5 ヶ月前 · 34f934c4c4
--- a/agent/agent.py
+++ b/agent/agent.py
@@ -46,17 +46,17 @@ class OcrAgent:
 
				     def __init__(self):
			
 
				         self._url = "http://127.0.0.1:8000/api/v1/ocr"
			
 
				 
			
 
				-    def extract_part_info(self, image_base64, prompt):
			
 
				+    def extract_part_info(self, image_base64, prompts):
			
 
				         """根据提示词提取信息"""
			
 
				         response = requests.post(
			
 
				             self._url,
			
 
				             json={
			
 
				                 "image": image_base64,
			
 
				-                "text": prompt
			
 
				+                "text": prompts
			
 
				             }
			
 
				         )
			
 
				         result = response.json()
			
 
				-        return json.loads(result['data'][0])
			
 
				+        return result
			
 
				 
			
 
				     def agent_ocr(self, image):
			
 
				         """qwen_ocr提取化学品安全标签信息"""
			
@@ -66,40 +66,25 @@ class OcrAgent:
 
				         start_time = time.perf_counter()
			
 
				 
			
 
				         # 定义需要并行执行的任务
			
 
				-        tasks = {
			
 
				-            'icon': PROMPT_EXTRACT_ICON,
			
 
				-            'name': PROMPT_EXTRACT_NAME,
			
 
				-            'tag': PROMPT_EXTRACT_COMPONENTS,
			
 
				-            'risk_notice': PROMPT_EXTRACT_KEYWORD,
			
 
				-            'pre_notice': PROMPT_EXTRACT_PREVENTION,
			
 
				-            'suppliers': PROMPT_EXTRACT_SUPPLIER
			
 
				-        }
			
 
				-
			
 
				-        # 使用线程池并行执行所有提取任务
			
 
				-        results = {}
			
 
				-        with ThreadPoolExecutor(max_workers=6) as executor:
			
 
				-            # 提交所有任务
			
 
				-            future_to_task = {
			
 
				-                executor.submit(self.extract_part_info, image_base64, prompt): task_name
			
 
				-                for task_name, prompt in tasks.items()
			
 
				-            }
			
 
				-
			
 
				-            # 收集结果
			
 
				-            for future in as_completed(future_to_task):
			
 
				-                task_name = future_to_task[future]
			
 
				-                try:
			
 
				-                    results[task_name] = future.result()
			
 
				-                except Exception as e:
			
 
				-                    print(f"任务 {task_name} 执行失败: {e}")
			
 
				-                    results[task_name] = {}
			
 
				+        prompts = [
			
 
				+            PROMPT_EXTRACT_ICON,
			
 
				+            PROMPT_EXTRACT_NAME,
			
 
				+            PROMPT_EXTRACT_COMPONENTS,
			
 
				+            PROMPT_EXTRACT_KEYWORD,
			
 
				+            PROMPT_EXTRACT_PREVENTION,
			
 
				+            PROMPT_EXTRACT_SUPPLIER
			
 
				+        ]
			
 
				+
			
 
				+        results = self.extract_part_info(image_base64, prompts)
			
 
				+        results = results["data"]
			
 
				 
			
 
				         # 从结果中提取数据
			
 
				-        icon = results.get('icon', {})
			
 
				-        name = results.get('name', {})
			
 
				-        tag = results.get('tag', {})
			
 
				-        risk_notice = results.get('risk_notice', {})
			
 
				-        pre_notice = results.get('pre_notice', {})
			
 
				-        suppliers = results.get('suppliers', {})
			
 
				+        icon = json.loads(results[0])
			
 
				+        name = json.loads(results[1])
			
 
				+        tag = json.loads(results[2])
			
 
				+        risk_notice = json.loads(results[3])
			
 
				+        pre_notice = json.loads(results[4])
			
 
				+        suppliers = json.loads(results[5])
			
 
				 
			
 
				         end_time = time.perf_counter()
			
 
				         elapsed_time = end_time - start_time
			
@@ -125,4 +110,5 @@ class OcrAgent:
 
				 if __name__ == "__main__":
			
 
				     image = Image.open("./test1.jpg").convert("RGB")
			
 
				     agent = OcrAgent()
			
 
				-    agent.agent_ocr(image)
			
 
				+    res = agent.agent_ocr(image)
			
 
				+    print(res)
			
--- a/api/run_api.py
+++ b/api/run_api.py
@@ -13,7 +13,7 @@ from datetime import datetime
 
				 
			
 
				 from fastapi import FastAPI, HTTPException, status
			
 
				 from fastapi.responses import JSONResponse
			
 
				-from pydantic import BaseModel, Field, validator
			
 
				+from pydantic import BaseModel, Field, field_validator
			
 
				 from PIL import Image
			
 
				 import uvicorn
			
 
				 
			
@@ -37,7 +37,8 @@ class AgentOCRRequest(BaseModel):
 
				     """Agent OCR 请求模型"""
			
 
				     image: str = Field(..., description="Base64 编码的图像字符串")
			
 
				 
			
 
				-    @validator('image')
			
 
				+    @field_validator('image')
			
 
				+    @classmethod
			
 
				     def validate_image(cls, v):
			
 
				         """验证 base64 图像格式"""
			
 
				         if not v:
			
@@ -346,9 +347,9 @@ async def global_exception_handler(request, exc):
 
				 def main():
			
 
				     """启动服务"""
			
 
				     uvicorn.run(
			
 
				-        "run_api:app",
			
 
				+        "api.run_api:app",
			
 
				         host="0.0.0.0",
			
 
				-        port=7080,  # 使用 8001 端口,避免与 model_api 的 8000 端口冲突
			
 
				+        port=6006,  # 使用 8001 端口,避免与 model_api 的 8000 端口冲突
			
 
				         workers=1,  # 由于 Agent 占用资源,使用单 worker
			
 
				         log_level="info",
			
 
				         access_log=True,
			
--- a/config/config.py
+++ b/config/config.py
@@ -1,7 +1,7 @@
 
				 # OCR配置文件
			
 
				 
			
 
				 # 模型路径
			
 
				-MODEL_PATH = "/root/llm/Qwen3-VL-8B-Instruct"
			
 
				+MODEL_PATH = "/root/autodl-tmp/llm/Qwen3-VL-8B-Instruct"
			
 
				 
			
 
				 
			
 
				 # ========== OCR提示词 - 分步骤提取 ==========
			
--- a/model/__init__.py
+++ b/model/__init__.py
@@ -1,5 +1,7 @@
 
				 from .qwen_ocr import QwenOcr
			
 
				+from .qwen_ocr_vllm import QwenOcrVLLM
			
 
				 
			
 
				 __all__ = [
			
 
				-    "QwenOcr"
			
 
				+    "QwenOcr",
			
 
				+    "QwenOcrVLLM"
			
 
				 ]
			
--- a/model/model_api.py
+++ b/model/model_api.py
@@ -17,7 +17,7 @@ from pydantic import BaseModel, Field, validator
 
				 from PIL import Image
			
 
				 import uvicorn
			
 
				 
			
 
				-from model.qwen_ocr import QwenOcr
			
 
				+from model import QwenOcr, QwenOcrVLLM
			
 
				 
			
 
				 
			
 
				 # ==================== 日志配置 ====================
			
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
 
				 class OCRRequest(BaseModel):
			
 
				     """OCR 推理请求模型"""
			
 
				     image: str = Field(..., description="Base64 编码的图像字符串")
			
 
				-    text: str = Field(..., description="OCR 提示词文本")
			
 
				+    text: list = Field(..., description="OCR 提示词文本列表")
			
 
				 
			
 
				     @validator('image')
			
 
				     def validate_image(cls, v):
			
@@ -53,9 +53,9 @@ class OCRRequest(BaseModel):
 
				     @validator('text')
			
 
				     def validate_text(cls, v):
			
 
				         """验证提示词文本"""
			
 
				-        if not v or not v.strip():
			
 
				+        if not v:
			
 
				             raise ValueError("提示词不能为空")
			
 
				-        return v.strip()
			
 
				+        return v
			
 
				 
			
 
				 
			
 
				 class OCRResponse(BaseModel):
			
@@ -114,7 +114,7 @@ class ModelManager:
 
				             logger.info("开始加载 QwenOcr 模型...")
			
 
				             # 在线程池中加载模型,避免阻塞事件循环
			
 
				             loop = asyncio.get_event_loop()
			
 
				-            self.model = await loop.run_in_executor(None, QwenOcr)
			
 
				+            self.model = await loop.run_in_executor(None, QwenOcrVLLM)
			
 
				 
			
 
				             # 初始化并发控制
			
 
				             self.max_concurrent_requests = max_concurrent
			
@@ -166,12 +166,12 @@ class ModelManager:
 
				             logger.error(f"Base64 转换失败: {e}")
			
 
				             raise ValueError(f"图像解码失败: {str(e)}")
			
 
				 
			
 
				-    async def inference(self, image_base64: str, prompt: str) -> list:
			
 
				+    async def inference(self, image_base64: str, prompts: str) -> list:
			
 
				         """
			
 
				         执行 OCR 推理(带并发控制)
			
 
				         Args:
			
 
				             image_base64: base64 编码的图像
			
 
				-            prompt: 提示词
			
 
				+            prompts: 提示词
			
 
				         Returns:
			
 
				             推理结果
			
 
				         """
			
@@ -191,9 +191,9 @@ class ModelManager:
 
				                 loop = asyncio.get_event_loop()
			
 
				                 result = await loop.run_in_executor(
			
 
				                     None,
			
 
				-                    self.model.inference,
			
 
				-                    pil_image,
			
 
				-                    prompt
			
 
				+                    self.model.batch_inference,
			
 
				+                    [pil_image] * len(prompts),
			
 
				+                    prompts
			
 
				                 )
			
 
				 
			
 
				                 return result
			
--- a/model/qwen_ocr_vllm.py
+++ b/model/qwen_ocr_vllm.py
@@ -0,0 +1,404 @@
 
				+import base64
			
 
				+from io import BytesIO
			
 
				+from PIL import Image
			
 
				+import json
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+from typing import List, Dict, Any, Optional
			
 
				+
			
 
				+from qwen_vl_utils import process_vision_info
			
 
				+from transformers import AutoProcessor
			
 
				+import time
			
 
				+import torch
			
 
				+
			
 
				+from config import MODEL_PATH, PROMPT_EXTRACT_NAME, PROMPT_EXTRACT_ICON, PROMPT_EXTRACT_COMPONENTS, PROMPT_EXTRACT_KEYWORD, PROMPT_EXTRACT_PREVENTION, PROMPT_EXTRACT_SUPPLIER
			
 
				+
			
 
				+# vLLM imports
			
 
				+from vllm import LLM, SamplingParams
			
 
				+from vllm.multimodal.utils import fetch_image
			
 
				+
			
 
				+
			
 
				+def image_to_base64(pil_image, image_format="JPEG"):
			
 
				+    """将PIL Image图像转换为Base64编码"""
			
 
				+    buffered = BytesIO()
			
 
				+    pil_image.save(buffered, format=image_format)
			
 
				+    img_byte_array = buffered.getvalue()
			
 
				+    encode_image = base64.b64encode(img_byte_array).decode('utf-8')
			
 
				+    return encode_image
			
 
				+
			
 
				+
			
 
				+class QwenOcrVLLM:
			
 
				+    """基于vLLM加速框架的Qwen OCR推理类
			
 
				+
			
 
				+    vLLM优势：
			
 
				+    1. PagedAttention技术 - 高效的KV cache管理
			
 
				+    2. 连续批处理 - 优化GPU利用率
			
 
				+    3. 快速模型执行 - CUDA/cuDNN kernel优化
			
 
				+    4. 支持量化 - AWQ, GPTQ等量化格式
			
 
				+    5. 张量并行 - 支持多GPU推理
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        icon_dir: str = "./icon",
			
 
				+        tensor_parallel_size: int = 1,
			
 
				+        gpu_memory_utilization: float = 0.9,
			
 
				+        max_model_len: int = 8192,
			
 
				+        dtype: str = "bfloat16",
			
 
				+        trust_remote_code: bool = True,
			
 
				+    ):
			
 
				+        """初始化vLLM模型
			
 
				+
			
 
				+        Args:
			
 
				+            icon_dir: icon参考图像目录
			
 
				+            tensor_parallel_size: 张量并行大小（多GPU推理）
			
 
				+            gpu_memory_utilization: GPU显存利用率（0.0-1.0）
			
 
				+            max_model_len: 最大模型序列长度
			
 
				+            dtype: 数据类型（"auto", "half", "float16", "bfloat16", "float", "float32"）
			
 
				+            trust_remote_code: 是否信任远程代码
			
 
				+        """
			
 
				+        print("=" * 60)
			
 
				+        print("初始化 vLLM 加速推理引擎...")
			
 
				+        print("=" * 60)
			
 
				+
			
 
				+        # 初始化vLLM引擎
			
 
				+        self.llm = LLM(
			
 
				+            model=MODEL_PATH,
			
 
				+            tensor_parallel_size=tensor_parallel_size,
			
 
				+            gpu_memory_utilization=gpu_memory_utilization,
			
 
				+            max_model_len=max_model_len,
			
 
				+            dtype=dtype,
			
 
				+            trust_remote_code=trust_remote_code,
			
 
				+            # 视觉模型特定参数
			
 
				+            limit_mm_per_prompt={"image": 10},  # 每个prompt最多支持10张图像
			
 
				+        )
			
 
				+
			
 
				+        # 加载processor用于消息模板处理
			
 
				+        self.processor = AutoProcessor.from_pretrained(
			
 
				+            MODEL_PATH,
			
 
				+            trust_remote_code=trust_remote_code
			
 
				+        )
			
 
				+
			
 
				+        # 加载icon参考图像
			
 
				+        self.icon_dir = icon_dir
			
 
				+        # self.icon_images = self._load_icon_images()
			
 
				+
			
 
				+        # 默认采样参数
			
 
				+        self.default_sampling_params = SamplingParams(
			
 
				+            temperature=0.0,      # 使用贪婪解码
			
 
				+            top_p=1.0,
			
 
				+            max_tokens=512,       # 最大生成token数
			
 
				+            stop_token_ids=None,
			
 
				+            skip_special_tokens=True,
			
 
				+        )
			
 
				+
			
 
				+        print("=" * 60)
			
 
				+        print("vLLM 引擎初始化完成!")
			
 
				+        print(f"- 模型路径: {MODEL_PATH}")
			
 
				+        print(f"- 张量并行: {tensor_parallel_size} GPU(s)")
			
 
				+        print(f"- 显存利用率: {gpu_memory_utilization * 100:.1f}%")
			
 
				+        print(f"- 数据类型: {dtype}")
			
 
				+        print(f"- 最大序列长度: {max_model_len}")
			
 
				+        print("=" * 60)
			
 
				+
			
 
				+        # 模型预热
			
 
				+        print("模型预热中...")
			
 
				+        self._warmup()
			
 
				+        print("模型预热完成!")
			
 
				+        print("=" * 60)
			
 
				+
			
 
				+    def _load_icon_images(self) -> Dict[str, Image.Image]:
			
 
				+        """加载icon目录下的所有参考图像"""
			
 
				+        icon_images = {}
			
 
				+        icon_path = Path(self.icon_dir)
			
 
				+
			
 
				+        if not icon_path.exists():
			
 
				+            print(f"警告: icon目录 {self.icon_dir} 不存在")
			
 
				+            return icon_images
			
 
				+
			
 
				+        # 加载所有png图像文件
			
 
				+        for icon_file in icon_path.glob("*.png"):
			
 
				+            icon_name = icon_file.stem  # 获取文件名(不含扩展名), 如 GHS01
			
 
				+            try:
			
 
				+                icon_image = Image.open(icon_file).convert("RGB")
			
 
				+                icon_images[icon_name] = icon_image
			
 
				+                print(f"已加载icon参考图像: {icon_name}")
			
 
				+            except Exception as e:
			
 
				+                print(f"加载icon图像 {icon_file} 失败: {e}")
			
 
				+
			
 
				+        return icon_images
			
 
				+
			
 
				+    def _warmup(self):
			
 
				+        """预热模型以触发编译和优化"""
			
 
				+        dummy_image = Image.new('RGB', (224, 224), color='white')
			
 
				+        prompt = PROMPT_EXTRACT_NAME
			
 
				+        try:
			
 
				+            self.inference(dummy_image, prompt, warmup=True)
			
 
				+        except Exception as e:
			
 
				+            print(f"预热过程中出现警告(可忽略): {e}")
			
 
				+
			
 
				+    def _build_messages(self, image: Image.Image, prompt: str) -> List[Dict]:
			
 
				+        """构建消息格式
			
 
				+
			
 
				+        Args:
			
 
				+            image: PIL Image对象
			
 
				+            prompt: 提示词文本
			
 
				+
			
 
				+        Returns:
			
 
				+            消息列表
			
 
				+        """
			
 
				+        messages = [
			
 
				+            {
			
 
				+                "role": "user",
			
 
				+                "content": [
			
 
				+                    {
			
 
				+                        "type": "image",
			
 
				+                        "image": image,
			
 
				+                    },
			
 
				+                    {
			
 
				+                        "type": "text",
			
 
				+                        "text": prompt
			
 
				+                    },
			
 
				+                ],
			
 
				+            }
			
 
				+        ]
			
 
				+        return messages
			
 
				+
			
 
				+    def _prepare_inputs(
			
 
				+        self,
			
 
				+        messages: List[Dict]
			
 
				+    ) -> Dict[str, Any]:
			
 
				+        """准备vLLM输入格式
			
 
				+
			
 
				+        Args:
			
 
				+            messages: 消息列表
			
 
				+
			
 
				+        Returns:
			
 
				+            包含prompt和multi_modal_data的字典
			
 
				+        """
			
 
				+        # 应用chat模板
			
 
				+        text = self.processor.apply_chat_template(
			
 
				+            messages,
			
 
				+            tokenize=False,
			
 
				+            add_generation_prompt=True
			
 
				+        )
			
 
				+
			
 
				+        # 处理视觉信息
			
 
				+        image_inputs, video_inputs = process_vision_info(messages)
			
 
				+
			
 
				+        # vLLM 0.6.0+ 新版API格式
			
 
				+        # 直接返回包含文本和多模态数据的字典
			
 
				+        inputs = {
			
 
				+            "prompt": text,
			
 
				+            "multi_modal_data": {
			
 
				+                "image": image_inputs[0] if image_inputs else None
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return inputs
			
 
				+
			
 
				+    def inference(
			
 
				+        self,
			
 
				+        image: Image.Image,
			
 
				+        prompt: str,
			
 
				+        warmup: bool = False,
			
 
				+        sampling_params: Optional[SamplingParams] = None
			
 
				+    ) -> List[str]:
			
 
				+        """OCR推理
			
 
				+
			
 
				+        Args:
			
 
				+            image: PIL Image对象
			
 
				+            prompt: 提示词
			
 
				+            warmup: 是否为预热模式
			
 
				+            sampling_params: 自定义采样参数
			
 
				+
			
 
				+        Returns:
			
 
				+            生成的文本列表
			
 
				+        """
			
 
				+        # 构建消息
			
 
				+        messages = self._build_messages(image, prompt)
			
 
				+
			
 
				+        # 准备输入
			
 
				+        inputs = self._prepare_inputs(messages)
			
 
				+
			
 
				+        # 使用默认或自定义采样参数
			
 
				+        params = sampling_params if sampling_params else self.default_sampling_params
			
 
				+
			
 
				+        # vLLM 0.6.0+ 新版API：直接传递inputs字典
			
 
				+        outputs = self.llm.generate(
			
 
				+            inputs,
			
 
				+            params
			
 
				+        )
			
 
				+
			
 
				+        # 提取生成的文本
			
 
				+        generated_texts = [output.outputs[0].text for output in outputs]
			
 
				+
			
 
				+        if not warmup:
			
 
				+            return generated_texts
			
 
				+
			
 
				+        return generated_texts
			
 
				+
			
 
				+    def batch_inference(
			
 
				+        self,
			
 
				+        images: List[Image.Image],
			
 
				+        prompts: List[str],
			
 
				+        sampling_params: Optional[SamplingParams] = None
			
 
				+    ) -> List[str]:
			
 
				+        """批量OCR推理（vLLM的核心优势）
			
 
				+
			
 
				+        Args:
			
 
				+            images: PIL Image对象列表
			
 
				+            prompts: 提示词列表
			
 
				+            sampling_params: 自定义采样参数
			
 
				+
			
 
				+        Returns:
			
 
				+            生成的文本列表
			
 
				+        """
			
 
				+        if len(images) != len(prompts):
			
 
				+            raise ValueError(f"images数量({len(images)})和prompts数量({len(prompts)})不匹配")
			
 
				+
			
 
				+        # 准备所有输入
			
 
				+        all_inputs = []
			
 
				+        for image, prompt in zip(images, prompts):
			
 
				+            messages = self._build_messages(image, prompt)
			
 
				+            inputs = self._prepare_inputs(messages)
			
 
				+            all_inputs.append(inputs)
			
 
				+
			
 
				+        # 使用默认或自定义采样参数
			
 
				+        params = sampling_params if sampling_params else self.default_sampling_params
			
 
				+
			
 
				+        # vLLM 0.6.0+ 新版API：直接传递inputs列表
			
 
				+        outputs = self.llm.generate(
			
 
				+            all_inputs,
			
 
				+            params
			
 
				+        )
			
 
				+
			
 
				+        # 提取生成的文本
			
 
				+        generated_texts = [output.outputs[0].text for output in outputs]
			
 
				+
			
 
				+        return generated_texts
			
 
				+
			
 
				+    def extract_icons(self, image: Image.Image) -> List[str]:
			
 
				+        """识别图像中的象形图标识
			
 
				+
			
 
				+        Args:
			
 
				+            image: PIL Image对象 - 待识别的化学品标签图像
			
 
				+
			
 
				+        Returns:
			
 
				+            生成的文本列表
			
 
				+        """
			
 
				+        # 构建包含所有参考图像的messages
			
 
				+        messages = [
			
 
				+            {
			
 
				+                "role": "user",
			
 
				+                "content": []
			
 
				+            }
			
 
				+        ]
			
 
				+
			
 
				+        # 添加所有icon参考图像
			
 
				+        content_list = messages[0]["content"]
			
 
				+
			
 
				+        # 按GHS编号顺序添加参考图像
			
 
				+        sorted_icons = sorted(self.icon_images.items(), key=lambda x: x[0])
			
 
				+        for icon_name, icon_image in sorted_icons:
			
 
				+            content_list.append({
			
 
				+                "type": "image",
			
 
				+                "image": icon_image,
			
 
				+            })
			
 
				+            content_list.append({
			
 
				+                "type": "text",
			
 
				+                "text": f"参考图像：{icon_name}"
			
 
				+            })
			
 
				+
			
 
				+        # 添加待识别的图像
			
 
				+        content_list.append({
			
 
				+            "type": "image",
			
 
				+            "image": image,
			
 
				+        })
			
 
				+
			
 
				+        # 添加提示词
			
 
				+        content_list.append({
			
 
				+            "type": "text",
			
 
				+            "text": PROMPT_EXTRACT_ICON
			
 
				+        })
			
 
				+
			
 
				+        # 准备输入
			
 
				+        inputs = self._prepare_inputs(messages)
			
 
				+
			
 
				+        # vLLM 0.6.0+ 新版API：直接传递inputs字典
			
 
				+        outputs = self.llm.generate(
			
 
				+            inputs,
			
 
				+            self.default_sampling_params
			
 
				+        )
			
 
				+
			
 
				+        # 提取生成的文本
			
 
				+        generated_texts = [output.outputs[0].text for output in outputs]
			
 
				+
			
 
				+        return generated_texts
			
 
				+
			
 
				+    def __del__(self):
			
 
				+        """析构函数，清理资源"""
			
 
				+        try:
			
 
				+            if hasattr(self, 'llm'):
			
 
				+                del self.llm
			
 
				+                print("vLLM引擎已释放")
			
 
				+        except:
			
 
				+            pass
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # 测试代码
			
 
				+    print("初始化 QwenOcrVLLM...")
			
 
				+    qwen_ocr = QwenOcrVLLM(
			
 
				+        tensor_parallel_size=1,      # 单GPU
			
 
				+        gpu_memory_utilization=0.9,  # 使用90%显存
			
 
				+        max_model_len=8192,          # 最大序列长度
			
 
				+        dtype="bfloat16"             # 使用bfloat16精度
			
 
				+    )
			
 
				+
			
 
				+    # 测试单张图像推理
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("测试单张图像推理...")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    test_image_path = "./test3.jpg"
			
 
				+    if os.path.exists(test_image_path):
			
 
				+        image = Image.open(test_image_path).convert("RGB")
			
 
				+
			
 
				+        # 测试提取名称
			
 
				+        start_time = time.time()
			
 
				+        result = qwen_ocr.inference(image, PROMPT_EXTRACT_PREVENTION)
			
 
				+        elapsed = time.time() - start_time
			
 
				+
			
 
				+        print(f"\n推理耗时: {elapsed:.3f}秒")
			
 
				+        print(f"提取结果:\n{result[0]}")
			
 
				+    else:
			
 
				+        print(f"测试图像 {test_image_path} 不存在，跳过测试")
			
 
				+
			
 
				+    # 测试批量推理
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("测试批量推理...")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    if os.path.exists(test_image_path):
			
 
				+        # 创建3张相同的测试图像
			
 
				+        images = [Image.open(test_image_path).convert("RGB") for _ in range(6)]
			
 
				+        prompts = [PROMPT_EXTRACT_ICON, PROMPT_EXTRACT_NAME, PROMPT_EXTRACT_COMPONENTS, PROMPT_EXTRACT_KEYWORD, PROMPT_EXTRACT_PREVENTION, PROMPT_EXTRACT_SUPPLIER]
			
 
				+
			
 
				+        start_time = time.time()
			
 
				+        results = qwen_ocr.batch_inference(images, prompts)
			
 
				+        elapsed = time.time() - start_time
			
 
				+        print(results[0])
			
 
				+        print(results[1])
			
 
				+        print(results[2])
			
 
				+        print(results[3])
			
 
				+        print(results[4])
			
 
				+        print(results[5])
			
 
				+
			
 
				+        print(f"\n批量推理耗时: {elapsed:.3f}秒")
			
 
				+        print(f"平均每张: {elapsed/3:.3f}秒")
			
 
				+        print(f"批量推理加速比: {(elapsed/3):.3f}秒/张 vs 单张推理")
			
 
				+
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("测试完成!")
			
 
				+    print("=" * 60)
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -176,9 +176,6 @@ terminado==0.18.1
 
				 tinycss2==1.4.0
			
 
				 tokenizers==0.22.1
			
 
				 tomli==2.2.1
			
 
				-torch==2.8.0
			
 
				-torch_npu==2.8.0
			
 
				-torchvision==0.23.0
			
 
				 tornado==6.4.2
			
 
				 tqdm @ file:///croot/tqdm_1724853943256/work
			
 
				 traitlets==5.14.3
			
--- a/test_api.py
+++ b/test_api.py
@@ -3,6 +3,7 @@ from io import BytesIO
 
				 import base64
			
 
				 import json
			
 
				 from PIL import Image, ImageFilter, ImageEnhance
			
 
				+from config import PROMPT_EXTRACT_ICON, PROMPT_EXTRACT_NAME, PROMPT_EXTRACT_COMPONENTS, PROMPT_EXTRACT_KEYWORD, PROMPT_EXTRACT_PREVENTION, PROMPT_EXTRACT_SUPPLIER
			
 
				 
			
 
				 def image_to_base64(pil_image, image_format="JPEG"):
			
 
				     """将PIL Image图像转换为Base64编码"""
			
@@ -42,16 +43,16 @@ image_base64 = image_to_base64(image)
 
				 #     "http://127.0.0.1:8000/api/v1/ocr",
			
 
				 #     json={
			
 
				 #         "image": image_base64,
			
 
				-#         "text": PROMPT_EXTRACT_NAME
			
 
				+#         "text": [PROMPT_EXTRACT_ICON, PROMPT_EXTRACT_NAME, PROMPT_EXTRACT_COMPONENTS, PROMPT_EXTRACT_KEYWORD, PROMPT_EXTRACT_PREVENTION, PROMPT_EXTRACT_SUPPLIER]
			
 
				 #     }
			
 
				 # )
			
 
				 
			
 
				 response = requests.post(
			
 
				-    "https://749757254390085-http-7080.edge-proxy.gpugeek.com:8443/api/v1/agent_ocr",
			
 
				+    "https://u475436-9425-5ad0e9a4.gda1.seetacloud.com:6443/api/v1/agent_ocr",
			
 
				     json={
			
 
				         "image": image_base64,
			
 
				     }
			
 
				 )
			
 
				 
			
 
				 result = response.json()
			
 
				-print(result)
			
 
				+print(result['data'])