feat: 新增 OCR 单据字段精确提取模块

- 新增 backend/ocr_extractor.py: 两阶段提取流水线 (文档分析 + 字段提取)
- 四种提取策略: 精确KV匹配/模糊KV匹配/正则模式/表格结构匹配
- agent/state.py: 新增 ocr_extraction_result 和 uploaded_file_path 字段
- agent/nodes.py: process_input() 中自动触发 OCR 提取钩子
- app.py: 文件上传时保留图片路径, 总结卡片中展示提取结果
- .env.example: 新增 OCR_USE_GPU / OCR_CONFIDENCE_THRESHOLD 配置项
- tests/test_ocr_extraction.py: 48 个单元测试全部通过
This commit is contained in:
2026-05-20 08:06:55 +08:00
parent 067880bf2e
commit c9f003e1b7
6 changed files with 1417 additions and 2 deletions
+25
View File
@@ -7,6 +7,7 @@ import os
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict
from dotenv import load_dotenv
@@ -114,6 +115,30 @@ def process_input(state: AgentState) -> Dict:
conv_history.append({"role": "user", "content": user_input})
state["conversation_history"] = conv_history
# OCR 单据字段精确提取(处理上传的图片文件)
uploaded_path = state.get("uploaded_file_path", "")
if uploaded_path and Path(uploaded_path).is_file():
suffix = Path(uploaded_path).suffix.lower()
if suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp"):
try:
from backend.ocr_extractor import OcrExtractor
extractor = OcrExtractor()
ocr_result = extractor.extract(uploaded_path, [])
if ocr_result.get("ocr_available"):
state["ocr_extraction_result"] = ocr_result
_node_log.info(
"OCR 字段提取完成",
extra={
"file": uploaded_path,
"elements": ocr_result.get("total_elements", 0),
"fields": len(ocr_result.get("fields", [])),
},
)
except Exception as e:
_node_log.warning(f"OCR 字段提取失败: {e}")
state["ocr_extraction_result"] = {"error": str(e)}
state["uploaded_file_path"] = ""
# 重置本轮请求字段
state["retry_count"] = 0
state["user_modification_request"] = user_input
+4
View File
@@ -40,3 +40,7 @@ class AgentState(TypedDict, total=False):
# 需求6:失败上下文传递 — 重试耗尽后暂存失败信息,下次用户输入时自动注入
pending_failure_context: dict
# 需求7:OCR 单据字段精确提取结果
ocr_extraction_result: dict
uploaded_file_path: str