feat: v4 multimodal chat input, multi-format support, and annotation detection
- Replace st.chat_input with st-multimodal-chatinput (Ctrl+V paste, drag-drop, file button) - Extract _process_uploaded_file() shared handler (eliminates ~70 duplicated lines) - Add XLSX (openpyxl), XLS (xlrd), DOC (olefile) parsers to file_parser.py - Add backend/annotation_detector.py: circle detection (HoughCircles) + arrow detection (HoughLinesP clustering) + OCR correlation + LLM context formatting - Add annotation_result field to AgentState with session persistence - Wire annotation detection into process_input and _format_ocr_context - Add 11 new tests: 7 annotation detector + 4 multi-format parser - Update all docs: CLAUDE.md, README.md, CODE_GUIDE.md, ROADMAP.md
This commit is contained in:
+34
-34
@@ -373,40 +373,7 @@ def _load_image(path: Path) -> Optional[PIL.Image.Image]:
|
||||
def _ocr_elements(img: PIL.Image.Image, file_path: str) -> list[dict]:
|
||||
"""OCR 提取图片中的文字元素(位置+内容)。优先 EasyOCR,回退 PaddleOCR。"""
|
||||
|
||||
# 优先 EasyOCR
|
||||
try:
|
||||
import easyocr
|
||||
import numpy as np
|
||||
|
||||
reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
|
||||
result = reader.readtext(np.array(img))
|
||||
|
||||
elements = []
|
||||
for (bbox, text, confidence) in result:
|
||||
if not text.strip():
|
||||
continue
|
||||
xs = [p[0] for p in bbox]
|
||||
ys = [p[1] for p in bbox]
|
||||
x_min, x_max = min(xs), max(xs)
|
||||
y_min, y_max = min(ys), max(ys)
|
||||
|
||||
elements.append({
|
||||
"x": round(x_min, 1),
|
||||
"y": round(y_min, 1),
|
||||
"w": round(x_max - x_min, 1),
|
||||
"h": round(y_max - y_min, 1),
|
||||
"font_size": round(y_max - y_min, 1),
|
||||
"text": text.strip(),
|
||||
})
|
||||
|
||||
elements.sort(key=lambda e: (e["y"], e["x"]))
|
||||
return elements
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 回退 PaddleOCR
|
||||
# 优先 PaddleOCR(精确识别)
|
||||
try:
|
||||
from paddleocr import PaddleOCR
|
||||
import numpy as np
|
||||
@@ -446,6 +413,39 @@ def _ocr_elements(img: PIL.Image.Image, file_path: str) -> list[dict]:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 回退 EasyOCR
|
||||
try:
|
||||
import easyocr
|
||||
import numpy as np
|
||||
|
||||
reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
|
||||
result = reader.readtext(np.array(img))
|
||||
|
||||
elements = []
|
||||
for (bbox, text, confidence) in result:
|
||||
if not text.strip():
|
||||
continue
|
||||
xs = [p[0] for p in bbox]
|
||||
ys = [p[1] for p in bbox]
|
||||
x_min, x_max = min(xs), max(xs)
|
||||
y_min, y_max = min(ys), max(ys)
|
||||
|
||||
elements.append({
|
||||
"x": round(x_min, 1),
|
||||
"y": round(y_min, 1),
|
||||
"w": round(x_max - x_min, 1),
|
||||
"h": round(y_max - y_min, 1),
|
||||
"font_size": round(y_max - y_min, 1),
|
||||
"text": text.strip(),
|
||||
})
|
||||
|
||||
elements.sort(key=lambda e: (e["y"], e["x"]))
|
||||
return elements
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return []
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user