feat: v4 multimodal chat input, multi-format support, and annotation detection

- Replace st.chat_input with st-multimodal-chatinput (Ctrl+V paste, drag-drop, file button) - Extract _process_uploaded_file() shared handler (eliminates ~70 duplicated lines) - Add XLSX (openpyxl), XLS (xlrd), DOC (olefile) parsers to file_parser.py - Add backend/annotation_detector.py: circle detection (HoughCircles) + arrow detection (HoughLinesP clustering) + OCR correlation + LLM context formatting - Add annotation_result field to AgentState with session persistence - Wire annotation detection into process_input and _format_ocr_context - Add 11 new tests: 7 annotation detector + 4 multi-format parser - Update all docs: CLAUDE.md, README.md, CODE_GUIDE.md, ROADMAP.md
2026-05-20 23:43:16 +08:00
parent c9f003e1b7
commit 9bb011e429
16 changed files with 1257 additions and 164 deletions
@@ -373,40 +373,7 @@ def _load_image(path: Path) -> Optional[PIL.Image.Image]:
 def _ocr_elements(img: PIL.Image.Image, file_path: str) -> list[dict]:
    """OCR 提取图片中的文字元素（位置+内容）。优先 EasyOCR，回退 PaddleOCR。"""

-    # 优先 EasyOCR
-    try:
-        import easyocr
-        import numpy as np
-
-        reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
-        result = reader.readtext(np.array(img))
-
-        elements = []
-        for (bbox, text, confidence) in result:
-            if not text.strip():
-                continue
-            xs = [p[0] for p in bbox]
-            ys = [p[1] for p in bbox]
-            x_min, x_max = min(xs), max(xs)
-            y_min, y_max = min(ys), max(ys)
-
-            elements.append({
-                "x": round(x_min, 1),
-                "y": round(y_min, 1),
-                "w": round(x_max - x_min, 1),
-                "h": round(y_max - y_min, 1),
-                "font_size": round(y_max - y_min, 1),
-                "text": text.strip(),
-            })
-
-        elements.sort(key=lambda e: (e["y"], e["x"]))
-        return elements
-    except ImportError:
-        pass
-    except Exception:
-        pass
-
-    # 回退 PaddleOCR
+    # 优先 PaddleOCR（精确识别）
    try:
        from paddleocr import PaddleOCR
        import numpy as np
@@ -446,6 +413,39 @@ def _ocr_elements(img: PIL.Image.Image, file_path: str) -> list[dict]:
    except Exception:
        pass

+    # 回退 EasyOCR
+    try:
+        import easyocr
+        import numpy as np
+
+        reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
+        result = reader.readtext(np.array(img))
+
+        elements = []
+        for (bbox, text, confidence) in result:
+            if not text.strip():
+                continue
+            xs = [p[0] for p in bbox]
+            ys = [p[1] for p in bbox]
+            x_min, x_max = min(xs), max(xs)
+            y_min, y_max = min(ys), max(ys)
+
+            elements.append({
+                "x": round(x_min, 1),
+                "y": round(y_min, 1),
+                "w": round(x_max - x_min, 1),
+                "h": round(y_max - y_min, 1),
+                "font_size": round(y_max - y_min, 1),
+                "text": text.strip(),
+            })
+
+        elements.sort(key=lambda e: (e["y"], e["x"]))
+        return elements
+    except ImportError:
+        pass
+    except Exception:
+        pass
+
    return []