feat: v4 multimodal chat input, multi-format support, and annotation detection

- Replace st.chat_input with st-multimodal-chatinput (Ctrl+V paste, drag-drop, file button) - Extract _process_uploaded_file() shared handler (eliminates ~70 duplicated lines) - Add XLSX (openpyxl), XLS (xlrd), DOC (olefile) parsers to file_parser.py - Add backend/annotation_detector.py: circle detection (HoughCircles) + arrow detection (HoughLinesP clustering) + OCR correlation + LLM context formatting - Add annotation_result field to AgentState with session persistence - Wire annotation detection into process_input and _format_ocr_context - Add 11 new tests: 7 annotation detector + 4 multi-format parser - Update all docs: CLAUDE.md, README.md, CODE_GUIDE.md, ROADMAP.md
2026-05-20 23:43:16 +08:00
parent c9f003e1b7
commit 9bb011e429
16 changed files with 1257 additions and 164 deletions
@@ -106,6 +106,81 @@ def _render_jrxml(jrxml: str, max_lines: int = 30):
    st.code(preview, language="xml")


+# ---- 共享文件上传处理 ----
+def _process_uploaded_file(uploaded_file, suffix: str) -> dict:
+    """处理单个上传文件：保存临时文件、解析、布局分析。
+
+    返回: {"name": str, "text": str, "type": str, "tmp_path": str|None}
+    """
+    import tempfile
+    from backend.file_parser import parse_file
+    from backend.layout_analyzer import analyze_layout
+
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        tmp.write(uploaded_file.getvalue())
+        tmp_path = tmp.name
+
+    result = parse_file(tmp_path, suffix)
+    parsed_text = result["text"]
+    parsed_type = result["file_type"]
+
+    # 对图片/PDF 进行 A4 模板布局分析
+    if suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp", ".pdf"):
+        layout = analyze_layout(tmp_path)
+        tt = layout.get("template_type", "unknown")
+        current_jrxml = st.session_state.agent_state.get("current_jrxml", "")
+
+        if tt == "full_a4":
+            parsed_text = layout["description"]
+            parsed_type = "a4_template"
+        elif tt == "partial_rows":
+            parsed_type = "a4_partial"
+            if current_jrxml.strip():
+                from backend.layout_analyzer import match_rows_to_jrxml
+                match = match_rows_to_jrxml(layout, current_jrxml)
+                parsed_text = (
+                    f"[行片段修改] 上传图片包含 {layout['total_rows']} 行，"
+                    f"视为 A4 报表的一部分。\n\n"
+                    f"{match['description']}\n\n"
+                    f"--- 行结构 ---\n{layout['description']}"
+                )
+            else:
+                parsed_text = layout["description"]
+        else:
+            has_ocr = result.get("method") not in ("metadata_only", None)
+            img_w, img_h = layout["image_size"]
+            ratio = layout["aspect_ratio"]
+            if has_ocr:
+                parsed_text = (
+                    f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}。"
+                    f"未检测到 A4 报表结构，图片将被视为参考样式。\n"
+                    f"请根据用户的文字描述生成报表。"
+                )
+            else:
+                parsed_text = (
+                    f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}。\n"
+                    f"⚠ OCR 引擎未安装，无法识别图片中的文字内容。\n"
+                    f"请严格根据用户的文字描述来推断图片中的报表需求。\n"
+                    f"（提示：如需图片文字识别，请运行 pip install paddleocr）"
+                )
+            parsed_type = "image_reference"
+
+    elif suffix in (".pdf", ".docx", ".xlsx", ".xls", ".doc"):
+        parsed_type = suffix.lstrip(".")
+
+    keep_temp = (
+        suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp")
+        and result.get("method") not in ("metadata_only", None)
+    )
+
+    return {
+        "name": uploaded_file.name,
+        "text": parsed_text,
+        "type": parsed_type,
+        "tmp_path": tmp_path if keep_temp else None,
+    }
+
+
 # ---- URL 参数 ----
 query_params = st.query_params
 url_session_id = query_params.get("session_id", "")
@@ -480,7 +555,8 @@ with st.sidebar:

    uploaded = st.file_uploader(
        "选择文件",
-        type=["png", "jpg", "jpeg", "bmp", "webp", "pdf", "docx", "txt", "csv", "json", "xml"],
+        type=["png", "jpg", "jpeg", "bmp", "webp", "pdf", "docx", "xlsx", "xls", "doc",
+              "txt", "csv", "json", "xml"],
        accept_multiple_files=True,
        key="file_uploader",
        label_visibility="collapsed",
@@ -491,77 +567,21 @@ with st.sidebar:
            # 去重
            if any(f["name"] == uf.name for f in st.session_state.uploaded_files):
                continue
-            import tempfile
-            from backend.file_parser import parse_file
-            from backend.layout_analyzer import analyze_layout

            suffix = Path(uf.name).suffix.lower()
-            with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
-                tmp.write(uf.getvalue())
-                tmp_path = tmp.name
+            result = _process_uploaded_file(uf, suffix)

-            result = parse_file(tmp_path, suffix)
-
-            # 对图片/PDF 进行 A4 模板布局分析
-            parsed_text = result["text"]
-            parsed_type = result["file_type"]
-            if suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp", ".pdf"):
-                layout = analyze_layout(tmp_path)
-                tt = layout.get("template_type", "unknown")
-                current_jrxml = st.session_state.agent_state.get("current_jrxml", "")
-
-                if tt == "full_a4":
-                    parsed_text = layout["description"]
-                    parsed_type = "a4_template"
-                elif tt == "partial_rows":
-                    parsed_type = "a4_partial"
-                    if current_jrxml.strip():
-                        # 修改模式：尝试行匹配
-                        from backend.layout_analyzer import match_rows_to_jrxml
-                        match = match_rows_to_jrxml(layout, current_jrxml)
-                        parsed_text = (
-                            f"[行片段修改] 上传图片包含 {layout['total_rows']} 行，"
-                            f"视为 A4 报表的一部分。\n\n"
-                            f"{match['description']}\n\n"
-                            f"--- 行结构 ---\n{layout['description']}"
-                        )
-                    else:
-                        # 新建模式：按 A4 模板处理
-                        parsed_text = layout["description"]
-                else:
-                    # tt == "unknown": OCR 不可用或未检测到文字元素
-                    has_ocr = result.get("method") not in ("metadata_only", None)
-                    img_w, img_h = layout["image_size"]
-                    ratio = layout["aspect_ratio"]
-                    if has_ocr:
-                        parsed_text = (
-                            f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}。"
-                            f"未检测到 A4 报表结构，图片将被视为参考样式。\n"
-                            f"请根据用户的文字描述生成报表。"
-                        )
-                    else:
-                        parsed_text = (
-                            f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}。\n"
-                            f"⚠ OCR 引擎未安装，无法识别图片中的文字内容。\n"
-                            f"请严格根据用户的文字描述来推断图片中的报表需求。\n"
-                            f"（提示：如需图片文字识别，请运行 pip install paddleocr）"
-                        )
-                    parsed_type = "image_reference"
-
-            if parsed_text:
+            if result["text"]:
                st.session_state.uploaded_files.append({
-                    "name": uf.name,
-                    "text": parsed_text,
-                    "type": parsed_type,
+                    "name": result["name"],
+                    "text": result["text"],
+                    "type": result["type"],
                })

-            # 对图片类型，保存路径以便 OCR 字段提取（延迟到 process_input 阶段）
-            img_suffixes = (".png", ".jpg", ".jpeg", ".bmp", ".webp")
-            if suffix in img_suffixes and result.get("method") not in ("metadata_only", None):
+            tmp_path = result["tmp_path"]
+            if tmp_path:
                st.session_state.agent_state["uploaded_file_path"] = tmp_path
                st.session_state.uploaded_temp_paths.append(tmp_path)
-            else:
-                Path(tmp_path).unlink(missing_ok=True)

    if st.session_state.uploaded_files:
        for i, f in enumerate(st.session_state.uploaded_files):
@@ -632,34 +652,106 @@ for msg in st.session_state.messages:
        else:
            st.markdown(msg["content"])

-# ---- 聊天输入 ----
-if prompt := st.chat_input("描述您的报表需求..."):
-    # 拼接上传文件的文本
+# ---- 聊天输入（支持粘贴/拖拽文件） ----
+from st_multimodal_chatinput import multimodal_chatinput
+import base64
+import io
+from pathlib import Path as _Path
+
+# MIME type → 文件扩展名映射（用于剪贴板粘贴无扩展名的文件）
+MIME_TO_EXT = {
+    "image/png": ".png",
+    "image/jpeg": ".jpg",
+    "image/bmp": ".bmp",
+    "image/webp": ".webp",
+    "application/pdf": ".pdf",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
+    "application/vnd.ms-excel": ".xls",
+    "application/msword": ".doc",
+    "text/plain": ".txt",
+    "text/csv": ".csv",
+    "application/json": ".json",
+    "text/xml": ".xml",
+}
+
+chat_result = multimodal_chatinput()
+if chat_result:
+    prompt = (chat_result.get("textInput") or "").strip()
+    chat_files = chat_result.get("uploadedFiles") or []
+
+    # 处理聊天中上传/粘贴的文件
    uploaded_texts = []
    uploaded_files_info = []
+
+    # 先收集侧边栏已上传的文件
    if st.session_state.get("uploaded_files"):
        for f in st.session_state.uploaded_files:
            uploaded_texts.append(f"[上传文件: {f['name']}]\n{f['text']}")
            uploaded_files_info.append({"name": f["name"], "type": f["type"], "length": len(f["text"])})
-    if uploaded_texts:
-        full_prompt = "\n\n".join(uploaded_texts) + "\n\n---\n用户需求:\n" + prompt
-        st.session_state.uploaded_files = []  # 用后即清
-    else:
-        full_prompt = prompt
+        st.session_state.uploaded_files = []

-    _app_log.info(
-        "收到用户输入",
-        extra={
-            "session_id": current_session_id,
-            "prompt_preview": prompt[:200],
-            "prompt_length": len(prompt),
-            "has_uploaded_files": bool(uploaded_files_info),
-            "uploaded_files": uploaded_files_info,
-        },
-    )
+    # 处理聊天中的文件
+    class _Base64File:
+        """包装 base64 文件为类 UploadedFile 接口。"""
+        def __init__(self, name, data_bytes):
+            self.name = name
+            self._data = data_bytes

-    st.session_state.messages.append({"role": "user", "content": prompt})
-    with st.chat_message("user"):
-        st.markdown(prompt)
-    run_agent(full_prompt)
-    st.rerun()
+        def getvalue(self):
+            return self._data
+
+    for cf in chat_files:
+        name = cf.get("name", "clipboard_file")
+        mime = cf.get("type", "")
+        content_b64 = cf.get("content", "")
+        if not content_b64:
+            continue
+        try:
+            data = base64.b64decode(content_b64)
+        except Exception:
+            continue
+
+        suffix = _Path(name).suffix.lower()
+        if not suffix and mime in MIME_TO_EXT:
+            suffix = MIME_TO_EXT[mime]
+            name = f"{_Path(name).stem}{suffix}"
+
+        wrapper = _Base64File(name, data)
+        result = _process_uploaded_file(wrapper, suffix)
+
+        if result["text"]:
+            uploaded_texts.append(f"[上传文件: {result['name']}]\n{result['text']}")
+            uploaded_files_info.append({"name": result["name"], "type": result["type"], "length": len(result["text"])})
+
+        tmp_path = result["tmp_path"]
+        if tmp_path:
+            st.session_state.agent_state["uploaded_file_path"] = tmp_path
+            st.session_state.uploaded_temp_paths.append(tmp_path)
+
+    if prompt or uploaded_texts:
+        if uploaded_texts:
+            full_prompt = "\n\n".join(uploaded_texts)
+            if prompt:
+                full_prompt += "\n\n---\n用户需求:\n" + prompt
+        else:
+            full_prompt = prompt
+
+        displayed_prompt = prompt or "(已上传文件，未输入文字)"
+
+        _app_log.info(
+            "收到用户输入",
+            extra={
+                "session_id": current_session_id,
+                "prompt_preview": displayed_prompt[:200],
+                "prompt_length": len(full_prompt),
+                "has_uploaded_files": bool(uploaded_files_info),
+                "uploaded_files": uploaded_files_info,
+            },
+        )
+
+        st.session_state.messages.append({"role": "user", "content": displayed_prompt})
+        with st.chat_message("user"):
+            st.markdown(displayed_prompt)
+        run_agent(full_prompt)
+        st.rerun()