Merge remote v4/v5 features (multimodal chat input, layered generation, annotation detection) with local v3 features (dialog file upload, XLSX support, session fix)

Key resolutions: - agent/nodes.py: Merged session_id exclusion fix with new persistable fields (ocr_extraction_result, annotation_result, layout_schema, ocr_elements) - app.py: Adopted st-multimodal-chatinput for unified paste/drop/upload, removed custom JS paste bridge - backend/file_parser.py: Kept local XLSX parser, added remote XLS/DOC parsers - CLAUDE.md + CODE_GUIDE.md: Merged documentation from both branches Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-21 10:05:43 +08:00
parent 87ead4fa6a 43a0542a11
commit 2befd44430
22 changed files with 2114 additions and 507 deletions
@@ -52,6 +52,8 @@ def parse_file(file_path: str, file_type: str = "") -> dict:
        ".pdf":  _parse_pdf,
        ".docx": _parse_docx,
        ".xlsx": _parse_xlsx,
+        ".xls":  _parse_xls,
+        ".doc":  _parse_doc,
    }

    parser = parsers.get(suffix)
@@ -73,26 +75,7 @@ def _parse_image(path: Path) -> dict:
    except Exception:
        info = "[图片: 无法读取元数据]"

-    # 优先 EasyOCR（Windows 兼容性更好）
-    try:
-        import easyocr
-        import numpy as np
-        reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
-        result = reader.readtext(np.array(img))
-        lines = [text.strip() for (_, text, _) in result if text.strip()]
-        if lines:
-            return {
-                "text": f"{info}\n识别文本:\n" + "\n".join(lines),
-                "file_type": "image",
-                "method": "easyocr",
-                "error": None,
-            }
-    except ImportError:
-        pass
-    except Exception:
-        pass
-
-    # 回退 PaddleOCR
+    # 优先 PaddleOCR（精确识别）
    try:
        from paddleocr import PaddleOCR
        ocr = PaddleOCR(lang="ch")
@@ -115,6 +98,25 @@ def _parse_image(path: Path) -> dict:
    except Exception:
        pass

+    # 回退 EasyOCR
+    try:
+        import easyocr
+        import numpy as np
+        reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
+        result = reader.readtext(np.array(img))
+        lines = [text.strip() for (_, text, _) in result if text.strip()]
+        if lines:
+            return {
+                "text": f"{info}\n识别文本:\n" + "\n".join(lines),
+                "file_type": "image",
+                "method": "easyocr",
+                "error": None,
+            }
+    except ImportError:
+        pass
+    except Exception:
+        pass
+
    # OCR 不可用 → 返回图片元信息 + 安装提示
    return {
        "text": f"{info}\n(如需 OCR 文字识别，请安装: pip install easyocr)",
@@ -197,36 +199,91 @@ def _parse_docx(path: Path) -> dict:


 def _parse_xlsx(path: Path) -> dict:
-    """提取 Excel (.xlsx) 表格内容为文本。"""
+    """提取 Excel .xlsx 文件中的文本。"""
    try:
-        import openpyxl
-        wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
-        sheets_text = []
-        for sheet_name in wb.sheetnames:
-            ws = wb[sheet_name]
+        from openpyxl import load_workbook
+        wb = load_workbook(path, read_only=True, data_only=True)
+        parts = []
+        for name in wb.sheetnames:
+            ws = wb[name]
            rows = []
            for row in ws.iter_rows(values_only=True):
                cells = [str(c) if c is not None else "" for c in row]
-                if any(c.strip() for c in cells):
-                    rows.append(" | ".join(cells))
+                if any(c for c in cells):
+                    rows.append("\t".join(cells))
            if rows:
-                sheets_text.append(f"--- 工作表: {sheet_name} ---\n" + "\n".join(rows))
+                parts.append(f"[Sheet: {name}]\n" + "\n".join(rows))
        wb.close()
-        if sheets_text:
-            return {
-                "text": "\n\n".join(sheets_text),
-                "file_type": "xlsx",
-                "method": "openpyxl",
-                "error": None,
-            }
+        text = "\n\n".join(parts)
+        return {"text": text, "file_type": "xlsx", "method": "openpyxl", "error": None}
    except ImportError:
        pass
-    except Exception:
-        pass
+    except Exception as e:
+        return {"text": "", "file_type": "xlsx", "method": "none",
+                "error": f"XLSX 解析失败: {e}"}
    return {"text": "", "file_type": "xlsx", "method": "none",
            "error": "XLSX 解析需要安装 openpyxl"}


+def _parse_xls(path: Path) -> dict:
+    """提取旧版 Excel .xls 文件中的文本。"""
+    try:
+        import xlrd
+        wb = xlrd.open_workbook(path)
+        parts = []
+        for name in wb.sheet_names():
+            ws = wb.sheet_by_name(name)
+            rows = []
+            for rx in range(ws.nrows):
+                cells = [str(ws.cell_value(rx, cx)) if ws.cell_value(rx, cx) != "" else ""
+                         for cx in range(ws.ncols)]
+                if any(c for c in cells):
+                    rows.append("\t".join(cells))
+            if rows:
+                parts.append(f"[Sheet: {name}]\n" + "\n".join(rows))
+        text = "\n\n".join(parts)
+        return {"text": text, "file_type": "xls", "method": "xlrd", "error": None}
+    except ImportError:
+        pass
+    except Exception as e:
+        return {"text": "", "file_type": "xls", "method": "none",
+                "error": f"XLS 解析失败: {e}"}
+    return {"text": "", "file_type": "xls", "method": "none",
+            "error": "XLS 解析需要安装 xlrd"}
+
+
+def _parse_doc(path: Path) -> dict:
+    """提取旧版 Word .doc 文件中的文本（尽力而为，二进制格式）。"""
+    try:
+        import olefile
+        ole = olefile.OleFileIO(path)
+        if not ole.exists("WordDocument"):
+            ole.close()
+            return {"text": "", "file_type": "doc", "method": "none",
+                    "error": "不是有效的 .doc 文件"}
+        raw = ole.openstream("WordDocument").read()
+        ole.close()
+        # 提取可打印 UTF-16LE 字符段
+        text = ""
+        try:
+            decoded = raw.decode("utf-16-le", errors="ignore")
+            text = "".join(c for c in decoded if c.isprintable() or c in "\n\r\t")
+        except Exception:
+            pass
+        if not text.strip():
+            return {"text": "", "file_type": "doc", "method": "olefile",
+                    "error": "无法提取文本（.doc 为二进制格式，建议转换为 .docx）"}
+        return {"text": text.strip(), "file_type": "doc", "method": "olefile", "error": None}
+    except ImportError:
+        pass
+    except Exception as e:
+        return {"text": "", "file_type": "doc", "method": "none",
+                "error": f"DOC 解析失败: {e}"}
+    return {"text": "", "file_type": "doc", "method": "none",
+            "error": "DOC 解析需要安装 olefile"}
+
+
+
 def _parse_text(path: Path) -> dict:
    """读取纯文本文件。"""
    try: