feat: comprehensive v2 upgrade — streaming, error KB, file upload, layout analysis

Major changes: - Streaming: LLM统一 _BaseLLM 接口 (invoke + stream), generate/modify/correct 节点使用 get_stream_writer() 实现逐字输出, UI 节点平铺展开自动折叠 - Prompt外部化: 7个prompt拆分到 prompts/*.md, loader.py 支持热重载 - 错误自增长: backend/error_kb.py — 指纹去重 + ChromaDB持久化, correct_jrxml→validate 通过时自动入库, retrieve同时搜索错误KB - 文件上传: backend/file_parser.py — PDF/DOCX/图片/文本解析, 侧边栏多文件上传, 文本自动注入下一条消息 - A4模板识别: backend/layout_analyzer.py — 三种模式(完整A4/行片段修改/行片段新建), PaddleOCR元素提取 + 行分组 + JRXML section匹配 - 会话历史下载: jrxml_versions版本追踪 + 侧边栏历史版本下载按钮 - 预览修复: route_after_save跳过预览/导出意图的验证循环 - Ctrl+C修复: JS注入拦截Streamlit裸c键清缓存 Docs: CLAUDE.md (完整项目文档), ROADMAP.md (改进路线图) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-19 15:02:53 +08:00
parent b280c2b453
commit 70614dff5e
19 changed files with 1770 additions and 231 deletions
@@ -0,0 +1,193 @@
+"""文件解析器：将上传文件转为文本，供 LLM 处理。
+
+支持:
+- 图片 (.png/.jpg/.jpeg/.bmp) → OCR 提取文本
+- PDF (.pdf) → 文本提取
+- Word (.docx) → 文本提取
+- 纯文本 (.txt/.csv/.json/.xml) → 直接读取
+
+策略选择:
+- 原生多模态: 模型支持图片时直接传文件（当前 MiniMax 不支持，自动退回文本转换）
+- 文本转换: 所有文件转为 UTF-8 文本后注入 prompt
+"""
+
+import os
+import io
+from pathlib import Path
+from typing import Optional
+
+import PIL.Image
+
+MODELS_WITH_VISION = {
+    "gpt-4o", "gpt-4-turbo", "gpt-4-vision-preview",
+    "claude-3", "claude-3.5", "claude-4",
+    "gemini-1.5", "gemini-2",
+}
+
+
+def can_use_vision(model: str = "") -> bool:
+    """检查当前模型是否支持原生多模态（图片直接上传）。"""
+    if not model:
+        model = os.getenv("LLM_MODEL", "")
+    return any(v in model.lower() for v in MODELS_WITH_VISION)
+
+
+def parse_file(file_path: str, file_type: str = "") -> dict:
+    """解析任意文件为文本。
+
+    返回: {"text": str, "file_type": str, "method": str, "error": Optional[str]}
+    """
+    path = Path(file_path)
+    if not path.exists():
+        return {"text": "", "file_type": file_type, "method": "none", "error": "文件不存在"}
+
+    suffix = file_type or path.suffix.lower()
+
+    parsers = {
+        ".png":  _parse_image,
+        ".jpg":  _parse_image,
+        ".jpeg": _parse_image,
+        ".bmp":  _parse_image,
+        ".webp": _parse_image,
+        ".pdf":  _parse_pdf,
+        ".docx": _parse_docx,
+    }
+
+    parser = parsers.get(suffix)
+    if parser:
+        return parser(path)
+    else:
+        return _parse_text(path)
+
+
+# ---------------------------------------------------------------------------
+# 各类型解析器
+# ---------------------------------------------------------------------------
+
+def _parse_image(path: Path) -> dict:
+    """OCR 提取图片中的文字。"""
+    try:
+        img = PIL.Image.open(path)
+        info = f"[图片: {img.size[0]}x{img.size[1]}, {img.mode}]"
+    except Exception:
+        info = "[图片: 无法读取元数据]"
+
+    # 尝试 PaddleOCR
+    try:
+        from paddleocr import PaddleOCR
+        ocr = PaddleOCR(lang="ch", use_angle_cls=False, show_log=False)
+        result = ocr.ocr(str(path))
+        lines = []
+        if result and result[0]:
+            for line in result[0]:
+                text = line[1][0] if len(line) > 1 else ""
+                if text.strip():
+                    lines.append(text.strip())
+        if lines:
+            return {
+                "text": f"{info}\n识别文本:\n" + "\n".join(lines),
+                "file_type": "image",
+                "method": "paddleocr",
+                "error": None,
+            }
+    except ImportError:
+        pass
+    except Exception:
+        pass
+
+    # OCR 不可用 → 返回图片元信息 + 安装提示
+    return {
+        "text": f"{info}\n(如需 OCR 文字识别，请安装: pip install paddleocr)",
+        "file_type": "image",
+        "method": "metadata_only",
+        "error": "OCR 引擎未安装，已返回图片元信息",
+    }
+
+
+def _parse_pdf(path: Path) -> dict:
+    """提取 PDF 中的文本。"""
+    try:
+        import pdfplumber
+        with pdfplumber.open(path) as pdf:
+            pages = []
+            for page in pdf.pages:
+                text = page.extract_text()
+                if text:
+                    pages.append(text)
+            full = "\n\n".join(pages)
+            return {
+                "text": full,
+                "file_type": "pdf",
+                "method": "pdfplumber",
+                "error": None,
+            }
+    except ImportError:
+        pass
+    except Exception as e:
+        pass
+
+    # Fallback: 尝试 PyMuPDF
+    try:
+        import fitz
+        doc = fitz.open(path)
+        pages = []
+        for page in doc:
+            pages.append(page.get_text())
+        doc.close()
+        return {
+            "text": "\n\n".join(pages),
+            "file_type": "pdf",
+            "method": "pymupdf",
+            "error": None,
+        }
+    except ImportError:
+        pass
+    except Exception:
+        pass
+
+    return {"text": "", "file_type": "pdf", "method": "none",
+            "error": "PDF 解析需要安装 pdfplumber 或 PyMuPDF"}
+
+
+def _parse_docx(path: Path) -> dict:
+    """提取 Word 文档中的文本。"""
+    try:
+        from docx import Document
+        doc = Document(path)
+        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+        # 同时提取表格内容
+        for table in doc.tables:
+            for row in table.rows:
+                cells = [cell.text for cell in row.cells if cell.text.strip()]
+                if cells:
+                    paragraphs.append(" | ".join(cells))
+        return {
+            "text": "\n\n".join(paragraphs),
+            "file_type": "docx",
+            "method": "python-docx",
+            "error": None,
+        }
+    except ImportError:
+        pass
+    except Exception as e:
+        pass
+
+    return {"text": "", "file_type": "docx", "method": "none",
+            "error": "DOCX 解析需要安装 python-docx"}
+
+
+def _parse_text(path: Path) -> dict:
+    """读取纯文本文件。"""
+    try:
+        text = path.read_text(encoding="utf-8")
+        return {"text": text, "file_type": path.suffix, "method": "direct", "error": None}
+    except UnicodeDecodeError:
+        try:
+            text = path.read_text(encoding="gbk")
+            return {"text": text, "file_type": path.suffix, "method": "direct_gbk", "error": None}
+        except Exception:
+            return {"text": "", "file_type": path.suffix, "method": "none",
+                    "error": "无法解码文件"}
+    except Exception:
+        return {"text": "", "file_type": path.suffix, "method": "none",
+                "error": "读取失败"}