feat: comprehensive v2 upgrade — streaming, error KB, file upload, layout analysis

Major changes:
- Streaming: LLM统一 _BaseLLM 接口 (invoke + stream), generate/modify/correct
  节点使用 get_stream_writer() 实现逐字输出, UI 节点平铺展开自动折叠
- Prompt外部化: 7个prompt拆分到 prompts/*.md, loader.py 支持热重载
- 错误自增长: backend/error_kb.py — 指纹去重 + ChromaDB持久化,
  correct_jrxml→validate 通过时自动入库, retrieve同时搜索错误KB
- 文件上传: backend/file_parser.py — PDF/DOCX/图片/文本解析,
  侧边栏多文件上传, 文本自动注入下一条消息
- A4模板识别: backend/layout_analyzer.py — 三种模式(完整A4/行片段修改/行片段新建),
  PaddleOCR元素提取 + 行分组 + JRXML section匹配
- 会话历史下载: jrxml_versions版本追踪 + 侧边栏历史版本下载按钮
- 预览修复: route_after_save跳过预览/导出意图的验证循环
- Ctrl+C修复: JS注入拦截Streamlit裸c键清缓存

Docs: CLAUDE.md (完整项目文档), ROADMAP.md (改进路线图)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-19 15:02:53 +08:00
parent b280c2b453
commit 70614dff5e
19 changed files with 1770 additions and 231 deletions
+193
View File
@@ -0,0 +1,193 @@
"""文件解析器:将上传文件转为文本,供 LLM 处理。
支持:
- 图片 (.png/.jpg/.jpeg/.bmp) → OCR 提取文本
- PDF (.pdf) → 文本提取
- Word (.docx) → 文本提取
- 纯文本 (.txt/.csv/.json/.xml) → 直接读取
策略选择:
- 原生多模态: 模型支持图片时直接传文件(当前 MiniMax 不支持,自动退回文本转换)
- 文本转换: 所有文件转为 UTF-8 文本后注入 prompt
"""
import os
import io
from pathlib import Path
from typing import Optional
import PIL.Image
MODELS_WITH_VISION = {
"gpt-4o", "gpt-4-turbo", "gpt-4-vision-preview",
"claude-3", "claude-3.5", "claude-4",
"gemini-1.5", "gemini-2",
}
def can_use_vision(model: str = "") -> bool:
"""检查当前模型是否支持原生多模态(图片直接上传)。"""
if not model:
model = os.getenv("LLM_MODEL", "")
return any(v in model.lower() for v in MODELS_WITH_VISION)
def parse_file(file_path: str, file_type: str = "") -> dict:
"""解析任意文件为文本。
返回: {"text": str, "file_type": str, "method": str, "error": Optional[str]}
"""
path = Path(file_path)
if not path.exists():
return {"text": "", "file_type": file_type, "method": "none", "error": "文件不存在"}
suffix = file_type or path.suffix.lower()
parsers = {
".png": _parse_image,
".jpg": _parse_image,
".jpeg": _parse_image,
".bmp": _parse_image,
".webp": _parse_image,
".pdf": _parse_pdf,
".docx": _parse_docx,
}
parser = parsers.get(suffix)
if parser:
return parser(path)
else:
return _parse_text(path)
# ---------------------------------------------------------------------------
# 各类型解析器
# ---------------------------------------------------------------------------
def _parse_image(path: Path) -> dict:
"""OCR 提取图片中的文字。"""
try:
img = PIL.Image.open(path)
info = f"[图片: {img.size[0]}x{img.size[1]}, {img.mode}]"
except Exception:
info = "[图片: 无法读取元数据]"
# 尝试 PaddleOCR
try:
from paddleocr import PaddleOCR
ocr = PaddleOCR(lang="ch", use_angle_cls=False, show_log=False)
result = ocr.ocr(str(path))
lines = []
if result and result[0]:
for line in result[0]:
text = line[1][0] if len(line) > 1 else ""
if text.strip():
lines.append(text.strip())
if lines:
return {
"text": f"{info}\n识别文本:\n" + "\n".join(lines),
"file_type": "image",
"method": "paddleocr",
"error": None,
}
except ImportError:
pass
except Exception:
pass
# OCR 不可用 → 返回图片元信息 + 安装提示
return {
"text": f"{info}\n(如需 OCR 文字识别,请安装: pip install paddleocr)",
"file_type": "image",
"method": "metadata_only",
"error": "OCR 引擎未安装,已返回图片元信息",
}
def _parse_pdf(path: Path) -> dict:
"""提取 PDF 中的文本。"""
try:
import pdfplumber
with pdfplumber.open(path) as pdf:
pages = []
for page in pdf.pages:
text = page.extract_text()
if text:
pages.append(text)
full = "\n\n".join(pages)
return {
"text": full,
"file_type": "pdf",
"method": "pdfplumber",
"error": None,
}
except ImportError:
pass
except Exception as e:
pass
# Fallback: 尝试 PyMuPDF
try:
import fitz
doc = fitz.open(path)
pages = []
for page in doc:
pages.append(page.get_text())
doc.close()
return {
"text": "\n\n".join(pages),
"file_type": "pdf",
"method": "pymupdf",
"error": None,
}
except ImportError:
pass
except Exception:
pass
return {"text": "", "file_type": "pdf", "method": "none",
"error": "PDF 解析需要安装 pdfplumber 或 PyMuPDF"}
def _parse_docx(path: Path) -> dict:
"""提取 Word 文档中的文本。"""
try:
from docx import Document
doc = Document(path)
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
# 同时提取表格内容
for table in doc.tables:
for row in table.rows:
cells = [cell.text for cell in row.cells if cell.text.strip()]
if cells:
paragraphs.append(" | ".join(cells))
return {
"text": "\n\n".join(paragraphs),
"file_type": "docx",
"method": "python-docx",
"error": None,
}
except ImportError:
pass
except Exception as e:
pass
return {"text": "", "file_type": "docx", "method": "none",
"error": "DOCX 解析需要安装 python-docx"}
def _parse_text(path: Path) -> dict:
"""读取纯文本文件。"""
try:
text = path.read_text(encoding="utf-8")
return {"text": text, "file_type": path.suffix, "method": "direct", "error": None}
except UnicodeDecodeError:
try:
text = path.read_text(encoding="gbk")
return {"text": text, "file_type": path.suffix, "method": "direct_gbk", "error": None}
except Exception:
return {"text": "", "file_type": path.suffix, "method": "none",
"error": "无法解码文件"}
except Exception:
return {"text": "", "file_type": path.suffix, "method": "none",
"error": "读取失败"}