feat: comprehensive v2 upgrade — streaming, error KB, file upload, layout analysis
Major changes: - Streaming: LLM统一 _BaseLLM 接口 (invoke + stream), generate/modify/correct 节点使用 get_stream_writer() 实现逐字输出, UI 节点平铺展开自动折叠 - Prompt外部化: 7个prompt拆分到 prompts/*.md, loader.py 支持热重载 - 错误自增长: backend/error_kb.py — 指纹去重 + ChromaDB持久化, correct_jrxml→validate 通过时自动入库, retrieve同时搜索错误KB - 文件上传: backend/file_parser.py — PDF/DOCX/图片/文本解析, 侧边栏多文件上传, 文本自动注入下一条消息 - A4模板识别: backend/layout_analyzer.py — 三种模式(完整A4/行片段修改/行片段新建), PaddleOCR元素提取 + 行分组 + JRXML section匹配 - 会话历史下载: jrxml_versions版本追踪 + 侧边栏历史版本下载按钮 - 预览修复: route_after_save跳过预览/导出意图的验证循环 - Ctrl+C修复: JS注入拦截Streamlit裸c键清缓存 Docs: CLAUDE.md (完整项目文档), ROADMAP.md (改进路线图) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,193 @@
|
||||
"""文件解析器:将上传文件转为文本,供 LLM 处理。
|
||||
|
||||
支持:
|
||||
- 图片 (.png/.jpg/.jpeg/.bmp) → OCR 提取文本
|
||||
- PDF (.pdf) → 文本提取
|
||||
- Word (.docx) → 文本提取
|
||||
- 纯文本 (.txt/.csv/.json/.xml) → 直接读取
|
||||
|
||||
策略选择:
|
||||
- 原生多模态: 模型支持图片时直接传文件(当前 MiniMax 不支持,自动退回文本转换)
|
||||
- 文本转换: 所有文件转为 UTF-8 文本后注入 prompt
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import PIL.Image
|
||||
|
||||
MODELS_WITH_VISION = {
|
||||
"gpt-4o", "gpt-4-turbo", "gpt-4-vision-preview",
|
||||
"claude-3", "claude-3.5", "claude-4",
|
||||
"gemini-1.5", "gemini-2",
|
||||
}
|
||||
|
||||
|
||||
def can_use_vision(model: str = "") -> bool:
|
||||
"""检查当前模型是否支持原生多模态(图片直接上传)。"""
|
||||
if not model:
|
||||
model = os.getenv("LLM_MODEL", "")
|
||||
return any(v in model.lower() for v in MODELS_WITH_VISION)
|
||||
|
||||
|
||||
def parse_file(file_path: str, file_type: str = "") -> dict:
|
||||
"""解析任意文件为文本。
|
||||
|
||||
返回: {"text": str, "file_type": str, "method": str, "error": Optional[str]}
|
||||
"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
return {"text": "", "file_type": file_type, "method": "none", "error": "文件不存在"}
|
||||
|
||||
suffix = file_type or path.suffix.lower()
|
||||
|
||||
parsers = {
|
||||
".png": _parse_image,
|
||||
".jpg": _parse_image,
|
||||
".jpeg": _parse_image,
|
||||
".bmp": _parse_image,
|
||||
".webp": _parse_image,
|
||||
".pdf": _parse_pdf,
|
||||
".docx": _parse_docx,
|
||||
}
|
||||
|
||||
parser = parsers.get(suffix)
|
||||
if parser:
|
||||
return parser(path)
|
||||
else:
|
||||
return _parse_text(path)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 各类型解析器
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _parse_image(path: Path) -> dict:
|
||||
"""OCR 提取图片中的文字。"""
|
||||
try:
|
||||
img = PIL.Image.open(path)
|
||||
info = f"[图片: {img.size[0]}x{img.size[1]}, {img.mode}]"
|
||||
except Exception:
|
||||
info = "[图片: 无法读取元数据]"
|
||||
|
||||
# 尝试 PaddleOCR
|
||||
try:
|
||||
from paddleocr import PaddleOCR
|
||||
ocr = PaddleOCR(lang="ch", use_angle_cls=False, show_log=False)
|
||||
result = ocr.ocr(str(path))
|
||||
lines = []
|
||||
if result and result[0]:
|
||||
for line in result[0]:
|
||||
text = line[1][0] if len(line) > 1 else ""
|
||||
if text.strip():
|
||||
lines.append(text.strip())
|
||||
if lines:
|
||||
return {
|
||||
"text": f"{info}\n识别文本:\n" + "\n".join(lines),
|
||||
"file_type": "image",
|
||||
"method": "paddleocr",
|
||||
"error": None,
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# OCR 不可用 → 返回图片元信息 + 安装提示
|
||||
return {
|
||||
"text": f"{info}\n(如需 OCR 文字识别,请安装: pip install paddleocr)",
|
||||
"file_type": "image",
|
||||
"method": "metadata_only",
|
||||
"error": "OCR 引擎未安装,已返回图片元信息",
|
||||
}
|
||||
|
||||
|
||||
def _parse_pdf(path: Path) -> dict:
|
||||
"""提取 PDF 中的文本。"""
|
||||
try:
|
||||
import pdfplumber
|
||||
with pdfplumber.open(path) as pdf:
|
||||
pages = []
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
pages.append(text)
|
||||
full = "\n\n".join(pages)
|
||||
return {
|
||||
"text": full,
|
||||
"file_type": "pdf",
|
||||
"method": "pdfplumber",
|
||||
"error": None,
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
# Fallback: 尝试 PyMuPDF
|
||||
try:
|
||||
import fitz
|
||||
doc = fitz.open(path)
|
||||
pages = []
|
||||
for page in doc:
|
||||
pages.append(page.get_text())
|
||||
doc.close()
|
||||
return {
|
||||
"text": "\n\n".join(pages),
|
||||
"file_type": "pdf",
|
||||
"method": "pymupdf",
|
||||
"error": None,
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {"text": "", "file_type": "pdf", "method": "none",
|
||||
"error": "PDF 解析需要安装 pdfplumber 或 PyMuPDF"}
|
||||
|
||||
|
||||
def _parse_docx(path: Path) -> dict:
|
||||
"""提取 Word 文档中的文本。"""
|
||||
try:
|
||||
from docx import Document
|
||||
doc = Document(path)
|
||||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||
# 同时提取表格内容
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
cells = [cell.text for cell in row.cells if cell.text.strip()]
|
||||
if cells:
|
||||
paragraphs.append(" | ".join(cells))
|
||||
return {
|
||||
"text": "\n\n".join(paragraphs),
|
||||
"file_type": "docx",
|
||||
"method": "python-docx",
|
||||
"error": None,
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return {"text": "", "file_type": "docx", "method": "none",
|
||||
"error": "DOCX 解析需要安装 python-docx"}
|
||||
|
||||
|
||||
def _parse_text(path: Path) -> dict:
|
||||
"""读取纯文本文件。"""
|
||||
try:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
return {"text": text, "file_type": path.suffix, "method": "direct", "error": None}
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
text = path.read_text(encoding="gbk")
|
||||
return {"text": text, "file_type": path.suffix, "method": "direct_gbk", "error": None}
|
||||
except Exception:
|
||||
return {"text": "", "file_type": path.suffix, "method": "none",
|
||||
"error": "无法解码文件"}
|
||||
except Exception:
|
||||
return {"text": "", "file_type": path.suffix, "method": "none",
|
||||
"error": "读取失败"}
|
||||
Reference in New Issue
Block a user