6467fd4ae5
- OCR: EasyOCR (primary, ch_sim+en) with PaddleOCR fallback for Windows compatibility - Validation: _check_minimum_content() rejects empty-shell JRXML (no band/textField) - Retry: MAX_RETRY 3→5, exhaustion records pending_failure_context for next-turn auto-injection - Finalize: only saves jrxml_versions on pass, preserves last good final_jrxml on fail - Extract JRXML: improved empty markdown block handling and XML fragment fallback - UI: real-time node progress via placeholder updates, initial "analyzing" feedback - UI: use agent_state (full) instead of node_state (partial) for summary card routing - UI: unknown template_type now gives LLM meaningful image context instead of metadata - Docs: updated CLAUDE.md and CODE_GUIDE.md to reflect all v3 changes Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
213 lines
6.3 KiB
Python
213 lines
6.3 KiB
Python
"""文件解析器:将上传文件转为文本,供 LLM 处理。
|
||
|
||
支持:
|
||
- 图片 (.png/.jpg/.jpeg/.bmp) → OCR 提取文本
|
||
- PDF (.pdf) → 文本提取
|
||
- Word (.docx) → 文本提取
|
||
- 纯文本 (.txt/.csv/.json/.xml) → 直接读取
|
||
|
||
策略选择:
|
||
- 原生多模态: 模型支持图片时直接传文件(当前 MiniMax 不支持,自动退回文本转换)
|
||
- 文本转换: 所有文件转为 UTF-8 文本后注入 prompt
|
||
"""
|
||
|
||
import os
|
||
import io
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
|
||
import PIL.Image
|
||
|
||
MODELS_WITH_VISION = {
|
||
"gpt-4o", "gpt-4-turbo", "gpt-4-vision-preview",
|
||
"claude-3", "claude-3.5", "claude-4",
|
||
"gemini-1.5", "gemini-2",
|
||
}
|
||
|
||
|
||
def can_use_vision(model: str = "") -> bool:
|
||
"""检查当前模型是否支持原生多模态(图片直接上传)。"""
|
||
if not model:
|
||
model = os.getenv("LLM_MODEL", "")
|
||
return any(v in model.lower() for v in MODELS_WITH_VISION)
|
||
|
||
|
||
def parse_file(file_path: str, file_type: str = "") -> dict:
|
||
"""解析任意文件为文本。
|
||
|
||
返回: {"text": str, "file_type": str, "method": str, "error": Optional[str]}
|
||
"""
|
||
path = Path(file_path)
|
||
if not path.exists():
|
||
return {"text": "", "file_type": file_type, "method": "none", "error": "文件不存在"}
|
||
|
||
suffix = file_type or path.suffix.lower()
|
||
|
||
parsers = {
|
||
".png": _parse_image,
|
||
".jpg": _parse_image,
|
||
".jpeg": _parse_image,
|
||
".bmp": _parse_image,
|
||
".webp": _parse_image,
|
||
".pdf": _parse_pdf,
|
||
".docx": _parse_docx,
|
||
}
|
||
|
||
parser = parsers.get(suffix)
|
||
if parser:
|
||
return parser(path)
|
||
else:
|
||
return _parse_text(path)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 各类型解析器
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _parse_image(path: Path) -> dict:
|
||
"""OCR 提取图片中的文字。优先 EasyOCR,回退 PaddleOCR。"""
|
||
try:
|
||
img = PIL.Image.open(path)
|
||
info = f"[图片: {img.size[0]}x{img.size[1]}, {img.mode}]"
|
||
except Exception:
|
||
info = "[图片: 无法读取元数据]"
|
||
|
||
# 优先 EasyOCR(Windows 兼容性更好)
|
||
try:
|
||
import easyocr
|
||
import numpy as np
|
||
reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
|
||
result = reader.readtext(np.array(img))
|
||
lines = [text.strip() for (_, text, _) in result if text.strip()]
|
||
if lines:
|
||
return {
|
||
"text": f"{info}\n识别文本:\n" + "\n".join(lines),
|
||
"file_type": "image",
|
||
"method": "easyocr",
|
||
"error": None,
|
||
}
|
||
except ImportError:
|
||
pass
|
||
except Exception:
|
||
pass
|
||
|
||
# 回退 PaddleOCR
|
||
try:
|
||
from paddleocr import PaddleOCR
|
||
ocr = PaddleOCR(lang="ch")
|
||
result = ocr.ocr(str(path))
|
||
lines = []
|
||
if result and result[0]:
|
||
for line in result[0]:
|
||
text = line[1][0] if len(line) > 1 else ""
|
||
if text.strip():
|
||
lines.append(text.strip())
|
||
if lines:
|
||
return {
|
||
"text": f"{info}\n识别文本:\n" + "\n".join(lines),
|
||
"file_type": "image",
|
||
"method": "paddleocr",
|
||
"error": None,
|
||
}
|
||
except ImportError:
|
||
pass
|
||
except Exception:
|
||
pass
|
||
|
||
# OCR 不可用 → 返回图片元信息 + 安装提示
|
||
return {
|
||
"text": f"{info}\n(如需 OCR 文字识别,请安装: pip install easyocr)",
|
||
"file_type": "image",
|
||
"method": "metadata_only",
|
||
"error": "OCR 引擎未安装,已返回图片元信息",
|
||
}
|
||
|
||
|
||
def _parse_pdf(path: Path) -> dict:
|
||
"""提取 PDF 中的文本。"""
|
||
try:
|
||
import pdfplumber
|
||
with pdfplumber.open(path) as pdf:
|
||
pages = []
|
||
for page in pdf.pages:
|
||
text = page.extract_text()
|
||
if text:
|
||
pages.append(text)
|
||
full = "\n\n".join(pages)
|
||
return {
|
||
"text": full,
|
||
"file_type": "pdf",
|
||
"method": "pdfplumber",
|
||
"error": None,
|
||
}
|
||
except ImportError:
|
||
pass
|
||
except Exception as e:
|
||
pass
|
||
|
||
# Fallback: 尝试 PyMuPDF
|
||
try:
|
||
import fitz
|
||
doc = fitz.open(path)
|
||
pages = []
|
||
for page in doc:
|
||
pages.append(page.get_text())
|
||
doc.close()
|
||
return {
|
||
"text": "\n\n".join(pages),
|
||
"file_type": "pdf",
|
||
"method": "pymupdf",
|
||
"error": None,
|
||
}
|
||
except ImportError:
|
||
pass
|
||
except Exception:
|
||
pass
|
||
|
||
return {"text": "", "file_type": "pdf", "method": "none",
|
||
"error": "PDF 解析需要安装 pdfplumber 或 PyMuPDF"}
|
||
|
||
|
||
def _parse_docx(path: Path) -> dict:
|
||
"""提取 Word 文档中的文本。"""
|
||
try:
|
||
from docx import Document
|
||
doc = Document(path)
|
||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||
# 同时提取表格内容
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
cells = [cell.text for cell in row.cells if cell.text.strip()]
|
||
if cells:
|
||
paragraphs.append(" | ".join(cells))
|
||
return {
|
||
"text": "\n\n".join(paragraphs),
|
||
"file_type": "docx",
|
||
"method": "python-docx",
|
||
"error": None,
|
||
}
|
||
except ImportError:
|
||
pass
|
||
except Exception as e:
|
||
pass
|
||
|
||
return {"text": "", "file_type": "docx", "method": "none",
|
||
"error": "DOCX 解析需要安装 python-docx"}
|
||
|
||
|
||
def _parse_text(path: Path) -> dict:
|
||
"""读取纯文本文件。"""
|
||
try:
|
||
text = path.read_text(encoding="utf-8")
|
||
return {"text": text, "file_type": path.suffix, "method": "direct", "error": None}
|
||
except UnicodeDecodeError:
|
||
try:
|
||
text = path.read_text(encoding="gbk")
|
||
return {"text": text, "file_type": path.suffix, "method": "direct_gbk", "error": None}
|
||
except Exception:
|
||
return {"text": "", "file_type": path.suffix, "method": "none",
|
||
"error": "无法解码文件"}
|
||
except Exception:
|
||
return {"text": "", "file_type": path.suffix, "method": "none",
|
||
"error": "读取失败"}
|