2befd44430
Key resolutions: - agent/nodes.py: Merged session_id exclusion fix with new persistable fields (ocr_extraction_result, annotation_result, layout_schema, ocr_elements) - app.py: Adopted st-multimodal-chatinput for unified paste/drop/upload, removed custom JS paste bridge - backend/file_parser.py: Kept local XLSX parser, added remote XLS/DOC parsers - CLAUDE.md + CODE_GUIDE.md: Merged documentation from both branches Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
302 lines
9.8 KiB
Python
302 lines
9.8 KiB
Python
"""文件解析器:将上传文件转为文本,供 LLM 处理。
|
|
|
|
支持:
|
|
- 图片 (.png/.jpg/.jpeg/.bmp) → OCR 提取文本
|
|
- PDF (.pdf) → 文本提取
|
|
- Word (.docx) → 文本提取
|
|
- 纯文本 (.txt/.csv/.json/.xml) → 直接读取
|
|
|
|
策略选择:
|
|
- 原生多模态: 模型支持图片时直接传文件(当前 MiniMax 不支持,自动退回文本转换)
|
|
- 文本转换: 所有文件转为 UTF-8 文本后注入 prompt
|
|
"""
|
|
|
|
import os
|
|
import io
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import PIL.Image
|
|
|
|
MODELS_WITH_VISION = {
|
|
"gpt-4o", "gpt-4-turbo", "gpt-4-vision-preview",
|
|
"claude-3", "claude-3.5", "claude-4",
|
|
"gemini-1.5", "gemini-2",
|
|
}
|
|
|
|
|
|
def can_use_vision(model: str = "") -> bool:
|
|
"""检查当前模型是否支持原生多模态(图片直接上传)。"""
|
|
if not model:
|
|
model = os.getenv("LLM_MODEL", "")
|
|
return any(v in model.lower() for v in MODELS_WITH_VISION)
|
|
|
|
|
|
def parse_file(file_path: str, file_type: str = "") -> dict:
|
|
"""解析任意文件为文本。
|
|
|
|
返回: {"text": str, "file_type": str, "method": str, "error": Optional[str]}
|
|
"""
|
|
path = Path(file_path)
|
|
if not path.exists():
|
|
return {"text": "", "file_type": file_type, "method": "none", "error": "文件不存在"}
|
|
|
|
suffix = file_type or path.suffix.lower()
|
|
|
|
parsers = {
|
|
".png": _parse_image,
|
|
".jpg": _parse_image,
|
|
".jpeg": _parse_image,
|
|
".bmp": _parse_image,
|
|
".webp": _parse_image,
|
|
".pdf": _parse_pdf,
|
|
".docx": _parse_docx,
|
|
".xlsx": _parse_xlsx,
|
|
".xls": _parse_xls,
|
|
".doc": _parse_doc,
|
|
}
|
|
|
|
parser = parsers.get(suffix)
|
|
if parser:
|
|
return parser(path)
|
|
else:
|
|
return _parse_text(path)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 各类型解析器
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _parse_image(path: Path) -> dict:
|
|
"""OCR 提取图片中的文字。优先 EasyOCR,回退 PaddleOCR。"""
|
|
try:
|
|
img = PIL.Image.open(path)
|
|
info = f"[图片: {img.size[0]}x{img.size[1]}, {img.mode}]"
|
|
except Exception:
|
|
info = "[图片: 无法读取元数据]"
|
|
|
|
# 优先 PaddleOCR(精确识别)
|
|
try:
|
|
from paddleocr import PaddleOCR
|
|
ocr = PaddleOCR(lang="ch")
|
|
result = ocr.ocr(str(path))
|
|
lines = []
|
|
if result and result[0]:
|
|
for line in result[0]:
|
|
text = line[1][0] if len(line) > 1 else ""
|
|
if text.strip():
|
|
lines.append(text.strip())
|
|
if lines:
|
|
return {
|
|
"text": f"{info}\n识别文本:\n" + "\n".join(lines),
|
|
"file_type": "image",
|
|
"method": "paddleocr",
|
|
"error": None,
|
|
}
|
|
except ImportError:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
# 回退 EasyOCR
|
|
try:
|
|
import easyocr
|
|
import numpy as np
|
|
reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
|
|
result = reader.readtext(np.array(img))
|
|
lines = [text.strip() for (_, text, _) in result if text.strip()]
|
|
if lines:
|
|
return {
|
|
"text": f"{info}\n识别文本:\n" + "\n".join(lines),
|
|
"file_type": "image",
|
|
"method": "easyocr",
|
|
"error": None,
|
|
}
|
|
except ImportError:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
# OCR 不可用 → 返回图片元信息 + 安装提示
|
|
return {
|
|
"text": f"{info}\n(如需 OCR 文字识别,请安装: pip install easyocr)",
|
|
"file_type": "image",
|
|
"method": "metadata_only",
|
|
"error": "OCR 引擎未安装,已返回图片元信息",
|
|
}
|
|
|
|
|
|
def _parse_pdf(path: Path) -> dict:
|
|
"""提取 PDF 中的文本。"""
|
|
try:
|
|
import pdfplumber
|
|
with pdfplumber.open(path) as pdf:
|
|
pages = []
|
|
for page in pdf.pages:
|
|
text = page.extract_text()
|
|
if text:
|
|
pages.append(text)
|
|
full = "\n\n".join(pages)
|
|
return {
|
|
"text": full,
|
|
"file_type": "pdf",
|
|
"method": "pdfplumber",
|
|
"error": None,
|
|
}
|
|
except ImportError:
|
|
pass
|
|
except Exception as e:
|
|
pass
|
|
|
|
# Fallback: 尝试 PyMuPDF
|
|
try:
|
|
import fitz
|
|
doc = fitz.open(path)
|
|
pages = []
|
|
for page in doc:
|
|
pages.append(page.get_text())
|
|
doc.close()
|
|
return {
|
|
"text": "\n\n".join(pages),
|
|
"file_type": "pdf",
|
|
"method": "pymupdf",
|
|
"error": None,
|
|
}
|
|
except ImportError:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
return {"text": "", "file_type": "pdf", "method": "none",
|
|
"error": "PDF 解析需要安装 pdfplumber 或 PyMuPDF"}
|
|
|
|
|
|
def _parse_docx(path: Path) -> dict:
|
|
"""提取 Word 文档中的文本。"""
|
|
try:
|
|
from docx import Document
|
|
doc = Document(path)
|
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
# 同时提取表格内容
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
cells = [cell.text for cell in row.cells if cell.text.strip()]
|
|
if cells:
|
|
paragraphs.append(" | ".join(cells))
|
|
return {
|
|
"text": "\n\n".join(paragraphs),
|
|
"file_type": "docx",
|
|
"method": "python-docx",
|
|
"error": None,
|
|
}
|
|
except ImportError:
|
|
pass
|
|
except Exception as e:
|
|
pass
|
|
|
|
return {"text": "", "file_type": "docx", "method": "none",
|
|
"error": "DOCX 解析需要安装 python-docx"}
|
|
|
|
|
|
def _parse_xlsx(path: Path) -> dict:
|
|
"""提取 Excel .xlsx 文件中的文本。"""
|
|
try:
|
|
from openpyxl import load_workbook
|
|
wb = load_workbook(path, read_only=True, data_only=True)
|
|
parts = []
|
|
for name in wb.sheetnames:
|
|
ws = wb[name]
|
|
rows = []
|
|
for row in ws.iter_rows(values_only=True):
|
|
cells = [str(c) if c is not None else "" for c in row]
|
|
if any(c for c in cells):
|
|
rows.append("\t".join(cells))
|
|
if rows:
|
|
parts.append(f"[Sheet: {name}]\n" + "\n".join(rows))
|
|
wb.close()
|
|
text = "\n\n".join(parts)
|
|
return {"text": text, "file_type": "xlsx", "method": "openpyxl", "error": None}
|
|
except ImportError:
|
|
pass
|
|
except Exception as e:
|
|
return {"text": "", "file_type": "xlsx", "method": "none",
|
|
"error": f"XLSX 解析失败: {e}"}
|
|
return {"text": "", "file_type": "xlsx", "method": "none",
|
|
"error": "XLSX 解析需要安装 openpyxl"}
|
|
|
|
|
|
def _parse_xls(path: Path) -> dict:
|
|
"""提取旧版 Excel .xls 文件中的文本。"""
|
|
try:
|
|
import xlrd
|
|
wb = xlrd.open_workbook(path)
|
|
parts = []
|
|
for name in wb.sheet_names():
|
|
ws = wb.sheet_by_name(name)
|
|
rows = []
|
|
for rx in range(ws.nrows):
|
|
cells = [str(ws.cell_value(rx, cx)) if ws.cell_value(rx, cx) != "" else ""
|
|
for cx in range(ws.ncols)]
|
|
if any(c for c in cells):
|
|
rows.append("\t".join(cells))
|
|
if rows:
|
|
parts.append(f"[Sheet: {name}]\n" + "\n".join(rows))
|
|
text = "\n\n".join(parts)
|
|
return {"text": text, "file_type": "xls", "method": "xlrd", "error": None}
|
|
except ImportError:
|
|
pass
|
|
except Exception as e:
|
|
return {"text": "", "file_type": "xls", "method": "none",
|
|
"error": f"XLS 解析失败: {e}"}
|
|
return {"text": "", "file_type": "xls", "method": "none",
|
|
"error": "XLS 解析需要安装 xlrd"}
|
|
|
|
|
|
def _parse_doc(path: Path) -> dict:
|
|
"""提取旧版 Word .doc 文件中的文本(尽力而为,二进制格式)。"""
|
|
try:
|
|
import olefile
|
|
ole = olefile.OleFileIO(path)
|
|
if not ole.exists("WordDocument"):
|
|
ole.close()
|
|
return {"text": "", "file_type": "doc", "method": "none",
|
|
"error": "不是有效的 .doc 文件"}
|
|
raw = ole.openstream("WordDocument").read()
|
|
ole.close()
|
|
# 提取可打印 UTF-16LE 字符段
|
|
text = ""
|
|
try:
|
|
decoded = raw.decode("utf-16-le", errors="ignore")
|
|
text = "".join(c for c in decoded if c.isprintable() or c in "\n\r\t")
|
|
except Exception:
|
|
pass
|
|
if not text.strip():
|
|
return {"text": "", "file_type": "doc", "method": "olefile",
|
|
"error": "无法提取文本(.doc 为二进制格式,建议转换为 .docx)"}
|
|
return {"text": text.strip(), "file_type": "doc", "method": "olefile", "error": None}
|
|
except ImportError:
|
|
pass
|
|
except Exception as e:
|
|
return {"text": "", "file_type": "doc", "method": "none",
|
|
"error": f"DOC 解析失败: {e}"}
|
|
return {"text": "", "file_type": "doc", "method": "none",
|
|
"error": "DOC 解析需要安装 olefile"}
|
|
|
|
|
|
|
|
def _parse_text(path: Path) -> dict:
|
|
"""读取纯文本文件。"""
|
|
try:
|
|
text = path.read_text(encoding="utf-8")
|
|
return {"text": text, "file_type": path.suffix, "method": "direct", "error": None}
|
|
except UnicodeDecodeError:
|
|
try:
|
|
text = path.read_text(encoding="gbk")
|
|
return {"text": text, "file_type": path.suffix, "method": "direct_gbk", "error": None}
|
|
except Exception:
|
|
return {"text": "", "file_type": path.suffix, "method": "none",
|
|
"error": "无法解码文件"}
|
|
except Exception:
|
|
return {"text": "", "file_type": path.suffix, "method": "none",
|
|
"error": "读取失败"}
|