Merge remote v4/v5 features (multimodal chat input, layered generation, annotation detection) with local v3 features (dialog file upload, XLSX support, session fix)
Key resolutions: - agent/nodes.py: Merged session_id exclusion fix with new persistable fields (ocr_extraction_result, annotation_result, layout_schema, ocr_elements) - app.py: Adopted st-multimodal-chatinput for unified paste/drop/upload, removed custom JS paste bridge - backend/file_parser.py: Kept local XLSX parser, added remote XLS/DOC parsers - CLAUDE.md + CODE_GUIDE.md: Merged documentation from both branches Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+95
-38
@@ -52,6 +52,8 @@ def parse_file(file_path: str, file_type: str = "") -> dict:
|
||||
".pdf": _parse_pdf,
|
||||
".docx": _parse_docx,
|
||||
".xlsx": _parse_xlsx,
|
||||
".xls": _parse_xls,
|
||||
".doc": _parse_doc,
|
||||
}
|
||||
|
||||
parser = parsers.get(suffix)
|
||||
@@ -73,26 +75,7 @@ def _parse_image(path: Path) -> dict:
|
||||
except Exception:
|
||||
info = "[图片: 无法读取元数据]"
|
||||
|
||||
# 优先 EasyOCR(Windows 兼容性更好)
|
||||
try:
|
||||
import easyocr
|
||||
import numpy as np
|
||||
reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
|
||||
result = reader.readtext(np.array(img))
|
||||
lines = [text.strip() for (_, text, _) in result if text.strip()]
|
||||
if lines:
|
||||
return {
|
||||
"text": f"{info}\n识别文本:\n" + "\n".join(lines),
|
||||
"file_type": "image",
|
||||
"method": "easyocr",
|
||||
"error": None,
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 回退 PaddleOCR
|
||||
# 优先 PaddleOCR(精确识别)
|
||||
try:
|
||||
from paddleocr import PaddleOCR
|
||||
ocr = PaddleOCR(lang="ch")
|
||||
@@ -115,6 +98,25 @@ def _parse_image(path: Path) -> dict:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 回退 EasyOCR
|
||||
try:
|
||||
import easyocr
|
||||
import numpy as np
|
||||
reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
|
||||
result = reader.readtext(np.array(img))
|
||||
lines = [text.strip() for (_, text, _) in result if text.strip()]
|
||||
if lines:
|
||||
return {
|
||||
"text": f"{info}\n识别文本:\n" + "\n".join(lines),
|
||||
"file_type": "image",
|
||||
"method": "easyocr",
|
||||
"error": None,
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# OCR 不可用 → 返回图片元信息 + 安装提示
|
||||
return {
|
||||
"text": f"{info}\n(如需 OCR 文字识别,请安装: pip install easyocr)",
|
||||
@@ -197,36 +199,91 @@ def _parse_docx(path: Path) -> dict:
|
||||
|
||||
|
||||
def _parse_xlsx(path: Path) -> dict:
|
||||
"""提取 Excel (.xlsx) 表格内容为文本。"""
|
||||
"""提取 Excel .xlsx 文件中的文本。"""
|
||||
try:
|
||||
import openpyxl
|
||||
wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
|
||||
sheets_text = []
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
from openpyxl import load_workbook
|
||||
wb = load_workbook(path, read_only=True, data_only=True)
|
||||
parts = []
|
||||
for name in wb.sheetnames:
|
||||
ws = wb[name]
|
||||
rows = []
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
cells = [str(c) if c is not None else "" for c in row]
|
||||
if any(c.strip() for c in cells):
|
||||
rows.append(" | ".join(cells))
|
||||
if any(c for c in cells):
|
||||
rows.append("\t".join(cells))
|
||||
if rows:
|
||||
sheets_text.append(f"--- 工作表: {sheet_name} ---\n" + "\n".join(rows))
|
||||
parts.append(f"[Sheet: {name}]\n" + "\n".join(rows))
|
||||
wb.close()
|
||||
if sheets_text:
|
||||
return {
|
||||
"text": "\n\n".join(sheets_text),
|
||||
"file_type": "xlsx",
|
||||
"method": "openpyxl",
|
||||
"error": None,
|
||||
}
|
||||
text = "\n\n".join(parts)
|
||||
return {"text": text, "file_type": "xlsx", "method": "openpyxl", "error": None}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
return {"text": "", "file_type": "xlsx", "method": "none",
|
||||
"error": f"XLSX 解析失败: {e}"}
|
||||
return {"text": "", "file_type": "xlsx", "method": "none",
|
||||
"error": "XLSX 解析需要安装 openpyxl"}
|
||||
|
||||
|
||||
def _parse_xls(path: Path) -> dict:
|
||||
"""提取旧版 Excel .xls 文件中的文本。"""
|
||||
try:
|
||||
import xlrd
|
||||
wb = xlrd.open_workbook(path)
|
||||
parts = []
|
||||
for name in wb.sheet_names():
|
||||
ws = wb.sheet_by_name(name)
|
||||
rows = []
|
||||
for rx in range(ws.nrows):
|
||||
cells = [str(ws.cell_value(rx, cx)) if ws.cell_value(rx, cx) != "" else ""
|
||||
for cx in range(ws.ncols)]
|
||||
if any(c for c in cells):
|
||||
rows.append("\t".join(cells))
|
||||
if rows:
|
||||
parts.append(f"[Sheet: {name}]\n" + "\n".join(rows))
|
||||
text = "\n\n".join(parts)
|
||||
return {"text": text, "file_type": "xls", "method": "xlrd", "error": None}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception as e:
|
||||
return {"text": "", "file_type": "xls", "method": "none",
|
||||
"error": f"XLS 解析失败: {e}"}
|
||||
return {"text": "", "file_type": "xls", "method": "none",
|
||||
"error": "XLS 解析需要安装 xlrd"}
|
||||
|
||||
|
||||
def _parse_doc(path: Path) -> dict:
|
||||
"""提取旧版 Word .doc 文件中的文本(尽力而为,二进制格式)。"""
|
||||
try:
|
||||
import olefile
|
||||
ole = olefile.OleFileIO(path)
|
||||
if not ole.exists("WordDocument"):
|
||||
ole.close()
|
||||
return {"text": "", "file_type": "doc", "method": "none",
|
||||
"error": "不是有效的 .doc 文件"}
|
||||
raw = ole.openstream("WordDocument").read()
|
||||
ole.close()
|
||||
# 提取可打印 UTF-16LE 字符段
|
||||
text = ""
|
||||
try:
|
||||
decoded = raw.decode("utf-16-le", errors="ignore")
|
||||
text = "".join(c for c in decoded if c.isprintable() or c in "\n\r\t")
|
||||
except Exception:
|
||||
pass
|
||||
if not text.strip():
|
||||
return {"text": "", "file_type": "doc", "method": "olefile",
|
||||
"error": "无法提取文本(.doc 为二进制格式,建议转换为 .docx)"}
|
||||
return {"text": text.strip(), "file_type": "doc", "method": "olefile", "error": None}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception as e:
|
||||
return {"text": "", "file_type": "doc", "method": "none",
|
||||
"error": f"DOC 解析失败: {e}"}
|
||||
return {"text": "", "file_type": "doc", "method": "none",
|
||||
"error": "DOC 解析需要安装 olefile"}
|
||||
|
||||
|
||||
|
||||
def _parse_text(path: Path) -> dict:
|
||||
"""读取纯文本文件。"""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user