feat: v4 multimodal chat input, multi-format support, and annotation detection

- Replace st.chat_input with st-multimodal-chatinput (Ctrl+V paste, drag-drop, file button)
- Extract _process_uploaded_file() shared handler (eliminates ~70 duplicated lines)
- Add XLSX (openpyxl), XLS (xlrd), DOC (olefile) parsers to file_parser.py
- Add backend/annotation_detector.py: circle detection (HoughCircles) + arrow detection (HoughLinesP clustering) + OCR correlation + LLM context formatting
- Add annotation_result field to AgentState with session persistence
- Wire annotation detection into process_input and _format_ocr_context
- Add 11 new tests: 7 annotation detector + 4 multi-format parser
- Update all docs: CLAUDE.md, README.md, CODE_GUIDE.md, ROADMAP.md
This commit is contained in:
2026-05-20 23:43:16 +08:00
parent c9f003e1b7
commit 9bb011e429
16 changed files with 1257 additions and 164 deletions
+108 -20
View File
@@ -51,6 +51,9 @@ def parse_file(file_path: str, file_type: str = "") -> dict:
".webp": _parse_image,
".pdf": _parse_pdf,
".docx": _parse_docx,
".xlsx": _parse_xlsx,
".xls": _parse_xls,
".doc": _parse_doc,
}
parser = parsers.get(suffix)
@@ -72,26 +75,7 @@ def _parse_image(path: Path) -> dict:
except Exception:
info = "[图片: 无法读取元数据]"
# 优先 EasyOCRWindows 兼容性更好
try:
import easyocr
import numpy as np
reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
result = reader.readtext(np.array(img))
lines = [text.strip() for (_, text, _) in result if text.strip()]
if lines:
return {
"text": f"{info}\n识别文本:\n" + "\n".join(lines),
"file_type": "image",
"method": "easyocr",
"error": None,
}
except ImportError:
pass
except Exception:
pass
# 回退 PaddleOCR
# 优先 PaddleOCR(精确识别
try:
from paddleocr import PaddleOCR
ocr = PaddleOCR(lang="ch")
@@ -114,6 +98,25 @@ def _parse_image(path: Path) -> dict:
except Exception:
pass
# 回退 EasyOCR
try:
import easyocr
import numpy as np
reader = easyocr.Reader(["ch_sim", "en"], gpu=False, verbose=False)
result = reader.readtext(np.array(img))
lines = [text.strip() for (_, text, _) in result if text.strip()]
if lines:
return {
"text": f"{info}\n识别文本:\n" + "\n".join(lines),
"file_type": "image",
"method": "easyocr",
"error": None,
}
except ImportError:
pass
except Exception:
pass
# OCR 不可用 → 返回图片元信息 + 安装提示
return {
"text": f"{info}\n(如需 OCR 文字识别,请安装: pip install easyocr)",
@@ -195,6 +198,91 @@ def _parse_docx(path: Path) -> dict:
"error": "DOCX 解析需要安装 python-docx"}
def _parse_xlsx(path: Path) -> dict:
"""提取 Excel .xlsx 文件中的文本。"""
try:
from openpyxl import load_workbook
wb = load_workbook(path, read_only=True, data_only=True)
parts = []
for name in wb.sheetnames:
ws = wb[name]
rows = []
for row in ws.iter_rows(values_only=True):
cells = [str(c) if c is not None else "" for c in row]
if any(c for c in cells):
rows.append("\t".join(cells))
if rows:
parts.append(f"[Sheet: {name}]\n" + "\n".join(rows))
wb.close()
text = "\n\n".join(parts)
return {"text": text, "file_type": "xlsx", "method": "openpyxl", "error": None}
except ImportError:
pass
except Exception as e:
return {"text": "", "file_type": "xlsx", "method": "none",
"error": f"XLSX 解析失败: {e}"}
return {"text": "", "file_type": "xlsx", "method": "none",
"error": "XLSX 解析需要安装 openpyxl"}
def _parse_xls(path: Path) -> dict:
"""提取旧版 Excel .xls 文件中的文本。"""
try:
import xlrd
wb = xlrd.open_workbook(path)
parts = []
for name in wb.sheet_names():
ws = wb.sheet_by_name(name)
rows = []
for rx in range(ws.nrows):
cells = [str(ws.cell_value(rx, cx)) if ws.cell_value(rx, cx) != "" else ""
for cx in range(ws.ncols)]
if any(c for c in cells):
rows.append("\t".join(cells))
if rows:
parts.append(f"[Sheet: {name}]\n" + "\n".join(rows))
text = "\n\n".join(parts)
return {"text": text, "file_type": "xls", "method": "xlrd", "error": None}
except ImportError:
pass
except Exception as e:
return {"text": "", "file_type": "xls", "method": "none",
"error": f"XLS 解析失败: {e}"}
return {"text": "", "file_type": "xls", "method": "none",
"error": "XLS 解析需要安装 xlrd"}
def _parse_doc(path: Path) -> dict:
"""提取旧版 Word .doc 文件中的文本(尽力而为,二进制格式)。"""
try:
import olefile
ole = olefile.OleFileIO(path)
if not ole.exists("WordDocument"):
ole.close()
return {"text": "", "file_type": "doc", "method": "none",
"error": "不是有效的 .doc 文件"}
raw = ole.openstream("WordDocument").read()
ole.close()
# 提取可打印 UTF-16LE 字符段
text = ""
try:
decoded = raw.decode("utf-16-le", errors="ignore")
text = "".join(c for c in decoded if c.isprintable() or c in "\n\r\t")
except Exception:
pass
if not text.strip():
return {"text": "", "file_type": "doc", "method": "olefile",
"error": "无法提取文本(.doc 为二进制格式,建议转换为 .docx)"}
return {"text": text.strip(), "file_type": "doc", "method": "olefile", "error": None}
except ImportError:
pass
except Exception as e:
return {"text": "", "file_type": "doc", "method": "none",
"error": f"DOC 解析失败: {e}"}
return {"text": "", "file_type": "doc", "method": "none",
"error": "DOC 解析需要安装 olefile"}
def _parse_text(path: Path) -> dict:
"""读取纯文本文件。"""
try: