feat: v4 multimodal chat input, multi-format support, and annotation detection

- Replace st.chat_input with st-multimodal-chatinput (Ctrl+V paste, drag-drop, file button)
- Extract _process_uploaded_file() shared handler (eliminates ~70 duplicated lines)
- Add XLSX (openpyxl), XLS (xlrd), DOC (olefile) parsers to file_parser.py
- Add backend/annotation_detector.py: circle detection (HoughCircles) + arrow detection (HoughLinesP clustering) + OCR correlation + LLM context formatting
- Add annotation_result field to AgentState with session persistence
- Wire annotation detection into process_input and _format_ocr_context
- Add 11 new tests: 7 annotation detector + 4 multi-format parser
- Update all docs: CLAUDE.md, README.md, CODE_GUIDE.md, ROADMAP.md
This commit is contained in:
2026-05-20 23:43:16 +08:00
parent c9f003e1b7
commit 9bb011e429
16 changed files with 1257 additions and 164 deletions
+179 -87
View File
@@ -106,6 +106,81 @@ def _render_jrxml(jrxml: str, max_lines: int = 30):
st.code(preview, language="xml")
# ---- 共享文件上传处理 ----
def _process_uploaded_file(uploaded_file, suffix: str) -> dict:
"""处理单个上传文件:保存临时文件、解析、布局分析。
返回: {"name": str, "text": str, "type": str, "tmp_path": str|None}
"""
import tempfile
from backend.file_parser import parse_file
from backend.layout_analyzer import analyze_layout
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(uploaded_file.getvalue())
tmp_path = tmp.name
result = parse_file(tmp_path, suffix)
parsed_text = result["text"]
parsed_type = result["file_type"]
# 对图片/PDF 进行 A4 模板布局分析
if suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp", ".pdf"):
layout = analyze_layout(tmp_path)
tt = layout.get("template_type", "unknown")
current_jrxml = st.session_state.agent_state.get("current_jrxml", "")
if tt == "full_a4":
parsed_text = layout["description"]
parsed_type = "a4_template"
elif tt == "partial_rows":
parsed_type = "a4_partial"
if current_jrxml.strip():
from backend.layout_analyzer import match_rows_to_jrxml
match = match_rows_to_jrxml(layout, current_jrxml)
parsed_text = (
f"[行片段修改] 上传图片包含 {layout['total_rows']} 行,"
f"视为 A4 报表的一部分。\n\n"
f"{match['description']}\n\n"
f"--- 行结构 ---\n{layout['description']}"
)
else:
parsed_text = layout["description"]
else:
has_ocr = result.get("method") not in ("metadata_only", None)
img_w, img_h = layout["image_size"]
ratio = layout["aspect_ratio"]
if has_ocr:
parsed_text = (
f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}"
f"未检测到 A4 报表结构,图片将被视为参考样式。\n"
f"请根据用户的文字描述生成报表。"
)
else:
parsed_text = (
f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}\n"
f"⚠ OCR 引擎未安装,无法识别图片中的文字内容。\n"
f"请严格根据用户的文字描述来推断图片中的报表需求。\n"
f"(提示:如需图片文字识别,请运行 pip install paddleocr"
)
parsed_type = "image_reference"
elif suffix in (".pdf", ".docx", ".xlsx", ".xls", ".doc"):
parsed_type = suffix.lstrip(".")
keep_temp = (
suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp")
and result.get("method") not in ("metadata_only", None)
)
return {
"name": uploaded_file.name,
"text": parsed_text,
"type": parsed_type,
"tmp_path": tmp_path if keep_temp else None,
}
# ---- URL 参数 ----
query_params = st.query_params
url_session_id = query_params.get("session_id", "")
@@ -480,7 +555,8 @@ with st.sidebar:
uploaded = st.file_uploader(
"选择文件",
type=["png", "jpg", "jpeg", "bmp", "webp", "pdf", "docx", "txt", "csv", "json", "xml"],
type=["png", "jpg", "jpeg", "bmp", "webp", "pdf", "docx", "xlsx", "xls", "doc",
"txt", "csv", "json", "xml"],
accept_multiple_files=True,
key="file_uploader",
label_visibility="collapsed",
@@ -491,77 +567,21 @@ with st.sidebar:
# 去重
if any(f["name"] == uf.name for f in st.session_state.uploaded_files):
continue
import tempfile
from backend.file_parser import parse_file
from backend.layout_analyzer import analyze_layout
suffix = Path(uf.name).suffix.lower()
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(uf.getvalue())
tmp_path = tmp.name
result = _process_uploaded_file(uf, suffix)
result = parse_file(tmp_path, suffix)
# 对图片/PDF 进行 A4 模板布局分析
parsed_text = result["text"]
parsed_type = result["file_type"]
if suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp", ".pdf"):
layout = analyze_layout(tmp_path)
tt = layout.get("template_type", "unknown")
current_jrxml = st.session_state.agent_state.get("current_jrxml", "")
if tt == "full_a4":
parsed_text = layout["description"]
parsed_type = "a4_template"
elif tt == "partial_rows":
parsed_type = "a4_partial"
if current_jrxml.strip():
# 修改模式:尝试行匹配
from backend.layout_analyzer import match_rows_to_jrxml
match = match_rows_to_jrxml(layout, current_jrxml)
parsed_text = (
f"[行片段修改] 上传图片包含 {layout['total_rows']} 行,"
f"视为 A4 报表的一部分。\n\n"
f"{match['description']}\n\n"
f"--- 行结构 ---\n{layout['description']}"
)
else:
# 新建模式:按 A4 模板处理
parsed_text = layout["description"]
else:
# tt == "unknown": OCR 不可用或未检测到文字元素
has_ocr = result.get("method") not in ("metadata_only", None)
img_w, img_h = layout["image_size"]
ratio = layout["aspect_ratio"]
if has_ocr:
parsed_text = (
f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}"
f"未检测到 A4 报表结构,图片将被视为参考样式。\n"
f"请根据用户的文字描述生成报表。"
)
else:
parsed_text = (
f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}\n"
f"⚠ OCR 引擎未安装,无法识别图片中的文字内容。\n"
f"请严格根据用户的文字描述来推断图片中的报表需求。\n"
f"(提示:如需图片文字识别,请运行 pip install paddleocr"
)
parsed_type = "image_reference"
if parsed_text:
if result["text"]:
st.session_state.uploaded_files.append({
"name": uf.name,
"text": parsed_text,
"type": parsed_type,
"name": result["name"],
"text": result["text"],
"type": result["type"],
})
# 对图片类型,保存路径以便 OCR 字段提取(延迟到 process_input 阶段)
img_suffixes = (".png", ".jpg", ".jpeg", ".bmp", ".webp")
if suffix in img_suffixes and result.get("method") not in ("metadata_only", None):
tmp_path = result["tmp_path"]
if tmp_path:
st.session_state.agent_state["uploaded_file_path"] = tmp_path
st.session_state.uploaded_temp_paths.append(tmp_path)
else:
Path(tmp_path).unlink(missing_ok=True)
if st.session_state.uploaded_files:
for i, f in enumerate(st.session_state.uploaded_files):
@@ -632,34 +652,106 @@ for msg in st.session_state.messages:
else:
st.markdown(msg["content"])
# ---- 聊天输入 ----
if prompt := st.chat_input("描述您的报表需求..."):
# 拼接上传文件的文本
# ---- 聊天输入(支持粘贴/拖拽文件) ----
from st_multimodal_chatinput import multimodal_chatinput
import base64
import io
from pathlib import Path as _Path
# MIME type → 文件扩展名映射(用于剪贴板粘贴无扩展名的文件)
MIME_TO_EXT = {
"image/png": ".png",
"image/jpeg": ".jpg",
"image/bmp": ".bmp",
"image/webp": ".webp",
"application/pdf": ".pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
"application/vnd.ms-excel": ".xls",
"application/msword": ".doc",
"text/plain": ".txt",
"text/csv": ".csv",
"application/json": ".json",
"text/xml": ".xml",
}
chat_result = multimodal_chatinput()
if chat_result:
prompt = (chat_result.get("textInput") or "").strip()
chat_files = chat_result.get("uploadedFiles") or []
# 处理聊天中上传/粘贴的文件
uploaded_texts = []
uploaded_files_info = []
# 先收集侧边栏已上传的文件
if st.session_state.get("uploaded_files"):
for f in st.session_state.uploaded_files:
uploaded_texts.append(f"[上传文件: {f['name']}]\n{f['text']}")
uploaded_files_info.append({"name": f["name"], "type": f["type"], "length": len(f["text"])})
if uploaded_texts:
full_prompt = "\n\n".join(uploaded_texts) + "\n\n---\n用户需求:\n" + prompt
st.session_state.uploaded_files = [] # 用后即清
else:
full_prompt = prompt
st.session_state.uploaded_files = []
_app_log.info(
"收到用户输入",
extra={
"session_id": current_session_id,
"prompt_preview": prompt[:200],
"prompt_length": len(prompt),
"has_uploaded_files": bool(uploaded_files_info),
"uploaded_files": uploaded_files_info,
},
)
# 处理聊天中的文件
class _Base64File:
"""包装 base64 文件为类 UploadedFile 接口。"""
def __init__(self, name, data_bytes):
self.name = name
self._data = data_bytes
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
run_agent(full_prompt)
st.rerun()
def getvalue(self):
return self._data
for cf in chat_files:
name = cf.get("name", "clipboard_file")
mime = cf.get("type", "")
content_b64 = cf.get("content", "")
if not content_b64:
continue
try:
data = base64.b64decode(content_b64)
except Exception:
continue
suffix = _Path(name).suffix.lower()
if not suffix and mime in MIME_TO_EXT:
suffix = MIME_TO_EXT[mime]
name = f"{_Path(name).stem}{suffix}"
wrapper = _Base64File(name, data)
result = _process_uploaded_file(wrapper, suffix)
if result["text"]:
uploaded_texts.append(f"[上传文件: {result['name']}]\n{result['text']}")
uploaded_files_info.append({"name": result["name"], "type": result["type"], "length": len(result["text"])})
tmp_path = result["tmp_path"]
if tmp_path:
st.session_state.agent_state["uploaded_file_path"] = tmp_path
st.session_state.uploaded_temp_paths.append(tmp_path)
if prompt or uploaded_texts:
if uploaded_texts:
full_prompt = "\n\n".join(uploaded_texts)
if prompt:
full_prompt += "\n\n---\n用户需求:\n" + prompt
else:
full_prompt = prompt
displayed_prompt = prompt or "(已上传文件,未输入文字)"
_app_log.info(
"收到用户输入",
extra={
"session_id": current_session_id,
"prompt_preview": displayed_prompt[:200],
"prompt_length": len(full_prompt),
"has_uploaded_files": bool(uploaded_files_info),
"uploaded_files": uploaded_files_info,
},
)
st.session_state.messages.append({"role": "user", "content": displayed_prompt})
with st.chat_message("user"):
st.markdown(displayed_prompt)
run_agent(full_prompt)
st.rerun()