feat: v4 multimodal chat input, multi-format support, and annotation detection
- Replace st.chat_input with st-multimodal-chatinput (Ctrl+V paste, drag-drop, file button) - Extract _process_uploaded_file() shared handler (eliminates ~70 duplicated lines) - Add XLSX (openpyxl), XLS (xlrd), DOC (olefile) parsers to file_parser.py - Add backend/annotation_detector.py: circle detection (HoughCircles) + arrow detection (HoughLinesP clustering) + OCR correlation + LLM context formatting - Add annotation_result field to AgentState with session persistence - Wire annotation detection into process_input and _format_ocr_context - Add 11 new tests: 7 annotation detector + 4 multi-format parser - Update all docs: CLAUDE.md, README.md, CODE_GUIDE.md, ROADMAP.md
This commit is contained in:
@@ -106,6 +106,81 @@ def _render_jrxml(jrxml: str, max_lines: int = 30):
|
||||
st.code(preview, language="xml")
|
||||
|
||||
|
||||
# ---- 共享文件上传处理 ----
|
||||
def _process_uploaded_file(uploaded_file, suffix: str) -> dict:
|
||||
"""处理单个上传文件:保存临时文件、解析、布局分析。
|
||||
|
||||
返回: {"name": str, "text": str, "type": str, "tmp_path": str|None}
|
||||
"""
|
||||
import tempfile
|
||||
from backend.file_parser import parse_file
|
||||
from backend.layout_analyzer import analyze_layout
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
tmp.write(uploaded_file.getvalue())
|
||||
tmp_path = tmp.name
|
||||
|
||||
result = parse_file(tmp_path, suffix)
|
||||
parsed_text = result["text"]
|
||||
parsed_type = result["file_type"]
|
||||
|
||||
# 对图片/PDF 进行 A4 模板布局分析
|
||||
if suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp", ".pdf"):
|
||||
layout = analyze_layout(tmp_path)
|
||||
tt = layout.get("template_type", "unknown")
|
||||
current_jrxml = st.session_state.agent_state.get("current_jrxml", "")
|
||||
|
||||
if tt == "full_a4":
|
||||
parsed_text = layout["description"]
|
||||
parsed_type = "a4_template"
|
||||
elif tt == "partial_rows":
|
||||
parsed_type = "a4_partial"
|
||||
if current_jrxml.strip():
|
||||
from backend.layout_analyzer import match_rows_to_jrxml
|
||||
match = match_rows_to_jrxml(layout, current_jrxml)
|
||||
parsed_text = (
|
||||
f"[行片段修改] 上传图片包含 {layout['total_rows']} 行,"
|
||||
f"视为 A4 报表的一部分。\n\n"
|
||||
f"{match['description']}\n\n"
|
||||
f"--- 行结构 ---\n{layout['description']}"
|
||||
)
|
||||
else:
|
||||
parsed_text = layout["description"]
|
||||
else:
|
||||
has_ocr = result.get("method") not in ("metadata_only", None)
|
||||
img_w, img_h = layout["image_size"]
|
||||
ratio = layout["aspect_ratio"]
|
||||
if has_ocr:
|
||||
parsed_text = (
|
||||
f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}。"
|
||||
f"未检测到 A4 报表结构,图片将被视为参考样式。\n"
|
||||
f"请根据用户的文字描述生成报表。"
|
||||
)
|
||||
else:
|
||||
parsed_text = (
|
||||
f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}。\n"
|
||||
f"⚠ OCR 引擎未安装,无法识别图片中的文字内容。\n"
|
||||
f"请严格根据用户的文字描述来推断图片中的报表需求。\n"
|
||||
f"(提示:如需图片文字识别,请运行 pip install paddleocr)"
|
||||
)
|
||||
parsed_type = "image_reference"
|
||||
|
||||
elif suffix in (".pdf", ".docx", ".xlsx", ".xls", ".doc"):
|
||||
parsed_type = suffix.lstrip(".")
|
||||
|
||||
keep_temp = (
|
||||
suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp")
|
||||
and result.get("method") not in ("metadata_only", None)
|
||||
)
|
||||
|
||||
return {
|
||||
"name": uploaded_file.name,
|
||||
"text": parsed_text,
|
||||
"type": parsed_type,
|
||||
"tmp_path": tmp_path if keep_temp else None,
|
||||
}
|
||||
|
||||
|
||||
# ---- URL 参数 ----
|
||||
query_params = st.query_params
|
||||
url_session_id = query_params.get("session_id", "")
|
||||
@@ -480,7 +555,8 @@ with st.sidebar:
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"选择文件",
|
||||
type=["png", "jpg", "jpeg", "bmp", "webp", "pdf", "docx", "txt", "csv", "json", "xml"],
|
||||
type=["png", "jpg", "jpeg", "bmp", "webp", "pdf", "docx", "xlsx", "xls", "doc",
|
||||
"txt", "csv", "json", "xml"],
|
||||
accept_multiple_files=True,
|
||||
key="file_uploader",
|
||||
label_visibility="collapsed",
|
||||
@@ -491,77 +567,21 @@ with st.sidebar:
|
||||
# 去重
|
||||
if any(f["name"] == uf.name for f in st.session_state.uploaded_files):
|
||||
continue
|
||||
import tempfile
|
||||
from backend.file_parser import parse_file
|
||||
from backend.layout_analyzer import analyze_layout
|
||||
|
||||
suffix = Path(uf.name).suffix.lower()
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
tmp.write(uf.getvalue())
|
||||
tmp_path = tmp.name
|
||||
result = _process_uploaded_file(uf, suffix)
|
||||
|
||||
result = parse_file(tmp_path, suffix)
|
||||
|
||||
# 对图片/PDF 进行 A4 模板布局分析
|
||||
parsed_text = result["text"]
|
||||
parsed_type = result["file_type"]
|
||||
if suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp", ".pdf"):
|
||||
layout = analyze_layout(tmp_path)
|
||||
tt = layout.get("template_type", "unknown")
|
||||
current_jrxml = st.session_state.agent_state.get("current_jrxml", "")
|
||||
|
||||
if tt == "full_a4":
|
||||
parsed_text = layout["description"]
|
||||
parsed_type = "a4_template"
|
||||
elif tt == "partial_rows":
|
||||
parsed_type = "a4_partial"
|
||||
if current_jrxml.strip():
|
||||
# 修改模式:尝试行匹配
|
||||
from backend.layout_analyzer import match_rows_to_jrxml
|
||||
match = match_rows_to_jrxml(layout, current_jrxml)
|
||||
parsed_text = (
|
||||
f"[行片段修改] 上传图片包含 {layout['total_rows']} 行,"
|
||||
f"视为 A4 报表的一部分。\n\n"
|
||||
f"{match['description']}\n\n"
|
||||
f"--- 行结构 ---\n{layout['description']}"
|
||||
)
|
||||
else:
|
||||
# 新建模式:按 A4 模板处理
|
||||
parsed_text = layout["description"]
|
||||
else:
|
||||
# tt == "unknown": OCR 不可用或未检测到文字元素
|
||||
has_ocr = result.get("method") not in ("metadata_only", None)
|
||||
img_w, img_h = layout["image_size"]
|
||||
ratio = layout["aspect_ratio"]
|
||||
if has_ocr:
|
||||
parsed_text = (
|
||||
f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}。"
|
||||
f"未检测到 A4 报表结构,图片将被视为参考样式。\n"
|
||||
f"请根据用户的文字描述生成报表。"
|
||||
)
|
||||
else:
|
||||
parsed_text = (
|
||||
f"[图片上传] 尺寸 {img_w}x{img_h}px, 比例 {ratio}。\n"
|
||||
f"⚠ OCR 引擎未安装,无法识别图片中的文字内容。\n"
|
||||
f"请严格根据用户的文字描述来推断图片中的报表需求。\n"
|
||||
f"(提示:如需图片文字识别,请运行 pip install paddleocr)"
|
||||
)
|
||||
parsed_type = "image_reference"
|
||||
|
||||
if parsed_text:
|
||||
if result["text"]:
|
||||
st.session_state.uploaded_files.append({
|
||||
"name": uf.name,
|
||||
"text": parsed_text,
|
||||
"type": parsed_type,
|
||||
"name": result["name"],
|
||||
"text": result["text"],
|
||||
"type": result["type"],
|
||||
})
|
||||
|
||||
# 对图片类型,保存路径以便 OCR 字段提取(延迟到 process_input 阶段)
|
||||
img_suffixes = (".png", ".jpg", ".jpeg", ".bmp", ".webp")
|
||||
if suffix in img_suffixes and result.get("method") not in ("metadata_only", None):
|
||||
tmp_path = result["tmp_path"]
|
||||
if tmp_path:
|
||||
st.session_state.agent_state["uploaded_file_path"] = tmp_path
|
||||
st.session_state.uploaded_temp_paths.append(tmp_path)
|
||||
else:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
if st.session_state.uploaded_files:
|
||||
for i, f in enumerate(st.session_state.uploaded_files):
|
||||
@@ -632,34 +652,106 @@ for msg in st.session_state.messages:
|
||||
else:
|
||||
st.markdown(msg["content"])
|
||||
|
||||
# ---- 聊天输入 ----
|
||||
if prompt := st.chat_input("描述您的报表需求..."):
|
||||
# 拼接上传文件的文本
|
||||
# ---- 聊天输入(支持粘贴/拖拽文件) ----
|
||||
from st_multimodal_chatinput import multimodal_chatinput
|
||||
import base64
|
||||
import io
|
||||
from pathlib import Path as _Path
|
||||
|
||||
# MIME type → 文件扩展名映射(用于剪贴板粘贴无扩展名的文件)
|
||||
MIME_TO_EXT = {
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/bmp": ".bmp",
|
||||
"image/webp": ".webp",
|
||||
"application/pdf": ".pdf",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
||||
"application/vnd.ms-excel": ".xls",
|
||||
"application/msword": ".doc",
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv",
|
||||
"application/json": ".json",
|
||||
"text/xml": ".xml",
|
||||
}
|
||||
|
||||
chat_result = multimodal_chatinput()
|
||||
if chat_result:
|
||||
prompt = (chat_result.get("textInput") or "").strip()
|
||||
chat_files = chat_result.get("uploadedFiles") or []
|
||||
|
||||
# 处理聊天中上传/粘贴的文件
|
||||
uploaded_texts = []
|
||||
uploaded_files_info = []
|
||||
|
||||
# 先收集侧边栏已上传的文件
|
||||
if st.session_state.get("uploaded_files"):
|
||||
for f in st.session_state.uploaded_files:
|
||||
uploaded_texts.append(f"[上传文件: {f['name']}]\n{f['text']}")
|
||||
uploaded_files_info.append({"name": f["name"], "type": f["type"], "length": len(f["text"])})
|
||||
if uploaded_texts:
|
||||
full_prompt = "\n\n".join(uploaded_texts) + "\n\n---\n用户需求:\n" + prompt
|
||||
st.session_state.uploaded_files = [] # 用后即清
|
||||
else:
|
||||
full_prompt = prompt
|
||||
st.session_state.uploaded_files = []
|
||||
|
||||
_app_log.info(
|
||||
"收到用户输入",
|
||||
extra={
|
||||
"session_id": current_session_id,
|
||||
"prompt_preview": prompt[:200],
|
||||
"prompt_length": len(prompt),
|
||||
"has_uploaded_files": bool(uploaded_files_info),
|
||||
"uploaded_files": uploaded_files_info,
|
||||
},
|
||||
)
|
||||
# 处理聊天中的文件
|
||||
class _Base64File:
|
||||
"""包装 base64 文件为类 UploadedFile 接口。"""
|
||||
def __init__(self, name, data_bytes):
|
||||
self.name = name
|
||||
self._data = data_bytes
|
||||
|
||||
st.session_state.messages.append({"role": "user", "content": prompt})
|
||||
with st.chat_message("user"):
|
||||
st.markdown(prompt)
|
||||
run_agent(full_prompt)
|
||||
st.rerun()
|
||||
def getvalue(self):
|
||||
return self._data
|
||||
|
||||
for cf in chat_files:
|
||||
name = cf.get("name", "clipboard_file")
|
||||
mime = cf.get("type", "")
|
||||
content_b64 = cf.get("content", "")
|
||||
if not content_b64:
|
||||
continue
|
||||
try:
|
||||
data = base64.b64decode(content_b64)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
suffix = _Path(name).suffix.lower()
|
||||
if not suffix and mime in MIME_TO_EXT:
|
||||
suffix = MIME_TO_EXT[mime]
|
||||
name = f"{_Path(name).stem}{suffix}"
|
||||
|
||||
wrapper = _Base64File(name, data)
|
||||
result = _process_uploaded_file(wrapper, suffix)
|
||||
|
||||
if result["text"]:
|
||||
uploaded_texts.append(f"[上传文件: {result['name']}]\n{result['text']}")
|
||||
uploaded_files_info.append({"name": result["name"], "type": result["type"], "length": len(result["text"])})
|
||||
|
||||
tmp_path = result["tmp_path"]
|
||||
if tmp_path:
|
||||
st.session_state.agent_state["uploaded_file_path"] = tmp_path
|
||||
st.session_state.uploaded_temp_paths.append(tmp_path)
|
||||
|
||||
if prompt or uploaded_texts:
|
||||
if uploaded_texts:
|
||||
full_prompt = "\n\n".join(uploaded_texts)
|
||||
if prompt:
|
||||
full_prompt += "\n\n---\n用户需求:\n" + prompt
|
||||
else:
|
||||
full_prompt = prompt
|
||||
|
||||
displayed_prompt = prompt or "(已上传文件,未输入文字)"
|
||||
|
||||
_app_log.info(
|
||||
"收到用户输入",
|
||||
extra={
|
||||
"session_id": current_session_id,
|
||||
"prompt_preview": displayed_prompt[:200],
|
||||
"prompt_length": len(full_prompt),
|
||||
"has_uploaded_files": bool(uploaded_files_info),
|
||||
"uploaded_files": uploaded_files_info,
|
||||
},
|
||||
)
|
||||
|
||||
st.session_state.messages.append({"role": "user", "content": displayed_prompt})
|
||||
with st.chat_message("user"):
|
||||
st.markdown(displayed_prompt)
|
||||
run_agent(full_prompt)
|
||||
st.rerun()
|
||||
|
||||
Reference in New Issue
Block a user