feat: comprehensive v2 upgrade — streaming, error KB, file upload, layout analysis

Major changes: - Streaming: LLM统一 _BaseLLM 接口 (invoke + stream), generate/modify/correct 节点使用 get_stream_writer() 实现逐字输出, UI 节点平铺展开自动折叠 - Prompt外部化: 7个prompt拆分到 prompts/*.md, loader.py 支持热重载 - 错误自增长: backend/error_kb.py — 指纹去重 + ChromaDB持久化, correct_jrxml→validate 通过时自动入库, retrieve同时搜索错误KB - 文件上传: backend/file_parser.py — PDF/DOCX/图片/文本解析, 侧边栏多文件上传, 文本自动注入下一条消息 - A4模板识别: backend/layout_analyzer.py — 三种模式(完整A4/行片段修改/行片段新建), PaddleOCR元素提取 + 行分组 + JRXML section匹配 - 会话历史下载: jrxml_versions版本追踪 + 侧边栏历史版本下载按钮 - 预览修复: route_after_save跳过预览/导出意图的验证循环 - Ctrl+C修复: JS注入拦截Streamlit裸c键清缓存 Docs: CLAUDE.md (完整项目文档), ROADMAP.md (改进路线图) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-19 15:02:53 +08:00
parent b280c2b453
commit 70614dff5e
19 changed files with 1770 additions and 231 deletions
@@ -0,0 +1,494 @@
+"""A4 图片模板布局分析器。
+
+检测上传图片并逐行识别每个元素的：
+- 位置 (x, y, w, h)
+- 字体大小（基于 OCR 边界框高度估算）
+- 文本内容
+
+支持三种模式：
+- 完整 A4 模板：比例匹配 + OCR 元素 ≥2 → 全量布局描述
+- 行片段（非 A4 但有元素）：视为 A4 中的某几行 → 部分布局描述
+- 修改匹配：将图片中的行与现有 JRXML 做匹配，定位修改位置
+
+用法:
+    from backend.layout_analyzer import analyze_layout, match_rows_to_jrxml
+    result = analyze_layout("row_snippet.png")
+    # result["template_type"] = "partial_rows"
+    match = match_rows_to_jrxml(result, current_jrxml)
+    # match["matched_rows"] = [{"row_index": 0, "jrxml_section": "detail_band", ...}]
+"""
+
+import re
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Optional
+
+import PIL.Image
+
+# A4 标准尺寸 (mm): 210 × 297, 比例 ≈ 0.707
+A4_RATIO = 210 / 297
+A4_RATIO_EXACT_MIN, A4_RATIO_EXACT_MAX = 0.686, 0.728
+A4_RATIO_CLOSE_MIN, A4_RATIO_CLOSE_MAX = 0.650, 0.764
+
+
+def analyze_layout(
+    file_path: str,
+    row_tolerance_ratio: float = 0.02,
+) -> dict:
+    """分析图片/PDF 的报表模板布局。
+
+    返回:
+    {
+        "is_a4_template": bool,       # 完整 A4 模板
+        "is_partial": bool,           # 行片段（非 A4 但有文字元素）
+        "template_type": str,         # "full_a4" | "partial_rows" | "unknown"
+        "image_size": (w, h),
+        "aspect_ratio": float,
+        "a4_confidence": str,
+        "rows": [{y_center, elements: [{x, y, w, h, font_size, text}, ...]}, ...],
+        "description": str,
+        "total_rows": int,
+        "total_elements": int,
+    }
+    """
+    path = Path(file_path)
+    if not path.exists():
+        return _empty_result("文件不存在")
+
+    img = _load_image(path)
+    if img is None:
+        return _empty_result("无法加载图片")
+
+    w, h = img.size
+    ratio = min(w, h) / max(w, h)
+
+    # A4 比例判定
+    if A4_RATIO_EXACT_MIN <= ratio <= A4_RATIO_EXACT_MAX:
+        a4_confidence = "exact"
+    elif A4_RATIO_CLOSE_MIN <= ratio <= A4_RATIO_CLOSE_MAX:
+        a4_confidence = "close"
+    else:
+        a4_confidence = "not_a4"
+
+    # OCR 提取
+    elements = _ocr_elements(img, file_path)
+
+    if not elements:
+        return {
+            "is_a4_template": False,
+            "is_partial": False,
+            "template_type": "unknown",
+            "image_size": (w, h),
+            "aspect_ratio": round(ratio, 3),
+            "a4_confidence": a4_confidence,
+            "rows": [],
+            "description": _build_description([], w, h, a4_confidence, "unknown"),
+            "total_rows": 0,
+            "total_elements": 0,
+        }
+
+    # 行分组
+    rows = _group_into_rows(elements, h, row_tolerance_ratio)
+
+    total = sum(len(r["elements"]) for r in rows)
+
+    # 模板类型判定
+    is_full_a4 = a4_confidence != "not_a4" and total >= 2
+    is_partial = not is_full_a4 and total >= 1  # 非 A4 但有文字 → 行片段
+
+    if is_full_a4:
+        template_type = "full_a4"
+    elif is_partial:
+        template_type = "partial_rows"
+    else:
+        template_type = "unknown"
+
+    description = _build_description(rows, w, h, a4_confidence, template_type)
+
+    return {
+        "is_a4_template": is_full_a4,
+        "is_partial": is_partial,
+        "template_type": template_type,
+        "image_size": (w, h),
+        "aspect_ratio": round(ratio, 3),
+        "a4_confidence": a4_confidence,
+        "rows": rows,
+        "description": description,
+        "total_rows": len(rows),
+        "total_elements": total,
+    }
+
+
+def match_rows_to_jrxml(
+    layout_result: dict,
+    current_jrxml: str,
+) -> dict:
+    """将图片中的行与现有 JRXML 中的 section/band 做匹配。
+
+    匹配策略：
+    1. 从图片 OCR 文本中提取关键词
+    2. 在 JRXML 中搜索这些关键词出现在哪个 band
+    3. 返回匹配结果，可用于定位修改位置
+
+    返回:
+    {
+        "matched": bool,
+        "matched_rows": [{row_index, row_y_center, jrxml_section, confidence}],
+        "unmatched_rows": [...],
+        "description": str,   # 人类可读的匹配结果
+    }
+    """
+    rows = layout_result.get("rows", [])
+    if not rows or not current_jrxml.strip():
+        return {"matched": False, "matched_rows": [], "unmatched_rows": rows,
+                "description": "无行数据或 JRXML 为空"}
+
+    # 解析 JRXML 结构
+    jrxml_sections = _parse_jrxml_sections(current_jrxml)
+
+    matched_rows = []
+    unmatched_rows = []
+
+    for ri, row in enumerate(rows):
+        ocr_texts = [e["text"] for e in row["elements"]]
+        best_section = None
+        best_score = 0
+
+        for section in jrxml_sections:
+            score = _text_similarity(ocr_texts, section["text_content"])
+            if score > best_score:
+                best_score = score
+                best_section = section
+
+        if best_score > 0.3 and best_section:  # 最低匹配阈值
+            matched_rows.append({
+                "row_index": ri,
+                "row_y_center": row["y_center"],
+                "jrxml_section": best_section["name"],
+                "jrxml_section_type": best_section["type"],
+                "confidence": round(best_score, 2),
+                "matched_text": best_section["text_content"][:200],
+            })
+        else:
+            unmatched_rows.append({
+                "row_index": ri,
+                "row_y_center": row["y_center"],
+                "ocr_texts": ocr_texts,
+            })
+
+    # 生成描述
+    desc_parts = []
+    if matched_rows:
+        desc_parts.append(f"图片中 {len(matched_rows)} 行匹配到当前 JRXML：")
+        for m in matched_rows:
+            desc_parts.append(
+                f"  - 图片第 {m['row_index']+1} 行 → JRXML「{m['jrxml_section']}」"
+                f"（{m['jrxml_section_type']}，置信度 {m['confidence']}）"
+            )
+    if unmatched_rows:
+        desc_parts.append(f"图片中 {len(unmatched_rows)} 行未匹配到 JRXML 现有区域：")
+        for u in unmatched_rows:
+            texts = ", ".join(u["ocr_texts"][:3])
+            desc_parts.append(f"  - 图片第 {u['row_index']+1} 行：{texts}")
+
+    return {
+        "matched": len(matched_rows) > 0,
+        "matched_rows": matched_rows,
+        "unmatched_rows": unmatched_rows,
+        "description": "\n".join(desc_parts),
+    }
+
+
+def analyze_and_inject(file_path: str, base_prompt: str,
+                       current_jrxml: str = "") -> str:
+    """分析布局并增强 prompt。
+
+    - 完整 A4 模板 → 全量布局描述
+    - 行片段 + 有 JRXML → 行匹配 + 修改指引
+    - 行片段 + 无 JRXML → 行片段描述（视为 A4 模板的一部分）
+    """
+    result = analyze_layout(file_path)
+    tt = result.get("template_type", "unknown")
+
+    if tt == "unknown":
+        return base_prompt
+
+    if tt == "full_a4":
+        return f"[图片模板分析 — 完整 A4 报表]\n{result['description']}\n\n---\n原始需求:\n{base_prompt}"
+
+    if tt == "partial_rows":
+        if current_jrxml.strip():
+            match = match_rows_to_jrxml(result, current_jrxml)
+            if match["matched"]:
+                return (
+                    f"[图片模板分析 — 行片段修改]\n"
+                    f"图片包含 {result['total_rows']} 行，视为 A4 模板的一部分。\n"
+                    f"{match['description']}\n\n"
+                    f"{result['description']}\n\n"
+                    f"---\n请根据以上匹配结果，修改 JRXML 中对应区域的布局：\n{base_prompt}"
+                )
+            else:
+                return (
+                    f"[图片模板分析 — 行片段（未匹配到现有区域）]\n"
+                    f"图片包含 {result['total_rows']} 行。\n"
+                    f"{result['description']}\n\n"
+                    f"---\n请根据以上行结构，在 JRXML 中找到合适位置进行修改：\n{base_prompt}"
+                )
+        else:
+            return (
+                f"[图片模板分析 — 行片段（无现有报表，按 A4 模板处理）]\n"
+                f"图片包含 {result['total_rows']} 行，请按 A4 报表模板的需求输出整张报表。\n"
+                f"{result['description']}\n\n"
+                f"---\n原始需求:\n{base_prompt}"
+            )
+
+    return base_prompt
+
+
+# ---------------------------------------------------------------------------
+# JRXML 结构解析
+# ---------------------------------------------------------------------------
+
+def _parse_jrxml_sections(jrxml: str) -> list[dict]:
+    """解析 JRXML 中的 section/band 结构。
+
+    直接搜索所有 band 元素，通过上下文字符串推断其所属 section。
+    """
+    sections = []
+    try:
+        root = ET.fromstring(jrxml)
+        section_tags = {
+            "title", "pageHeader", "columnHeader", "detail",
+            "columnFooter", "pageFooter", "summary", "background",
+            "noData", "groupHeader", "groupFooter",
+        }
+
+        for section_elem in root.iter():
+            stag = _tag(section_elem)
+            if stag not in section_tags:
+                continue
+
+            for child in section_elem:
+                if _tag(child) == "band":
+                    name = child.get("name", "")
+                    section_name = f"{stag}[{name}]" if name else stag
+                    text_content = ET.tostring(child, encoding="unicode")
+                    sections.append({
+                        "name": section_name,
+                        "type": stag,
+                        "text_content": text_content,
+                    })
+    except Exception:
+        pass
+
+    # Fallback: 如果 structured parsing 失败，直接把整个 JRXML 按 band 分割
+    if not sections:
+        sections = _parse_jrxml_regex(jrxml)
+
+    return sections
+
+
+def _tag(elem) -> str:
+    """去除命名空间前缀的标签名。"""
+    return elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
+
+
+def _parse_jrxml_regex(jrxml: str) -> list[dict]:
+    """正则回退方案：直接在文本中搜索 band 块。"""
+    sections = []
+    band_pattern = re.compile(
+        r'<(title|pageHeader|columnHeader|detail|columnFooter|pageFooter|summary|background|noData|groupHeader|groupFooter)>\s*'
+        r'(<band[^>]*>.*?</band>)\s*'
+        r'</\1>',
+        re.DOTALL,
+    )
+    for m in band_pattern.finditer(jrxml):
+        stag = m.group(1)
+        band_xml = m.group(0)
+        sections.append({
+            "name": stag,
+            "type": stag,
+            "text_content": band_xml,
+        })
+    return sections
+
+
+def _text_similarity(ocr_texts: list[str], jrxml_text: str) -> float:
+    """计算 OCR 文本与 JRXML 文本的相似度（简单的词匹配）。"""
+    if not ocr_texts or not jrxml_text:
+        return 0.0
+
+    jrxml_lower = jrxml_text.lower()
+    score = 0.0
+    for text in ocr_texts:
+        # 精确匹配
+        if text.lower() in jrxml_lower:
+            score += 1.0
+        else:
+            # 部分词匹配
+            words = re.findall(r"\w+", text)
+            matched = sum(1 for w in words if w.lower() in jrxml_lower)
+            if words:
+                score += matched / len(words) * 0.5
+
+    return min(score / len(ocr_texts), 1.0)
+
+
+# ---------------------------------------------------------------------------
+# 内部实现（不变）
+# ---------------------------------------------------------------------------
+
+def _load_image(path: Path) -> Optional[PIL.Image.Image]:
+    suffix = path.suffix.lower()
+
+    if suffix in (".png", ".jpg", ".jpeg", ".bmp", ".webp"):
+        try:
+            return PIL.Image.open(path).convert("RGB")
+        except Exception:
+            return None
+
+    if suffix == ".pdf":
+        try:
+            import pdfplumber
+            with pdfplumber.open(path) as pdf:
+                if pdf.pages:
+                    pil_img = pdf.pages[0].to_image(resolution=150)
+                    return pil_img.original.convert("RGB")
+        except Exception:
+            pass
+
+        try:
+            import fitz
+            doc = fitz.open(path)
+            pix = doc[0].get_pixmap(dpi=150)
+            img = PIL.Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            doc.close()
+            return img
+        except Exception:
+            pass
+
+    return None
+
+
+def _ocr_elements(img: PIL.Image.Image, file_path: str) -> list[dict]:
+    try:
+        from paddleocr import PaddleOCR
+        import numpy as np
+
+        ocr = PaddleOCR(lang="ch", use_angle_cls=True, show_log=False)
+        result = ocr.ocr(np.array(img))
+
+        elements = []
+        if result and result[0]:
+            for line in result[0]:
+                if len(line) < 2:
+                    continue
+                box = line[0]
+                text_info = line[1]
+                text = text_info[0] if isinstance(text_info, (list, tuple)) else str(text_info)
+                if not text.strip():
+                    continue
+
+                xs = [p[0] for p in box]
+                ys = [p[1] for p in box]
+                x_min, x_max = min(xs), max(xs)
+                y_min, y_max = min(ys), max(ys)
+
+                elements.append({
+                    "x": round(x_min, 1),
+                    "y": round(y_min, 1),
+                    "w": round(x_max - x_min, 1),
+                    "h": round(y_max - y_min, 1),
+                    "font_size": round(y_max - y_min, 1),
+                    "text": text.strip(),
+                })
+
+        elements.sort(key=lambda e: (e["y"], e["x"]))
+        return elements
+    except Exception:
+        pass
+
+    return []
+
+
+def _group_into_rows(elements: list[dict], img_height: int,
+                     tolerance_ratio: float = 0.02) -> list[dict]:
+    if not elements:
+        return []
+
+    tolerance = img_height * tolerance_ratio
+    rows = []
+    current_row = [elements[0]]
+
+    for elem in elements[1:]:
+        prev_cy = current_row[0]["y"] + current_row[0]["h"] / 2
+        curr_cy = elem["y"] + elem["h"] / 2
+
+        if abs(curr_cy - prev_cy) < tolerance:
+            current_row.append(elem)
+        else:
+            rows.append(_build_row(current_row))
+            current_row = [elem]
+
+    if current_row:
+        rows.append(_build_row(current_row))
+
+    return rows
+
+
+def _build_row(elements: list[dict]) -> dict:
+    elements.sort(key=lambda e: e["x"])
+    ys = [e["y"] for e in elements]
+    return {"y_center": round(sum(ys) / len(ys), 1), "elements": elements}
+
+
+def _build_description(rows: list[dict], img_w: int, img_h: int,
+                       a4_confidence: str, template_type: str) -> str:
+    if not rows:
+        if template_type == "partial_rows":
+            return f"图片 {img_w}x{img_h}（非 A4 比例），未检测到文字元素。"
+        return f"图片共 {img_w}x{img_h} 像素，未检测到文字元素。"
+
+    lines = []
+    if template_type == "full_a4":
+        lines.append(f"图片为完整 A4 报表模板，共 {len(rows)} 行，像素区域 {img_w}x{img_h}：")
+    elif template_type == "partial_rows":
+        lines.append(f"图片为报表模板行片段（非完整 A4），包含 {len(rows)} 行，"
+                     f"像素区域 {img_w}x{img_h}，请按 A4 模板处理：")
+    else:
+        lines.append(f"图片共 {img_w}x{img_h} 像素，包含 {len(rows)} 行文字：")
+
+    for i, row in enumerate(rows):
+        elems = row["elements"]
+        lines.append(f"\n第 {i+1} 行有 {len(elems)} 个元素：")
+        for j, e in enumerate(elems):
+            letter = chr(ord("a") + j)
+            lines.append(
+                f"  元素 {letter}：位置(x={e['x']}, y={e['y']})，"
+                f"长 {e['w']}px，高 {e['h']}px，"
+                f"字体 {e['font_size']}px，"
+                f"内容「{e['text']}」"
+            )
+
+    if template_type == "full_a4":
+        lines.append(f"\n请根据以上布局生成对应的 JRXML 报表模板。")
+    elif template_type == "partial_rows":
+        lines.append(f"\n请将以上 {len(rows)} 行作为 A4 模板的一部分，"
+                     f"生成或修改对应的 JRXML 报表区域。")
+
+    return "\n".join(lines)
+
+
+def _empty_result(error: str = "") -> dict:
+    return {
+        "is_a4_template": False,
+        "is_partial": False,
+        "template_type": "unknown",
+        "image_size": (0, 0),
+        "aspect_ratio": 0,
+        "a4_confidence": "not_a4",
+        "rows": [],
+        "description": error,
+        "total_rows": 0,
+        "total_elements": 0,
+    }