fix: band-level windowed refine_layout + programmatic map_fields to prevent 91.5% content loss

Root cause: LLM receiving full 34k-char JRXML would regenerate from scratch instead of modifying coordinates in-place, shrinking output to ~3k chars. Solution (programmatic node control, not prompt engineering): - New agent/jrxml_windower.py: decompose JRXML into header (never sent to LLM) + individual bands. Split bands >4000 chars at element boundaries. Reassemble with element count validation (>10% change = rollback). - Rewrite refine_layout: per-band windowed LLM processing (~2-4k chars each). LLM cannot "reimagine" the entire report. - Rewrite map_fields: 100% programmatic regex $F{field_N} -> real name replacement. Zero LLM calls, zero content loss. - _sanitize_field_name: non-ASCII chars escaped to _uXXXX_ format for valid JRXML identifiers. - Tests: 48 new unit tests (windower 28 + map_fields 20). All passing. Full suite 385 tests, zero regressions.
2026-05-24 08:55:38 +08:00
parent bb6cc6e241
commit bd5bfbac2d
80 changed files with 39463 additions and 108 deletions
@@ -0,0 +1,157 @@
+"""field_matcher.py 测试 — OCR 字段 → KB 字段匹配, embedding + LLM。"""
+
+import sys
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from backend.field_matcher import (
+    _cosine_similarity, match_ocr_to_kb, _match_via_llm,
+    format_field_mapping_context,
+)
+
+
+# ── 余弦相似度 ──────────────────────────────────────────────────
+
+class TestCosineSimilarity:
+    def test_identical_vectors(self):
+        assert _cosine_similarity([1, 0, 0], [1, 0, 0]) == 1.0
+
+    def test_orthogonal_vectors(self):
+        assert _cosine_similarity([1, 0, 0], [0, 1, 0]) == 0.0
+
+    def test_opposite_vectors(self):
+        assert _cosine_similarity([1, 0], [-1, 0]) == -1.0
+
+    def test_normalized_vectors_range(self):
+        sim = _cosine_similarity([0.6, 0.8], [0.8, 0.6])
+        assert -1.0 <= sim <= 1.0
+
+
+# ── LLM 匹配 ────────────────────────────────────────────────────
+
+class TestMatchViaLlm:
+    def test_returns_json_mapping(self):
+        mock_llm = MagicMock()
+        mock_response = MagicMock()
+        mock_response.content = '{"工单号": "billNo", "客户": "customerName"}'
+        mock_llm.invoke.return_value = mock_response
+
+        kb_fields = [
+            {"name": "billNo", "description": "工单号", "type": "String"},
+            {"name": "customerName", "description": "客户名称", "type": "String"},
+        ]
+        result = _match_via_llm(["工单号", "客户"], kb_fields, mock_llm)
+        assert result == {"工单号": "billNo", "客户": "customerName"}
+
+    def test_includes_candidates_hint_when_provided(self):
+        mock_llm = MagicMock()
+        mock_response = MagicMock()
+        mock_response.content = '{"工单号": "billNo"}'
+        mock_llm.invoke.return_value = mock_response
+        candidates = {"工单号": [("billNo", 0.85), ("orderId", 0.62)]}
+        result = _match_via_llm(
+            ["工单号"],
+            [{"name": "billNo", "description": "工单号", "type": "String"}],
+            mock_llm, candidates=candidates)
+        call_args = mock_llm.invoke.call_args[0][0]
+        assert "候选" in call_args
+        assert "billNo" in call_args
+
+    def test_llm_error_returns_empty_dict(self):
+        mock_llm = MagicMock()
+        mock_llm.invoke.side_effect = RuntimeError("LLM crash")
+        result = _match_via_llm(["x"], [{"name": "y", "description": "", "type": "String"}], mock_llm)
+        assert result == {}
+
+    def test_llm_returns_invalid_json_returns_empty(self):
+        mock_llm = MagicMock()
+        mock_response = MagicMock()
+        mock_response.content = "not json at all"
+        mock_llm.invoke.return_value = mock_response
+        result = _match_via_llm(["x"], [{"name": "y", "description": "", "type": "String"}], mock_llm)
+        assert result == {}
+
+
+# ── 完整匹配流程 ────────────────────────────────────────────────
+
+class TestMatchOcrToKb:
+    @pytest.fixture(autouse=True)
+    def mock_embed(self):
+        with patch("backend.field_matcher._embed") as mock_embed:
+            def _fake_embed(text):
+                if "billNo" in text or "工单" in text:
+                    return [1.0, 0.0, 0.0]
+                if "customerName" in text or "客户" in text:
+                    return [0.0, 1.0, 0.0]
+                if "amount" in text or "金额" in text:
+                    return [0.0, 0.0, 1.0]
+                return [0.0, 0.0, 0.0]
+            mock_embed.side_effect = _fake_embed
+            yield mock_embed
+
+    def test_matches_without_llm(self):
+        kb_fields = [
+            {"name": "billNo", "description": "工单号", "type": "String"},
+            {"name": "customerName", "description": "客户名称", "type": "String"},
+            {"name": "amount", "description": "金额", "type": "BigDecimal"},
+        ]
+        mapping = match_ocr_to_kb(
+            ["工单号", "客户名称", "金额"], kb_fields, llm=None)
+        assert mapping["工单号"] == "billNo"
+        assert mapping["客户名称"] == "customerName"
+        assert mapping["金额"] == "amount"
+
+    def test_empty_inputs_return_empty(self):
+        assert match_ocr_to_kb([], [], llm=None) == {}
+        assert match_ocr_to_kb(["x"], [], llm=None) == {}
+        assert match_ocr_to_kb([], [{"name": "y", "description": "", "type": "String"}], llm=None) == {}
+
+    def test_low_similarity_not_matched(self):
+        kb_fields = [{"name": "far", "description": "不相关字段", "type": "String"}]
+        mapping = match_ocr_to_kb(["无关"], kb_fields, llm=None)
+        assert "无关" not in mapping or mapping == {}
+
+    def test_uses_llm_when_provided(self):
+        mock_llm = MagicMock()
+        mock_response = MagicMock()
+        mock_response.content = '{"工单号": "billNo", "客户名称": "customerName"}'
+        mock_llm.invoke.return_value = mock_response
+        kb_fields = [
+            {"name": "billNo", "description": "工单号", "type": "String"},
+            {"name": "customerName", "description": "客户", "type": "String"},
+        ]
+        mapping = match_ocr_to_kb(["工单号", "客户名称"], kb_fields, llm=mock_llm)
+        assert mapping["工单号"] == "billNo"
+
+    def test_embedding_failure_falls_back_to_llm(self):
+        mock_llm = MagicMock()
+        mock_response = MagicMock()
+        mock_response.content = '{"工单号": "billNo"}'
+        mock_llm.invoke.return_value = mock_response
+        with patch("backend.field_matcher._embed", side_effect=RuntimeError("model error")):
+            kb_fields = [{"name": "billNo", "description": "工单号", "type": "String"}]
+            mapping = match_ocr_to_kb(["工单号"], kb_fields, llm=mock_llm)
+            assert mapping["工单号"] == "billNo"
+
+
+# ── 格式化上下文 ────────────────────────────────────────────────
+
+class TestFormatFieldMappingContext:
+    def test_formats_mapping_as_table(self):
+        ctx = format_field_mapping_context({"工单号": "billNo", "客户": "customerName"})
+        assert "[字段映射" in ctx
+        assert "$P{billNo}" in ctx
+        assert "$P{customerName}" in ctx
+        assert "工单号" in ctx
+        assert "客户" in ctx
+
+    def test_empty_mapping_returns_empty_string(self):
+        assert format_field_mapping_context({}) == ""
+
+    def test_single_entry(self):
+        ctx = format_field_mapping_context({"发票号码": "invoiceNo"})
+        assert "$P{invoiceNo}" in ctx