fix: band-level windowed refine_layout + programmatic map_fields to prevent 91.5% content loss

Root cause: LLM receiving full 34k-char JRXML would regenerate from scratch
instead of modifying coordinates in-place, shrinking output to ~3k chars.

Solution (programmatic node control, not prompt engineering):

- New agent/jrxml_windower.py: decompose JRXML into header (never sent to
  LLM) + individual bands. Split bands >4000 chars at element boundaries.
  Reassemble with element count validation (>10% change = rollback).

- Rewrite refine_layout: per-band windowed LLM processing (~2-4k chars
  each). LLM cannot "reimagine" the entire report.

- Rewrite map_fields: 100% programmatic regex $F{field_N} -> real name
  replacement. Zero LLM calls, zero content loss.

- _sanitize_field_name: non-ASCII chars escaped to _uXXXX_ format for
  valid JRXML identifiers.

- Tests: 48 new unit tests (windower 28 + map_fields 20). All passing.
  Full suite 385 tests, zero regressions.
This commit is contained in:
2026-05-24 08:55:38 +08:00
parent bb6cc6e241
commit bd5bfbac2d
80 changed files with 39463 additions and 108 deletions
+157
View File
@@ -0,0 +1,157 @@
"""field_matcher.py 测试 — OCR 字段 → KB 字段匹配, embedding + LLM。"""
import sys
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent))
from backend.field_matcher import (
_cosine_similarity, match_ocr_to_kb, _match_via_llm,
format_field_mapping_context,
)
# ── 余弦相似度 ──────────────────────────────────────────────────
class TestCosineSimilarity:
def test_identical_vectors(self):
assert _cosine_similarity([1, 0, 0], [1, 0, 0]) == 1.0
def test_orthogonal_vectors(self):
assert _cosine_similarity([1, 0, 0], [0, 1, 0]) == 0.0
def test_opposite_vectors(self):
assert _cosine_similarity([1, 0], [-1, 0]) == -1.0
def test_normalized_vectors_range(self):
sim = _cosine_similarity([0.6, 0.8], [0.8, 0.6])
assert -1.0 <= sim <= 1.0
# ── LLM 匹配 ────────────────────────────────────────────────────
class TestMatchViaLlm:
def test_returns_json_mapping(self):
mock_llm = MagicMock()
mock_response = MagicMock()
mock_response.content = '{"工单号": "billNo", "客户": "customerName"}'
mock_llm.invoke.return_value = mock_response
kb_fields = [
{"name": "billNo", "description": "工单号", "type": "String"},
{"name": "customerName", "description": "客户名称", "type": "String"},
]
result = _match_via_llm(["工单号", "客户"], kb_fields, mock_llm)
assert result == {"工单号": "billNo", "客户": "customerName"}
def test_includes_candidates_hint_when_provided(self):
mock_llm = MagicMock()
mock_response = MagicMock()
mock_response.content = '{"工单号": "billNo"}'
mock_llm.invoke.return_value = mock_response
candidates = {"工单号": [("billNo", 0.85), ("orderId", 0.62)]}
result = _match_via_llm(
["工单号"],
[{"name": "billNo", "description": "工单号", "type": "String"}],
mock_llm, candidates=candidates)
call_args = mock_llm.invoke.call_args[0][0]
assert "候选" in call_args
assert "billNo" in call_args
def test_llm_error_returns_empty_dict(self):
mock_llm = MagicMock()
mock_llm.invoke.side_effect = RuntimeError("LLM crash")
result = _match_via_llm(["x"], [{"name": "y", "description": "", "type": "String"}], mock_llm)
assert result == {}
def test_llm_returns_invalid_json_returns_empty(self):
mock_llm = MagicMock()
mock_response = MagicMock()
mock_response.content = "not json at all"
mock_llm.invoke.return_value = mock_response
result = _match_via_llm(["x"], [{"name": "y", "description": "", "type": "String"}], mock_llm)
assert result == {}
# ── 完整匹配流程 ────────────────────────────────────────────────
class TestMatchOcrToKb:
@pytest.fixture(autouse=True)
def mock_embed(self):
with patch("backend.field_matcher._embed") as mock_embed:
def _fake_embed(text):
if "billNo" in text or "工单" in text:
return [1.0, 0.0, 0.0]
if "customerName" in text or "客户" in text:
return [0.0, 1.0, 0.0]
if "amount" in text or "金额" in text:
return [0.0, 0.0, 1.0]
return [0.0, 0.0, 0.0]
mock_embed.side_effect = _fake_embed
yield mock_embed
def test_matches_without_llm(self):
kb_fields = [
{"name": "billNo", "description": "工单号", "type": "String"},
{"name": "customerName", "description": "客户名称", "type": "String"},
{"name": "amount", "description": "金额", "type": "BigDecimal"},
]
mapping = match_ocr_to_kb(
["工单号", "客户名称", "金额"], kb_fields, llm=None)
assert mapping["工单号"] == "billNo"
assert mapping["客户名称"] == "customerName"
assert mapping["金额"] == "amount"
def test_empty_inputs_return_empty(self):
assert match_ocr_to_kb([], [], llm=None) == {}
assert match_ocr_to_kb(["x"], [], llm=None) == {}
assert match_ocr_to_kb([], [{"name": "y", "description": "", "type": "String"}], llm=None) == {}
def test_low_similarity_not_matched(self):
kb_fields = [{"name": "far", "description": "不相关字段", "type": "String"}]
mapping = match_ocr_to_kb(["无关"], kb_fields, llm=None)
assert "无关" not in mapping or mapping == {}
def test_uses_llm_when_provided(self):
mock_llm = MagicMock()
mock_response = MagicMock()
mock_response.content = '{"工单号": "billNo", "客户名称": "customerName"}'
mock_llm.invoke.return_value = mock_response
kb_fields = [
{"name": "billNo", "description": "工单号", "type": "String"},
{"name": "customerName", "description": "客户", "type": "String"},
]
mapping = match_ocr_to_kb(["工单号", "客户名称"], kb_fields, llm=mock_llm)
assert mapping["工单号"] == "billNo"
def test_embedding_failure_falls_back_to_llm(self):
mock_llm = MagicMock()
mock_response = MagicMock()
mock_response.content = '{"工单号": "billNo"}'
mock_llm.invoke.return_value = mock_response
with patch("backend.field_matcher._embed", side_effect=RuntimeError("model error")):
kb_fields = [{"name": "billNo", "description": "工单号", "type": "String"}]
mapping = match_ocr_to_kb(["工单号"], kb_fields, llm=mock_llm)
assert mapping["工单号"] == "billNo"
# ── 格式化上下文 ────────────────────────────────────────────────
class TestFormatFieldMappingContext:
def test_formats_mapping_as_table(self):
ctx = format_field_mapping_context({"工单号": "billNo", "客户": "customerName"})
assert "[字段映射" in ctx
assert "$P{billNo}" in ctx
assert "$P{customerName}" in ctx
assert "工单号" in ctx
assert "客户" in ctx
def test_empty_mapping_returns_empty_string(self):
assert format_field_mapping_context({}) == ""
def test_single_entry(self):
ctx = format_field_mapping_context({"发票号码": "invoiceNo"})
assert "$P{invoiceNo}" in ctx