fix: band-level windowed refine_layout + programmatic map_fields to prevent 91.5% content loss
Root cause: LLM receiving full 34k-char JRXML would regenerate from scratch
instead of modifying coordinates in-place, shrinking output to ~3k chars.
Solution (programmatic node control, not prompt engineering):
- New agent/jrxml_windower.py: decompose JRXML into header (never sent to
LLM) + individual bands. Split bands >4000 chars at element boundaries.
Reassemble with element count validation (>10% change = rollback).
- Rewrite refine_layout: per-band windowed LLM processing (~2-4k chars
each). LLM cannot "reimagine" the entire report.
- Rewrite map_fields: 100% programmatic regex $F{field_N} -> real name
replacement. Zero LLM calls, zero content loss.
- _sanitize_field_name: non-ASCII chars escaped to _uXXXX_ format for
valid JRXML identifiers.
- Tests: 48 new unit tests (windower 28 + map_fields 20). All passing.
Full suite 385 tests, zero regressions.
This commit is contained in:
@@ -0,0 +1,200 @@
|
||||
"""程序化字段映射单元测试。
|
||||
|
||||
测试 _programmatic_map_fields 和 _sanitize_field_name
|
||||
的确定性替换行为,以及 validate_element_count 校验。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
from agent.nodes import _programmatic_map_fields, _sanitize_field_name
|
||||
from agent.jrxml_windower import count_elements, validate_element_count
|
||||
|
||||
# ── 最小 JRXML 模板(含占位字段)────────────────────────────────────
|
||||
|
||||
JRXML_WITH_PLACEHOLDERS = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<jasperReport name="test" pageWidth="595" pageHeight="842">
|
||||
<field name="field_1" class="java.lang.String"/>
|
||||
<field name="field_2" class="java.math.BigDecimal"/>
|
||||
<field name="field_3" class="java.lang.String"/>
|
||||
<queryString><![CDATA[SELECT * FROM t]]></queryString>
|
||||
<title>
|
||||
<band height="50">
|
||||
<staticText>
|
||||
<reportElement x="0" y="0" width="100" height="20"/>
|
||||
<text><![CDATA[$F{field_1}]]></text>
|
||||
</staticText>
|
||||
<textField>
|
||||
<reportElement x="100" y="0" width="80" height="20"/>
|
||||
<textFieldExpression><![CDATA[$F{field_2}]]></textFieldExpression>
|
||||
</textField>
|
||||
<textField>
|
||||
<reportElement x="200" y="0" width="80" height="20"/>
|
||||
<textFieldExpression><![CDATA[$F{field_3}]]></textFieldExpression>
|
||||
</textField>
|
||||
</band>
|
||||
</title>
|
||||
<detail>
|
||||
<band height="30">
|
||||
<textField>
|
||||
<reportElement x="0" y="0" width="100" height="20"/>
|
||||
<textFieldExpression><![CDATA[$F{field_1} + " " + $F{field_2}]]></textFieldExpression>
|
||||
</textField>
|
||||
</band>
|
||||
</detail>
|
||||
</jasperReport>"""
|
||||
|
||||
|
||||
# ── _sanitize_field_name 测试 ────────────────────────────────────────
|
||||
|
||||
class TestSanitizeFieldName:
|
||||
def test_ascii_name_passes_through(self):
|
||||
assert _sanitize_field_name("customer_name") == "customer_name"
|
||||
|
||||
def test_uppercase_lowered(self):
|
||||
assert _sanitize_field_name("CustomerName") == "customername"
|
||||
|
||||
def test_spaces_replaced(self):
|
||||
assert _sanitize_field_name("customer name") == "customer_name"
|
||||
|
||||
def test_chinese_characters_escaped(self):
|
||||
result = _sanitize_field_name("发票代码")
|
||||
assert "发票" not in result
|
||||
assert "u53d1_" in result
|
||||
assert "u7968_" in result
|
||||
|
||||
def test_mixed_ascii_chinese(self):
|
||||
result = _sanitize_field_name("发票_code")
|
||||
assert "_code" in result
|
||||
assert "u53d1_" in result
|
||||
|
||||
def test_empty_returns_unnamed(self):
|
||||
assert _sanitize_field_name("") == "unnamed_field"
|
||||
|
||||
def test_all_special_chars_returns_unnamed(self):
|
||||
assert _sanitize_field_name("!!!") == "unnamed_field"
|
||||
|
||||
def test_leading_digit_prefixed(self):
|
||||
result = _sanitize_field_name("123abc")
|
||||
assert result == "f_123abc"
|
||||
|
||||
def test_consecutive_underscores_collapsed(self):
|
||||
result = _sanitize_field_name("a__b___c")
|
||||
assert result == "a_b_c"
|
||||
|
||||
def test_japanese_characters_escaped(self):
|
||||
result = _sanitize_field_name("請求書")
|
||||
assert "請求" not in result
|
||||
|
||||
|
||||
# ── _programmatic_map_fields 测试 ────────────────────────────────────
|
||||
|
||||
class TestProgrammaticMapFields:
|
||||
def test_replaces_field_declarations(self):
|
||||
ocr = [
|
||||
{"field_name": "customer_name"},
|
||||
{"field_name": "total_amount"},
|
||||
{"field_name": "invoice_date"},
|
||||
]
|
||||
result = _programmatic_map_fields(JRXML_WITH_PLACEHOLDERS, ocr)
|
||||
assert 'field name="customer_name"' in result
|
||||
assert 'field name="total_amount"' in result
|
||||
assert 'field name="invoice_date"' in result
|
||||
assert 'field name="field_1"' not in result
|
||||
|
||||
def test_replaces_field_references(self):
|
||||
ocr = [
|
||||
{"field_name": "customer_name"},
|
||||
{"field_name": "total_amount"},
|
||||
{"field_name": "invoice_date"},
|
||||
]
|
||||
result = _programmatic_map_fields(JRXML_WITH_PLACEHOLDERS, ocr)
|
||||
assert "$F{field_1}" not in result
|
||||
assert "$F{customer_name}" in result
|
||||
assert "$F{total_amount}" in result
|
||||
assert "$F{invoice_date}" in result
|
||||
|
||||
def test_preserves_element_count(self):
|
||||
ocr = [
|
||||
{"field_name": "customer_name"},
|
||||
{"field_name": "total_amount"},
|
||||
{"field_name": "invoice_date"},
|
||||
]
|
||||
result = _programmatic_map_fields(JRXML_WITH_PLACEHOLDERS, ocr)
|
||||
orig = count_elements(JRXML_WITH_PLACEHOLDERS)
|
||||
mod = count_elements(result)
|
||||
assert orig == mod, f"Elements: {orig} -> {mod}"
|
||||
|
||||
def test_preserves_coordinates(self):
|
||||
ocr = [
|
||||
{"field_name": "customer_name"},
|
||||
{"field_name": "total_amount"},
|
||||
{"field_name": "invoice_date"},
|
||||
]
|
||||
result = _programmatic_map_fields(JRXML_WITH_PLACEHOLDERS, ocr)
|
||||
assert 'x="0"' in result
|
||||
assert 'x="100"' in result
|
||||
assert 'x="200"' in result
|
||||
assert 'y="0"' in result
|
||||
assert 'width="100"' in result
|
||||
assert 'height="20"' in result
|
||||
|
||||
def test_partial_fields_preserved(self):
|
||||
"""当 OCR 字段少于占位字段时,多余占位字段保留。"""
|
||||
ocr = [
|
||||
{"field_name": "customer_name"},
|
||||
{"field_name": "total_amount"},
|
||||
]
|
||||
result = _programmatic_map_fields(JRXML_WITH_PLACEHOLDERS, ocr)
|
||||
assert 'field name="field_3"' in result
|
||||
assert "$F{field_3}" in result
|
||||
|
||||
def test_empty_field_name_skipped(self):
|
||||
"""空 field_name 的 OCR 字段不触发替换。"""
|
||||
ocr = [
|
||||
{"field_name": ""},
|
||||
{"field_name": "total_amount"},
|
||||
{"field_name": ""},
|
||||
]
|
||||
result = _programmatic_map_fields(JRXML_WITH_PLACEHOLDERS, ocr)
|
||||
assert '$F{field_1}' in result
|
||||
assert '$F{total_amount}' in result
|
||||
assert '$F{field_3}' in result
|
||||
|
||||
def test_no_ocr_fields_no_change(self):
|
||||
result = _programmatic_map_fields(JRXML_WITH_PLACEHOLDERS, [])
|
||||
assert result == JRXML_WITH_PLACEHOLDERS
|
||||
|
||||
def test_chinese_field_names_sanitized(self):
|
||||
ocr = [
|
||||
{"field_name": "发票代码"},
|
||||
{"field_name": "发票号码"},
|
||||
{"field_name": "金额"},
|
||||
]
|
||||
result = _programmatic_map_fields(JRXML_WITH_PLACEHOLDERS, ocr)
|
||||
assert "发票代码" not in result
|
||||
|
||||
def test_validate_element_count_passes(self):
|
||||
ocr = [
|
||||
{"field_name": "customer_name"},
|
||||
{"field_name": "total_amount"},
|
||||
{"field_name": "invoice_date"},
|
||||
]
|
||||
result = _programmatic_map_fields(JRXML_WITH_PLACEHOLDERS, ocr)
|
||||
validation = validate_element_count(
|
||||
JRXML_WITH_PLACEHOLDERS, result, "map_fields"
|
||||
)
|
||||
assert validation["ok"] is True
|
||||
assert validation["modified"] == validation["original"]
|
||||
|
||||
def test_expression_with_multiple_fields(self):
|
||||
"""包含多个 $F{} 的表达式正确替换。"""
|
||||
ocr = [
|
||||
{"field_name": "unit_price"},
|
||||
{"field_name": "quantity"},
|
||||
]
|
||||
result = _programmatic_map_fields(JRXML_WITH_PLACEHOLDERS, ocr)
|
||||
assert '$F{unit_price}' in result
|
||||
assert '$F{quantity}' in result
|
||||
assert '$F{field_1}' not in result
|
||||
assert '$F{field_2}' not in result
|
||||
Reference in New Issue
Block a user