fix: band-level windowed refine_layout + programmatic map_fields to prevent 91.5% content loss

Root cause: LLM receiving full 34k-char JRXML would regenerate from scratch
instead of modifying coordinates in-place, shrinking output to ~3k chars.

Solution (programmatic node control, not prompt engineering):

- New agent/jrxml_windower.py: decompose JRXML into header (never sent to
  LLM) + individual bands. Split bands >4000 chars at element boundaries.
  Reassemble with element count validation (>10% change = rollback).

- Rewrite refine_layout: per-band windowed LLM processing (~2-4k chars
  each). LLM cannot "reimagine" the entire report.

- Rewrite map_fields: 100% programmatic regex $F{field_N} -> real name
  replacement. Zero LLM calls, zero content loss.

- _sanitize_field_name: non-ASCII chars escaped to _uXXXX_ format for
  valid JRXML identifiers.

- Tests: 48 new unit tests (windower 28 + map_fields 20). All passing.
  Full suite 385 tests, zero regressions.
This commit is contained in:
2026-05-24 08:55:38 +08:00
parent bb6cc6e241
commit bd5bfbac2d
80 changed files with 39463 additions and 108 deletions
+214
View File
@@ -0,0 +1,214 @@
"""kb_searcher.py 测试 — KBChromaSearcher 创建, 搜索, 模板检索。"""
import sys
import tempfile
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent))
from backend.kb_searcher import (
KBChromaSearcher, get_kb_searcher, search_kb, search_templates_in_kb,
)
@pytest.fixture
def mock_chromadb(monkeypatch):
mock_client = MagicMock()
mock_collection = MagicMock()
mock_client.get_collection.return_value = mock_collection
mock_client.create_collection.return_value = mock_collection
monkeypatch.setattr(
"chromadb.PersistentClient",
lambda path: mock_client)
mock_st = MagicMock()
mock_st_model = MagicMock()
mock_st_model.encode.return_value = MagicMock()
mock_st_model.encode.return_value.tolist.return_value = [0.1, 0.2, 0.3]
mock_st.return_value = mock_st_model
monkeypatch.setattr("sentence_transformers.SentenceTransformer", mock_st)
yield {"client": mock_client, "collection": mock_collection,
"st_model": mock_st_model, "st": mock_st}
@pytest.fixture
def searcher(mock_chromadb):
with tempfile.TemporaryDirectory(prefix="test_chroma_") as tmpdir:
s = KBChromaSearcher(chroma_path=tmpdir, collection_name="test_kb")
s._model = mock_chromadb["st_model"]
s._client = mock_chromadb["client"]
s._collection = mock_chromadb["collection"]
yield s
# ── 创建 & 就绪检查 ─────────────────────────────────────────────
class TestKBChromaSearcherInit:
def test_creates_with_defaults(self, mock_chromadb):
with tempfile.TemporaryDirectory(prefix="test_chroma_") as tmpdir:
s = KBChromaSearcher(chroma_path=tmpdir)
assert s.collection_name == "kb_chunks"
assert s.chroma_path == str(tmpdir)
def test_custom_collection_name(self, mock_chromadb):
with tempfile.TemporaryDirectory(prefix="test_chroma_") as tmpdir:
s = KBChromaSearcher(chroma_path=tmpdir, collection_name="custom")
assert s.collection_name == "custom"
def test_model_lazy_loaded(self, mock_chromadb):
with tempfile.TemporaryDirectory(prefix="test_chroma_") as tmpdir:
s = KBChromaSearcher(chroma_path=tmpdir)
assert s._model is None
def test_is_ready_true(self, searcher):
assert searcher.is_ready() is True
def test_is_ready_false(self, searcher, mock_chromadb):
mock_chromadb["client"].get_collection.side_effect = Exception("no collection")
assert searcher.is_ready() is False
# ── 搜索 ────────────────────────────────────────────────────────
class TestSearch:
def test_search_returns_empty_when_not_ready(self, searcher, mock_chromadb):
mock_chromadb["client"].get_collection.side_effect = Exception("no collection")
results = searcher.search("test query")
assert results == []
def test_search_calls_collection_query(self, searcher, mock_chromadb):
mock_chromadb["collection"].query.return_value = {
"ids": [["chunk_0", "chunk_1"]],
"documents": [["doc1", "doc2"]],
"metadatas": [[{"chunk_type": "md"}, {"chunk_type": "txt"}]],
"distances": [[0.1, 0.3]],
}
results = searcher.search("query", k=5)
assert len(results) == 2
assert results[0]["id"] == "chunk_0"
assert results[0]["content"] == "doc1"
assert results[0]["metadata"]["chunk_type"] == "md"
assert results[0]["distance"] == 0.1
def test_search_respects_threshold(self, searcher, mock_chromadb):
mock_chromadb["collection"].query.return_value = {
"ids": [["chunk_0", "chunk_1"]],
"documents": [["doc1", "doc2"]],
"metadatas": [[{}, {}]],
"distances": [[0.2, 0.8]],
}
results = searcher.search("query", threshold=0.5)
assert len(results) == 1
assert results[0]["id"] == "chunk_0"
def test_search_empty_results(self, searcher, mock_chromadb):
mock_chromadb["collection"].query.return_value = {
"ids": [[]], "documents": [[]], "metadatas": [[]], "distances": [[]],
}
assert searcher.search("query") == []
# ── 模板搜索 ────────────────────────────────────────────────────
class TestSearchTemplates:
def test_filters_jrxml_chunks(self, searcher, mock_chromadb):
mock_chromadb["collection"].query.return_value = {
"ids": [["c0", "c1", "c2"]],
"documents": [["t1", "t2", "t3"]],
"metadatas": [[
{"chunk_type": "jrxml_template", "report_name": "R1"},
{"chunk_type": "md_section"},
{"chunk_type": "jrxml_template", "report_name": "R2"},
]],
"distances": [[0.1, 0.2, 0.3]],
}
templates = searcher.search_templates("query", k=3)
assert len(templates) >= 1
for t in templates:
meta = t["metadata"]
assert "jrxml" in meta.get("chunk_type", "").lower() or meta.get("report_name")
def test_no_jrxml_chunks_returns_empty(self, searcher, mock_chromadb):
mock_chromadb["collection"].query.return_value = {
"ids": [["c0"]],
"documents": [["text"]],
"metadatas": [[{"chunk_type": "md_section"}]],
"distances": [[0.1]],
}
templates = searcher.search_templates("query")
assert templates == []
# ── search_as_context ───────────────────────────────────────────
class TestSearchAsContext:
def test_returns_formatted_context(self, searcher, mock_chromadb):
mock_chromadb["collection"].query.return_value = {
"ids": [["c0", "c1"]],
"documents": [["内容1", "内容2"]],
"metadatas": [[
{"chunk_type": "md", "report_name": "R1"},
{"chunk_type": "txt"},
]],
"distances": [[0.1, 0.2]],
}
ctx = searcher.search_as_context("q", k=2)
assert "内容1" in ctx
assert "内容2" in ctx
assert "类型" in ctx
assert "报表" in ctx
assert "---" in ctx
def test_empty_returns_empty_string(self, searcher, mock_chromadb):
mock_chromadb["collection"].query.return_value = {
"ids": [[]], "documents": [[]], "metadatas": [[]], "distances": [[]],
}
assert searcher.search_as_context("q") == ""
# ── add_chunks ──────────────────────────────────────────────────
class TestAddChunks:
def test_add_chunks_calls_upsert(self, searcher, mock_chromadb):
chunks = [{"id": "c0", "content": "test content", "metadata": {"chunk_type": "md"}}]
searcher.add_chunks(chunks)
mock_chromadb["collection"].upsert.assert_called_once()
kwargs = mock_chromadb["collection"].upsert.call_args[1]
assert kwargs["ids"] == ["c0"]
assert kwargs["documents"] == ["test content"]
def test_empty_chunks_noop(self, searcher, mock_chromadb):
searcher.add_chunks([])
mock_chromadb["collection"].upsert.assert_not_called()
# ── 工厂函数 ────────────────────────────────────────────────────
class TestGetKbSearcher:
def test_returns_cached_instance(self, monkeypatch, mock_chromadb):
with tempfile.TemporaryDirectory(prefix="test_chroma_") as tmpdir:
monkeypatch.setattr(
"backend.kb_manager.get_kb_chroma_path",
lambda kb_id: Path(tmpdir) if kb_id == "abcdef1234567890abcd" else None)
s1 = get_kb_searcher("abcdef1234567890abcd")
s2 = get_kb_searcher("abcdef1234567890abcd")
assert s1 is s2
def test_returns_none_for_invalid_kb(self, monkeypatch):
monkeypatch.setattr(
"backend.kb_manager.get_kb_chroma_path", lambda kb_id: None)
assert get_kb_searcher("deadbeef1234567890abcd") is None
class TestSearchKbFunction:
def test_returns_empty_for_invalid_kb(self, monkeypatch):
monkeypatch.setattr(
"backend.kb_manager.get_kb_chroma_path", lambda kb_id: None)
assert search_kb("deadbeef1234567890abcd", "query") == ""
def test_returns_empty_for_invalid_template_search(self, monkeypatch):
monkeypatch.setattr(
"backend.kb_manager.get_kb_chroma_path", lambda kb_id: None)
assert search_templates_in_kb("deadbeef1234567890abcd", "query") == []