fix: band-level windowed refine_layout + programmatic map_fields to prevent 91.5% content loss

Root cause: LLM receiving full 34k-char JRXML would regenerate from scratch
instead of modifying coordinates in-place, shrinking output to ~3k chars.

Solution (programmatic node control, not prompt engineering):

- New agent/jrxml_windower.py: decompose JRXML into header (never sent to
  LLM) + individual bands. Split bands >4000 chars at element boundaries.
  Reassemble with element count validation (>10% change = rollback).

- Rewrite refine_layout: per-band windowed LLM processing (~2-4k chars
  each). LLM cannot "reimagine" the entire report.

- Rewrite map_fields: 100% programmatic regex $F{field_N} -> real name
  replacement. Zero LLM calls, zero content loss.

- _sanitize_field_name: non-ASCII chars escaped to _uXXXX_ format for
  valid JRXML identifiers.

- Tests: 48 new unit tests (windower 28 + map_fields 20). All passing.
  Full suite 385 tests, zero regressions.
This commit is contained in:
2026-05-24 08:55:38 +08:00
parent bb6cc6e241
commit bd5bfbac2d
80 changed files with 39463 additions and 108 deletions
+134
View File
@@ -0,0 +1,134 @@
"""初始化默认用户和预置知识库。
解析 rag/jrxml_source/ 下的全部 JRXML 模板 + 接口文档,
创建默认用户 "默认用户" 和知识库 "F6-汽车维修打印默认知识库"
执行完整的 parse -> chunk -> embed 管线。
用法:
python scripts/init_default_kb.py
python scripts/init_default_kb.py --force # 强制重建
"""
import os
import sys
import argparse
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from dotenv import load_dotenv
load_dotenv()
from backend.logger import get_logger
_log = get_logger("init_kb")
DEFAULT_USER_NAME = "默认用户"
DEFAULT_KB_NAME = "F6-汽车维修打印默认知识库"
DEFAULT_KB_DESC = "预置的汽车维修打印单模板与接口文档知识库"
def find_source_files(source_dir: Path) -> list[str]:
"""收集源目录下的所有 .jrxml 和 .md 文件。"""
files = []
if not source_dir.exists():
_log.warning("源目录不存在: %s", source_dir)
return files
for pattern in ("*.jrxml", "*.md", "**/*.jrxml", "**/*.md"):
for fp in source_dir.glob(pattern):
if fp.is_file():
files.append(str(fp))
return list(dict.fromkeys(files))
def init_default_kb(force: bool = False) -> dict:
"""初始化默认 KB,返回结果摘要。"""
from backend.kb_manager import create_user, list_users, create_kb, list_kbs
from backend.kb_parser import build_kb_from_files
# 1. 查找或创建默认用户
users = list_users()
default_user = None
for u in users:
if u.get("name") == DEFAULT_USER_NAME:
default_user = u
break
if default_user is None:
default_user = create_user(DEFAULT_USER_NAME)
_log.info("创建默认用户: %s", default_user["user_id"])
else:
_log.info("默认用户已存在: %s", default_user["user_id"])
user_id = default_user["user_id"]
# 2. 查找或创建默认 KB
user_kbs = list_kbs(user_id)
default_kb = None
for kb in user_kbs:
if kb.get("name") == DEFAULT_KB_NAME:
default_kb = kb
break
if default_kb is not None and force:
from backend.kb_manager import delete_kb
delete_kb(default_kb["kb_id"])
default_kb = None
_log.info("强制重建: 已删除旧 KB")
if default_kb is None:
default_kb = create_kb(user_id, DEFAULT_KB_NAME, DEFAULT_KB_DESC)
_log.info("创建默认知识库: %s", default_kb["kb_id"])
elif default_kb.get("parse_status") == "ready":
_log.info("默认知识库已就绪: chunks=%s", default_kb.get("chunk_count", 0))
return {"status": "already_ready", "kb_id": default_kb["kb_id"],
"user_id": user_id, "chunk_count": default_kb.get("chunk_count", 0)}
kb_id = default_kb["kb_id"]
# 3. 收集源文件
source_dir = PROJECT_ROOT / "rag" / "jrxml_source"
files = find_source_files(source_dir)
if not files:
_log.warning("未找到源文件,跳过构建")
return {"status": "no_files", "kb_id": kb_id, "user_id": user_id}
_log.info("找到 %d 个源文件", len(files))
# 4. 构建 KB
result = build_kb_from_files(kb_id, files)
_log.info("KB 构建完成: %s", result)
return {**result, "kb_id": kb_id, "user_id": user_id, "file_count": len(files)}
def main():
parser = argparse.ArgumentParser(description="初始化默认知识库")
parser.add_argument("--force", action="store_true", help="强制重建(删除已有 KB")
args = parser.parse_args()
print("=" * 60)
print("JRXML Agent - 默认知识库初始化")
print("=" * 60)
result = init_default_kb(force=args.force)
print(f"\n用户: {DEFAULT_USER_NAME}")
print(f"知识库: {DEFAULT_KB_NAME}")
print(f"状态: {result.get('status', 'unknown')}")
print(f"字段数: {result.get('field_count', 0)}")
print(f"模板数: {result.get('template_count', 0)}")
print(f"Chunk数: {result.get('chunk_count', 0)}")
if result.get("errors"):
print(f"错误: {len(result['errors'])}")
for e in result["errors"]:
print(f" - {e.get('file', '')}: {e.get('error', '')}")
print("\n初始化完成!")
if __name__ == "__main__":
main()