fix: band-level windowed refine_layout + programmatic map_fields to prevent 91.5% content loss
Root cause: LLM receiving full 34k-char JRXML would regenerate from scratch
instead of modifying coordinates in-place, shrinking output to ~3k chars.
Solution (programmatic node control, not prompt engineering):
- New agent/jrxml_windower.py: decompose JRXML into header (never sent to
LLM) + individual bands. Split bands >4000 chars at element boundaries.
Reassemble with element count validation (>10% change = rollback).
- Rewrite refine_layout: per-band windowed LLM processing (~2-4k chars
each). LLM cannot "reimagine" the entire report.
- Rewrite map_fields: 100% programmatic regex $F{field_N} -> real name
replacement. Zero LLM calls, zero content loss.
- _sanitize_field_name: non-ASCII chars escaped to _uXXXX_ format for
valid JRXML identifiers.
- Tests: 48 new unit tests (windower 28 + map_fields 20). All passing.
Full suite 385 tests, zero regressions.
This commit is contained in:
@@ -0,0 +1,134 @@
|
||||
"""初始化默认用户和预置知识库。
|
||||
|
||||
解析 rag/jrxml_source/ 下的全部 JRXML 模板 + 接口文档,
|
||||
创建默认用户 "默认用户" 和知识库 "F6-汽车维修打印默认知识库",
|
||||
执行完整的 parse -> chunk -> embed 管线。
|
||||
|
||||
用法:
|
||||
python scripts/init_default_kb.py
|
||||
python scripts/init_default_kb.py --force # 强制重建
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
from backend.logger import get_logger
|
||||
|
||||
_log = get_logger("init_kb")
|
||||
|
||||
DEFAULT_USER_NAME = "默认用户"
|
||||
DEFAULT_KB_NAME = "F6-汽车维修打印默认知识库"
|
||||
DEFAULT_KB_DESC = "预置的汽车维修打印单模板与接口文档知识库"
|
||||
|
||||
|
||||
def find_source_files(source_dir: Path) -> list[str]:
|
||||
"""收集源目录下的所有 .jrxml 和 .md 文件。"""
|
||||
files = []
|
||||
if not source_dir.exists():
|
||||
_log.warning("源目录不存在: %s", source_dir)
|
||||
return files
|
||||
|
||||
for pattern in ("*.jrxml", "*.md", "**/*.jrxml", "**/*.md"):
|
||||
for fp in source_dir.glob(pattern):
|
||||
if fp.is_file():
|
||||
files.append(str(fp))
|
||||
|
||||
return list(dict.fromkeys(files))
|
||||
|
||||
|
||||
def init_default_kb(force: bool = False) -> dict:
|
||||
"""初始化默认 KB,返回结果摘要。"""
|
||||
from backend.kb_manager import create_user, list_users, create_kb, list_kbs
|
||||
from backend.kb_parser import build_kb_from_files
|
||||
|
||||
# 1. 查找或创建默认用户
|
||||
users = list_users()
|
||||
default_user = None
|
||||
for u in users:
|
||||
if u.get("name") == DEFAULT_USER_NAME:
|
||||
default_user = u
|
||||
break
|
||||
|
||||
if default_user is None:
|
||||
default_user = create_user(DEFAULT_USER_NAME)
|
||||
_log.info("创建默认用户: %s", default_user["user_id"])
|
||||
else:
|
||||
_log.info("默认用户已存在: %s", default_user["user_id"])
|
||||
|
||||
user_id = default_user["user_id"]
|
||||
|
||||
# 2. 查找或创建默认 KB
|
||||
user_kbs = list_kbs(user_id)
|
||||
default_kb = None
|
||||
for kb in user_kbs:
|
||||
if kb.get("name") == DEFAULT_KB_NAME:
|
||||
default_kb = kb
|
||||
break
|
||||
|
||||
if default_kb is not None and force:
|
||||
from backend.kb_manager import delete_kb
|
||||
delete_kb(default_kb["kb_id"])
|
||||
default_kb = None
|
||||
_log.info("强制重建: 已删除旧 KB")
|
||||
|
||||
if default_kb is None:
|
||||
default_kb = create_kb(user_id, DEFAULT_KB_NAME, DEFAULT_KB_DESC)
|
||||
_log.info("创建默认知识库: %s", default_kb["kb_id"])
|
||||
elif default_kb.get("parse_status") == "ready":
|
||||
_log.info("默认知识库已就绪: chunks=%s", default_kb.get("chunk_count", 0))
|
||||
return {"status": "already_ready", "kb_id": default_kb["kb_id"],
|
||||
"user_id": user_id, "chunk_count": default_kb.get("chunk_count", 0)}
|
||||
|
||||
kb_id = default_kb["kb_id"]
|
||||
|
||||
# 3. 收集源文件
|
||||
source_dir = PROJECT_ROOT / "rag" / "jrxml_source"
|
||||
files = find_source_files(source_dir)
|
||||
if not files:
|
||||
_log.warning("未找到源文件,跳过构建")
|
||||
return {"status": "no_files", "kb_id": kb_id, "user_id": user_id}
|
||||
|
||||
_log.info("找到 %d 个源文件", len(files))
|
||||
|
||||
# 4. 构建 KB
|
||||
result = build_kb_from_files(kb_id, files)
|
||||
_log.info("KB 构建完成: %s", result)
|
||||
|
||||
return {**result, "kb_id": kb_id, "user_id": user_id, "file_count": len(files)}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="初始化默认知识库")
|
||||
parser.add_argument("--force", action="store_true", help="强制重建(删除已有 KB)")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("JRXML Agent - 默认知识库初始化")
|
||||
print("=" * 60)
|
||||
|
||||
result = init_default_kb(force=args.force)
|
||||
|
||||
print(f"\n用户: {DEFAULT_USER_NAME}")
|
||||
print(f"知识库: {DEFAULT_KB_NAME}")
|
||||
print(f"状态: {result.get('status', 'unknown')}")
|
||||
print(f"字段数: {result.get('field_count', 0)}")
|
||||
print(f"模板数: {result.get('template_count', 0)}")
|
||||
print(f"Chunk数: {result.get('chunk_count', 0)}")
|
||||
if result.get("errors"):
|
||||
print(f"错误: {len(result['errors'])} 条")
|
||||
for e in result["errors"]:
|
||||
print(f" - {e.get('file', '')}: {e.get('error', '')}")
|
||||
|
||||
print("\n初始化完成!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user