Files
agent_jrxml/scripts/init_default_kb.py
panda bd5bfbac2d fix: band-level windowed refine_layout + programmatic map_fields to prevent 91.5% content loss
Root cause: LLM receiving full 34k-char JRXML would regenerate from scratch
instead of modifying coordinates in-place, shrinking output to ~3k chars.

Solution (programmatic node control, not prompt engineering):

- New agent/jrxml_windower.py: decompose JRXML into header (never sent to
  LLM) + individual bands. Split bands >4000 chars at element boundaries.
  Reassemble with element count validation (>10% change = rollback).

- Rewrite refine_layout: per-band windowed LLM processing (~2-4k chars
  each). LLM cannot "reimagine" the entire report.

- Rewrite map_fields: 100% programmatic regex $F{field_N} -> real name
  replacement. Zero LLM calls, zero content loss.

- _sanitize_field_name: non-ASCII chars escaped to _uXXXX_ format for
  valid JRXML identifiers.

- Tests: 48 new unit tests (windower 28 + map_fields 20). All passing.
  Full suite 385 tests, zero regressions.
2026-05-24 08:55:38 +08:00

135 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""初始化默认用户和预置知识库。
解析 rag/jrxml_source/ 下的全部 JRXML 模板 + 接口文档,
创建默认用户 "默认用户" 和知识库 "F6-汽车维修打印默认知识库"
执行完整的 parse -> chunk -> embed 管线。
用法:
python scripts/init_default_kb.py
python scripts/init_default_kb.py --force # 强制重建
"""
import os
import sys
import argparse
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from dotenv import load_dotenv
load_dotenv()
from backend.logger import get_logger
_log = get_logger("init_kb")
DEFAULT_USER_NAME = "默认用户"
DEFAULT_KB_NAME = "F6-汽车维修打印默认知识库"
DEFAULT_KB_DESC = "预置的汽车维修打印单模板与接口文档知识库"
def find_source_files(source_dir: Path) -> list[str]:
"""收集源目录下的所有 .jrxml 和 .md 文件。"""
files = []
if not source_dir.exists():
_log.warning("源目录不存在: %s", source_dir)
return files
for pattern in ("*.jrxml", "*.md", "**/*.jrxml", "**/*.md"):
for fp in source_dir.glob(pattern):
if fp.is_file():
files.append(str(fp))
return list(dict.fromkeys(files))
def init_default_kb(force: bool = False) -> dict:
"""初始化默认 KB,返回结果摘要。"""
from backend.kb_manager import create_user, list_users, create_kb, list_kbs
from backend.kb_parser import build_kb_from_files
# 1. 查找或创建默认用户
users = list_users()
default_user = None
for u in users:
if u.get("name") == DEFAULT_USER_NAME:
default_user = u
break
if default_user is None:
default_user = create_user(DEFAULT_USER_NAME)
_log.info("创建默认用户: %s", default_user["user_id"])
else:
_log.info("默认用户已存在: %s", default_user["user_id"])
user_id = default_user["user_id"]
# 2. 查找或创建默认 KB
user_kbs = list_kbs(user_id)
default_kb = None
for kb in user_kbs:
if kb.get("name") == DEFAULT_KB_NAME:
default_kb = kb
break
if default_kb is not None and force:
from backend.kb_manager import delete_kb
delete_kb(default_kb["kb_id"])
default_kb = None
_log.info("强制重建: 已删除旧 KB")
if default_kb is None:
default_kb = create_kb(user_id, DEFAULT_KB_NAME, DEFAULT_KB_DESC)
_log.info("创建默认知识库: %s", default_kb["kb_id"])
elif default_kb.get("parse_status") == "ready":
_log.info("默认知识库已就绪: chunks=%s", default_kb.get("chunk_count", 0))
return {"status": "already_ready", "kb_id": default_kb["kb_id"],
"user_id": user_id, "chunk_count": default_kb.get("chunk_count", 0)}
kb_id = default_kb["kb_id"]
# 3. 收集源文件
source_dir = PROJECT_ROOT / "rag" / "jrxml_source"
files = find_source_files(source_dir)
if not files:
_log.warning("未找到源文件,跳过构建")
return {"status": "no_files", "kb_id": kb_id, "user_id": user_id}
_log.info("找到 %d 个源文件", len(files))
# 4. 构建 KB
result = build_kb_from_files(kb_id, files)
_log.info("KB 构建完成: %s", result)
return {**result, "kb_id": kb_id, "user_id": user_id, "file_count": len(files)}
def main():
parser = argparse.ArgumentParser(description="初始化默认知识库")
parser.add_argument("--force", action="store_true", help="强制重建(删除已有 KB")
args = parser.parse_args()
print("=" * 60)
print("JRXML Agent - 默认知识库初始化")
print("=" * 60)
result = init_default_kb(force=args.force)
print(f"\n用户: {DEFAULT_USER_NAME}")
print(f"知识库: {DEFAULT_KB_NAME}")
print(f"状态: {result.get('status', 'unknown')}")
print(f"字段数: {result.get('field_count', 0)}")
print(f"模板数: {result.get('template_count', 0)}")
print(f"Chunk数: {result.get('chunk_count', 0)}")
if result.get("errors"):
print(f"错误: {len(result['errors'])}")
for e in result["errors"]:
print(f" - {e.get('file', '')}: {e.get('error', '')}")
print("\n初始化完成!")
if __name__ == "__main__":
main()