bd5bfbac2d
Root cause: LLM receiving full 34k-char JRXML would regenerate from scratch
instead of modifying coordinates in-place, shrinking output to ~3k chars.
Solution (programmatic node control, not prompt engineering):
- New agent/jrxml_windower.py: decompose JRXML into header (never sent to
LLM) + individual bands. Split bands >4000 chars at element boundaries.
Reassemble with element count validation (>10% change = rollback).
- Rewrite refine_layout: per-band windowed LLM processing (~2-4k chars
each). LLM cannot "reimagine" the entire report.
- Rewrite map_fields: 100% programmatic regex $F{field_N} -> real name
replacement. Zero LLM calls, zero content loss.
- _sanitize_field_name: non-ASCII chars escaped to _uXXXX_ format for
valid JRXML identifiers.
- Tests: 48 new unit tests (windower 28 + map_fields 20). All passing.
Full suite 385 tests, zero regressions.
135 lines
4.3 KiB
Python
135 lines
4.3 KiB
Python
"""初始化默认用户和预置知识库。
|
||
|
||
解析 rag/jrxml_source/ 下的全部 JRXML 模板 + 接口文档,
|
||
创建默认用户 "默认用户" 和知识库 "F6-汽车维修打印默认知识库",
|
||
执行完整的 parse -> chunk -> embed 管线。
|
||
|
||
用法:
|
||
python scripts/init_default_kb.py
|
||
python scripts/init_default_kb.py --force # 强制重建
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import argparse
|
||
from pathlib import Path
|
||
|
||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||
sys.path.insert(0, str(PROJECT_ROOT))
|
||
|
||
from dotenv import load_dotenv
|
||
load_dotenv()
|
||
|
||
from backend.logger import get_logger
|
||
|
||
_log = get_logger("init_kb")
|
||
|
||
DEFAULT_USER_NAME = "默认用户"
|
||
DEFAULT_KB_NAME = "F6-汽车维修打印默认知识库"
|
||
DEFAULT_KB_DESC = "预置的汽车维修打印单模板与接口文档知识库"
|
||
|
||
|
||
def find_source_files(source_dir: Path) -> list[str]:
|
||
"""收集源目录下的所有 .jrxml 和 .md 文件。"""
|
||
files = []
|
||
if not source_dir.exists():
|
||
_log.warning("源目录不存在: %s", source_dir)
|
||
return files
|
||
|
||
for pattern in ("*.jrxml", "*.md", "**/*.jrxml", "**/*.md"):
|
||
for fp in source_dir.glob(pattern):
|
||
if fp.is_file():
|
||
files.append(str(fp))
|
||
|
||
return list(dict.fromkeys(files))
|
||
|
||
|
||
def init_default_kb(force: bool = False) -> dict:
|
||
"""初始化默认 KB,返回结果摘要。"""
|
||
from backend.kb_manager import create_user, list_users, create_kb, list_kbs
|
||
from backend.kb_parser import build_kb_from_files
|
||
|
||
# 1. 查找或创建默认用户
|
||
users = list_users()
|
||
default_user = None
|
||
for u in users:
|
||
if u.get("name") == DEFAULT_USER_NAME:
|
||
default_user = u
|
||
break
|
||
|
||
if default_user is None:
|
||
default_user = create_user(DEFAULT_USER_NAME)
|
||
_log.info("创建默认用户: %s", default_user["user_id"])
|
||
else:
|
||
_log.info("默认用户已存在: %s", default_user["user_id"])
|
||
|
||
user_id = default_user["user_id"]
|
||
|
||
# 2. 查找或创建默认 KB
|
||
user_kbs = list_kbs(user_id)
|
||
default_kb = None
|
||
for kb in user_kbs:
|
||
if kb.get("name") == DEFAULT_KB_NAME:
|
||
default_kb = kb
|
||
break
|
||
|
||
if default_kb is not None and force:
|
||
from backend.kb_manager import delete_kb
|
||
delete_kb(default_kb["kb_id"])
|
||
default_kb = None
|
||
_log.info("强制重建: 已删除旧 KB")
|
||
|
||
if default_kb is None:
|
||
default_kb = create_kb(user_id, DEFAULT_KB_NAME, DEFAULT_KB_DESC)
|
||
_log.info("创建默认知识库: %s", default_kb["kb_id"])
|
||
elif default_kb.get("parse_status") == "ready":
|
||
_log.info("默认知识库已就绪: chunks=%s", default_kb.get("chunk_count", 0))
|
||
return {"status": "already_ready", "kb_id": default_kb["kb_id"],
|
||
"user_id": user_id, "chunk_count": default_kb.get("chunk_count", 0)}
|
||
|
||
kb_id = default_kb["kb_id"]
|
||
|
||
# 3. 收集源文件
|
||
source_dir = PROJECT_ROOT / "rag" / "jrxml_source"
|
||
files = find_source_files(source_dir)
|
||
if not files:
|
||
_log.warning("未找到源文件,跳过构建")
|
||
return {"status": "no_files", "kb_id": kb_id, "user_id": user_id}
|
||
|
||
_log.info("找到 %d 个源文件", len(files))
|
||
|
||
# 4. 构建 KB
|
||
result = build_kb_from_files(kb_id, files)
|
||
_log.info("KB 构建完成: %s", result)
|
||
|
||
return {**result, "kb_id": kb_id, "user_id": user_id, "file_count": len(files)}
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="初始化默认知识库")
|
||
parser.add_argument("--force", action="store_true", help="强制重建(删除已有 KB)")
|
||
args = parser.parse_args()
|
||
|
||
print("=" * 60)
|
||
print("JRXML Agent - 默认知识库初始化")
|
||
print("=" * 60)
|
||
|
||
result = init_default_kb(force=args.force)
|
||
|
||
print(f"\n用户: {DEFAULT_USER_NAME}")
|
||
print(f"知识库: {DEFAULT_KB_NAME}")
|
||
print(f"状态: {result.get('status', 'unknown')}")
|
||
print(f"字段数: {result.get('field_count', 0)}")
|
||
print(f"模板数: {result.get('template_count', 0)}")
|
||
print(f"Chunk数: {result.get('chunk_count', 0)}")
|
||
if result.get("errors"):
|
||
print(f"错误: {len(result['errors'])} 条")
|
||
for e in result["errors"]:
|
||
print(f" - {e.get('file', '')}: {e.get('error', '')}")
|
||
|
||
print("\n初始化完成!")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|