Files
panda 70614dff5e feat: comprehensive v2 upgrade — streaming, error KB, file upload, layout analysis
Major changes:
- Streaming: LLM统一 _BaseLLM 接口 (invoke + stream), generate/modify/correct
  节点使用 get_stream_writer() 实现逐字输出, UI 节点平铺展开自动折叠
- Prompt外部化: 7个prompt拆分到 prompts/*.md, loader.py 支持热重载
- 错误自增长: backend/error_kb.py — 指纹去重 + ChromaDB持久化,
  correct_jrxml→validate 通过时自动入库, retrieve同时搜索错误KB
- 文件上传: backend/file_parser.py — PDF/DOCX/图片/文本解析,
  侧边栏多文件上传, 文本自动注入下一条消息
- A4模板识别: backend/layout_analyzer.py — 三种模式(完整A4/行片段修改/行片段新建),
  PaddleOCR元素提取 + 行分组 + JRXML section匹配
- 会话历史下载: jrxml_versions版本追踪 + 侧边栏历史版本下载按钮
- 预览修复: route_after_save跳过预览/导出意图的验证循环
- Ctrl+C修复: JS注入拦截Streamlit裸c键清缓存

Docs: CLAUDE.md (完整项目文档), ROADMAP.md (改进路线图)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-19 15:02:53 +08:00

226 lines
7.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""错误自增长知识库 — 记录修正成功的错误案例,用于未来参考。
原则:
- 仅记录"新错误"(指纹去重)
- 必须包含完整的修正方案(prompt、工具链、前后 JRXML
- 存储于 ChromaDB,可被检索注入到生成 prompt 中
用法:
from backend.error_kb import ErrorKB
kb = ErrorKB()
kb.record(error_msg, bad_jrxml, good_jrxml, correction_prompt)
cases = kb.search("字段未声明", k=3)
"""
import hashlib
import json
import os
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
from dotenv import load_dotenv
load_dotenv()
CHROMA_DIR = Path(os.getenv("CHROMA_PERSIST_DIR", "./db/chroma"))
COLLECTION_NAME = "jrxml_error_cases"
def _make_fingerprint(error_msg: str) -> str:
"""生成错误指纹 — 标准化后取 hash,用于去重。
标准化规则:
- 去除字段名、变量名等具体标识符(替换为占位符)
- 小写化
- 只保留错误的结构性特征
"""
text = error_msg.lower()
# 替换变量名 / 字段名($F{xxx}, "name", 'value' 等)
text = re.sub(r'\$f\{[^}]+\}', '$f{<FIELD>}', text)
text = re.sub(r"'[^']*'", "'<VALUE>'", text)
text = re.sub(r'"[^"]*"', '"<VALUE>"', text)
# 替换数字
text = re.sub(r'\b\d+\b', '<NUM>', text)
# 压缩空白
text = re.sub(r'\s+', ' ', text).strip()
return hashlib.md5(text.encode()).hexdigest()[:16]
class ErrorKB:
"""错误案例知识库 — 包装 ChromaDB 持久化。"""
def __init__(self):
self._client = None
self._collection = None
@property
def client(self):
if self._client is None:
import chromadb
self._client = chromadb.PersistentClient(path=str(CHROMA_DIR))
return self._client
@property
def collection(self):
if self._collection is None:
try:
self._collection = self.client.get_collection(COLLECTION_NAME)
except Exception:
self._collection = self.client.create_collection(COLLECTION_NAME)
return self._collection
def exists(self, error_msg: str) -> bool:
"""检查错误是否已存在于知识库中(按指纹去重)。"""
fp = _make_fingerprint(error_msg)
try:
results = self.collection.get(ids=[fp])
return bool(results and results["ids"])
except Exception:
return False
def record(
self,
error_msg: str,
bad_jrxml: str,
good_jrxml: str,
correction_prompt: str,
model: str = "",
retry_count: int = 0,
) -> bool:
"""记录一个成功修正的错误案例。
仅当指纹不重复时写入。返回 True 表示已记录,False 表示重复。
"""
if self.exists(error_msg):
return False
fp = _make_fingerprint(error_msg)
now = datetime.now(timezone.utc).isoformat()
# 内容:结构化记录
doc = json.dumps({
"error": error_msg,
"bad_jrxml_snippet": bad_jrxml[:2000],
"good_jrxml_snippet": good_jrxml[:2000],
"correction_prompt": correction_prompt[:1500],
"model": model,
"retry_count": retry_count,
"recorded_at": now,
"tools": ["validation_service", "llm_correction"],
}, ensure_ascii=False)
# 元数据:用于检索过滤
error_keywords = _extract_keywords(error_msg)
metadata = {
"fingerprint": fp,
"error_keywords": ", ".join(error_keywords[:5]),
"recorded_at": now,
"retry_success": retry_count + 1, # 第几次修正成功的
}
self.collection.add(
ids=[fp],
documents=[doc],
metadatas=[metadata],
)
return True
def search(self, error_msg: str, k: int = 3) -> list[dict]:
"""根据错误消息搜索相似的修正案例(ChromaDB 语义搜索)。
返回 [{error, fix_snippet, prompt, ...}, ...]
"""
keywords = _extract_keywords(error_msg)
if not keywords:
return []
query_text = " ".join(keywords)
try:
results = self.collection.query(
query_texts=[query_text],
n_results=k,
include=["documents", "metadatas", "distances"],
)
except Exception:
return []
output = []
if not results["ids"] or not results["ids"][0]:
return output
for i, doc_id in enumerate(results["ids"][0]):
dist = results["distances"][0][i]
try:
data = json.loads(results["documents"][0][i])
output.append({
"id": doc_id,
"error": data.get("error", ""),
"fix_snippet": data.get("good_jrxml_snippet", ""),
"prompt": data.get("correction_prompt", ""),
"recorded_at": data.get("recorded_at", ""),
"distance": dist,
})
except json.JSONDecodeError:
continue
return output
def search_as_context(self, error_msg: str, k: int = 3) -> str:
"""搜索并返回拼接好的错误案例上下文,可直接注入 LLM prompt。"""
results = self.search(error_msg, k=k)
if not results:
return ""
parts = []
for r in results:
parts.append(
f"[历史错误案例]\n"
f"错误: {r['error'][:200]}\n"
f"修正后 JRXML 片段:\n{r['fix_snippet'][:800]}\n"
)
return "\n---\n".join(parts)
def stats(self) -> dict:
"""返回知识库统计信息。"""
try:
count = self.collection.count()
return {"total_cases": count, "collection": COLLECTION_NAME}
except Exception:
return {"total_cases": 0, "collection": COLLECTION_NAME}
def _extract_keywords(error_msg: str) -> list[str]:
"""从错误消息中提取关键词(中文 + 英文 token)。"""
# 中文字符作为独立关键词
chinese = re.findall(r'[一-鿿]{2,}', error_msg)
# 英文 camelCase / snake_case token
english = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{2,}', error_msg)
# JRXML 特有模式
jrxml_patterns = re.findall(r'\$F\{[^}]*\}', error_msg)
return chinese + english + jrxml_patterns
# 全局单例
_kb: Optional[ErrorKB] = None
def get_error_kb() -> ErrorKB:
global _kb
if _kb is None:
_kb = ErrorKB()
return _kb
def record_error(error_msg: str, bad_jrxml: str, good_jrxml: str,
correction_prompt: str, model: str = "", retry_count: int = 0) -> bool:
"""便捷函数:记录成功修正的错误案例。"""
return get_error_kb().record(error_msg, bad_jrxml, good_jrxml,
correction_prompt, model, retry_count)
def search_error_cases(error_msg: str, k: int = 3) -> str:
"""便捷函数:搜索历史错误案例并返回上下文字符串。"""
return get_error_kb().search_as_context(error_msg, k=k)