feat: comprehensive v2 upgrade — streaming, error KB, file upload, layout analysis
Major changes: - Streaming: LLM统一 _BaseLLM 接口 (invoke + stream), generate/modify/correct 节点使用 get_stream_writer() 实现逐字输出, UI 节点平铺展开自动折叠 - Prompt外部化: 7个prompt拆分到 prompts/*.md, loader.py 支持热重载 - 错误自增长: backend/error_kb.py — 指纹去重 + ChromaDB持久化, correct_jrxml→validate 通过时自动入库, retrieve同时搜索错误KB - 文件上传: backend/file_parser.py — PDF/DOCX/图片/文本解析, 侧边栏多文件上传, 文本自动注入下一条消息 - A4模板识别: backend/layout_analyzer.py — 三种模式(完整A4/行片段修改/行片段新建), PaddleOCR元素提取 + 行分组 + JRXML section匹配 - 会话历史下载: jrxml_versions版本追踪 + 侧边栏历史版本下载按钮 - 预览修复: route_after_save跳过预览/导出意图的验证循环 - Ctrl+C修复: JS注入拦截Streamlit裸c键清缓存 Docs: CLAUDE.md (完整项目文档), ROADMAP.md (改进路线图) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,225 @@
|
||||
"""错误自增长知识库 — 记录修正成功的错误案例,用于未来参考。
|
||||
|
||||
原则:
|
||||
- 仅记录"新错误"(指纹去重)
|
||||
- 必须包含完整的修正方案(prompt、工具链、前后 JRXML)
|
||||
- 存储于 ChromaDB,可被检索注入到生成 prompt 中
|
||||
|
||||
用法:
|
||||
from backend.error_kb import ErrorKB
|
||||
kb = ErrorKB()
|
||||
kb.record(error_msg, bad_jrxml, good_jrxml, correction_prompt)
|
||||
cases = kb.search("字段未声明", k=3)
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
CHROMA_DIR = Path(os.getenv("CHROMA_PERSIST_DIR", "./db/chroma"))
|
||||
COLLECTION_NAME = "jrxml_error_cases"
|
||||
|
||||
|
||||
def _make_fingerprint(error_msg: str) -> str:
|
||||
"""生成错误指纹 — 标准化后取 hash,用于去重。
|
||||
|
||||
标准化规则:
|
||||
- 去除字段名、变量名等具体标识符(替换为占位符)
|
||||
- 小写化
|
||||
- 只保留错误的结构性特征
|
||||
"""
|
||||
text = error_msg.lower()
|
||||
# 替换变量名 / 字段名($F{xxx}, "name", 'value' 等)
|
||||
text = re.sub(r'\$f\{[^}]+\}', '$f{<FIELD>}', text)
|
||||
text = re.sub(r"'[^']*'", "'<VALUE>'", text)
|
||||
text = re.sub(r'"[^"]*"', '"<VALUE>"', text)
|
||||
# 替换数字
|
||||
text = re.sub(r'\b\d+\b', '<NUM>', text)
|
||||
# 压缩空白
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return hashlib.md5(text.encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
class ErrorKB:
|
||||
"""错误案例知识库 — 包装 ChromaDB 持久化。"""
|
||||
|
||||
def __init__(self):
|
||||
self._client = None
|
||||
self._collection = None
|
||||
|
||||
@property
|
||||
def client(self):
|
||||
if self._client is None:
|
||||
import chromadb
|
||||
self._client = chromadb.PersistentClient(path=str(CHROMA_DIR))
|
||||
return self._client
|
||||
|
||||
@property
|
||||
def collection(self):
|
||||
if self._collection is None:
|
||||
try:
|
||||
self._collection = self.client.get_collection(COLLECTION_NAME)
|
||||
except Exception:
|
||||
self._collection = self.client.create_collection(COLLECTION_NAME)
|
||||
return self._collection
|
||||
|
||||
def exists(self, error_msg: str) -> bool:
|
||||
"""检查错误是否已存在于知识库中(按指纹去重)。"""
|
||||
fp = _make_fingerprint(error_msg)
|
||||
try:
|
||||
results = self.collection.get(ids=[fp])
|
||||
return bool(results and results["ids"])
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def record(
|
||||
self,
|
||||
error_msg: str,
|
||||
bad_jrxml: str,
|
||||
good_jrxml: str,
|
||||
correction_prompt: str,
|
||||
model: str = "",
|
||||
retry_count: int = 0,
|
||||
) -> bool:
|
||||
"""记录一个成功修正的错误案例。
|
||||
|
||||
仅当指纹不重复时写入。返回 True 表示已记录,False 表示重复。
|
||||
"""
|
||||
if self.exists(error_msg):
|
||||
return False
|
||||
|
||||
fp = _make_fingerprint(error_msg)
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# 内容:结构化记录
|
||||
doc = json.dumps({
|
||||
"error": error_msg,
|
||||
"bad_jrxml_snippet": bad_jrxml[:2000],
|
||||
"good_jrxml_snippet": good_jrxml[:2000],
|
||||
"correction_prompt": correction_prompt[:1500],
|
||||
"model": model,
|
||||
"retry_count": retry_count,
|
||||
"recorded_at": now,
|
||||
"tools": ["validation_service", "llm_correction"],
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# 元数据:用于检索过滤
|
||||
error_keywords = _extract_keywords(error_msg)
|
||||
metadata = {
|
||||
"fingerprint": fp,
|
||||
"error_keywords": ", ".join(error_keywords[:5]),
|
||||
"recorded_at": now,
|
||||
"retry_success": retry_count + 1, # 第几次修正成功的
|
||||
}
|
||||
|
||||
self.collection.add(
|
||||
ids=[fp],
|
||||
documents=[doc],
|
||||
metadatas=[metadata],
|
||||
)
|
||||
return True
|
||||
|
||||
def search(self, error_msg: str, k: int = 3) -> list[dict]:
|
||||
"""根据错误消息搜索相似的修正案例(ChromaDB 语义搜索)。
|
||||
|
||||
返回 [{error, fix_snippet, prompt, ...}, ...]
|
||||
"""
|
||||
keywords = _extract_keywords(error_msg)
|
||||
if not keywords:
|
||||
return []
|
||||
|
||||
query_text = " ".join(keywords)
|
||||
try:
|
||||
results = self.collection.query(
|
||||
query_texts=[query_text],
|
||||
n_results=k,
|
||||
include=["documents", "metadatas", "distances"],
|
||||
)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
output = []
|
||||
if not results["ids"] or not results["ids"][0]:
|
||||
return output
|
||||
|
||||
for i, doc_id in enumerate(results["ids"][0]):
|
||||
dist = results["distances"][0][i]
|
||||
try:
|
||||
data = json.loads(results["documents"][0][i])
|
||||
output.append({
|
||||
"id": doc_id,
|
||||
"error": data.get("error", ""),
|
||||
"fix_snippet": data.get("good_jrxml_snippet", ""),
|
||||
"prompt": data.get("correction_prompt", ""),
|
||||
"recorded_at": data.get("recorded_at", ""),
|
||||
"distance": dist,
|
||||
})
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return output
|
||||
|
||||
def search_as_context(self, error_msg: str, k: int = 3) -> str:
|
||||
"""搜索并返回拼接好的错误案例上下文,可直接注入 LLM prompt。"""
|
||||
results = self.search(error_msg, k=k)
|
||||
if not results:
|
||||
return ""
|
||||
|
||||
parts = []
|
||||
for r in results:
|
||||
parts.append(
|
||||
f"[历史错误案例]\n"
|
||||
f"错误: {r['error'][:200]}\n"
|
||||
f"修正后 JRXML 片段:\n{r['fix_snippet'][:800]}\n"
|
||||
)
|
||||
return "\n---\n".join(parts)
|
||||
|
||||
def stats(self) -> dict:
|
||||
"""返回知识库统计信息。"""
|
||||
try:
|
||||
count = self.collection.count()
|
||||
return {"total_cases": count, "collection": COLLECTION_NAME}
|
||||
except Exception:
|
||||
return {"total_cases": 0, "collection": COLLECTION_NAME}
|
||||
|
||||
|
||||
def _extract_keywords(error_msg: str) -> list[str]:
|
||||
"""从错误消息中提取关键词(中文 + 英文 token)。"""
|
||||
# 中文字符作为独立关键词
|
||||
chinese = re.findall(r'[一-鿿]{2,}', error_msg)
|
||||
# 英文 camelCase / snake_case token
|
||||
english = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{2,}', error_msg)
|
||||
# JRXML 特有模式
|
||||
jrxml_patterns = re.findall(r'\$F\{[^}]*\}', error_msg)
|
||||
return chinese + english + jrxml_patterns
|
||||
|
||||
|
||||
# 全局单例
|
||||
_kb: Optional[ErrorKB] = None
|
||||
|
||||
|
||||
def get_error_kb() -> ErrorKB:
|
||||
global _kb
|
||||
if _kb is None:
|
||||
_kb = ErrorKB()
|
||||
return _kb
|
||||
|
||||
|
||||
def record_error(error_msg: str, bad_jrxml: str, good_jrxml: str,
|
||||
correction_prompt: str, model: str = "", retry_count: int = 0) -> bool:
|
||||
"""便捷函数:记录成功修正的错误案例。"""
|
||||
return get_error_kb().record(error_msg, bad_jrxml, good_jrxml,
|
||||
correction_prompt, model, retry_count)
|
||||
|
||||
|
||||
def search_error_cases(error_msg: str, k: int = 3) -> str:
|
||||
"""便捷函数:搜索历史错误案例并返回上下文字符串。"""
|
||||
return get_error_kb().search_as_context(error_msg, k=k)
|
||||
Reference in New Issue
Block a user