feat: comprehensive v2 upgrade — streaming, error KB, file upload, layout analysis

Major changes:
- Streaming: LLM统一 _BaseLLM 接口 (invoke + stream), generate/modify/correct
  节点使用 get_stream_writer() 实现逐字输出, UI 节点平铺展开自动折叠
- Prompt外部化: 7个prompt拆分到 prompts/*.md, loader.py 支持热重载
- 错误自增长: backend/error_kb.py — 指纹去重 + ChromaDB持久化,
  correct_jrxml→validate 通过时自动入库, retrieve同时搜索错误KB
- 文件上传: backend/file_parser.py — PDF/DOCX/图片/文本解析,
  侧边栏多文件上传, 文本自动注入下一条消息
- A4模板识别: backend/layout_analyzer.py — 三种模式(完整A4/行片段修改/行片段新建),
  PaddleOCR元素提取 + 行分组 + JRXML section匹配
- 会话历史下载: jrxml_versions版本追踪 + 侧边栏历史版本下载按钮
- 预览修复: route_after_save跳过预览/导出意图的验证循环
- Ctrl+C修复: JS注入拦截Streamlit裸c键清缓存

Docs: CLAUDE.md (完整项目文档), ROADMAP.md (改进路线图)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-19 15:02:53 +08:00
parent b280c2b453
commit 70614dff5e
19 changed files with 1770 additions and 231 deletions
+225
View File
@@ -0,0 +1,225 @@
"""错误自增长知识库 — 记录修正成功的错误案例,用于未来参考。
原则:
- 仅记录"新错误"(指纹去重)
- 必须包含完整的修正方案(prompt、工具链、前后 JRXML
- 存储于 ChromaDB,可被检索注入到生成 prompt 中
用法:
from backend.error_kb import ErrorKB
kb = ErrorKB()
kb.record(error_msg, bad_jrxml, good_jrxml, correction_prompt)
cases = kb.search("字段未声明", k=3)
"""
import hashlib
import json
import os
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
from dotenv import load_dotenv
load_dotenv()
CHROMA_DIR = Path(os.getenv("CHROMA_PERSIST_DIR", "./db/chroma"))
COLLECTION_NAME = "jrxml_error_cases"
def _make_fingerprint(error_msg: str) -> str:
"""生成错误指纹 — 标准化后取 hash,用于去重。
标准化规则:
- 去除字段名、变量名等具体标识符(替换为占位符)
- 小写化
- 只保留错误的结构性特征
"""
text = error_msg.lower()
# 替换变量名 / 字段名($F{xxx}, "name", 'value' 等)
text = re.sub(r'\$f\{[^}]+\}', '$f{<FIELD>}', text)
text = re.sub(r"'[^']*'", "'<VALUE>'", text)
text = re.sub(r'"[^"]*"', '"<VALUE>"', text)
# 替换数字
text = re.sub(r'\b\d+\b', '<NUM>', text)
# 压缩空白
text = re.sub(r'\s+', ' ', text).strip()
return hashlib.md5(text.encode()).hexdigest()[:16]
class ErrorKB:
"""错误案例知识库 — 包装 ChromaDB 持久化。"""
def __init__(self):
self._client = None
self._collection = None
@property
def client(self):
if self._client is None:
import chromadb
self._client = chromadb.PersistentClient(path=str(CHROMA_DIR))
return self._client
@property
def collection(self):
if self._collection is None:
try:
self._collection = self.client.get_collection(COLLECTION_NAME)
except Exception:
self._collection = self.client.create_collection(COLLECTION_NAME)
return self._collection
def exists(self, error_msg: str) -> bool:
"""检查错误是否已存在于知识库中(按指纹去重)。"""
fp = _make_fingerprint(error_msg)
try:
results = self.collection.get(ids=[fp])
return bool(results and results["ids"])
except Exception:
return False
def record(
self,
error_msg: str,
bad_jrxml: str,
good_jrxml: str,
correction_prompt: str,
model: str = "",
retry_count: int = 0,
) -> bool:
"""记录一个成功修正的错误案例。
仅当指纹不重复时写入。返回 True 表示已记录,False 表示重复。
"""
if self.exists(error_msg):
return False
fp = _make_fingerprint(error_msg)
now = datetime.now(timezone.utc).isoformat()
# 内容:结构化记录
doc = json.dumps({
"error": error_msg,
"bad_jrxml_snippet": bad_jrxml[:2000],
"good_jrxml_snippet": good_jrxml[:2000],
"correction_prompt": correction_prompt[:1500],
"model": model,
"retry_count": retry_count,
"recorded_at": now,
"tools": ["validation_service", "llm_correction"],
}, ensure_ascii=False)
# 元数据:用于检索过滤
error_keywords = _extract_keywords(error_msg)
metadata = {
"fingerprint": fp,
"error_keywords": ", ".join(error_keywords[:5]),
"recorded_at": now,
"retry_success": retry_count + 1, # 第几次修正成功的
}
self.collection.add(
ids=[fp],
documents=[doc],
metadatas=[metadata],
)
return True
def search(self, error_msg: str, k: int = 3) -> list[dict]:
"""根据错误消息搜索相似的修正案例(ChromaDB 语义搜索)。
返回 [{error, fix_snippet, prompt, ...}, ...]
"""
keywords = _extract_keywords(error_msg)
if not keywords:
return []
query_text = " ".join(keywords)
try:
results = self.collection.query(
query_texts=[query_text],
n_results=k,
include=["documents", "metadatas", "distances"],
)
except Exception:
return []
output = []
if not results["ids"] or not results["ids"][0]:
return output
for i, doc_id in enumerate(results["ids"][0]):
dist = results["distances"][0][i]
try:
data = json.loads(results["documents"][0][i])
output.append({
"id": doc_id,
"error": data.get("error", ""),
"fix_snippet": data.get("good_jrxml_snippet", ""),
"prompt": data.get("correction_prompt", ""),
"recorded_at": data.get("recorded_at", ""),
"distance": dist,
})
except json.JSONDecodeError:
continue
return output
def search_as_context(self, error_msg: str, k: int = 3) -> str:
"""搜索并返回拼接好的错误案例上下文,可直接注入 LLM prompt。"""
results = self.search(error_msg, k=k)
if not results:
return ""
parts = []
for r in results:
parts.append(
f"[历史错误案例]\n"
f"错误: {r['error'][:200]}\n"
f"修正后 JRXML 片段:\n{r['fix_snippet'][:800]}\n"
)
return "\n---\n".join(parts)
def stats(self) -> dict:
"""返回知识库统计信息。"""
try:
count = self.collection.count()
return {"total_cases": count, "collection": COLLECTION_NAME}
except Exception:
return {"total_cases": 0, "collection": COLLECTION_NAME}
def _extract_keywords(error_msg: str) -> list[str]:
"""从错误消息中提取关键词(中文 + 英文 token)。"""
# 中文字符作为独立关键词
chinese = re.findall(r'[一-鿿]{2,}', error_msg)
# 英文 camelCase / snake_case token
english = re.findall(r'[a-zA-Z_][a-zA-Z0-9_]{2,}', error_msg)
# JRXML 特有模式
jrxml_patterns = re.findall(r'\$F\{[^}]*\}', error_msg)
return chinese + english + jrxml_patterns
# 全局单例
_kb: Optional[ErrorKB] = None
def get_error_kb() -> ErrorKB:
global _kb
if _kb is None:
_kb = ErrorKB()
return _kb
def record_error(error_msg: str, bad_jrxml: str, good_jrxml: str,
correction_prompt: str, model: str = "", retry_count: int = 0) -> bool:
"""便捷函数:记录成功修正的错误案例。"""
return get_error_kb().record(error_msg, bad_jrxml, good_jrxml,
correction_prompt, model, retry_count)
def search_error_cases(error_msg: str, k: int = 3) -> str:
"""便捷函数:搜索历史错误案例并返回上下文字符串。"""
return get_error_kb().search_as_context(error_msg, k=k)