From 4b43c5d3e4303d762ed0f4c44e0ab38b1242ecdd Mon Sep 17 00:00:00 2001 From: panda <1415243231@qq.com> Date: Thu, 14 May 2026 23:21:10 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20LangGraph=E5=B7=A5=E4=BD=9C=E6=B5=81?= =?UTF-8?q?=E6=A0=B8=E5=BF=83=20=E2=80=94=20Agent=E7=8A=B6=E6=80=81/?= =?UTF-8?q?=E8=8A=82=E7=82=B9/=E5=9B=BE=20+=20=E9=AA=8C=E8=AF=81=E6=9C=8D?= =?UTF-8?q?=E5=8A=A1=20+=20=E7=9F=A5=E8=AF=86=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit agent/ state.py: AgentState TypedDict(20字段含意图/压缩/会话/撤销) nodes.py: 17个节点函数(生成/修改/验证/纠错/意图分类/压缩/撤销/重置) graph.py: 17节点状态图,8意图路由分发 验证服务 validation_service/ main.py: FastAPI服务,lxml XSD验证 + 结构化检查(字段引用/SQL/尺寸) 数据 data/ sample_templates/: 4个JRXML示例模板 corrections/: 3个错误修正案例 脚本 scripts/ init_kb.py: Chroma知识库初始化 --- agent/__init__.py | 0 agent/graph.py | 225 +++++++ agent/nodes.py | 571 ++++++++++++++++++ agent/state.py | 33 + data/corrections/empty_query_string.jrxml | 6 + data/corrections/missing_field.jrxml | 6 + data/corrections/missing_page_size.jrxml | 6 + data/sample_templates/employee_roster.jrxml | 69 +++ data/sample_templates/inventory_list.jrxml | 79 +++ .../report_with_summary.jrxml | 89 +++ data/sample_templates/sales_order.jrxml | 69 +++ scripts/init_kb.py | 87 +++ validation_service/main.py | 129 ++++ validation_service/validate.bat | 6 + 14 files changed, 1375 insertions(+) create mode 100644 agent/__init__.py create mode 100644 agent/graph.py create mode 100644 agent/nodes.py create mode 100644 agent/state.py create mode 100644 data/corrections/empty_query_string.jrxml create mode 100644 data/corrections/missing_field.jrxml create mode 100644 data/corrections/missing_page_size.jrxml create mode 100644 data/sample_templates/employee_roster.jrxml create mode 100644 data/sample_templates/inventory_list.jrxml create mode 100644 data/sample_templates/report_with_summary.jrxml create mode 100644 data/sample_templates/sales_order.jrxml create mode 100644 scripts/init_kb.py create mode 100644 validation_service/main.py create mode 100644 validation_service/validate.bat diff --git a/agent/__init__.py b/agent/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agent/graph.py b/agent/graph.py new file mode 100644 index 0000000..400e802 --- /dev/null +++ b/agent/graph.py @@ -0,0 +1,225 @@ +"""LangGraph JRXML 生成代理的状态图定义。""" + +import os +from typing import Literal + +from dotenv import load_dotenv +from langgraph.graph import StateGraph, END + +from agent.state import AgentState +from agent.nodes import ( + load_session_node, + process_input, + manage_context, + save_state_snapshot, + classify_intent, + retrieve, + generate, + modify_jrxml, + handle_consult, + handle_undo, + handle_reset, + save_session_node, + validate, + explain_error, + correct_jrxml, + finalize, +) + +load_dotenv() +MAX_RETRY = int(os.getenv("MAX_RETRY", "3")) + +# ============================================================ +# 路由函数 +# ============================================================ + +def route_by_intent(state: AgentState) -> Literal[ + "retrieve", "modify_jrxml", "save_session", + "handle_consult", "handle_undo", "handle_reset" +]: + """根据 classify_intent 的结果路由到对应的处理节点。""" + intent = state.get("intent", "initial_generation") + + if intent == "initial_generation": + return "retrieve" + elif intent == "modify_report": + return "modify_jrxml" + elif intent in ("preview_report", "export_pdf", "export_jrxml"): + return "save_session" + elif intent == "consult_question": + return "handle_consult" + elif intent == "undo_modification": + return "handle_undo" + elif intent == "reset_session": + return "handle_reset" + else: + # 兜底:根据是否有报表判断 + if state.get("current_jrxml"): + return "modify_jrxml" + return "retrieve" + + +def route_after_generate(state: AgentState) -> Literal["save_session"]: + return "save_session" + + +def route_after_modify(state: AgentState) -> Literal["save_session"]: + return "save_session" + + +def route_after_undo(state: AgentState) -> Literal["save_session"]: + return "save_session" + + +def route_after_save(state: AgentState) -> Literal["validate"]: + return "validate" + + +def route_after_validate(state: AgentState) -> Literal["finalize", "explain_error"]: + if state.get("status") == "pass": + return "finalize" + return "explain_error" + + +def route_after_explain(state: AgentState) -> Literal["correct_jrxml"]: + return "correct_jrxml" + + +def route_after_correct(state: AgentState) -> Literal["validate", "finalize"]: + retry = state.get("retry_count", 0) + if retry >= MAX_RETRY: + return "finalize" + return "validate" + + +# ============================================================ +# 图构建 +# ============================================================ + +def build_graph() -> StateGraph: + workflow = StateGraph(AgentState) + + # 现有节点 + workflow.add_node("load_session", load_session_node) + workflow.add_node("process_input", process_input) + workflow.add_node("manage_context", manage_context) + workflow.add_node("save_session", save_session_node) + workflow.add_node("retrieve", retrieve) + workflow.add_node("generate", generate) + workflow.add_node("modify_jrxml", modify_jrxml) + workflow.add_node("validate", validate) + workflow.add_node("explain_error", explain_error) + workflow.add_node("correct_jrxml", correct_jrxml) + workflow.add_node("finalize", finalize) + + # 新增节点:意图识别 + workflow.add_node("save_state_snapshot", save_state_snapshot) + workflow.add_node("classify_intent", classify_intent) + workflow.add_node("handle_consult", handle_consult) + workflow.add_node("handle_undo", handle_undo) + workflow.add_node("handle_reset", handle_reset) + + # ---- 入口和前置流程 ---- + workflow.set_entry_point("load_session") + workflow.add_edge("load_session", "process_input") + workflow.add_edge("process_input", "manage_context") + workflow.add_edge("manage_context", "save_state_snapshot") + workflow.add_edge("save_state_snapshot", "classify_intent") + + # ---- 意图路由 ---- + workflow.add_conditional_edges( + "classify_intent", + route_by_intent, + { + "retrieve": "retrieve", + "modify_jrxml": "modify_jrxml", + "save_session": "save_session", + "handle_consult": "handle_consult", + "handle_undo": "handle_undo", + "handle_reset": "handle_reset", + }, + ) + + # ---- 初始生成分支 ---- + workflow.add_edge("retrieve", "generate") + workflow.add_conditional_edges( + "generate", + route_after_generate, + {"save_session": "save_session"}, + ) + + # ---- 修改分支 ---- + workflow.add_conditional_edges( + "modify_jrxml", + route_after_modify, + {"save_session": "save_session"}, + ) + + # ---- 撤销分支 ---- + workflow.add_conditional_edges( + "handle_undo", + route_after_undo, + {"save_session": "save_session"}, + ) + + # ---- 保存后进入验证 ---- + workflow.add_conditional_edges( + "save_session", + route_after_save, + {"validate": "validate"}, + ) + + # ---- 验证 → 修正循环 ---- + workflow.add_conditional_edges( + "validate", + route_after_validate, + {"finalize": "finalize", "explain_error": "explain_error"}, + ) + workflow.add_conditional_edges( + "explain_error", + route_after_explain, + {"correct_jrxml": "correct_jrxml"}, + ) + workflow.add_conditional_edges( + "correct_jrxml", + route_after_correct, + {"validate": "validate", "finalize": "finalize"}, + ) + + # ---- 咨询 / 重置 → 直接结束 ---- + workflow.add_edge("handle_consult", "finalize") + workflow.add_edge("handle_reset", "finalize") + + # ---- 结束 ---- + workflow.add_edge("finalize", END) + + return workflow.compile() + + +# ============================================================ +# 初始状态 +# ============================================================ + +def create_initial_state() -> AgentState: + return AgentState( + conversation_history=[], + current_jrxml="", + user_input="", + status="", + error_msg="", + natural_explanation="", + retry_count=0, + user_modification_request="", + final_jrxml="", + stage="initial_generation", + retrieved_context="", + full_conversation_history=[], + compressed_history="", + current_token_count=0, + session_id="", + session_name="", + created_at="", + updated_at="", + intent="", + history_states=[], + ) diff --git a/agent/nodes.py b/agent/nodes.py new file mode 100644 index 0000000..6714845 --- /dev/null +++ b/agent/nodes.py @@ -0,0 +1,571 @@ +"""LangGraph JRXML 生成工作流的节点函数。""" + +import copy +import json +import os +import re +from datetime import datetime, timezone +from typing import Dict + +from dotenv import load_dotenv + +from agent.state import AgentState +from backend.embeddings import get_embeddings +from backend.llm import get_llm +from backend.validation import validate_jrxml + +load_dotenv() + +MAX_RETRY = int(os.getenv("MAX_RETRY", "3")) +CONTEXT_MAX_TOKENS = int(os.getenv("CONTEXT_MAX_TOKENS", "6000")) +CONTEXT_KEEP_RECENT = int(os.getenv("CONTEXT_KEEP_RECENT", "4")) +HISTORY_MAX_SNAPSHOTS = int(os.getenv("HISTORY_MAX_SNAPSHOTS", "10")) + +# ============================================================ +# 意图分类提示词(约 180 tokens,控制在 200 token 以内) +# ============================================================ +INTENT_CLASSIFY_PROMPT = """你是意图分类器。根据用户输入判断意图,只输出意图名称。 + +当前有报表:{has_report} +用户输入:{user_input} + +可选意图: +- initial_generation(新建报表,或无报表时的任何需求) +- modify_report(修改当前已有报表) +- preview_report(预览/查看当前报表) +- export_pdf(导出PDF文件) +- export_jrxml(下载/导出/保存JRXML文件) +- undo_modification(撤销/回退上一步修改) +- consult_question(咨询JasperReports相关知识或使用问题) +- reset_session(清空/重置/重新开始) + +意图名称:""" + +# ============================================================ +# 咨询回答提示词 +# ============================================================ +CONSULT_PROMPT = """你是 JasperReports 专家。用简洁清晰的中文回答用户关于 JasperReports 的问题。 + +用户问题:{question} + +直接回答:""" + +# ============================================================ +# 原有提示词(不变) +# ============================================================ +INITIAL_GENERATION_PROMPT = """你是一位资深 JasperReports 工程师。根据以下参考模板和用户需求,生成一个完整、可编译的 JRXML 文件。 +JRXML 必须兼容 JasperReports 7.0.6 schema。 + +关键规则: +- 只输出 JRXML 代码,不要解释,不要 markdown 标记。 +- 报表正文中使用的每个字段必须在 部分中声明。 +- 根元素为 ,包含正确的 xmlns 属性。 +- 包含 ,在 中包含 SQL 查询。 +- 确保所有交叉引用(字段名称、band 元素)保持一致。 + +参考模板和组件: +{context} + +用户需求: +{user_request} +""" + +MODIFICATION_PROMPT = """你是一位资深 JasperReports 工程师。用户想要修改一个现有的、可编译的 JRXML 报表。精确应用请求的更改到当前 JRXML 并输出完整修改后的 JRXML。 + +关键规则: +- 只输出完整修改后的 JRXML 代码,不要解释,不要 markdown 标记。 +- 保留所有未被更改的现有结构。 +- 结果必须继续与 JasperReports 7.0.6 兼容。 +- 报表正文中使用的每个字段必须在 部分中声明。 +- 如果添加新字段,正确声明它们。 +- 确保 中有效的 SQL。 + +当前 JRXML: +{current_jrxml} + +对话历史: +{conversation_history} + +用户的修改请求: +{modification_request} +""" + +CORRECTION_PROMPT = """你是一位资深 JasperReports 工程师。你生成的 JRXML 文件编译失败。分析错误并修复 JRXML。 + +关键规则: +- 只输出完整修复后的 JRXML 代码,不要解释,不要 markdown 标记。 +- JRXML 必须与 JasperReports 7.0.6 兼容。 +- 解决下面列出的特定错误。 + +当前 JRXML(带错误): +{current_jrxml} + +编译错误: +{error_msg} + +错误的自然语言解释: +{explanation} + +立即生成修正后的 JRXML: +""" + +EXPLAIN_PROMPT = """你是一位 JasperReports 专家。用普通非技术语言解释以下 JRXML 编译错误,让业务用户能够理解。 + +错误消息: +{error_msg} + +当前 JRXML 片段(前 80 行): +{jrxml_snippet} + +用 2-3 句话解释哪里出了问题以及如何修复: +""" + +COMPRESSION_PROMPT = """你是一个信息压缩助手。以下是用户与报表生成助手之间的历史对话记录,请将其压缩为一份简洁的摘要(不超过200字)。 + +摘要必须保留以下关键信息: +- 用户提出的所有报表需求点(字段、标题、分组、汇总等) +- 用户提出的所有修改要求及其顺序 +- 当前报表的核心结构(字段列表、标题、分组方式) +- 任何特殊要求或约束条件 + +只输出摘要文本,不要添加任何解释或标记。 + +对话记录: +{conversation_text} +""" + + +# ============================================================ +# 核心工作流节点 +# ============================================================ + +def process_input(state: AgentState) -> Dict: + """记录用户输入到对话历史,重置本轮请求状态。""" + user_input = state.get("user_input", "") + + # 维护全量对话历史 + full_history = state.get("full_conversation_history", []) + full_history.append({"role": "user", "content": user_input, "ts": _now_iso()}) + state["full_conversation_history"] = full_history + + # 维护工作对话历史 + conv_history = state.get("conversation_history", []) + conv_history.append({"role": "user", "content": user_input}) + state["conversation_history"] = conv_history + + # 重置本轮请求字段 + state["retry_count"] = 0 + state["user_modification_request"] = user_input + + return state + + +def save_state_snapshot(state: AgentState) -> Dict: + """保存当前状态快照到 history_states,用于撤销操作。最多保留 N 个版本。""" + snapshots = state.get("history_states", []) + if not isinstance(snapshots, list): + snapshots = [] + + snapshot = { + "current_jrxml": state.get("current_jrxml", ""), + "final_jrxml": state.get("final_jrxml", ""), + "status": state.get("status", ""), + "conversation_history": copy.deepcopy(state.get("conversation_history", [])), + "user_modification_request": state.get("user_modification_request", ""), + "intent": state.get("intent", ""), + } + snapshots.append(snapshot) + + max_snap = HISTORY_MAX_SNAPSHOTS + if len(snapshots) > max_snap: + snapshots = snapshots[-max_snap:] + + state["history_states"] = snapshots + return state + + +def classify_intent(state: AgentState) -> Dict: + """使用 LLM 对用户输入进行意图分类(8 种意图)。""" + user_input = state.get("user_input", "") + has_report = "是" if state.get("current_jrxml", "").strip() else "否" + + intent = "initial_generation" + try: + llm = get_llm() + prompt = INTENT_CLASSIFY_PROMPT.format( + has_report=has_report, + user_input=user_input[:500], + ) + resp = llm.invoke(prompt) + raw = resp.content.strip().lower() + + valid_intents = [ + "initial_generation", "modify_report", "preview_report", + "export_pdf", "export_jrxml", "undo_modification", + "consult_question", "reset_session", + ] + for vi in valid_intents: + if vi in raw: + intent = vi + break + else: + # 兜底:有报表 → modify_report,无报表 → initial_generation + intent = "modify_report" if has_report == "是" else "initial_generation" + except Exception: + intent = "modify_report" if has_report == "是" else "initial_generation" + + state["intent"] = intent + return state + + +def handle_consult(state: AgentState) -> Dict: + """处理咨询类问题:调用 LLM 直接回答,不走报表生成流程。""" + user_input = state.get("user_input", "") + try: + llm = get_llm() + prompt = CONSULT_PROMPT.format(question=user_input) + resp = llm.invoke(prompt) + answer = resp.content.strip() + except Exception: + answer = "抱歉,暂时无法处理您的问题,请稍后再试。" + + state["consult_answer"] = answer + state["conversation_history"].append({"role": "assistant", "content": answer}) + state["full_conversation_history"].append( + {"role": "assistant", "content": answer, "ts": _now_iso()} + ) + return state + + +def handle_undo(state: AgentState) -> Dict: + """撤销上一步修改:从 history_states 恢复最近一个快照。""" + snapshots = state.get("history_states", []) + if not isinstance(snapshots, list) or not snapshots: + state["conversation_history"].append( + {"role": "assistant", "content": "没有可撤销的操作。"} + ) + return state + + prev = snapshots.pop() + state["history_states"] = snapshots + + state["current_jrxml"] = prev.get("current_jrxml", "") + state["final_jrxml"] = prev.get("final_jrxml", "") + state["status"] = prev.get("status", "") + state["conversation_history"] = prev.get("conversation_history", []) + state["user_modification_request"] = prev.get("user_modification_request", "") + + state["conversation_history"].append( + {"role": "assistant", "content": "已撤销上一步修改,恢复到之前的状态。"} + ) + state["full_conversation_history"].append( + {"role": "assistant", "content": "已撤销上一步修改。", "ts": _now_iso()} + ) + return state + + +def handle_reset(state: AgentState) -> Dict: + """重置当前会话:清空报表相关状态,保留会话信息。""" + state["current_jrxml"] = "" + state["final_jrxml"] = "" + state["status"] = "" + state["error_msg"] = "" + state["natural_explanation"] = "" + state["user_modification_request"] = "" + state["retrieved_context"] = "" + state["retry_count"] = 0 + state["compressed_history"] = "" + state["history_states"] = [] + state["intent"] = "initial_generation" + + state["conversation_history"] = [] + state["conversation_history"].append( + {"role": "assistant", "content": "会话已重置,请描述您要创建的新报表。"} + ) + state["full_conversation_history"].append( + {"role": "assistant", "content": "会话已重置。", "ts": _now_iso()} + ) + return state + + +def count_tokens(state: AgentState) -> int: + """使用 tiktoken(gpt-4o 编码器)计算当前上下文 token 数量。""" + try: + import tiktoken + enc = tiktoken.encoding_for_model("gpt-4o") + except Exception: + # 回退方案:中英文混合场景下,近似 1 token ≈ 2.5 个字符 + text = json.dumps({ + "history": state.get("conversation_history", [])[-CONTEXT_KEEP_RECENT:], + "jrxml": state.get("current_jrxml", ""), + "compressed": state.get("compressed_history", ""), + }, ensure_ascii=False) + return len(text) // 2.5 + + text = json.dumps({ + "history": state.get("conversation_history", [])[-CONTEXT_KEEP_RECENT:], + "jrxml": state.get("current_jrxml", ""), + "compressed": state.get("compressed_history", ""), + }, ensure_ascii=False) + return len(enc.encode(text)) + + +def manage_context(state: AgentState) -> Dict: + """当 token 数量超过阈值时,压缩较早的对话轮次。""" + token_count = count_tokens(state) + state["current_token_count"] = token_count + + if token_count <= CONTEXT_MAX_TOKENS: + return state + + full_history = state.get("full_conversation_history", []) + if len(full_history) <= CONTEXT_KEEP_RECENT: + return state + + # 最近N轮保留完整,更早的轮次送去压缩 + recent = full_history[-CONTEXT_KEEP_RECENT:] + older = full_history[:-CONTEXT_KEEP_RECENT] + + if not older: + return state + + conv_text = json.dumps(older, ensure_ascii=False, indent=2) + + try: + llm = get_llm() + prompt = COMPRESSION_PROMPT.format(conversation_text=conv_text) + resp = llm.invoke(prompt) + new_compressed = resp.content.strip()[:300] + except Exception: + new_compressed = _simple_compress(older) + + # 合并已有压缩与新压缩 + existing = state.get("compressed_history", "") + if existing: + state["compressed_history"] = f"{existing}\n---\n{new_compressed}" + else: + state["compressed_history"] = new_compressed + + state["conversation_history"] = list(recent) + state["current_token_count"] = count_tokens(state) + return state + + +def load_session_node(state: AgentState) -> Dict: + """在请求开始时从磁盘加载会话状态。""" + session_id = state.get("session_id", "") + if not session_id: + return state + + try: + from backend.session import load_session + data = load_session(session_id) + if data and data.get("agent_state"): + saved = data["agent_state"] + # 恢复核心字段(不覆盖当前请求的 user_input / stage) + for key in ("conversation_history", "full_conversation_history", + "current_jrxml", "final_jrxml", "compressed_history", + "session_name", "created_at", "history_states"): + if key in saved and key not in ("user_input", "stage"): + state[key] = saved[key] + state["session_name"] = data.get("session_name", "") + state["created_at"] = data.get("created_at", "") + except Exception: + pass + return state + + +def save_session_node(state: AgentState) -> Dict: + """将当前代理状态持久化到磁盘。""" + session_id = state.get("session_id", "") + if not session_id: + return state + + try: + from backend.session import save_session + persistable = {} + for key in ("conversation_history", "full_conversation_history", + "current_jrxml", "final_jrxml", "compressed_history", + "status", "error_msg", "history_states"): + if key in state: + persistable[key] = state[key] + persistable["updated_at"] = _now_iso() + + session_name = state.get("session_name", "") + if not session_name and state.get("conversation_history"): + first_user = next( + (m["content"][:50] for m in state["conversation_history"] + if m.get("role") == "user"), "") + if first_user: + session_name = first_user + + save_session(session_id, persistable, session_name) + if not state.get("session_name"): + state["session_name"] = session_name + state["updated_at"] = persistable["updated_at"] + except Exception: + pass + return state + + +def _simple_compress(messages: list[dict]) -> str: + """当 LLM 不可用时,基于简单规则的压缩回退方案。""" + points = [] + for m in messages: + if m.get("role") == "user": + points.append(f"用户提问:{m['content'][:100]}") + return "; ".join(points[-10:]) + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def retrieve(state: AgentState) -> Dict: + """在 Chroma 中搜索相关的 JRXML 模板和组件。""" + try: + embeddings = get_embeddings() + from langchain_chroma import Chroma + + persist_dir = os.getenv("CHROMA_PERSIST_DIR", "./db/chroma") + if not os.path.exists(persist_dir) or not os.listdir(persist_dir): + state["retrieved_context"] = "" + return state + + vectorstore = Chroma( + embedding_function=embeddings, + persist_directory=persist_dir, + ) + user_input = state.get("user_input", "") + docs = vectorstore.similarity_search(user_input, k=5) + context_parts = [] + for d in docs: + context_parts.append(d.page_content) + state["retrieved_context"] = "\n\n---\n\n".join(context_parts) + except Exception: + state["retrieved_context"] = "" + return state + + +def generate(state: AgentState) -> Dict: + """根据用户需求和检索到的上下文生成初始 JRXML。""" + llm = get_llm() + prompt = INITIAL_GENERATION_PROMPT.format( + context=state.get("retrieved_context", ""), + user_request=state.get("user_input", ""), + ) + resp = llm.invoke(prompt) + jrxml = _extract_jrxml(resp.content) + state["current_jrxml"] = jrxml + state["conversation_history"].append({"role": "assistant", "content": jrxml}) + return state + + +def modify_jrxml(state: AgentState) -> Dict: + """根据用户的修改请求修改现有 JRXML。""" + llm = get_llm() + # 构建对话上下文:压缩摘要 + 最近对话 + compressed = state.get("compressed_history", "") + recent = state.get("conversation_history", [])[-6:] + conv_parts = [] + if compressed: + conv_parts.append(f"[早期对话摘要]\n{compressed}") + conv_parts.append(json.dumps(recent, ensure_ascii=False, indent=2)) + conv_text = "\n\n---\n\n".join(conv_parts) + + prompt = MODIFICATION_PROMPT.format( + current_jrxml=state.get("current_jrxml", ""), + conversation_history=conv_text, + modification_request=state.get("user_modification_request", ""), + ) + resp = llm.invoke(prompt) + jrxml = _extract_jrxml(resp.content) + state["current_jrxml"] = jrxml + state["conversation_history"].append( + { + "role": "user", + "content": state.get("user_modification_request", ""), + } + ) + state["conversation_history"].append({"role": "assistant", "content": jrxml}) + state["full_conversation_history"] = ( + list(state.get("full_conversation_history", [])) + + [ + {"role": "user", "content": state.get("user_modification_request", ""), "ts": _now_iso()}, + {"role": "assistant", "content": jrxml, "ts": _now_iso()}, + ] + ) + state["retry_count"] = 0 + return state + + +def validate(state: AgentState) -> Dict: + """根据 FastAPI 验证服务验证当前 JRXML。""" + jrxml = state.get("current_jrxml", "") + if not jrxml: + state["status"] = "fail" + state["error_msg"] = "没有 JRXML 内容可供验证。" + return state + + result = validate_jrxml(jrxml) + state["status"] = "pass" if result.get("valid") else "fail" + state["error_msg"] = result.get("error", "") + return state + + +def explain_error(state: AgentState) -> Dict: + """生成验证错误的可读解释。""" + llm = get_llm() + jrxml = state.get("current_jrxml", "") + lines = jrxml.split("\n")[:80] + snippet = "\n".join(lines) + + prompt = EXPLAIN_PROMPT.format( + error_msg=state.get("error_msg", "未知错误"), + jrxml_snippet=snippet, + ) + resp = llm.invoke(prompt) + state["natural_explanation"] = resp.content.strip() + return state + + +def correct_jrxml(state: AgentState) -> Dict: + """尝试自动修正验证失败的 JRXML。""" + llm = get_llm() + prompt = CORRECTION_PROMPT.format( + current_jrxml=state.get("current_jrxml", ""), + error_msg=state.get("error_msg", ""), + explanation=state.get("natural_explanation", ""), + ) + resp = llm.invoke(prompt) + jrxml = _extract_jrxml(resp.content) + state["current_jrxml"] = jrxml + state["retry_count"] = state.get("retry_count", 0) + 1 + state["conversation_history"].append( + {"role": "assistant", "content": f"[自动修正,第 {state['retry_count']} 次尝试]\n{jrxml}"} + ) + return state + + +def finalize(state: AgentState) -> Dict: + """保存最终验证通过的 JRXML 并更新对话历史。""" + state["final_jrxml"] = state.get("current_jrxml", "") + return state + + +def _extract_jrxml(text: str) -> str: + """从 LLM 响应中提取 JRXML 内容,如有 markdown 标记则去除。""" + text = text.strip() + xml_pattern = re.compile(r"```(?:xml|jrxml)?\s*([\s\S]*?)```", re.IGNORECASE) + m = xml_pattern.search(text) + if m: + return m.group(1).strip() + + jasper_tag = re.search(r"(<\?xml[\s\S]*?)", text, re.IGNORECASE) + if jasper_tag: + return jasper_tag.group(1).strip() + + if text.startswith(" is empty. + +FIX: Add a SQL query inside CDATA: diff --git a/data/corrections/missing_field.jrxml b/data/corrections/missing_field.jrxml new file mode 100644 index 0000000..3b1b7be --- /dev/null +++ b/data/corrections/missing_field.jrxml @@ -0,0 +1,6 @@ +# Error case: field used in expression but not declared +# Correction: add field declaration + +ERROR: textFieldExpression uses $F{total_amount} but no declared. + +FIX: Add to the field declarations section. diff --git a/data/corrections/missing_page_size.jrxml b/data/corrections/missing_page_size.jrxml new file mode 100644 index 0000000..643f1fc --- /dev/null +++ b/data/corrections/missing_page_size.jrxml @@ -0,0 +1,6 @@ +# Error case: jasperReport missing pageWidth and pageHeight +# Correction: add page dimensions + +ERROR: has no pageWidth/pageHeight attributes. + +FIX: Add pageWidth="595" pageHeight="842" to the root element. diff --git a/data/sample_templates/employee_roster.jrxml b/data/sample_templates/employee_roster.jrxml new file mode 100644 index 0000000..147062f --- /dev/null +++ b/data/sample_templates/employee_roster.jrxml @@ -0,0 +1,69 @@ + + + + + + + + + + + <band height="50"> + <staticText> + <reportElement x="0" y="10" width="555" height="30"/> + <textElement textAlignment="Center"> + <font size="16" isBold="true"/> + </textElement> + <text><![CDATA[Employee Roster]]></text> + </staticText> + </band> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/sample_templates/inventory_list.jrxml b/data/sample_templates/inventory_list.jrxml new file mode 100644 index 0000000..e09136b --- /dev/null +++ b/data/sample_templates/inventory_list.jrxml @@ -0,0 +1,79 @@ + + + + + + + + + + + + <band height="50"> + <staticText> + <reportElement x="0" y="10" width="555" height="30"/> + <textElement textAlignment="Center"> + <font size="16" isBold="true"/> + </textElement> + <text><![CDATA[Inventory List]]></text> + </staticText> + </band> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/sample_templates/report_with_summary.jrxml b/data/sample_templates/report_with_summary.jrxml new file mode 100644 index 0000000..a4ba2a2 --- /dev/null +++ b/data/sample_templates/report_with_summary.jrxml @@ -0,0 +1,89 @@ + + + + + + + + + + + + + + + + <band height="60"> + <staticText> + <reportElement x="0" y="10" width="555" height="30"/> + <textElement textAlignment="Center"> + <font size="16" isBold="true"/> + </textElement> + <text><![CDATA[Department Sales Summary]]></text> + </staticText> + </band> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/data/sample_templates/sales_order.jrxml b/data/sample_templates/sales_order.jrxml new file mode 100644 index 0000000..7651695 --- /dev/null +++ b/data/sample_templates/sales_order.jrxml @@ -0,0 +1,69 @@ + + + + + + + + + + + <band height="50"> + <staticText> + <reportElement x="0" y="10" width="555" height="30"/> + <textElement textAlignment="Center"> + <font size="16" isBold="true"/> + </textElement> + <text><![CDATA[Sales Orders Report]]></text> + </staticText> + </band> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/init_kb.py b/scripts/init_kb.py new file mode 100644 index 0000000..7aef308 --- /dev/null +++ b/scripts/init_kb.py @@ -0,0 +1,87 @@ +"""初始化 Chroma 知识库,加载示例 JRXML 模板和错误修正案例。 + +用法: python scripts/init_kb.py +""" + +import os +import sys +from pathlib import Path + +from dotenv import load_dotenv + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +load_dotenv() + +from backend.embeddings import get_embeddings + + +def load_templates(template_dir: Path) -> list[dict]: + docs = [] + for fpath in template_dir.glob('*.jrxml'): + content = fpath.read_text(encoding='utf-8') + name = fpath.stem + docs.append({ + 'content': content, + 'metadata': { + 'source': str(fpath), + 'type': 'full_report', + 'name': name, + }, + }) + return docs + + +def load_corrections(corrections_dir: Path) -> list[dict]: + docs = [] + for fpath in corrections_dir.glob('*.jrxml'): + content = fpath.read_text(encoding='utf-8') + docs.append({ + 'content': content, + 'metadata': { + 'source': str(fpath), + 'type': 'correction_case', + 'name': fpath.stem, + }, + }) + return docs + + +def main(): + persist_dir = os.getenv('CHROMA_PERSIST_DIR', './db/chroma') + data_dir = Path(__file__).parent.parent / 'data' + + template_dir = data_dir / 'sample_templates' + corrections_dir = data_dir / 'corrections' + + docs = [] + if template_dir.exists(): + docs.extend(load_templates(template_dir)) + print(f'从 {template_dir} 加载了 {len(docs)} 个模板') + + if corrections_dir.exists(): + corr = load_corrections(corrections_dir) + docs.extend(corr) + print(f'从 {corrections_dir} 加载了 {len(corr)} 个修正案例') + + if not docs: + print('未找到文档,无需索引。') + return + + embeddings = get_embeddings() + from langchain_chroma import Chroma + + texts = [d['content'] for d in docs] + metadatas = [d['metadata'] for d in docs] + + Chroma.from_texts( + texts=texts, + embedding=embeddings, + metadatas=metadatas, + persist_directory=persist_dir, + ) + print(f'已将 {len(docs)} 个文档索引到 Chroma,存储位置: {persist_dir}') + + +if __name__ == '__main__': + main() diff --git a/validation_service/main.py b/validation_service/main.py new file mode 100644 index 0000000..9b4de3d --- /dev/null +++ b/validation_service/main.py @@ -0,0 +1,129 @@ +"""JRXML 文件验证服务(FastAPI)。 + +使用 lxml XML Schema 验证作为 JasperReports 7.0.6 编译验证的第一阶段后备方案。 +要进行完整的编译验证,需要基于 Java 的验证器以及 JasperReports 7.0.6 + JDK 21。 + +启动: uvicorn validation_service.main:app --port 8001 +""" + +import re +import xml.etree.ElementTree as ET +from pathlib import Path + +from fastapi import FastAPI +from lxml import etree +from pydantic import BaseModel + +app = FastAPI(title="JRXML 验证服务") + +SCHEMA_DIR = Path(__file__).parent / "schemas" +SCHEMA_FILE = SCHEMA_DIR / "jasperreport_7_0_6.xsd" + + +class ValidationRequest(BaseModel): + jrxml: str + + +class ValidationResponse(BaseModel): + valid: bool + error: str + + +def _check_structural_issues(jrxml: str) -> list[str]: + """检查 JRXML 中常见的结构性问题。""" + issues = [] + root = None + + try: + root = ET.fromstring(jrxml) + except ET.ParseError as e: + issues.append(f"XML 解析错误:{e}") + return issues + + # 同时处理带命名空间和不带命名空间的元素名 + ns = "http://jasperreports.sourceforge.net/jasperreports" + + declared_fields = set() + for elem in root.iter(): + tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + if tag == "field": + name = elem.get("name") + if name: + declared_fields.add(name) + + field_expr_pattern = re.compile(r'\$F\{(\w+)\}') + for m in field_expr_pattern.finditer(jrxml): + field_name = m.group(1) + if field_name not in declared_fields: + issues.append( + f"字段 '{field_name}' 在表达式中使用但未在 部分声明" + ) + + query = None + for elem in root.iter(): + tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + if tag == "queryString": + query = elem + break + if query is not None: + query_text = "".join(query.itertext()).strip() + if not query_text: + issues.append(" 为空 - 请在 CDATA 中添加 SQL 查询") + elif not any(kw in query_text.upper() for kw in ["SELECT"]): + issues.append(" 似乎不包含 SQL SELECT 查询") + + if not root.get("pageWidth"): + issues.append("缺少 上的 pageWidth 属性") + if not root.get("pageHeight"): + issues.append("缺少 上的 pageHeight 属性") + if not root.get("name"): + issues.append("缺少 上的 'name' 属性") + + return issues + + +def _validate_xsd(jrxml: str) -> tuple[bool, str]: + """根据 JasperReports XSD schema 验证 JRXML。""" + if not SCHEMA_FILE.exists(): + return True, "" + + try: + schema_doc = etree.parse(str(SCHEMA_FILE)) + xmlschema = etree.XMLSchema(schema_doc) + doc = etree.fromstring(jrxml.encode("utf-8")) + xmlschema.assertValid(doc) + return True, "" + except etree.DocumentInvalid as e: + return False, str(e) + except etree.XMLSchemaError as e: + return False, f"Schema 错误:{e}" + except Exception as e: + return False, f"XML 验证错误:{e}" + + +@app.post("/validate", response_model=ValidationResponse) +async def validate_jrxml(req: ValidationRequest): + jrxml = req.jrxml.strip() + if not jrxml: + return ValidationResponse(valid=False, error="JRXML 内容为空") + + structural_issues = _check_structural_issues(jrxml) + if structural_issues: + return ValidationResponse(valid=False, error="; ".join(structural_issues)) + + valid, xsd_error = _validate_xsd(jrxml) + if not valid: + return ValidationResponse(valid=False, error=xsd_error) + + return ValidationResponse(valid=True, error="") + + +@app.get("/health") +async def health(): + schema_available = SCHEMA_FILE.exists() + return { + "status": "ok", + "schema_available": schema_available, + "validation_type": "XSD" if schema_available else "仅结构检查", + "note": "如需完整的 JasperReports 7.0.6 编译验证,请使用基于 Java 的验证器", + } diff --git a/validation_service/validate.bat b/validation_service/validate.bat new file mode 100644 index 0000000..113a848 --- /dev/null +++ b/validation_service/validate.bat @@ -0,0 +1,6 @@ +@echo off +echo 正在启动 JRXML 验证服务... +echo. +cd /d "%~dp0" +python -m uvicorn validation_service.main:app --host 0.0.0.0 --port 8001 --reload +pause