fix: per-node max_tokens + validation 502 guard + correct_jrxml output validity

- backend/llm.py: per-node max_tokens via get_llm(max_tokens=N), LLM_MAX_TOKENS env var (default 8192)
- agent/nodes.py: 5 generation nodes use max_tokens=32768, generate_skeleton retries at 65536
- agent/nodes.py: fix ns:field regex (<field → <[\w:]*field) to handle namespace prefixes
- agent/nodes.py: fix correct_jrxml never writing back to state["current_jrxml"]
- agent/nodes.py: correct_jrxml rejects non-JRXML output (no <jasperReport tag)
- agent/nodes.py: _strip_continuation_wrapper strips markdown/prefixes from continuation rounds
- agent/nodes.py: _extract_jrxml iterates multiple markdown code blocks, skips fragments
- agent/graph.py: route_after_validate skips correction loop when service_unavailable
- agent/graph.py: route_after_save skips validation for empty JRXML
- backend/validation.py: returns service_unavailable: True for ConnectError and HTTP 5xx
- Docs: CLAUDE.md v14 changelog, README.md LLM_MAX_TOKENS, .env.example LLM_MAX_TOKENS
This commit is contained in:
2026-05-24 15:20:25 +08:00
parent e362f530ea
commit 4e14334030
8 changed files with 388 additions and 32 deletions
+72 -21
View File
@@ -543,9 +543,9 @@ def _programmatic_map_fields(jrxml: str, ocr_fields: list[dict]) -> str:
real_name = _sanitize_field_name(raw_name)
if real_name == placeholder:
continue
# 替换 field 声明: <field name="field_1" → <field name="customer_name"
# 替换 field 声明: <ns0:field name="field_1" → <ns0:field name="customer_name"
result = re.sub(
rf'(<field\b[^>]*\bname\s*=\s*"){re.escape(placeholder)}(")',
rf'(<[\w:]*field\b[^>]*\bname\s*=\s*"){re.escape(placeholder)}(")',
rf'\g<1>{real_name}\g<2>', result,
)
# 替换所有引用: $F{{field_1}} → $F{{customer_name}}
@@ -821,7 +821,7 @@ def generate(state: AgentState) -> Dict:
from langgraph.config import get_stream_writer
writer = get_stream_writer()
llm = get_llm(caller="generate")
llm = get_llm(caller="generate", max_tokens=32768)
user_request = state.get("user_input", "")
ocr_text = _format_ocr_context(state)
@@ -849,7 +849,6 @@ def generate_skeleton(state: AgentState) -> Dict:
from langgraph.config import get_stream_writer
writer = get_stream_writer()
llm = get_llm(caller="generate_skeleton")
schema = state.get("layout_schema", {})
schema_text = schema.get("schema_text", "") if isinstance(schema, dict) else ""
@@ -861,10 +860,16 @@ def generate_skeleton(state: AgentState) -> Dict:
user_request=user_request,
template_context=_build_template_context(state),
)
llm = get_llm(caller="generate_skeleton", max_tokens=32768)
prev_jrxml = state.get("current_jrxml", "")
full_text = _generate_with_continuation(llm, prompt, writer, "generate_skeleton")
if not full_text.strip():
_node_log.error("generate_skeleton LLM 返回空响应")
_node_log.warning("generate_skeleton 首次返回空响应,以更高 max_tokens 重试")
llm = get_llm(caller="generate_skeleton", max_tokens=65536)
full_text = _generate_with_continuation(llm, prompt, writer, "generate_skeleton")
if not full_text.strip():
_node_log.error("generate_skeleton LLM 返回空响应(含重试)")
return state
jrxml = _extract_jrxml(full_text)
if len(jrxml.strip()) < 200:
@@ -1025,7 +1030,7 @@ def modify_jrxml(state: AgentState) -> Dict:
from langgraph.config import get_stream_writer
writer = get_stream_writer()
llm = get_llm(caller="modify_jrxml")
llm = get_llm(caller="modify_jrxml", max_tokens=32768)
# 构建对话上下文:压缩摘要 + 最近对话
compressed = state.get("compressed_history", "")
recent = state.get("conversation_history", [])[-6:]
@@ -1278,6 +1283,7 @@ def validate(state: AgentState) -> Dict:
result = validate_jrxml(jrxml)
state["status"] = "pass" if result.get("valid") else "fail"
state["error_msg"] = result.get("error", "")
state["service_unavailable"] = result.get("service_unavailable", False)
# OCR 保真度检查:比对生成结果与原始图片的 OCR 提取内容
fidelity = _check_ocr_fidelity(jrxml, state)
@@ -1378,7 +1384,7 @@ def correct_jrxml(state: AgentState) -> Dict:
from langgraph.config import get_stream_writer
writer = get_stream_writer()
llm = get_llm(caller="correct_jrxml")
llm = get_llm(caller="correct_jrxml", max_tokens=32768)
ocr_context = _format_ocr_context(state)
layout_schema = state.get("layout_schema", {})
layout_text = ""
@@ -1432,6 +1438,13 @@ def correct_jrxml(state: AgentState) -> Dict:
_node_log.warning(f"correct_jrxml 输出过短({len(jrxml)} 字符),回退到前一版本")
jrxml = prev_jrxml
# 如果提取结果不是合法 JRXML(不含 <jasperReport),说明 LLM 返回了 HTML 等垃圾输出
if jrxml and "<jasperReport" not in jrxml and "<?xml" not in jrxml:
_node_log.warning(
f"correct_jrxml 输出不是合法 JRXML{jrxml[:100]}),回退到前一版本"
)
jrxml = prev_jrxml
# 去重检测:如果输出与输入完全相同(忽略空白差异),说明修正无效
_prev_norm = re.sub(r"\s+", "", prev_jrxml) if prev_jrxml else ""
_new_norm = re.sub(r"\s+", "", jrxml) if jrxml else ""
@@ -1442,6 +1455,7 @@ def correct_jrxml(state: AgentState) -> Dict:
state["retry_count"] = state.get("retry_count", 0) + 2
else:
state["retry_count"] = state.get("retry_count", 0) + 1
state["current_jrxml"] = jrxml
state["conversation_history"].append(
{"role": "assistant", "content": f"[自动修正,第 {state['retry_count']} 次尝试]\n{jrxml}"}
)
@@ -1510,6 +1524,31 @@ def finalize(state: AgentState) -> Dict:
return state
def _strip_continuation_wrapper(text: str) -> str:
"""去除续写响应中的 markdown 代码块标记和自然语言解释。
续写轮次的 LLM 可能会"忘记"原始 prompt 中的格式要求,
在响应开头加解释文字、用 ``` 包裹 XML 片段。
此函数提取其中的纯 XML 内容,去除包装。
"""
text = text.strip()
# 移除完整的 markdown 代码块包装: ```...```
m = re.search(r"```(?:xml|jrxml)?\s*([\s\S]*?)```", text, re.IGNORECASE)
if m:
inner = m.group(1).strip()
if inner:
return inner
# 移除开头/结尾的独立 ``` 标记(不完整代码块)
text = re.sub(r"^```(?:xml|jrxml)?\s*", "", text)
text = re.sub(r"```\s*$", "", text)
# 移除续写响应常见的自然语言前缀
text = re.sub(
r"^.{0,40}(继续输出|剩余|续写|补全|接上).{0,30}[:]?\s*",
"", text, flags=re.IGNORECASE
)
return text.strip()
def _generate_with_continuation(llm, prompt, writer, node_name, max_rounds=3) -> str:
"""Stream LLM generation with automatic truncation recovery.
@@ -1519,6 +1558,7 @@ def _generate_with_continuation(llm, prompt, writer, node_name, max_rounds=3) ->
Returns combined full text from all rounds.
"""
_jrxml_end = r"</(?:[\w:]+:)?(?:jasperReport|report)>\s*$"
full_text = ""
for round_num in range(max_rounds):
@@ -1529,7 +1569,8 @@ def _generate_with_continuation(llm, prompt, writer, node_name, max_rounds=3) ->
current_prompt = (
f"[系统指令] 你正在生成的 JRXML 在上一次响应中被截断。\n"
f"已生成内容的最后部分(请从此处继续):\n...{tail}\n\n"
f"请从截断点继续输出剩余内容,不要重复已输出的部分。"
f"请从截断点继续输出剩余内容,不要重复已输出的部分。\n"
f"不要输出 markdown 代码块、解释或任何非 JRXML 的内容。"
)
new_chunks = []
@@ -1538,10 +1579,12 @@ def _generate_with_continuation(llm, prompt, writer, node_name, max_rounds=3) ->
writer({"type": "stream", "node": node_name, "text": chunk})
new_text = "".join(new_chunks)
if round_num > 0:
new_text = _strip_continuation_wrapper(new_text)
full_text += new_text
jrxml = _extract_jrxml(full_text)
if re.search(r"</(?:[\w:]+:)?jasperReport>\s*$", jrxml, re.IGNORECASE):
if re.search(_jrxml_end, jrxml, re.IGNORECASE):
break
if not new_text.strip():
@@ -1554,17 +1597,26 @@ def _generate_with_continuation(llm, prompt, writer, node_name, max_rounds=3) ->
def _extract_jrxml(text: str) -> str:
"""从 LLM 响应中提取 JRXML 内容,如有 markdown 标记则去除。"""
text = text.strip()
xml_pattern = re.compile(r"```(?:xml|jrxml)?\s*([\s\S]*?)```", re.IGNORECASE)
m = xml_pattern.search(text)
if m:
content = m.group(1).strip()
if content:
return content
# markdown 代码块存在但内容为空 — 回退到直接匹配
"""从 LLM 响应中提取 JRXML 内容,如有 markdown 标记则去除。
_jrxml_close = r"</(?:[\w:]+:)?jasperReport>"
处理多种情况:
1. 完整的 markdown 代码块包裹(单轮输出)
2. 混合文本(多轮续写:第一轮无 markdown,续写轮添加了 markdown
3. 纯 JRXML 无包装
"""
text = text.strip()
# 检测并提取 markdown 代码块中的内容
# 如果第一个代码块的内容看起来是完整 JRXML(以 <?xml 或 <jasperReport 开头),
# 则返回它;否则跳过该块,回退到其他提取方式。
xml_pattern = re.compile(r"```(?:xml|jrxml)?\s*([\s\S]*?)```", re.IGNORECASE)
for m in xml_pattern.finditer(text):
content = m.group(1).strip()
if content and (content.startswith("<?xml") or content.startswith("<jasperReport")):
return content
# 非完整 JRXML 片段 — 跳过,继续搜索后续代码块
# 直接匹配 <?xml ... </jasperReport> 或 ... </report>
_jrxml_close = r"</(?:[\w:]+:)?(?:jasperReport|report)>"
jasper_tag = re.search(rf"(<\?xml[\s\S]*?{_jrxml_close})", text, re.IGNORECASE)
if jasper_tag:
return jasper_tag.group(1).strip()
@@ -1572,8 +1624,7 @@ def _extract_jrxml(text: str) -> str:
if text.startswith("<?xml") or text.startswith("<jasperReport"):
return text
# 最终回退:如果文本中包含 XML 片段但没有被捕获到,尝试直接提取
# 这处理 LLM 在代码块外用自然语言"包裹"JRXML 的情况
# 最终回退:尝试在文本中定位 XML 起始和结束
xml_start = text.find("<?xml")
jr_close = re.search(_jrxml_close, text, re.IGNORECASE)
if xml_start >= 0 and jr_close: