From 52eed4d0101d6299bfa012d4bd87a6a7f2990415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=A9=AC=E4=B8=80=E4=B8=81?= <1769123563@qq.com> Date: Fri, 14 Nov 2025 17:55:28 +0800 Subject: [PATCH] Increase DeepSeek Compatibility --- ReportEngine/agent.py | 15 +- ReportEngine/nodes/__init__.py | 3 +- ReportEngine/nodes/chapter_generation_node.py | 251 +++++++++++++++++- ReportEngine/renderers/html_renderer.py | 203 +++++++++++++- 4 files changed, 460 insertions(+), 12 deletions(-) diff --git a/ReportEngine/agent.py b/ReportEngine/agent.py index 1d49cf2..2dc28a1 100644 --- a/ReportEngine/agent.py +++ b/ReportEngine/agent.py @@ -29,6 +29,7 @@ from .nodes import ( TemplateSelectionNode, ChapterGenerationNode, ChapterJsonParseError, + ChapterContentError, DocumentLayoutNode, WordBudgetNode, ) @@ -438,20 +439,26 @@ class ReportAgent: stream_callback=chunk_callback ) break - except ChapterJsonParseError as parse_error: + except (ChapterJsonParseError, ChapterContentError) as structured_error: + error_kind = ( + "content_sparse" if isinstance(structured_error, ChapterContentError) else "json_parse" + ) + readable_label = "内容密度异常" if error_kind == "content_sparse" else "JSON解析失败" logger.warning( - "章节 %s JSON解析失败(第 %s/%s 次尝试): %s", + "章节 %s %s(第 %s/%s 次尝试): %s", section.title, + readable_label, attempt, chapter_max_attempts, - parse_error, + structured_error, ) emit('chapter_status', { 'chapterId': section.chapter_id, 'title': section.title, 'status': 'retrying' if attempt < chapter_max_attempts else 'error', 'attempt': attempt, - 'error': str(parse_error), + 'error': str(structured_error), + 'reason': error_kind, }) if attempt >= chapter_max_attempts: raise diff --git a/ReportEngine/nodes/__init__.py b/ReportEngine/nodes/__init__.py index a24dc88..1f1b538 100644 --- a/ReportEngine/nodes/__init__.py +++ b/ReportEngine/nodes/__init__.py @@ -6,7 +6,7 @@ Report Engine节点处理模块。 from .base_node import BaseNode, StateMutationNode from .template_selection_node import TemplateSelectionNode -from .chapter_generation_node import ChapterGenerationNode, ChapterJsonParseError +from .chapter_generation_node import ChapterGenerationNode, ChapterJsonParseError, ChapterContentError from .document_layout_node import DocumentLayoutNode from .word_budget_node import WordBudgetNode @@ -16,6 +16,7 @@ __all__ = [ "TemplateSelectionNode", "ChapterGenerationNode", "ChapterJsonParseError", + "ChapterContentError", "DocumentLayoutNode", "WordBudgetNode", ] diff --git a/ReportEngine/nodes/chapter_generation_node.py b/ReportEngine/nodes/chapter_generation_node.py index a87c324..d647c23 100644 --- a/ReportEngine/nodes/chapter_generation_node.py +++ b/ReportEngine/nodes/chapter_generation_node.py @@ -36,6 +36,14 @@ class ChapterJsonParseError(ValueError): self.raw_text = raw_text +class ChapterContentError(ValueError): + """ + 章节内容稀疏异常。 + + 当LLM仅输出标题或正文不足以支撑一章时触发,驱动重试以保证报告质量。 + """ + + class ChapterGenerationNode(BaseNode): """ 负责按章节调用LLM并校验JSON结构。 @@ -71,6 +79,12 @@ class ChapterGenerationNode(BaseNode): "sub": "subscript", "sup": "superscript", } + # 章节若仅包含标题或字符过少则视为失败,强制LLM重新生成 + _MIN_NON_HEADING_BLOCKS = 2 + _MIN_BODY_CHARACTERS = 400 + _PARAGRAPH_FRAGMENT_MAX_CHARS = 80 + _PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240 + _TERMINATION_PUNCTUATION = set("。!?!?;;……") def __init__(self, llm_client, validator: IRValidator, storage: ChapterStorage): """ @@ -121,17 +135,32 @@ class ChapterGenerationNode(BaseNode): self._sanitize_chapter_blocks(chapter_json) valid, errors = self.validator.validate_chapter(chapter_json) + content_error: ChapterContentError | None = None + if valid: + try: + self._ensure_content_density(chapter_json) + except ChapterContentError as exc: + content_error = exc + + error_messages: List[str] = [] + if not valid and errors: + error_messages.extend(errors) + if content_error: + error_messages.append(str(content_error)) + self.storage.persist_chapter( run_dir, chapter_meta, chapter_json, - errors=None if valid else errors, + errors=None if not error_messages else error_messages, ) if not valid: raise ValueError( f"{section.title} 章节JSON校验失败: {'; '.join(errors[:5])}" ) + if content_error: + raise content_error return chapter_json @@ -488,6 +517,97 @@ class ChapterGenerationNode(BaseNode): walk(chapter.get("blocks")) + blocks = chapter.get("blocks") + if isinstance(blocks, list): + chapter["blocks"] = self._merge_fragment_sequences(blocks) + + def _ensure_content_density(self, chapter: Dict[str, Any]): + """ + 校验章节正文密度。 + + 若blocks缺失、除标题外无有效区块,或正文字符数低于阈值, + 则视为章节内容异常,触发ChapterContentError以便上游重试。 + """ + blocks = chapter.get("blocks") + if not isinstance(blocks, list) or not blocks: + raise ChapterContentError("章节缺少正文区块,无法输出内容") + + non_heading_blocks = [ + block + for block in blocks + if isinstance(block, dict) + and block.get("type") not in {"heading", "divider", "toc"} + ] + body_characters = self._count_body_characters(blocks) + + if len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS or body_characters < self._MIN_BODY_CHARACTERS: + raise ChapterContentError( + f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters}" + ) + + def _count_body_characters(self, blocks: Any) -> int: + """ + 递归统计正文字符数。 + + - 忽略heading/divider/widget等非正文类型; + - 对paragraph/list/table/callout等结构抽取嵌套文本; + - 仅用于粗粒度判断篇幅是否合理。 + """ + + def walk(node: Any) -> int: + if node is None: + return 0 + if isinstance(node, list): + return sum(walk(item) for item in node) + if isinstance(node, str): + return len(node.strip()) + if not isinstance(node, dict): + return 0 + + block_type = node.get("type") + if block_type in {"heading", "divider", "toc", "widget"}: + return 0 + + if block_type == "paragraph": + inlines = node.get("inlines") + if isinstance(inlines, list): + total = 0 + for run in inlines: + if isinstance(run, dict): + text = run.get("text") + if isinstance(text, str): + total += len(text.strip()) + return total + text_value = node.get("text") + if isinstance(text_value, str): + return len(text_value.strip()) + return len(self._extract_block_text(node).strip()) + + if block_type == "list": + total = 0 + for item in node.get("items", []): + total += walk(item) + return total + + if block_type in {"blockquote", "callout"}: + return walk(node.get("blocks")) + + if block_type == "table": + total = 0 + for row in node.get("rows", []): + cells = row.get("cells") or [] + for cell in cells: + total += walk(cell.get("blocks")) + return total + + nested = node.get("blocks") + if isinstance(nested, list): + return walk(nested) + + return len(self._extract_block_text(node).strip()) + + return walk(blocks) + def _sanitize_block_content(self, block: Dict[str, Any]): """根据类型做精细化修复,例如清理paragraph内的非法inline mark""" block_type = block.get("type") @@ -505,7 +625,134 @@ class ChapterGenerationNode(BaseNode): normalized_runs = [self._as_inline_run(self._extract_block_text(block))] if not normalized_runs: normalized_runs = [self._as_inline_run("")] - block["inlines"] = normalized_runs + block["inlines"] = self._strip_inline_artifacts(normalized_runs) + + def _strip_inline_artifacts(self, inlines: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """移除被LLM误写入的JSON哨兵文本,防止渲染出`{\"type\": \"\"}`等垃圾字符""" + cleaned: List[Dict[str, Any]] = [] + for run in inlines or []: + if not isinstance(run, dict): + continue + text = run.get("text") + if isinstance(text, str): + stripped = text.strip() + if stripped.startswith("{") and stripped.endswith("}"): + try: + payload = json.loads(stripped) + except json.JSONDecodeError: + payload = None + if isinstance(payload, dict) and set(payload.keys()).issubset({"type", "value"}): + continue + cleaned.append(run) + return cleaned or [self._as_inline_run("")] + + def _merge_fragment_sequences(self, blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """合并被LLM拆成多段的句子片段,避免HTML出现大量孤立
""" + if not isinstance(blocks, list): + return blocks + + merged: List[Dict[str, Any]] = [] + fragment_buffer: List[Dict[str, Any]] = [] + + def flush_buffer(): + nonlocal fragment_buffer + if not fragment_buffer: + return + if len(fragment_buffer) == 1: + merged.append(fragment_buffer[0]) + else: + merged.append(self._combine_paragraph_fragments(fragment_buffer)) + fragment_buffer = [] + + for block in blocks: + if self._is_paragraph_fragment(block): + fragment_buffer.append(block) + continue + flush_buffer() + merged.append(self._merge_nested_fragments(block)) + + flush_buffer() + return merged + + def _merge_nested_fragments(self, block: Dict[str, Any]) -> Dict[str, Any]: + """对嵌套结构(callout/list/table)递归处理片段合并""" + block_type = block.get("type") + if block_type in {"callout", "blockquote"}: + nested = block.get("blocks") + if isinstance(nested, list): + block["blocks"] = self._merge_fragment_sequences(nested) + elif block_type == "list": + items = block.get("items") + if isinstance(items, list): + for entry in items: + if isinstance(entry, list): + merged_entry = self._merge_fragment_sequences(entry) + entry[:] = merged_entry + elif block_type == "table": + for row in block.get("rows", []): + cells = row.get("cells") or [] + for cell in cells: + nested_blocks = cell.get("blocks") + if isinstance(nested_blocks, list): + cell["blocks"] = self._merge_fragment_sequences(nested_blocks) + return block + + def _combine_paragraph_fragments(self, fragments: List[Dict[str, Any]]) -> Dict[str, Any]: + """将多个句子片段合并为单个paragraph block""" + template = dict(fragments[0]) + combined_inlines: List[Dict[str, Any]] = [] + for fragment in fragments: + runs = fragment.get("inlines") + if isinstance(runs, list) and runs: + combined_inlines.extend(runs) + else: + fallback_text = self._extract_block_text(fragment) + combined_inlines.append(self._as_inline_run(fallback_text)) + if not combined_inlines: + combined_inlines.append(self._as_inline_run("")) + template["inlines"] = combined_inlines + return template + + def _is_paragraph_fragment(self, block: Dict[str, Any]) -> bool: + """判断paragraph是否为被错误拆分的短片段""" + if not isinstance(block, dict) or block.get("type") != "paragraph": + return False + inlines = block.get("inlines") + text = "" + has_marks = False + if isinstance(inlines, list) and inlines: + parts: List[str] = [] + for run in inlines: + if not isinstance(run, dict): + continue + parts.append(str(run.get("text") or "")) + marks = run.get("marks") + if isinstance(marks, list) and any(marks): + has_marks = True + text = "".join(parts) + else: + text = self._extract_block_text(block) + stripped = (text or "").strip() + if not stripped: + return True + if has_marks: + return False + if "\n" in stripped: + return False + + short_limit = self._PARAGRAPH_FRAGMENT_MAX_CHARS + long_limit = getattr( + self, + "_PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS", + short_limit * 3, + ) + + if stripped[-1] in self._TERMINATION_PUNCTUATION: + return len(stripped) <= short_limit + + if len(stripped) > long_limit: + return False + return True def _coerce_inline_run(self, run: Any) -> List[Dict[str, Any]]: """将任意inline写法规整为合法run""" diff --git a/ReportEngine/renderers/html_renderer.py b/ReportEngine/renderers/html_renderer.py index 5930563..e41447c 100644 --- a/ReportEngine/renderers/html_renderer.py +++ b/ReportEngine/renderers/html_renderer.py @@ -5,6 +5,7 @@ from __future__ import annotations import ast +import copy import html import json from typing import Any, Dict, List @@ -19,6 +20,31 @@ class HTMLRenderer: - 提供主题变量、编号映射等辅助功能。 """ + CALLOUT_ALLOWED_TYPES = { + "paragraph", + "list", + "table", + "blockquote", + "code", + "math", + "figure", + "kpiGrid", + } + INLINE_ARTIFACT_KEYS = { + "props", + "widgetId", + "widgetType", + "data", + "dataRef", + "datasets", + "labels", + "config", + "options", + } + TABLE_COMPLEX_CHARS = set( + "@%%()(),,。;;::、??!!·…-—_+<>[]{}|\\/\"'`~$^&*#" + ) + def __init__(self, config: Dict[str, Any] | None = None): """初始化渲染器缓存并允许注入额外配置(如主题覆盖)""" self.config = config or {} @@ -72,6 +98,7 @@ class HTMLRenderer: