Increase DeepSeek Compatibility

2025-11-14 17:55:28 +08:00
parent e267b1fc04
commit 52eed4d010
4 changed files with 460 additions and 12 deletions
@@ -6,7 +6,7 @@ Report Engine节点处理模块。

 from .base_node import BaseNode, StateMutationNode
 from .template_selection_node import TemplateSelectionNode
-from .chapter_generation_node import ChapterGenerationNode, ChapterJsonParseError
+from .chapter_generation_node import ChapterGenerationNode, ChapterJsonParseError, ChapterContentError
 from .document_layout_node import DocumentLayoutNode
 from .word_budget_node import WordBudgetNode

@@ -16,6 +16,7 @@ __all__ = [
    "TemplateSelectionNode",
    "ChapterGenerationNode",
    "ChapterJsonParseError",
+    "ChapterContentError",
    "DocumentLayoutNode",
    "WordBudgetNode",
 ]
@@ -36,6 +36,14 @@ class ChapterJsonParseError(ValueError):
        self.raw_text = raw_text


+class ChapterContentError(ValueError):
+    """
+    章节内容稀疏异常。
+
+    当LLM仅输出标题或正文不足以支撑一章时触发，驱动重试以保证报告质量。
+    """
+
+
 class ChapterGenerationNode(BaseNode):
    """
    负责按章节调用LLM并校验JSON结构。
@@ -71,6 +79,12 @@ class ChapterGenerationNode(BaseNode):
        "sub": "subscript",
        "sup": "superscript",
    }
+    # 章节若仅包含标题或字符过少则视为失败，强制LLM重新生成
+    _MIN_NON_HEADING_BLOCKS = 2
+    _MIN_BODY_CHARACTERS = 400
+    _PARAGRAPH_FRAGMENT_MAX_CHARS = 80
+    _PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240
+    _TERMINATION_PUNCTUATION = set("。！？!?；;……")

    def __init__(self, llm_client, validator: IRValidator, storage: ChapterStorage):
        """
@@ -121,17 +135,32 @@ class ChapterGenerationNode(BaseNode):
        self._sanitize_chapter_blocks(chapter_json)

        valid, errors = self.validator.validate_chapter(chapter_json)
+        content_error: ChapterContentError | None = None
+        if valid:
+            try:
+                self._ensure_content_density(chapter_json)
+            except ChapterContentError as exc:
+                content_error = exc
+
+        error_messages: List[str] = []
+        if not valid and errors:
+            error_messages.extend(errors)
+        if content_error:
+            error_messages.append(str(content_error))
+
        self.storage.persist_chapter(
            run_dir,
            chapter_meta,
            chapter_json,
-            errors=None if valid else errors,
+            errors=None if not error_messages else error_messages,
        )

        if not valid:
            raise ValueError(
                f"{section.title} 章节JSON校验失败: {'; '.join(errors[:5])}"
            )
+        if content_error:
+            raise content_error

        return chapter_json

@@ -488,6 +517,97 @@ class ChapterGenerationNode(BaseNode):

        walk(chapter.get("blocks"))

+        blocks = chapter.get("blocks")
+        if isinstance(blocks, list):
+            chapter["blocks"] = self._merge_fragment_sequences(blocks)
+
+    def _ensure_content_density(self, chapter: Dict[str, Any]):
+        """
+        校验章节正文密度。
+
+        若blocks缺失、除标题外无有效区块，或正文字符数低于阈值，
+        则视为章节内容异常，触发ChapterContentError以便上游重试。
+        """
+        blocks = chapter.get("blocks")
+        if not isinstance(blocks, list) or not blocks:
+            raise ChapterContentError("章节缺少正文区块，无法输出内容")
+
+        non_heading_blocks = [
+            block
+            for block in blocks
+            if isinstance(block, dict)
+            and block.get("type") not in {"heading", "divider", "toc"}
+        ]
+        body_characters = self._count_body_characters(blocks)
+
+        if len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS or body_characters < self._MIN_BODY_CHARACTERS:
+            raise ChapterContentError(
+                f"{chapter.get('title') or '该章节'} 正文不足：有效区块 {len(non_heading_blocks)} 个，估算字符数 {body_characters}"
+            )
+
+    def _count_body_characters(self, blocks: Any) -> int:
+        """
+        递归统计正文字符数。
+
+        - 忽略heading/divider/widget等非正文类型；
+        - 对paragraph/list/table/callout等结构抽取嵌套文本；
+        - 仅用于粗粒度判断篇幅是否合理。
+        """
+
+        def walk(node: Any) -> int:
+            if node is None:
+                return 0
+            if isinstance(node, list):
+                return sum(walk(item) for item in node)
+            if isinstance(node, str):
+                return len(node.strip())
+            if not isinstance(node, dict):
+                return 0
+
+            block_type = node.get("type")
+            if block_type in {"heading", "divider", "toc", "widget"}:
+                return 0
+
+            if block_type == "paragraph":
+                inlines = node.get("inlines")
+                if isinstance(inlines, list):
+                    total = 0
+                    for run in inlines:
+                        if isinstance(run, dict):
+                            text = run.get("text")
+                            if isinstance(text, str):
+                                total += len(text.strip())
+                    return total
+                text_value = node.get("text")
+                if isinstance(text_value, str):
+                    return len(text_value.strip())
+                return len(self._extract_block_text(node).strip())
+
+            if block_type == "list":
+                total = 0
+                for item in node.get("items", []):
+                    total += walk(item)
+                return total
+
+            if block_type in {"blockquote", "callout"}:
+                return walk(node.get("blocks"))
+
+            if block_type == "table":
+                total = 0
+                for row in node.get("rows", []):
+                    cells = row.get("cells") or []
+                    for cell in cells:
+                        total += walk(cell.get("blocks"))
+                return total
+
+            nested = node.get("blocks")
+            if isinstance(nested, list):
+                return walk(nested)
+
+            return len(self._extract_block_text(node).strip())
+
+        return walk(blocks)
+
    def _sanitize_block_content(self, block: Dict[str, Any]):
        """根据类型做精细化修复，例如清理paragraph内的非法inline mark"""
        block_type = block.get("type")
@@ -505,7 +625,134 @@ class ChapterGenerationNode(BaseNode):
            normalized_runs = [self._as_inline_run(self._extract_block_text(block))]
        if not normalized_runs:
            normalized_runs = [self._as_inline_run("")]
-        block["inlines"] = normalized_runs
+        block["inlines"] = self._strip_inline_artifacts(normalized_runs)
+
+    def _strip_inline_artifacts(self, inlines: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """移除被LLM误写入的JSON哨兵文本，防止渲染出`{\"type\": \"\"}`等垃圾字符"""
+        cleaned: List[Dict[str, Any]] = []
+        for run in inlines or []:
+            if not isinstance(run, dict):
+                continue
+            text = run.get("text")
+            if isinstance(text, str):
+                stripped = text.strip()
+                if stripped.startswith("{") and stripped.endswith("}"):
+                    try:
+                        payload = json.loads(stripped)
+                    except json.JSONDecodeError:
+                        payload = None
+                    if isinstance(payload, dict) and set(payload.keys()).issubset({"type", "value"}):
+                        continue
+            cleaned.append(run)
+        return cleaned or [self._as_inline_run("")]
+
+    def _merge_fragment_sequences(self, blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """合并被LLM拆成多段的句子片段，避免HTML出现大量孤立<p>"""
+        if not isinstance(blocks, list):
+            return blocks
+
+        merged: List[Dict[str, Any]] = []
+        fragment_buffer: List[Dict[str, Any]] = []
+
+        def flush_buffer():
+            nonlocal fragment_buffer
+            if not fragment_buffer:
+                return
+            if len(fragment_buffer) == 1:
+                merged.append(fragment_buffer[0])
+            else:
+                merged.append(self._combine_paragraph_fragments(fragment_buffer))
+            fragment_buffer = []
+
+        for block in blocks:
+            if self._is_paragraph_fragment(block):
+                fragment_buffer.append(block)
+                continue
+            flush_buffer()
+            merged.append(self._merge_nested_fragments(block))
+
+        flush_buffer()
+        return merged
+
+    def _merge_nested_fragments(self, block: Dict[str, Any]) -> Dict[str, Any]:
+        """对嵌套结构（callout/list/table）递归处理片段合并"""
+        block_type = block.get("type")
+        if block_type in {"callout", "blockquote"}:
+            nested = block.get("blocks")
+            if isinstance(nested, list):
+                block["blocks"] = self._merge_fragment_sequences(nested)
+        elif block_type == "list":
+            items = block.get("items")
+            if isinstance(items, list):
+                for entry in items:
+                    if isinstance(entry, list):
+                        merged_entry = self._merge_fragment_sequences(entry)
+                        entry[:] = merged_entry
+        elif block_type == "table":
+            for row in block.get("rows", []):
+                cells = row.get("cells") or []
+                for cell in cells:
+                    nested_blocks = cell.get("blocks")
+                    if isinstance(nested_blocks, list):
+                        cell["blocks"] = self._merge_fragment_sequences(nested_blocks)
+        return block
+
+    def _combine_paragraph_fragments(self, fragments: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """将多个句子片段合并为单个paragraph block"""
+        template = dict(fragments[0])
+        combined_inlines: List[Dict[str, Any]] = []
+        for fragment in fragments:
+            runs = fragment.get("inlines")
+            if isinstance(runs, list) and runs:
+                combined_inlines.extend(runs)
+            else:
+                fallback_text = self._extract_block_text(fragment)
+                combined_inlines.append(self._as_inline_run(fallback_text))
+        if not combined_inlines:
+            combined_inlines.append(self._as_inline_run(""))
+        template["inlines"] = combined_inlines
+        return template
+
+    def _is_paragraph_fragment(self, block: Dict[str, Any]) -> bool:
+        """判断paragraph是否为被错误拆分的短片段"""
+        if not isinstance(block, dict) or block.get("type") != "paragraph":
+            return False
+        inlines = block.get("inlines")
+        text = ""
+        has_marks = False
+        if isinstance(inlines, list) and inlines:
+            parts: List[str] = []
+            for run in inlines:
+                if not isinstance(run, dict):
+                    continue
+                parts.append(str(run.get("text") or ""))
+                marks = run.get("marks")
+                if isinstance(marks, list) and any(marks):
+                    has_marks = True
+            text = "".join(parts)
+        else:
+            text = self._extract_block_text(block)
+        stripped = (text or "").strip()
+        if not stripped:
+            return True
+        if has_marks:
+            return False
+        if "\n" in stripped:
+            return False
+
+        short_limit = self._PARAGRAPH_FRAGMENT_MAX_CHARS
+        long_limit = getattr(
+            self,
+            "_PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS",
+            short_limit * 3,
+        )
+
+        if stripped[-1] in self._TERMINATION_PUNCTUATION:
+            return len(stripped) <= short_limit
+
+        if len(stripped) > long_limit:
+            return False
+        return True

    def _coerce_inline_run(self, run: Any) -> List[Dict[str, Any]]:
        """将任意inline写法规整为合法run"""