Increase DeepSeek Compatibility

2025-11-14 17:55:28 +08:00
parent e267b1fc04
commit 52eed4d010
4 changed files with 460 additions and 12 deletions
@@ -29,6 +29,7 @@ from .nodes import (
    TemplateSelectionNode,
    ChapterGenerationNode,
    ChapterJsonParseError,
    ChapterContentError,
    DocumentLayoutNode,
    WordBudgetNode,
 )
@@ -438,20 +439,26 @@ class ReportAgent:
                            stream_callback=chunk_callback
                        )
                        break
-                    except ChapterJsonParseError as parse_error:
+                    except (ChapterJsonParseError, ChapterContentError) as structured_error:
                        error_kind = (
                            "content_sparse" if isinstance(structured_error, ChapterContentError) else "json_parse"
                        )
                        readable_label = "内容密度异常" if error_kind == "content_sparse" else "JSON解析失败"
                        logger.warning(
-                            "章节 %s JSON解析失败（第 %s/%s 次尝试）: %s",
+                            "章节 %s %s（第 %s/%s 次尝试）: %s",
                            section.title,
                            readable_label,
                            attempt,
                            chapter_max_attempts,
-                            parse_error,
+                            structured_error,
                        )
                        emit('chapter_status', {
                            'chapterId': section.chapter_id,
                            'title': section.title,
                            'status': 'retrying' if attempt < chapter_max_attempts else 'error',
                            'attempt': attempt,
-                            'error': str(parse_error),
+                            'error': str(structured_error),
                            'reason': error_kind,
                        })
                        if attempt >= chapter_max_attempts:
                            raise
@@ -6,7 +6,7 @@ Report Engine节点处理模块。
 from .base_node import BaseNode, StateMutationNode
 from .template_selection_node import TemplateSelectionNode
-from .chapter_generation_node import ChapterGenerationNode, ChapterJsonParseError
+from .chapter_generation_node import ChapterGenerationNode, ChapterJsonParseError, ChapterContentError
 from .document_layout_node import DocumentLayoutNode
 from .word_budget_node import WordBudgetNode
@@ -16,6 +16,7 @@ __all__ = [
    "TemplateSelectionNode",
    "ChapterGenerationNode",
    "ChapterJsonParseError",
    "ChapterContentError",
    "DocumentLayoutNode",
    "WordBudgetNode",
 ]
@@ -36,6 +36,14 @@ class ChapterJsonParseError(ValueError):
        self.raw_text = raw_text
 class ChapterContentError(ValueError):
    """
    章节内容稀疏异常。
    当LLM仅输出标题或正文不足以支撑一章时触发，驱动重试以保证报告质量。
    """
 class ChapterGenerationNode(BaseNode):
    """
    负责按章节调用LLM并校验JSON结构。
@@ -71,6 +79,12 @@ class ChapterGenerationNode(BaseNode):
        "sub": "subscript",
        "sup": "superscript",
    }
    # 章节若仅包含标题或字符过少则视为失败，强制LLM重新生成
    _MIN_NON_HEADING_BLOCKS = 2
    _MIN_BODY_CHARACTERS = 400
    _PARAGRAPH_FRAGMENT_MAX_CHARS = 80
    _PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240
    _TERMINATION_PUNCTUATION = set("。！？!?；;……")
    def __init__(self, llm_client, validator: IRValidator, storage: ChapterStorage):
        """
@@ -121,17 +135,32 @@ class ChapterGenerationNode(BaseNode):
        self._sanitize_chapter_blocks(chapter_json)
        valid, errors = self.validator.validate_chapter(chapter_json)
        content_error: ChapterContentError | None = None
        if valid:
            try:
                self._ensure_content_density(chapter_json)
            except ChapterContentError as exc:
                content_error = exc
        error_messages: List[str] = []
        if not valid and errors:
            error_messages.extend(errors)
        if content_error:
            error_messages.append(str(content_error))
        self.storage.persist_chapter(
            run_dir,
            chapter_meta,
            chapter_json,
-            errors=None if valid else errors,
+            errors=None if not error_messages else error_messages,
        )
        if not valid:
            raise ValueError(
                f"{section.title} 章节JSON校验失败: {'; '.join(errors[:5])}"
            )
        if content_error:
            raise content_error
        return chapter_json
@@ -488,6 +517,97 @@ class ChapterGenerationNode(BaseNode):
        walk(chapter.get("blocks"))
        blocks = chapter.get("blocks")
        if isinstance(blocks, list):
            chapter["blocks"] = self._merge_fragment_sequences(blocks)
    def _ensure_content_density(self, chapter: Dict[str, Any]):
        """
        校验章节正文密度。
        若blocks缺失、除标题外无有效区块，或正文字符数低于阈值，
        则视为章节内容异常，触发ChapterContentError以便上游重试。
        """
        blocks = chapter.get("blocks")
        if not isinstance(blocks, list) or not blocks:
            raise ChapterContentError("章节缺少正文区块，无法输出内容")
        non_heading_blocks = [
            block
            for block in blocks
            if isinstance(block, dict)
            and block.get("type") not in {"heading", "divider", "toc"}
        ]
        body_characters = self._count_body_characters(blocks)
        if len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS or body_characters < self._MIN_BODY_CHARACTERS:
            raise ChapterContentError(
                f"{chapter.get('title') or '该章节'} 正文不足：有效区块 {len(non_heading_blocks)} 个，估算字符数 {body_characters}"
            )
    def _count_body_characters(self, blocks: Any) -> int:
        """
        递归统计正文字符数。
        - 忽略heading/divider/widget等非正文类型；
        - 对paragraph/list/table/callout等结构抽取嵌套文本；
        - 仅用于粗粒度判断篇幅是否合理。
        """
        def walk(node: Any) -> int:
            if node is None:
                return 0
            if isinstance(node, list):
                return sum(walk(item) for item in node)
            if isinstance(node, str):
                return len(node.strip())
            if not isinstance(node, dict):
                return 0
            block_type = node.get("type")
            if block_type in {"heading", "divider", "toc", "widget"}:
                return 0
            if block_type == "paragraph":
                inlines = node.get("inlines")
                if isinstance(inlines, list):
                    total = 0
                    for run in inlines:
                        if isinstance(run, dict):
                            text = run.get("text")
                            if isinstance(text, str):
                                total += len(text.strip())
                    return total
                text_value = node.get("text")
                if isinstance(text_value, str):
                    return len(text_value.strip())
                return len(self._extract_block_text(node).strip())
            if block_type == "list":
                total = 0
                for item in node.get("items", []):
                    total += walk(item)
                return total
            if block_type in {"blockquote", "callout"}:
                return walk(node.get("blocks"))
            if block_type == "table":
                total = 0
                for row in node.get("rows", []):
                    cells = row.get("cells") or []
                    for cell in cells:
                        total += walk(cell.get("blocks"))
                return total
            nested = node.get("blocks")
            if isinstance(nested, list):
                return walk(nested)
            return len(self._extract_block_text(node).strip())
        return walk(blocks)
    def _sanitize_block_content(self, block: Dict[str, Any]):
        """根据类型做精细化修复，例如清理paragraph内的非法inline mark"""
        block_type = block.get("type")
@@ -505,7 +625,134 @@ class ChapterGenerationNode(BaseNode):
            normalized_runs = [self._as_inline_run(self._extract_block_text(block))]
        if not normalized_runs:
            normalized_runs = [self._as_inline_run("")]
-        block["inlines"] = normalized_runs
+        block["inlines"] = self._strip_inline_artifacts(normalized_runs)
    def _strip_inline_artifacts(self, inlines: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """移除被LLM误写入的JSON哨兵文本，防止渲染出`{\"type\": \"\"}`等垃圾字符"""
        cleaned: List[Dict[str, Any]] = []
        for run in inlines or []:
            if not isinstance(run, dict):
                continue
            text = run.get("text")
            if isinstance(text, str):
                stripped = text.strip()
                if stripped.startswith("{") and stripped.endswith("}"):
                    try:
                        payload = json.loads(stripped)
                    except json.JSONDecodeError:
                        payload = None
                    if isinstance(payload, dict) and set(payload.keys()).issubset({"type", "value"}):
                        continue
            cleaned.append(run)
        return cleaned or [self._as_inline_run("")]
    def _merge_fragment_sequences(self, blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """合并被LLM拆成多段的句子片段，避免HTML出现大量孤立<p>"""
        if not isinstance(blocks, list):
            return blocks
        merged: List[Dict[str, Any]] = []
        fragment_buffer: List[Dict[str, Any]] = []
        def flush_buffer():
            nonlocal fragment_buffer
            if not fragment_buffer:
                return
            if len(fragment_buffer) == 1:
                merged.append(fragment_buffer[0])
            else:
                merged.append(self._combine_paragraph_fragments(fragment_buffer))
            fragment_buffer = []
        for block in blocks:
            if self._is_paragraph_fragment(block):
                fragment_buffer.append(block)
                continue
            flush_buffer()
            merged.append(self._merge_nested_fragments(block))
        flush_buffer()
        return merged
    def _merge_nested_fragments(self, block: Dict[str, Any]) -> Dict[str, Any]:
        """对嵌套结构（callout/list/table）递归处理片段合并"""
        block_type = block.get("type")
        if block_type in {"callout", "blockquote"}:
            nested = block.get("blocks")
            if isinstance(nested, list):
                block["blocks"] = self._merge_fragment_sequences(nested)
        elif block_type == "list":
            items = block.get("items")
            if isinstance(items, list):
                for entry in items:
                    if isinstance(entry, list):
                        merged_entry = self._merge_fragment_sequences(entry)
                        entry[:] = merged_entry
        elif block_type == "table":
            for row in block.get("rows", []):
                cells = row.get("cells") or []
                for cell in cells:
                    nested_blocks = cell.get("blocks")
                    if isinstance(nested_blocks, list):
                        cell["blocks"] = self._merge_fragment_sequences(nested_blocks)
        return block
    def _combine_paragraph_fragments(self, fragments: List[Dict[str, Any]]) -> Dict[str, Any]:
        """将多个句子片段合并为单个paragraph block"""
        template = dict(fragments[0])
        combined_inlines: List[Dict[str, Any]] = []
        for fragment in fragments:
            runs = fragment.get("inlines")
            if isinstance(runs, list) and runs:
                combined_inlines.extend(runs)
            else:
                fallback_text = self._extract_block_text(fragment)
                combined_inlines.append(self._as_inline_run(fallback_text))
        if not combined_inlines:
            combined_inlines.append(self._as_inline_run(""))
        template["inlines"] = combined_inlines
        return template
    def _is_paragraph_fragment(self, block: Dict[str, Any]) -> bool:
        """判断paragraph是否为被错误拆分的短片段"""
        if not isinstance(block, dict) or block.get("type") != "paragraph":
            return False
        inlines = block.get("inlines")
        text = ""
        has_marks = False
        if isinstance(inlines, list) and inlines:
            parts: List[str] = []
            for run in inlines:
                if not isinstance(run, dict):
                    continue
                parts.append(str(run.get("text") or ""))
                marks = run.get("marks")
                if isinstance(marks, list) and any(marks):
                    has_marks = True
            text = "".join(parts)
        else:
            text = self._extract_block_text(block)
        stripped = (text or "").strip()
        if not stripped:
            return True
        if has_marks:
            return False
        if "\n" in stripped:
            return False
        short_limit = self._PARAGRAPH_FRAGMENT_MAX_CHARS
        long_limit = getattr(
            self,
            "_PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS",
            short_limit * 3,
        )
        if stripped[-1] in self._TERMINATION_PUNCTUATION:
            return len(stripped) <= short_limit
        if len(stripped) > long_limit:
            return False
        return True
    def _coerce_inline_run(self, run: Any) -> List[Dict[str, Any]]:
        """将任意inline写法规整为合法run"""
@@ -5,6 +5,7 @@
 from __future__ import annotations
 import ast
 import copy
 import html
 import json
 from typing import Any, Dict, List
@@ -19,6 +20,31 @@ class HTMLRenderer:
    - 提供主题变量、编号映射等辅助功能。
    """
    CALLOUT_ALLOWED_TYPES = {
        "paragraph",
        "list",
        "table",
        "blockquote",
        "code",
        "math",
        "figure",
        "kpiGrid",
    }
    INLINE_ARTIFACT_KEYS = {
        "props",
        "widgetId",
        "widgetType",
        "data",
        "dataRef",
        "datasets",
        "labels",
        "config",
        "options",
    }
    TABLE_COMPLEX_CHARS = set(
        "@％%（）()，,。；;：:、？?！!·…-—_+<>[]{}|\\/\"'`~$^&*#"
    )
    def __init__(self, config: Dict[str, Any] | None = None):
        """初始化渲染器缓存并允许注入额外配置（如主题覆盖）"""
        self.config = config or {}
@@ -72,6 +98,7 @@ class HTMLRenderer:
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>{self._escape_html(title)}</title>
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/chartjs-chart-sankey@4"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/html2canvas/1.4.1/html2canvas.min.js"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js"></script>
  <script>
@@ -442,8 +469,9 @@ class HTMLRenderer:
    def _render_table(self, block: Dict[str, Any]) -> str:
        """渲染表格，同时保留caption与单元格属性"""
        rows = self._normalize_table_rows(block.get("rows") or [])
        rows_html = ""
-        for row in block.get("rows", []):
+        for row in rows:
            row_cells = ""
            for cell in row.get("cells", []):
                cell_tag = "th" if cell.get("header") or cell.get("isHeader") else "td"
@@ -462,6 +490,105 @@ class HTMLRenderer:
        caption_html = f"<caption>{self._escape_html(caption)}</caption>" if caption else ""
        return f'<div class="table-wrap"><table>{caption_html}<tbody>{rows_html}</tbody></table></div>'
    def _normalize_table_rows(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """检测并修正仅有单列的竖排表，转换为标准网格"""
        if not rows:
            return []
        if not all(len((row.get("cells") or [])) == 1 for row in rows):
            return rows
        texts = [self._extract_row_text(row) for row in rows]
        header_span = self._detect_transposed_header_span(rows, texts)
        if not header_span:
            return rows
        normalized = self._transpose_single_cell_table(rows, header_span)
        return normalized or rows
    def _detect_transposed_header_span(self, rows: List[Dict[str, Any]], texts: List[str]) -> int:
        """推断竖排表头的行数，用于后续转置"""
        max_fields = min(8, len(rows) // 2)
        header_span = 0
        for idx, text in enumerate(texts):
            if idx >= max_fields:
                break
            if self._is_potential_table_header(text):
                header_span += 1
            else:
                break
        if header_span < 2:
            return 0
        remainder = texts[header_span:]
        if not remainder or (len(rows) - header_span) % header_span != 0:
            return 0
        if not any(self._looks_like_table_value(txt) for txt in remainder):
            return 0
        return header_span
    def _is_potential_table_header(self, text: str) -> bool:
        """根据长度与字符特征判断是否像表头字段"""
        if not text:
            return False
        stripped = text.strip()
        if not stripped or len(stripped) > 12:
            return False
        return not any(ch.isdigit() or ch in self.TABLE_COMPLEX_CHARS for ch in stripped)
    def _looks_like_table_value(self, text: str) -> bool:
        """判断该文本是否更像数据值，用于辅助判断转置"""
        if not text:
            return False
        stripped = text.strip()
        if len(stripped) >= 12:
            return True
        return any(ch.isdigit() or ch in self.TABLE_COMPLEX_CHARS for ch in stripped)
    def _transpose_single_cell_table(self, rows: List[Dict[str, Any]], span: int) -> List[Dict[str, Any]]:
        """将单列多行的表格转换为标准表头 + 若干数据行"""
        total = len(rows)
        if total <= span or (total - span) % span != 0:
            return []
        header_rows = rows[:span]
        data_rows = rows[span:]
        normalized: List[Dict[str, Any]] = []
        header_cells = []
        for row in header_rows:
            cell = copy.deepcopy((row.get("cells") or [{}])[0])
            cell["header"] = True
            header_cells.append(cell)
        normalized.append({"cells": header_cells})
        for start in range(0, len(data_rows), span):
            group = data_rows[start : start + span]
            if len(group) < span:
                break
            normalized.append(
                {
                    "cells": [
                        copy.deepcopy((item.get("cells") or [{}])[0])
                        for item in group
                    ]
                }
            )
        return normalized
    def _extract_row_text(self, row: Dict[str, Any]) -> str:
        """提取表格行中的纯文本，方便启发式分析"""
        cells = row.get("cells") or []
        if not cells:
            return ""
        cell = cells[0]
        texts: List[str] = []
        for block in cell.get("blocks", []):
            if isinstance(block, dict):
                if block.get("type") == "paragraph":
                    for inline in block.get("inlines") or []:
                        if isinstance(inline, dict):
                            value = inline.get("text")
                        else:
                            value = inline
                        if value is None:
                            continue
                        texts.append(str(value))
        return "".join(texts)
    def _render_blockquote(self, block: Dict[str, Any]) -> str:
        """渲染引用块，可嵌套其他block"""
        inner = self._render_blocks(block.get("blocks", []))
@@ -487,9 +614,63 @@ class HTMLRenderer:
        """渲染高亮提示盒，tone决定颜色"""
        tone = block.get("tone", "info")
        title = block.get("title")
-        inner = self._render_blocks(block.get("blocks", []))
+        safe_blocks, trailing_blocks = self._split_callout_content(block.get("blocks"))
        inner = self._render_blocks(safe_blocks)
        title_html = f"<strong>{self._escape_html(title)}</strong>" if title else ""
-        return f'<div class="callout tone-{tone}">{title_html}{inner}</div>'
+        callout_html = f'<div class="callout tone-{tone}">{title_html}{inner}</div>'
        trailing_html = self._render_blocks(trailing_blocks) if trailing_blocks else ""
        return callout_html + trailing_html
    def _split_callout_content(
        self, blocks: List[Dict[str, Any]] | None
    ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
        """限定callout内部仅包含轻量内容，其余块剥离到外层"""
        if not blocks:
            return [], []
        safe: List[Dict[str, Any]] = []
        trailing: List[Dict[str, Any]] = []
        for idx, child in enumerate(blocks):
            child_type = child.get("type")
            if child_type == "list":
                sanitized, overflow = self._sanitize_callout_list(child)
                if sanitized:
                    safe.append(sanitized)
                if overflow:
                    trailing.extend(overflow)
                    trailing.extend(copy.deepcopy(blocks[idx + 1 :]))
                    break
            elif child_type in self.CALLOUT_ALLOWED_TYPES:
                safe.append(child)
            else:
                trailing.extend(copy.deepcopy(blocks[idx:]))
                break
        else:
            return safe, []
        return safe, trailing
    def _sanitize_callout_list(
        self, block: Dict[str, Any]
    ) -> tuple[Dict[str, Any] | None, List[Dict[str, Any]]]:
        """当列表项包含结构型block时，将其截断移出callout"""
        items = block.get("items") or []
        if not items:
            return block, []
        sanitized_items: List[List[Dict[str, Any]]] = []
        trailing: List[Dict[str, Any]] = []
        for idx, item in enumerate(items):
            safe, overflow = self._split_callout_content(item)
            if safe:
                sanitized_items.append(safe)
            if overflow:
                trailing.extend(overflow)
                for rest in items[idx + 1 :]:
                    trailing.extend(copy.deepcopy(rest))
                break
        if not sanitized_items:
            return None, trailing
        new_block = copy.deepcopy(block)
        new_block["items"] = sanitized_items
        return new_block, trailing
    def _render_kpi_grid(self, block: Dict[str, Any]) -> str:
        """渲染KPI卡片栅格，包含指标值与涨跌幅"""
@@ -631,6 +812,8 @@ class HTMLRenderer:
                            nested_marks = inline_payload.get("marks")
                            if isinstance(nested_marks, list):
                                marks.extend(nested_marks)
                        elif any(key in payload for key in self.INLINE_ARTIFACT_KEYS):
                            text_value = ""
        return text_value, marks
@@ -1281,10 +1464,11 @@ function mergeOptions(base, override) {
 }
 function resolveChartTypes(payload) {
  const explicit = payload && payload.props && payload.props.type;
  const widgetType = payload && payload.widgetType ? payload.widgetType : 'chart.js/bar';
-  const primary = widgetType.includes('/') ? widgetType.split('/').pop() : widgetType;
+  const derived = widgetType && widgetType.includes('/') ? widgetType.split('/').pop() : widgetType;
  const extra = Array.isArray(payload && payload.preferredTypes) ? payload.preferredTypes : [];
-  const pipeline = [primary, ...extra, ...STABLE_CHART_TYPES];
+  const pipeline = [explicit, derived, ...extra, ...STABLE_CHART_TYPES].filter(Boolean);
  const result = [];
  pipeline.forEach(type => {
    if (type && !result.includes(type)) {
@@ -1456,6 +1640,15 @@ function buildChartOptions(payload) {
 }
 function instantiateChart(ctx, payload, optionsTemplate, type) {
  if (!ctx) {
    return null;
  }
  if (ctx.canvas && typeof Chart !== 'undefined' && typeof Chart.getChart === 'function') {
    const existing = Chart.getChart(ctx.canvas);
    if (existing) {
      existing.destroy();
    }
  }
  const data = cloneDeep(payload && payload.data ? payload.data : {});
  const config = {
    type,