Optimize the Handling of Low Word Counts

2025-11-15 17:46:42 +08:00
parent cab812e261
commit a12ac4234d
2 changed files with 114 additions and 9 deletions
@@ -10,6 +10,7 @@ Report Agent主类。
 import json
 import os
 from copy import deepcopy
 from pathlib import Path
 from uuid import uuid4
 from datetime import datetime
@@ -174,6 +175,8 @@ class ReportAgent:
    - 章节存储、IR装订、渲染器等产出链路；
    - 状态管理、日志、输入输出校验与持久化。
    """
    _CONTENT_SPARSE_MIN_ATTEMPTS = 3
    _CONTENT_SPARSE_WARNING_TEXT = "本章LLM生成的内容字数可能过低，必要时可以尝试重新运行程序。"
    def __init__(self, config: Optional[Settings] = None):
        """
@@ -466,7 +469,9 @@ class ReportAgent:
            emit('stage', {'stage': 'storage_ready', 'run_dir': str(run_dir)})
            chapters = []
-            chapter_max_attempts = max(1, self.config.CHAPTER_JSON_MAX_ATTEMPTS)
+            chapter_max_attempts = max(
                self._CONTENT_SPARSE_MIN_ATTEMPTS, self.config.CHAPTER_JSON_MAX_ATTEMPTS
            )
            for section in sections:
                logger.info(f"生成章节: {section.title}")
                emit('chapter_status', {
@@ -492,6 +497,9 @@ class ReportAgent:
                chapter_payload: Dict[str, Any] | None = None
                attempt = 1
                best_sparse_candidate: Dict[str, Any] | None = None
                best_sparse_score = -1
                fallback_used = False
                while attempt <= chapter_max_attempts:
                    try:
                        chapter_payload = self.chapter_generation_node.run(
@@ -506,6 +514,19 @@ class ReportAgent:
                            "content_sparse" if isinstance(structured_error, ChapterContentError) else "json_parse"
                        )
                        readable_label = "内容密度异常" if error_kind == "content_sparse" else "JSON解析失败"
                        if isinstance(structured_error, ChapterContentError):
                            candidate = getattr(structured_error, "chapter_payload", None)
                            candidate_score = getattr(structured_error, "body_characters", 0) or 0
                            if isinstance(candidate, dict) and candidate_score >= 0:
                                if candidate_score > best_sparse_score:
                                    best_sparse_candidate = deepcopy(candidate)
                                    best_sparse_score = candidate_score
                        will_fallback = (
                            isinstance(structured_error, ChapterContentError)
                            and attempt >= chapter_max_attempts
                            and attempt >= self._CONTENT_SPARSE_MIN_ATTEMPTS
                            and best_sparse_candidate is not None
                        )
                        logger.warning(
                            "章节 {title} {label}（第 {attempt}/{total} 次尝试）: {error}",
                            title=section.title,
@@ -514,14 +535,27 @@ class ReportAgent:
                            total=chapter_max_attempts,
                            error=structured_error,
                        )
-                        emit('chapter_status', {
+                        status_value = 'retrying' if attempt < chapter_max_attempts or will_fallback else 'error'
                        status_payload = {
                            'chapterId': section.chapter_id,
                            'title': section.title,
-                            'status': 'retrying' if attempt < chapter_max_attempts else 'error',
+                            'status': status_value,
                            'attempt': attempt,
                            'error': str(structured_error),
                            'reason': error_kind,
-                        })
+                        }
                        if will_fallback:
                            status_payload['warning'] = 'content_sparse_fallback_pending'
                        emit('chapter_status', status_payload)
                        if will_fallback:
                            logger.warning(
                                "章节 {title} 达到最大尝试次数，保留字数最多（约 {score} 字）的版本作为兜底输出",
                                title=section.title,
                                score=best_sparse_score,
                            )
                            chapter_payload = self._finalize_sparse_chapter(best_sparse_candidate)
                            fallback_used = True
                            break
                        if attempt >= chapter_max_attempts:
                            raise
                        attempt += 1
@@ -553,12 +587,16 @@ class ReportAgent:
                        f"{section.title} 章节JSON在 {chapter_max_attempts} 次尝试后仍无法解析"
                    )
                chapters.append(chapter_payload)
-                emit('chapter_status', {
+                completion_status = {
                    'chapterId': section.chapter_id,
                    'title': section.title,
                    'status': 'completed',
                    'attempt': attempt,
-                })
+                }
                if fallback_used:
                    completion_status['warning'] = 'content_sparse_fallback'
                    completion_status['warningMessage'] = self._CONTENT_SPARSE_WARNING_TEXT
                emit('chapter_status', completion_status)
            document_ir = self.document_composer.build_document(
                report_id,
@@ -779,6 +817,48 @@ class ReportAgent:
        ]
        return any(keyword in normalized for keyword in keywords)
    def _finalize_sparse_chapter(self, chapter: Optional[Dict[str, Any]]) -> Dict[str, Any]:
        """
        构造内容稀疏兜底章节：复制原始payload并插入温馨提示段落。
        """
        safe_chapter = deepcopy(chapter or {})
        if not isinstance(safe_chapter, dict):
            safe_chapter = {}
        self._ensure_sparse_warning_block(safe_chapter)
        return safe_chapter
    def _ensure_sparse_warning_block(self, chapter: Dict[str, Any]) -> None:
        """
        将提示段落插在章节标题后，提醒读者该章字数偏少。
        """
        warning_block = {
            "type": "paragraph",
            "inlines": [
                {
                    "text": self._CONTENT_SPARSE_WARNING_TEXT,
                    "marks": [{"type": "italic"}],
                }
            ],
            "meta": {"role": "content-sparse-warning"},
        }
        blocks = chapter.get("blocks")
        if isinstance(blocks, list) and blocks:
            inserted = False
            for idx, block in enumerate(blocks):
                if isinstance(block, dict) and block.get("type") == "heading":
                    blocks.insert(idx + 1, warning_block)
                    inserted = True
                    break
            if not inserted:
                blocks.insert(0, warning_block)
        else:
            chapter["blocks"] = [warning_block]
        meta = chapter.get("meta")
        if isinstance(meta, dict):
            meta["contentSparseWarning"] = True
        else:
            chapter["meta"] = {"contentSparseWarning": True}
    def _stringify(self, value: Any) -> str:
        """
        安全地将对象转成字符串。
@@ -55,6 +55,20 @@ class ChapterContentError(ValueError):
    当LLM仅输出标题或正文不足以支撑一章时触发，驱动重试以保证报告质量。
    """
    def __init__(
        self,
        message: str,
        chapter: Optional[Dict[str, Any]] = None,
        body_characters: int = 0,
        narrative_characters: int = 0,
        non_heading_blocks: int = 0,
    ):
        super().__init__(message)
        self.chapter_payload: Optional[Dict[str, Any]] = chapter
        self.body_characters: int = int(body_characters or 0)
        self.narrative_characters: int = int(narrative_characters or 0)
        self.non_heading_blocks: int = int(non_heading_blocks or 0)
 class ChapterGenerationNode(BaseNode):
    """
@@ -897,7 +911,13 @@ class ChapterGenerationNode(BaseNode):
        """
        blocks = chapter.get("blocks")
        if not isinstance(blocks, list) or not blocks:
-            raise ChapterContentError("章节缺少正文区块，无法输出内容")
+            raise ChapterContentError(
                "章节缺少正文区块，无法输出内容",
                chapter=chapter,
                body_characters=0,
                narrative_characters=0,
                non_heading_blocks=0,
            )
        non_heading_blocks = [
            block
@@ -905,16 +925,21 @@ class ChapterGenerationNode(BaseNode):
            if isinstance(block, dict)
            and block.get("type") not in {"heading", "divider", "toc"}
        ]
        valid_block_count = len(non_heading_blocks)
        body_characters = self._count_body_characters(blocks)
        narrative_characters = self._count_narrative_characters(blocks)
        if (
-            len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS
+            valid_block_count < self._MIN_NON_HEADING_BLOCKS
            or body_characters < self._MIN_BODY_CHARACTERS
            or narrative_characters < self._MIN_NARRATIVE_CHARACTERS
        ):
            raise ChapterContentError(
-                f"{chapter.get('title') or '该章节'} 正文不足：有效区块 {len(non_heading_blocks)} 个，估算字符数 {body_characters}，叙述性字符数 {narrative_characters}"
+                f"{chapter.get('title') or '该章节'} 正文不足：有效区块 {valid_block_count} 个，估算字符数 {body_characters}，叙述性字符数 {narrative_characters}",
                chapter=chapter,
                body_characters=body_characters,
                narrative_characters=narrative_characters,
                non_heading_blocks=valid_block_count,
            )
    def _count_body_characters(self, blocks: Any) -> int: