Optimize the Handling of Low Word Counts

This commit is contained in:
马一丁
2025-11-15 17:46:42 +08:00
parent cab812e261
commit a12ac4234d
2 changed files with 114 additions and 9 deletions
+86 -6
View File
@@ -10,6 +10,7 @@ Report Agent主类。
import json import json
import os import os
from copy import deepcopy
from pathlib import Path from pathlib import Path
from uuid import uuid4 from uuid import uuid4
from datetime import datetime from datetime import datetime
@@ -174,6 +175,8 @@ class ReportAgent:
- 章节存储、IR装订、渲染器等产出链路; - 章节存储、IR装订、渲染器等产出链路;
- 状态管理、日志、输入输出校验与持久化。 - 状态管理、日志、输入输出校验与持久化。
""" """
_CONTENT_SPARSE_MIN_ATTEMPTS = 3
_CONTENT_SPARSE_WARNING_TEXT = "本章LLM生成的内容字数可能过低,必要时可以尝试重新运行程序。"
def __init__(self, config: Optional[Settings] = None): def __init__(self, config: Optional[Settings] = None):
""" """
@@ -466,7 +469,9 @@ class ReportAgent:
emit('stage', {'stage': 'storage_ready', 'run_dir': str(run_dir)}) emit('stage', {'stage': 'storage_ready', 'run_dir': str(run_dir)})
chapters = [] chapters = []
chapter_max_attempts = max(1, self.config.CHAPTER_JSON_MAX_ATTEMPTS) chapter_max_attempts = max(
self._CONTENT_SPARSE_MIN_ATTEMPTS, self.config.CHAPTER_JSON_MAX_ATTEMPTS
)
for section in sections: for section in sections:
logger.info(f"生成章节: {section.title}") logger.info(f"生成章节: {section.title}")
emit('chapter_status', { emit('chapter_status', {
@@ -492,6 +497,9 @@ class ReportAgent:
chapter_payload: Dict[str, Any] | None = None chapter_payload: Dict[str, Any] | None = None
attempt = 1 attempt = 1
best_sparse_candidate: Dict[str, Any] | None = None
best_sparse_score = -1
fallback_used = False
while attempt <= chapter_max_attempts: while attempt <= chapter_max_attempts:
try: try:
chapter_payload = self.chapter_generation_node.run( chapter_payload = self.chapter_generation_node.run(
@@ -506,6 +514,19 @@ class ReportAgent:
"content_sparse" if isinstance(structured_error, ChapterContentError) else "json_parse" "content_sparse" if isinstance(structured_error, ChapterContentError) else "json_parse"
) )
readable_label = "内容密度异常" if error_kind == "content_sparse" else "JSON解析失败" readable_label = "内容密度异常" if error_kind == "content_sparse" else "JSON解析失败"
if isinstance(structured_error, ChapterContentError):
candidate = getattr(structured_error, "chapter_payload", None)
candidate_score = getattr(structured_error, "body_characters", 0) or 0
if isinstance(candidate, dict) and candidate_score >= 0:
if candidate_score > best_sparse_score:
best_sparse_candidate = deepcopy(candidate)
best_sparse_score = candidate_score
will_fallback = (
isinstance(structured_error, ChapterContentError)
and attempt >= chapter_max_attempts
and attempt >= self._CONTENT_SPARSE_MIN_ATTEMPTS
and best_sparse_candidate is not None
)
logger.warning( logger.warning(
"章节 {title} {label}(第 {attempt}/{total} 次尝试): {error}", "章节 {title} {label}(第 {attempt}/{total} 次尝试): {error}",
title=section.title, title=section.title,
@@ -514,14 +535,27 @@ class ReportAgent:
total=chapter_max_attempts, total=chapter_max_attempts,
error=structured_error, error=structured_error,
) )
emit('chapter_status', { status_value = 'retrying' if attempt < chapter_max_attempts or will_fallback else 'error'
status_payload = {
'chapterId': section.chapter_id, 'chapterId': section.chapter_id,
'title': section.title, 'title': section.title,
'status': 'retrying' if attempt < chapter_max_attempts else 'error', 'status': status_value,
'attempt': attempt, 'attempt': attempt,
'error': str(structured_error), 'error': str(structured_error),
'reason': error_kind, 'reason': error_kind,
}) }
if will_fallback:
status_payload['warning'] = 'content_sparse_fallback_pending'
emit('chapter_status', status_payload)
if will_fallback:
logger.warning(
"章节 {title} 达到最大尝试次数,保留字数最多(约 {score} 字)的版本作为兜底输出",
title=section.title,
score=best_sparse_score,
)
chapter_payload = self._finalize_sparse_chapter(best_sparse_candidate)
fallback_used = True
break
if attempt >= chapter_max_attempts: if attempt >= chapter_max_attempts:
raise raise
attempt += 1 attempt += 1
@@ -553,12 +587,16 @@ class ReportAgent:
f"{section.title} 章节JSON在 {chapter_max_attempts} 次尝试后仍无法解析" f"{section.title} 章节JSON在 {chapter_max_attempts} 次尝试后仍无法解析"
) )
chapters.append(chapter_payload) chapters.append(chapter_payload)
emit('chapter_status', { completion_status = {
'chapterId': section.chapter_id, 'chapterId': section.chapter_id,
'title': section.title, 'title': section.title,
'status': 'completed', 'status': 'completed',
'attempt': attempt, 'attempt': attempt,
}) }
if fallback_used:
completion_status['warning'] = 'content_sparse_fallback'
completion_status['warningMessage'] = self._CONTENT_SPARSE_WARNING_TEXT
emit('chapter_status', completion_status)
document_ir = self.document_composer.build_document( document_ir = self.document_composer.build_document(
report_id, report_id,
@@ -779,6 +817,48 @@ class ReportAgent:
] ]
return any(keyword in normalized for keyword in keywords) return any(keyword in normalized for keyword in keywords)
def _finalize_sparse_chapter(self, chapter: Optional[Dict[str, Any]]) -> Dict[str, Any]:
"""
构造内容稀疏兜底章节:复制原始payload并插入温馨提示段落。
"""
safe_chapter = deepcopy(chapter or {})
if not isinstance(safe_chapter, dict):
safe_chapter = {}
self._ensure_sparse_warning_block(safe_chapter)
return safe_chapter
def _ensure_sparse_warning_block(self, chapter: Dict[str, Any]) -> None:
"""
将提示段落插在章节标题后,提醒读者该章字数偏少。
"""
warning_block = {
"type": "paragraph",
"inlines": [
{
"text": self._CONTENT_SPARSE_WARNING_TEXT,
"marks": [{"type": "italic"}],
}
],
"meta": {"role": "content-sparse-warning"},
}
blocks = chapter.get("blocks")
if isinstance(blocks, list) and blocks:
inserted = False
for idx, block in enumerate(blocks):
if isinstance(block, dict) and block.get("type") == "heading":
blocks.insert(idx + 1, warning_block)
inserted = True
break
if not inserted:
blocks.insert(0, warning_block)
else:
chapter["blocks"] = [warning_block]
meta = chapter.get("meta")
if isinstance(meta, dict):
meta["contentSparseWarning"] = True
else:
chapter["meta"] = {"contentSparseWarning": True}
def _stringify(self, value: Any) -> str: def _stringify(self, value: Any) -> str:
""" """
安全地将对象转成字符串。 安全地将对象转成字符串。
+28 -3
View File
@@ -55,6 +55,20 @@ class ChapterContentError(ValueError):
当LLM仅输出标题或正文不足以支撑一章时触发,驱动重试以保证报告质量。 当LLM仅输出标题或正文不足以支撑一章时触发,驱动重试以保证报告质量。
""" """
def __init__(
self,
message: str,
chapter: Optional[Dict[str, Any]] = None,
body_characters: int = 0,
narrative_characters: int = 0,
non_heading_blocks: int = 0,
):
super().__init__(message)
self.chapter_payload: Optional[Dict[str, Any]] = chapter
self.body_characters: int = int(body_characters or 0)
self.narrative_characters: int = int(narrative_characters or 0)
self.non_heading_blocks: int = int(non_heading_blocks or 0)
class ChapterGenerationNode(BaseNode): class ChapterGenerationNode(BaseNode):
""" """
@@ -897,7 +911,13 @@ class ChapterGenerationNode(BaseNode):
""" """
blocks = chapter.get("blocks") blocks = chapter.get("blocks")
if not isinstance(blocks, list) or not blocks: if not isinstance(blocks, list) or not blocks:
raise ChapterContentError("章节缺少正文区块,无法输出内容") raise ChapterContentError(
"章节缺少正文区块,无法输出内容",
chapter=chapter,
body_characters=0,
narrative_characters=0,
non_heading_blocks=0,
)
non_heading_blocks = [ non_heading_blocks = [
block block
@@ -905,16 +925,21 @@ class ChapterGenerationNode(BaseNode):
if isinstance(block, dict) if isinstance(block, dict)
and block.get("type") not in {"heading", "divider", "toc"} and block.get("type") not in {"heading", "divider", "toc"}
] ]
valid_block_count = len(non_heading_blocks)
body_characters = self._count_body_characters(blocks) body_characters = self._count_body_characters(blocks)
narrative_characters = self._count_narrative_characters(blocks) narrative_characters = self._count_narrative_characters(blocks)
if ( if (
len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS valid_block_count < self._MIN_NON_HEADING_BLOCKS
or body_characters < self._MIN_BODY_CHARACTERS or body_characters < self._MIN_BODY_CHARACTERS
or narrative_characters < self._MIN_NARRATIVE_CHARACTERS or narrative_characters < self._MIN_NARRATIVE_CHARACTERS
): ):
raise ChapterContentError( raise ChapterContentError(
f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters},叙述性字符数 {narrative_characters}" f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {valid_block_count} 个,估算字符数 {body_characters},叙述性字符数 {narrative_characters}",
chapter=chapter,
body_characters=body_characters,
narrative_characters=narrative_characters,
non_heading_blocks=valid_block_count,
) )
def _count_body_characters(self, blocks: Any) -> int: def _count_body_characters(self, blocks: Any) -> int: