Optimize Re-output Logic
This commit is contained in:
@@ -90,7 +90,8 @@ class ChapterGenerationNode(BaseNode):
|
|||||||
}
|
}
|
||||||
# 章节若仅包含标题或字符过少则视为失败,强制LLM重新生成
|
# 章节若仅包含标题或字符过少则视为失败,强制LLM重新生成
|
||||||
_MIN_NON_HEADING_BLOCKS = 2
|
_MIN_NON_HEADING_BLOCKS = 2
|
||||||
_MIN_BODY_CHARACTERS = 400
|
_MIN_BODY_CHARACTERS = 600
|
||||||
|
_MIN_NARRATIVE_CHARACTERS = 300
|
||||||
_PARAGRAPH_FRAGMENT_MAX_CHARS = 80
|
_PARAGRAPH_FRAGMENT_MAX_CHARS = 80
|
||||||
_PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240
|
_PARAGRAPH_FRAGMENT_NO_TERMINATOR_MAX_CHARS = 240
|
||||||
_TERMINATION_PUNCTUATION = set("。!?!?;;……")
|
_TERMINATION_PUNCTUATION = set("。!?!?;;……")
|
||||||
@@ -659,10 +660,15 @@ class ChapterGenerationNode(BaseNode):
|
|||||||
and block.get("type") not in {"heading", "divider", "toc"}
|
and block.get("type") not in {"heading", "divider", "toc"}
|
||||||
]
|
]
|
||||||
body_characters = self._count_body_characters(blocks)
|
body_characters = self._count_body_characters(blocks)
|
||||||
|
narrative_characters = self._count_narrative_characters(blocks)
|
||||||
|
|
||||||
if len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS or body_characters < self._MIN_BODY_CHARACTERS:
|
if (
|
||||||
|
len(non_heading_blocks) < self._MIN_NON_HEADING_BLOCKS
|
||||||
|
or body_characters < self._MIN_BODY_CHARACTERS
|
||||||
|
or narrative_characters < self._MIN_NARRATIVE_CHARACTERS
|
||||||
|
):
|
||||||
raise ChapterContentError(
|
raise ChapterContentError(
|
||||||
f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters}"
|
f"{chapter.get('title') or '该章节'} 正文不足:有效区块 {len(non_heading_blocks)} 个,估算字符数 {body_characters},叙述性字符数 {narrative_characters}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def _count_body_characters(self, blocks: Any) -> int:
|
def _count_body_characters(self, blocks: Any) -> int:
|
||||||
@@ -696,19 +702,7 @@ class ChapterGenerationNode(BaseNode):
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
if block_type == "paragraph":
|
if block_type == "paragraph":
|
||||||
inlines = node.get("inlines")
|
return self._estimate_paragraph_characters(node)
|
||||||
if isinstance(inlines, list):
|
|
||||||
total = 0
|
|
||||||
for run in inlines:
|
|
||||||
if isinstance(run, dict):
|
|
||||||
text = run.get("text")
|
|
||||||
if isinstance(text, str):
|
|
||||||
total += len(text.strip())
|
|
||||||
return total
|
|
||||||
text_value = node.get("text")
|
|
||||||
if isinstance(text_value, str):
|
|
||||||
return len(text_value.strip())
|
|
||||||
return len(self._extract_block_text(node).strip())
|
|
||||||
|
|
||||||
if block_type == "list":
|
if block_type == "list":
|
||||||
total = 0
|
total = 0
|
||||||
@@ -735,6 +729,57 @@ class ChapterGenerationNode(BaseNode):
|
|||||||
|
|
||||||
return walk(blocks)
|
return walk(blocks)
|
||||||
|
|
||||||
|
def _count_narrative_characters(self, blocks: Any) -> int:
|
||||||
|
"""
|
||||||
|
统计paragraph/callout/list/blockquote等叙述性结构的字符数,避免被表格/图表“刷长”。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def walk(node: Any) -> int:
|
||||||
|
if node is None:
|
||||||
|
return 0
|
||||||
|
if isinstance(node, list):
|
||||||
|
return sum(walk(item) for item in node)
|
||||||
|
if isinstance(node, str):
|
||||||
|
return len(node.strip())
|
||||||
|
if not isinstance(node, dict):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
block_type = node.get("type")
|
||||||
|
if block_type == "paragraph":
|
||||||
|
return self._estimate_paragraph_characters(node)
|
||||||
|
if block_type == "list":
|
||||||
|
total = 0
|
||||||
|
for item in node.get("items", []):
|
||||||
|
total += walk(item)
|
||||||
|
return total
|
||||||
|
if block_type in {"callout", "blockquote"}:
|
||||||
|
return walk(node.get("blocks"))
|
||||||
|
|
||||||
|
# list项可能是匿名dict,兼容性遍历
|
||||||
|
if block_type is None:
|
||||||
|
nested = node.get("blocks")
|
||||||
|
if isinstance(nested, list):
|
||||||
|
return walk(nested)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
return walk(blocks)
|
||||||
|
|
||||||
|
def _estimate_paragraph_characters(self, block: Dict[str, Any]) -> int:
|
||||||
|
"""提取paragraph文本长度,复用在多种统计中。"""
|
||||||
|
inlines = block.get("inlines")
|
||||||
|
if isinstance(inlines, list):
|
||||||
|
total = 0
|
||||||
|
for run in inlines:
|
||||||
|
if isinstance(run, dict):
|
||||||
|
text = run.get("text")
|
||||||
|
if isinstance(text, str):
|
||||||
|
total += len(text.strip())
|
||||||
|
return total
|
||||||
|
text_value = block.get("text")
|
||||||
|
if isinstance(text_value, str):
|
||||||
|
return len(text_value.strip())
|
||||||
|
return len(self._extract_block_text(block).strip())
|
||||||
|
|
||||||
def _sanitize_block_content(self, block: Dict[str, Any]):
|
def _sanitize_block_content(self, block: Dict[str, Any]):
|
||||||
"""根据类型做精细化修复,例如清理paragraph内的非法inline mark"""
|
"""根据类型做精细化修复,例如清理paragraph内的非法inline mark"""
|
||||||
block_type = block.get("type")
|
block_type = block.get("type")
|
||||||
|
|||||||
Reference in New Issue
Block a user