Improved Rendering

This commit is contained in:
马一丁
2025-11-13 22:31:02 +08:00
parent fa787af135
commit 82152547e1
4 changed files with 1006 additions and 84 deletions
+163 -10
View File
@@ -10,12 +10,12 @@ from __future__ import annotations
import json
from pathlib import Path
import re
from typing import Any, Dict, List, Tuple
from typing import Any, Dict, List, Tuple, Callable, Optional
from loguru import logger
from ..core import TemplateSection, ChapterStorage
from ..ir import ALLOWED_BLOCK_TYPES, IRValidator
from ..ir import ALLOWED_BLOCK_TYPES, ALLOWED_INLINE_MARKS, IRValidator
from ..prompts import (
SYSTEM_PROMPT_CHAPTER_JSON,
build_chapter_user_prompt,
@@ -28,10 +28,41 @@ except ImportError: # pragma: no cover - optional dependency
_json_repair_fn = None
class ChapterJsonParseError(ValueError):
"""Raised when the LLM output for a chapter cannot be parsed as valid JSON."""
def __init__(self, message: str, raw_text: Optional[str] = None):
super().__init__(message)
self.raw_text = raw_text
class ChapterGenerationNode(BaseNode):
"""负责按章节调用LLM并校验JSON结构"""
_COLON_EQUALS_PATTERN = re.compile(r'(":\s*)=')
_LINE_BREAK_SENTINEL = "__LINE_BREAK__"
_INLINE_MARK_ALIASES = {
"strong": "bold",
"b": "bold",
"em": "italic",
"emphasis": "italic",
"i": "italic",
"u": "underline",
"strike-through": "strike",
"strikethrough": "strike",
"s": "strike",
"codeblock": "code",
"monospace": "code",
"hyperlink": "link",
"url": "link",
"colour": "color",
"textcolor": "color",
"bgcolor": "highlight",
"background": "highlight",
"highlightcolor": "highlight",
"sub": "subscript",
"sup": "superscript",
}
def __init__(self, llm_client, validator: IRValidator, storage: ChapterStorage):
"""
@@ -51,6 +82,7 @@ class ChapterGenerationNode(BaseNode):
section: TemplateSection,
context: Dict[str, Any],
run_dir: Path,
stream_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
**kwargs,
) -> Dict[str, Any]:
"""针对单个章节调用LLM,校验/落盘章节JSON并返回结构化结果"""
@@ -64,7 +96,13 @@ class ChapterGenerationNode(BaseNode):
llm_payload = self._build_payload(section, context)
user_message = build_chapter_user_prompt(llm_payload)
raw_text = self._stream_llm(user_message, chapter_dir, **kwargs)
raw_text = self._stream_llm(
user_message,
chapter_dir,
stream_callback=stream_callback,
section_meta=chapter_meta,
**kwargs,
)
chapter_json = self._parse_chapter(raw_text)
# 自动补全关键字段后再校验
@@ -150,8 +188,15 @@ class ChapterGenerationNode(BaseNode):
payload["globalContext"]["sectionBudgets"] = chapter_plan["sections"]
return payload
def _stream_llm(self, user_message: str, chapter_dir: Path, **kwargs) -> str:
"""流式调用LLM并实时写入raw文件"""
def _stream_llm(
self,
user_message: str,
chapter_dir: Path,
stream_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
section_meta: Optional[Dict[str, Any]] = None,
**kwargs,
) -> str:
"""流式调用LLM并实时写入raw文件,同时通过回调将delta抛出。"""
chunks: List[str] = []
with self.storage.capture_stream(chapter_dir) as stream_fp:
stream = self.llm_client.stream_invoke(
@@ -163,6 +208,12 @@ class ChapterGenerationNode(BaseNode):
for delta in stream:
stream_fp.write(delta)
chunks.append(delta)
if stream_callback:
meta = section_meta or {}
try:
stream_callback(delta, meta)
except Exception as callback_error: # pragma: no cover - 仅记录,不阻断主流程
logger.warning(f"章节流式回调失败: {callback_error}")
return "".join(chunks)
def _parse_chapter(self, raw_text: str) -> Dict[str, Any]:
@@ -192,9 +243,13 @@ class ChapterGenerationNode(BaseNode):
try:
data = self._parse_with_candidates(candidate_payloads[-1:])
except json.JSONDecodeError as inner_exc:
raise ValueError(f"章节JSON解析失败: {inner_exc}") from inner_exc
raise ChapterJsonParseError(
f"章节JSON解析失败: {inner_exc}", raw_text=cleaned
) from inner_exc
else:
raise ValueError(f"章节JSON解析失败: {exc}") from exc
raise ChapterJsonParseError(
f"章节JSON解析失败: {exc}", raw_text=cleaned
) from exc
if "chapter" in data and isinstance(data["chapter"], dict):
return data["chapter"]
@@ -400,6 +455,7 @@ class ChapterGenerationNode(BaseNode):
if not isinstance(block, dict):
continue
self._ensure_block_type(block)
self._sanitize_block_content(block)
block_type = block.get("type")
if block_type == "list":
items = block.get("items")
@@ -424,6 +480,98 @@ class ChapterGenerationNode(BaseNode):
walk(chapter.get("blocks"))
def _sanitize_block_content(self, block: Dict[str, Any]):
"""根据类型做精细化修复,例如清理paragraph内的非法inline mark"""
block_type = block.get("type")
if block_type == "paragraph":
self._normalize_paragraph_block(block)
def _normalize_paragraph_block(self, block: Dict[str, Any]):
"""将paragraph的inlines统一规整,剔除非法marks"""
inlines = block.get("inlines")
normalized_runs: List[Dict[str, Any]] = []
if isinstance(inlines, list) and inlines:
for run in inlines:
normalized_runs.extend(self._coerce_inline_run(run))
else:
normalized_runs = [self._as_inline_run(self._extract_block_text(block))]
if not normalized_runs:
normalized_runs = [self._as_inline_run("")]
block["inlines"] = normalized_runs
def _coerce_inline_run(self, run: Any) -> List[Dict[str, Any]]:
"""将任意inline写法规整为合法run"""
if isinstance(run, dict):
normalized_run = dict(run)
text = normalized_run.get("text")
if not isinstance(text, str):
text = "" if text is None else str(text)
marks = normalized_run.get("marks")
sanitized_marks, extra_text = self._sanitize_inline_marks(marks)
normalized_run["marks"] = sanitized_marks
normalized_run["text"] = (text or "") + extra_text
return [normalized_run]
if isinstance(run, str):
return [self._as_inline_run(run)]
if isinstance(run, (int, float)):
return [self._as_inline_run(str(run))]
if isinstance(run, list):
normalized: List[Dict[str, Any]] = []
for item in run:
normalized.extend(self._coerce_inline_run(item))
return normalized
return [self._as_inline_run("" if run is None else str(run))]
def _sanitize_inline_marks(self, marks: Any) -> Tuple[List[Dict[str, Any]], str]:
"""过滤非法marks并将break类控制符转成文本"""
text_suffix = ""
if marks is None:
return [], text_suffix
mark_list = marks if isinstance(marks, list) else [marks]
sanitized: List[Dict[str, Any]] = []
for mark in mark_list:
normalized_mark, extra_text = self._normalize_inline_mark(mark)
if normalized_mark:
sanitized.append(normalized_mark)
if extra_text:
text_suffix += extra_text
return sanitized, text_suffix
def _normalize_inline_mark(self, mark: Any) -> Tuple[Dict[str, Any] | None, str]:
"""对单个mark做兼容映射,或者在必要时转换为文本"""
if not isinstance(mark, dict):
return None, ""
canonical_type = self._canonical_inline_mark_type(mark.get("type"))
if canonical_type == self._LINE_BREAK_SENTINEL:
return None, "\n"
if canonical_type in ALLOWED_INLINE_MARKS:
normalized = dict(mark)
normalized["type"] = canonical_type
return normalized, ""
return None, ""
def _canonical_inline_mark_type(self, mark_type: Any) -> str | None:
"""将mark type映射为Schema所支持的取值"""
if not isinstance(mark_type, str):
return None
normalized = mark_type.strip()
if not normalized:
return None
lowered = normalized.lower()
if lowered in {"break", "linebreak", "br"}:
return self._LINE_BREAK_SENTINEL
return self._INLINE_MARK_ALIASES.get(lowered, lowered)
def _extract_block_text(self, block: Dict[str, Any]) -> str:
"""优先从text/content等字段提取fallback文本"""
for key in ("text", "content", "value", "title"):
value = block.get(key)
if isinstance(value, str):
return value
if value is not None:
return str(value)
return ""
def _normalize_list_items(self, items: Any) -> List[List[Dict[str, Any]]]:
"""确保list block的items为[[block, block], ...]结构"""
if not isinstance(items, list):
@@ -490,16 +638,21 @@ class ChapterGenerationNode(BaseNode):
text = str(block)
block.clear()
block["type"] = "paragraph"
block["inlines"] = [{"text": text}]
block["inlines"] = [self._as_inline_run(text)]
@staticmethod
def _as_paragraph_block(text: str) -> Dict[str, Any]:
"""将字符串快速包装成paragraph block,方便统一处理"""
return {
"type": "paragraph",
"inlines": [{"text": text or ""}],
"inlines": [ChapterGenerationNode._as_inline_run(text)],
}
@staticmethod
def _as_inline_run(text: str) -> Dict[str, Any]:
"""构造基础inline run,保证marks字段存在"""
return {"text": text or "", "marks": []}
@staticmethod
def _parse_with_candidates(payloads: List[str]) -> Dict[str, Any]:
"""按顺序尝试多个payload,直到解析成功"""
@@ -513,4 +666,4 @@ class ChapterGenerationNode(BaseNode):
raise last_exc
__all__ = ["ChapterGenerationNode"]
__all__ = ["ChapterGenerationNode", "ChapterJsonParseError"]