Blocked HTML
This commit is contained in:
@@ -5,11 +5,15 @@ Report Engine节点处理模块
|
||||
|
||||
from .base_node import BaseNode, StateMutationNode
|
||||
from .template_selection_node import TemplateSelectionNode
|
||||
from .html_generation_node import HTMLGenerationNode
|
||||
from .chapter_generation_node import ChapterGenerationNode
|
||||
from .document_layout_node import DocumentLayoutNode
|
||||
from .word_budget_node import WordBudgetNode
|
||||
|
||||
__all__ = [
|
||||
"BaseNode",
|
||||
"StateMutationNode",
|
||||
"StateMutationNode",
|
||||
"TemplateSelectionNode",
|
||||
"HTMLGenerationNode"
|
||||
"ChapterGenerationNode",
|
||||
"DocumentLayoutNode",
|
||||
"WordBudgetNode",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,506 @@
|
||||
"""
|
||||
章节级JSON生成节点。
|
||||
|
||||
每个章节依据Markdown模板切片独立调用LLM,流式写入Raw文件,
|
||||
完成后校验并落盘标准化JSON。该节点只负责“拿到合规章节”。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
import re
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from ..core import TemplateSection, ChapterStorage
|
||||
from ..ir import ALLOWED_BLOCK_TYPES, IRValidator
|
||||
from ..prompts import (
|
||||
SYSTEM_PROMPT_CHAPTER_JSON,
|
||||
build_chapter_user_prompt,
|
||||
)
|
||||
from .base_node import BaseNode
|
||||
|
||||
try:
|
||||
from json_repair import repair_json as _json_repair_fn
|
||||
except ImportError: # pragma: no cover - optional dependency
|
||||
_json_repair_fn = None
|
||||
|
||||
|
||||
class ChapterGenerationNode(BaseNode):
|
||||
"""负责按章节调用LLM并校验JSON结构"""
|
||||
|
||||
_COLON_EQUALS_PATTERN = re.compile(r'(":\s*)=')
|
||||
|
||||
def __init__(self, llm_client, validator: IRValidator, storage: ChapterStorage):
|
||||
super().__init__(llm_client, "ChapterGenerationNode")
|
||||
self.validator = validator
|
||||
self.storage = storage
|
||||
|
||||
def run(
|
||||
self,
|
||||
section: TemplateSection,
|
||||
context: Dict[str, Any],
|
||||
run_dir: Path,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
"""针对单个章节调用LLM,校验/落盘章节JSON并返回结构化结果"""
|
||||
chapter_meta = {
|
||||
"chapterId": section.chapter_id,
|
||||
"slug": section.slug,
|
||||
"title": section.title,
|
||||
"order": section.order,
|
||||
}
|
||||
chapter_dir = self.storage.begin_chapter(run_dir, chapter_meta)
|
||||
llm_payload = self._build_payload(section, context)
|
||||
user_message = build_chapter_user_prompt(llm_payload)
|
||||
|
||||
raw_text = self._stream_llm(user_message, chapter_dir, **kwargs)
|
||||
chapter_json = self._parse_chapter(raw_text)
|
||||
|
||||
# 自动补全关键字段后再校验
|
||||
chapter_json.setdefault("chapterId", section.chapter_id)
|
||||
chapter_json.setdefault("anchor", section.slug)
|
||||
chapter_json.setdefault("title", section.title)
|
||||
chapter_json.setdefault("order", section.order)
|
||||
self._sanitize_chapter_blocks(chapter_json)
|
||||
|
||||
valid, errors = self.validator.validate_chapter(chapter_json)
|
||||
self.storage.persist_chapter(
|
||||
run_dir,
|
||||
chapter_meta,
|
||||
chapter_json,
|
||||
errors=None if valid else errors,
|
||||
)
|
||||
|
||||
if not valid:
|
||||
raise ValueError(
|
||||
f"{section.title} 章节JSON校验失败: {'; '.join(errors[:5])}"
|
||||
)
|
||||
|
||||
return chapter_json
|
||||
|
||||
# ====== 内部方法 ======
|
||||
|
||||
def _build_payload(self, section: TemplateSection, context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""构造LLM输入payload"""
|
||||
reports = context.get("reports", {})
|
||||
# 章节篇幅规划(来自WordBudgetNode),用于指导字数与强调点
|
||||
chapter_plan_map = context.get("chapter_directives", {})
|
||||
chapter_plan = chapter_plan_map.get(section.chapter_id) if chapter_plan_map else {}
|
||||
payload = {
|
||||
"section": {
|
||||
"chapterId": section.chapter_id,
|
||||
"title": section.title,
|
||||
"slug": section.slug,
|
||||
"order": section.order,
|
||||
"number": section.number,
|
||||
"outline": section.outline,
|
||||
},
|
||||
"globalContext": {
|
||||
"query": context.get("query"),
|
||||
"templateName": context.get("template_name"),
|
||||
"themeTokens": context.get("theme_tokens", {}),
|
||||
"styleDirectives": context.get("style_directives", {}),
|
||||
# layout里包含标题/目录/hero等信息,方便章节保持统一视觉调性
|
||||
"layout": context.get("layout"),
|
||||
"templateOverview": context.get("template_overview", {}),
|
||||
},
|
||||
"reports": {
|
||||
"query_engine": reports.get("query_engine", ""),
|
||||
"media_engine": reports.get("media_engine", ""),
|
||||
"insight_engine": reports.get("insight_engine", ""),
|
||||
},
|
||||
"forumLogs": context.get("forum_logs", ""),
|
||||
"dataBundles": context.get("data_bundles", []),
|
||||
"constraints": {
|
||||
"language": "zh-CN",
|
||||
"maxTokens": context.get("max_tokens", 4096),
|
||||
"allowedBlocks": ALLOWED_BLOCK_TYPES,
|
||||
"styleHints": {
|
||||
"expectWidgets": True,
|
||||
"forceHeadingAnchors": True,
|
||||
"allowInlineMix": True,
|
||||
},
|
||||
},
|
||||
"chapterPlan": chapter_plan,
|
||||
"wordPlan": context.get("word_plan"),
|
||||
}
|
||||
if chapter_plan:
|
||||
constraints = payload["constraints"]
|
||||
if chapter_plan.get("targetWords"):
|
||||
constraints["wordTarget"] = chapter_plan["targetWords"]
|
||||
if chapter_plan.get("minWords"):
|
||||
constraints["minWords"] = chapter_plan["minWords"]
|
||||
if chapter_plan.get("maxWords"):
|
||||
constraints["maxWords"] = chapter_plan["maxWords"]
|
||||
if chapter_plan.get("emphasis"):
|
||||
constraints["emphasis"] = chapter_plan["emphasis"]
|
||||
if chapter_plan.get("sections"):
|
||||
constraints["sectionBudgets"] = chapter_plan["sections"]
|
||||
payload["globalContext"]["sectionBudgets"] = chapter_plan["sections"]
|
||||
return payload
|
||||
|
||||
def _stream_llm(self, user_message: str, chapter_dir: Path, **kwargs) -> str:
|
||||
"""流式调用LLM并实时写入raw文件"""
|
||||
chunks: List[str] = []
|
||||
with self.storage.capture_stream(chapter_dir) as stream_fp:
|
||||
stream = self.llm_client.stream_invoke(
|
||||
SYSTEM_PROMPT_CHAPTER_JSON,
|
||||
user_message,
|
||||
temperature=kwargs.get("temperature", 0.2),
|
||||
top_p=kwargs.get("top_p", 0.95),
|
||||
)
|
||||
for delta in stream:
|
||||
stream_fp.write(delta)
|
||||
chunks.append(delta)
|
||||
return "".join(chunks)
|
||||
|
||||
def _parse_chapter(self, raw_text: str) -> Dict[str, Any]:
|
||||
"""清洗LLM输出并解析JSON"""
|
||||
cleaned = raw_text.strip()
|
||||
if cleaned.startswith("```json"):
|
||||
cleaned = cleaned[7:]
|
||||
if cleaned.startswith("```"):
|
||||
cleaned = cleaned[3:]
|
||||
if cleaned.endswith("```"):
|
||||
cleaned = cleaned[:-3]
|
||||
cleaned = cleaned.strip()
|
||||
if not cleaned:
|
||||
raise ValueError("LLM返回空内容")
|
||||
|
||||
candidate_payloads = [cleaned]
|
||||
repaired = self._repair_llm_json(cleaned)
|
||||
if repaired != cleaned:
|
||||
candidate_payloads.append(repaired)
|
||||
|
||||
try:
|
||||
data = self._parse_with_candidates(candidate_payloads)
|
||||
except json.JSONDecodeError as exc:
|
||||
repaired_payload = self._attempt_json_repair(cleaned)
|
||||
if repaired_payload:
|
||||
candidate_payloads.append(repaired_payload)
|
||||
try:
|
||||
data = self._parse_with_candidates(candidate_payloads[-1:])
|
||||
except json.JSONDecodeError as inner_exc:
|
||||
raise ValueError(f"章节JSON解析失败: {inner_exc}") from inner_exc
|
||||
else:
|
||||
raise ValueError(f"章节JSON解析失败: {exc}") from exc
|
||||
|
||||
if "chapter" in data and isinstance(data["chapter"], dict):
|
||||
return data["chapter"]
|
||||
if isinstance(data, dict) and all(
|
||||
key in data for key in ("chapterId", "title", "blocks")
|
||||
):
|
||||
return data
|
||||
if isinstance(data, list):
|
||||
for item in data:
|
||||
if isinstance(item, dict):
|
||||
if "chapter" in item and isinstance(item["chapter"], dict):
|
||||
return item["chapter"]
|
||||
if all(key in item for key in ("chapterId", "title", "blocks")):
|
||||
return item
|
||||
raise ValueError("章节JSON缺少chapter字段")
|
||||
|
||||
def _repair_llm_json(self, text: str) -> str:
|
||||
"""处理常见的LLM错误(如\":=导致的非法JSON)"""
|
||||
repaired = text
|
||||
mutated = False
|
||||
|
||||
new_text = self._COLON_EQUALS_PATTERN.sub(r"\1", repaired)
|
||||
if new_text != repaired:
|
||||
logger.warning("检测到章节JSON中的\":=\"字符,已自动移除多余的'='号")
|
||||
repaired = new_text
|
||||
mutated = True
|
||||
|
||||
repaired, escaped = self._escape_in_string_controls(repaired)
|
||||
if escaped:
|
||||
logger.warning("检测到章节JSON字符串中存在未转义的控制字符,已自动转换为转义序列")
|
||||
mutated = True
|
||||
|
||||
repaired, balanced = self._balance_brackets(repaired)
|
||||
if balanced:
|
||||
logger.warning("检测到章节JSON括号不平衡,已自动补齐/剔除异常括号")
|
||||
mutated = True
|
||||
|
||||
repaired, commas_fixed = self._fix_missing_commas(repaired)
|
||||
if commas_fixed:
|
||||
logger.warning("检测到章节JSON对象/数组之间缺少逗号,已自动补齐")
|
||||
mutated = True
|
||||
|
||||
return repaired if mutated else text
|
||||
|
||||
def _escape_in_string_controls(self, text: str) -> Tuple[str, bool]:
|
||||
"""
|
||||
将字符串字面量中的裸换行/制表符/控制字符替换为JSON合法的转义序列。
|
||||
"""
|
||||
if not text:
|
||||
return text, False
|
||||
|
||||
result: List[str] = []
|
||||
in_string = False
|
||||
escaped = False
|
||||
mutated = False
|
||||
control_map = {"\n": "\\n", "\r": "\\n", "\t": "\\t"}
|
||||
|
||||
for ch in text:
|
||||
if escaped:
|
||||
result.append(ch)
|
||||
escaped = False
|
||||
continue
|
||||
|
||||
if ch == "\\":
|
||||
result.append(ch)
|
||||
escaped = True
|
||||
continue
|
||||
|
||||
if ch == '"':
|
||||
result.append(ch)
|
||||
in_string = not in_string
|
||||
continue
|
||||
|
||||
if in_string and ch in control_map:
|
||||
result.append(control_map[ch])
|
||||
mutated = True
|
||||
continue
|
||||
|
||||
if in_string and ord(ch) < 0x20:
|
||||
result.append(f"\\u{ord(ch):04x}")
|
||||
mutated = True
|
||||
continue
|
||||
|
||||
result.append(ch)
|
||||
|
||||
return "".join(result), mutated
|
||||
|
||||
def _fix_missing_commas(self, text: str) -> Tuple[str, bool]:
|
||||
"""在对象/数组连续出现时自动补逗号"""
|
||||
if not text:
|
||||
return text, False
|
||||
|
||||
chars: List[str] = []
|
||||
mutated = False
|
||||
in_string = False
|
||||
escaped = False
|
||||
length = len(text)
|
||||
i = 0
|
||||
while i < length:
|
||||
ch = text[i]
|
||||
chars.append(ch)
|
||||
if escaped:
|
||||
escaped = False
|
||||
i += 1
|
||||
continue
|
||||
if ch == "\\":
|
||||
escaped = True
|
||||
i += 1
|
||||
continue
|
||||
if ch == '"':
|
||||
in_string = not in_string
|
||||
i += 1
|
||||
continue
|
||||
if not in_string and ch in "}]":
|
||||
j = i + 1
|
||||
while j < length and text[j] in " \t\r\n":
|
||||
j += 1
|
||||
if j < length:
|
||||
next_ch = text[j]
|
||||
if next_ch in "{[":
|
||||
chars.append(",")
|
||||
mutated = True
|
||||
i += 1
|
||||
return "".join(chars), mutated
|
||||
|
||||
def _balance_brackets(self, text: str) -> Tuple[str, bool]:
|
||||
"""尝试修复因LLM多写/少写括号导致的不平衡结构"""
|
||||
if not text:
|
||||
return text, False
|
||||
|
||||
result: List[str] = []
|
||||
stack: List[str] = []
|
||||
mutated = False
|
||||
in_string = False
|
||||
escaped = False
|
||||
|
||||
opener_map = {"{": "}", "[": "]"}
|
||||
|
||||
for ch in text:
|
||||
if escaped:
|
||||
result.append(ch)
|
||||
escaped = False
|
||||
continue
|
||||
|
||||
if ch == "\\":
|
||||
result.append(ch)
|
||||
escaped = True
|
||||
continue
|
||||
|
||||
if ch == '"':
|
||||
result.append(ch)
|
||||
in_string = not in_string
|
||||
continue
|
||||
|
||||
if in_string:
|
||||
result.append(ch)
|
||||
continue
|
||||
|
||||
if ch in "{[":
|
||||
stack.append(ch)
|
||||
result.append(ch)
|
||||
continue
|
||||
|
||||
if ch in "}]":
|
||||
if stack and ((ch == "}" and stack[-1] == "{") or (ch == "]" and stack[-1] == "[")):
|
||||
stack.pop()
|
||||
result.append(ch)
|
||||
else:
|
||||
mutated = True
|
||||
continue
|
||||
|
||||
result.append(ch)
|
||||
|
||||
while stack:
|
||||
opener = stack.pop()
|
||||
result.append(opener_map[opener])
|
||||
mutated = True
|
||||
|
||||
return "".join(result), mutated
|
||||
|
||||
def _attempt_json_repair(self, text: str) -> str | None:
|
||||
"""使用可选的json_repair库进一步修复复杂语法错误"""
|
||||
if not _json_repair_fn:
|
||||
return None
|
||||
try:
|
||||
fixed = _json_repair_fn(text)
|
||||
except Exception as exc: # pragma: no cover - library failure
|
||||
logger.warning(f"json_repair 修复章节JSON失败: {exc}")
|
||||
return None
|
||||
if fixed == text:
|
||||
return None
|
||||
logger.warning("已使用json_repair自动修复章节JSON语法")
|
||||
return fixed
|
||||
|
||||
def _sanitize_chapter_blocks(self, chapter: Dict[str, Any]):
|
||||
"""修正常见的结构性错误(例如list.items嵌套过深)"""
|
||||
|
||||
def walk(blocks: List[Dict[str, Any]] | None):
|
||||
if not isinstance(blocks, list):
|
||||
return
|
||||
for block in blocks:
|
||||
if not isinstance(block, dict):
|
||||
continue
|
||||
self._ensure_block_type(block)
|
||||
block_type = block.get("type")
|
||||
if block_type == "list":
|
||||
items = block.get("items")
|
||||
normalized = self._normalize_list_items(items)
|
||||
if normalized:
|
||||
block["items"] = normalized
|
||||
for entry in block.get("items", []):
|
||||
walk(entry)
|
||||
elif block_type in {"callout", "blockquote"}:
|
||||
walk(block.get("blocks"))
|
||||
elif block_type == "table":
|
||||
for row in block.get("rows", []):
|
||||
cells = row.get("cells") or []
|
||||
for cell in cells:
|
||||
walk(cell.get("blocks"))
|
||||
elif block_type == "widget":
|
||||
self._normalize_widget_block(block)
|
||||
else:
|
||||
nested = block.get("blocks")
|
||||
if isinstance(nested, list):
|
||||
walk(nested)
|
||||
|
||||
walk(chapter.get("blocks"))
|
||||
|
||||
def _normalize_list_items(self, items: Any) -> List[List[Dict[str, Any]]]:
|
||||
"""确保list block的items为[[block, block], ...]结构"""
|
||||
if not isinstance(items, list):
|
||||
return []
|
||||
normalized: List[List[Dict[str, Any]]] = []
|
||||
for item in items:
|
||||
normalized.extend(self._coerce_list_item(item))
|
||||
return [entry for entry in normalized if entry]
|
||||
|
||||
def _coerce_list_item(self, item: Any) -> List[List[Dict[str, Any]]]:
|
||||
"""将各种嵌套写法统一折算为区块数组"""
|
||||
result: List[List[Dict[str, Any]]] = []
|
||||
if isinstance(item, dict):
|
||||
self._ensure_block_type(item)
|
||||
result.append([item])
|
||||
return result
|
||||
if isinstance(item, list):
|
||||
dicts = [elem for elem in item if isinstance(elem, dict)]
|
||||
if dicts:
|
||||
for elem in dicts:
|
||||
self._ensure_block_type(elem)
|
||||
result.append(dicts)
|
||||
for elem in item:
|
||||
if isinstance(elem, list):
|
||||
result.extend(self._coerce_list_item(elem))
|
||||
elif isinstance(elem, dict):
|
||||
continue
|
||||
elif isinstance(elem, str):
|
||||
result.append([self._as_paragraph_block(elem)])
|
||||
elif isinstance(elem, (int, float)):
|
||||
result.append([self._as_paragraph_block(str(elem))])
|
||||
elif isinstance(item, str):
|
||||
result.append([self._as_paragraph_block(item)])
|
||||
elif isinstance(item, (int, float)):
|
||||
result.append([self._as_paragraph_block(str(item))])
|
||||
return result
|
||||
|
||||
def _normalize_widget_block(self, block: Dict[str, Any]):
|
||||
"""确保widget具备顶层data或dataRef"""
|
||||
has_data = block.get("data") is not None or block.get("dataRef") is not None
|
||||
if has_data:
|
||||
return
|
||||
props = block.get("props")
|
||||
if isinstance(props, dict) and "data" in props:
|
||||
block["data"] = props.pop("data")
|
||||
return
|
||||
block["data"] = {"labels": [], "datasets": []}
|
||||
|
||||
def _ensure_block_type(self, block: Dict[str, Any]):
|
||||
"""若block缺少合法type,则降级为paragraph"""
|
||||
block_type = block.get("type")
|
||||
if isinstance(block_type, str) and block_type in ALLOWED_BLOCK_TYPES:
|
||||
return
|
||||
text = ""
|
||||
for key in ("text", "content", "title"):
|
||||
value = block.get(key)
|
||||
if isinstance(value, str) and value.strip():
|
||||
text = value.strip()
|
||||
break
|
||||
if not text:
|
||||
try:
|
||||
text = json.dumps(block, ensure_ascii=False)
|
||||
except Exception:
|
||||
text = str(block)
|
||||
block.clear()
|
||||
block["type"] = "paragraph"
|
||||
block["inlines"] = [{"text": text}]
|
||||
|
||||
@staticmethod
|
||||
def _as_paragraph_block(text: str) -> Dict[str, Any]:
|
||||
return {
|
||||
"type": "paragraph",
|
||||
"inlines": [{"text": text or ""}],
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _parse_with_candidates(payloads: List[str]) -> Dict[str, Any]:
|
||||
"""按顺序尝试多个payload,直到解析成功"""
|
||||
last_exc: json.JSONDecodeError | None = None
|
||||
for payload in payloads:
|
||||
try:
|
||||
return json.loads(payload)
|
||||
except json.JSONDecodeError as exc:
|
||||
last_exc = exc
|
||||
assert last_exc is not None
|
||||
raise last_exc
|
||||
|
||||
|
||||
__all__ = ["ChapterGenerationNode"]
|
||||
@@ -0,0 +1,81 @@
|
||||
"""
|
||||
根据模板目录与多源报告,生成整本报告的标题/目录/主题设计。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from ..core import TemplateSection
|
||||
from ..prompts import (
|
||||
SYSTEM_PROMPT_DOCUMENT_LAYOUT,
|
||||
build_document_layout_prompt,
|
||||
)
|
||||
from .base_node import BaseNode
|
||||
|
||||
|
||||
class DocumentLayoutNode(BaseNode):
|
||||
"""负责生成全局标题、目录与Hero设计"""
|
||||
|
||||
def __init__(self, llm_client):
|
||||
super().__init__(llm_client, "DocumentLayoutNode")
|
||||
|
||||
def run(
|
||||
self,
|
||||
sections: List[TemplateSection],
|
||||
template_markdown: str,
|
||||
reports: Dict[str, str],
|
||||
forum_logs: str,
|
||||
query: str,
|
||||
template_overview: Dict[str, Any] | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""综合模板+多源内容,生成全书的标题、目录结构与主题色板"""
|
||||
# 将模板原文、切片结构与多源报告一并喂给LLM,便于其理解层级与素材
|
||||
payload = {
|
||||
"query": query,
|
||||
"template": {
|
||||
"raw": template_markdown,
|
||||
"sections": [section.to_dict() for section in sections],
|
||||
},
|
||||
"templateOverview": template_overview
|
||||
or {
|
||||
"title": sections[0].title if sections else "",
|
||||
"chapters": [section.to_dict() for section in sections],
|
||||
},
|
||||
"reports": reports,
|
||||
"forumLogs": forum_logs,
|
||||
}
|
||||
|
||||
user_message = build_document_layout_prompt(payload)
|
||||
response = self.llm_client.stream_invoke_to_string(
|
||||
SYSTEM_PROMPT_DOCUMENT_LAYOUT,
|
||||
user_message,
|
||||
temperature=0.3,
|
||||
top_p=0.9,
|
||||
)
|
||||
design = self._parse_response(response)
|
||||
logger.info("文档标题/目录设计已生成")
|
||||
return design
|
||||
|
||||
def _parse_response(self, raw: str) -> Dict[str, Any]:
|
||||
"""解析LLM返回的JSON文本,若失败则抛出友好错误"""
|
||||
cleaned = raw.strip()
|
||||
if cleaned.startswith("```json"):
|
||||
cleaned = cleaned[7:]
|
||||
if cleaned.startswith("```"):
|
||||
cleaned = cleaned[3:]
|
||||
if cleaned.endswith("```"):
|
||||
cleaned = cleaned[:-3]
|
||||
cleaned = cleaned.strip()
|
||||
if not cleaned:
|
||||
raise ValueError("文档设计LLM返回空内容")
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ValueError(f"文档设计JSON解析失败: {exc}") from exc
|
||||
|
||||
|
||||
__all__ = ["DocumentLayoutNode"]
|
||||
@@ -1,254 +0,0 @@
|
||||
"""
|
||||
HTML生成节点
|
||||
将整合后的内容转换为美观的HTML报告
|
||||
"""
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
from .base_node import StateMutationNode
|
||||
from ..llms.base import LLMClient
|
||||
from ..state.state import ReportState
|
||||
from ..prompts import SYSTEM_PROMPT_HTML_GENERATION
|
||||
# 不再需要text_processing依赖
|
||||
|
||||
|
||||
class HTMLGenerationNode(StateMutationNode):
|
||||
"""HTML生成处理节点"""
|
||||
|
||||
def __init__(self, llm_client: LLMClient):
|
||||
"""
|
||||
初始化HTML生成节点
|
||||
|
||||
Args:
|
||||
llm_client: LLM客户端
|
||||
"""
|
||||
super().__init__(llm_client, "HTMLGenerationNode")
|
||||
|
||||
def run(self, input_data: Dict[str, Any], **kwargs) -> str:
|
||||
"""
|
||||
执行HTML生成
|
||||
|
||||
Args:
|
||||
input_data: 包含报告数据的字典
|
||||
- query: 原始查询
|
||||
- query_engine_report: QueryEngine报告内容
|
||||
- media_engine_report: MediaEngine报告内容
|
||||
- insight_engine_report: InsightEngine报告内容
|
||||
- forum_logs: 论坛日志内容
|
||||
- selected_template: 选择的模板内容
|
||||
|
||||
Returns:
|
||||
生成的HTML内容
|
||||
"""
|
||||
logger.info("开始生成HTML报告...")
|
||||
|
||||
try:
|
||||
# 准备LLM输入数据
|
||||
llm_input = {
|
||||
"query": input_data.get('query', ''),
|
||||
"query_engine_report": input_data.get('query_engine_report', ''),
|
||||
"media_engine_report": input_data.get('media_engine_report', ''),
|
||||
"insight_engine_report": input_data.get('insight_engine_report', ''),
|
||||
"forum_logs": input_data.get('forum_logs', ''),
|
||||
"selected_template": input_data.get('selected_template', '')
|
||||
}
|
||||
|
||||
# 转换为JSON格式传递给LLM
|
||||
message = json.dumps(llm_input, ensure_ascii=False, indent=2)
|
||||
|
||||
# 调用LLM生成HTML
|
||||
response = self.llm_client.stream_invoke_to_string(SYSTEM_PROMPT_HTML_GENERATION, message)
|
||||
|
||||
# 处理响应(简化版)
|
||||
processed_response = self.process_output(response)
|
||||
|
||||
logger.info("HTML报告生成完成")
|
||||
return processed_response
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"HTML生成失败: {str(e)}")
|
||||
# 返回备用HTML
|
||||
return self._generate_fallback_html(input_data)
|
||||
|
||||
def mutate_state(self, input_data: Dict[str, Any], state: ReportState, **kwargs) -> ReportState:
|
||||
"""
|
||||
修改报告状态,添加生成的HTML内容
|
||||
|
||||
Args:
|
||||
input_data: 输入数据
|
||||
state: 当前报告状态
|
||||
**kwargs: 额外参数
|
||||
|
||||
Returns:
|
||||
更新后的报告状态
|
||||
"""
|
||||
# 生成HTML
|
||||
html_content = self.run(input_data, **kwargs)
|
||||
|
||||
# 更新状态
|
||||
state.html_content = html_content
|
||||
state.mark_completed()
|
||||
|
||||
return state
|
||||
|
||||
def process_output(self, output: str) -> str:
|
||||
"""
|
||||
处理LLM输出,提取HTML内容
|
||||
|
||||
Args:
|
||||
output: LLM原始输出
|
||||
|
||||
Returns:
|
||||
HTML内容
|
||||
"""
|
||||
try:
|
||||
logger.info(f"处理LLM原始输出,长度: {len(output)} 字符")
|
||||
|
||||
html_content = output.strip()
|
||||
|
||||
# 清理markdown代码块标记(如果存在)
|
||||
if html_content.startswith('```html'):
|
||||
html_content = html_content[7:] # 移除 '```html'
|
||||
if html_content.endswith('```'):
|
||||
html_content = html_content[:-3] # 移除结尾的 '```'
|
||||
elif html_content.startswith('```') and html_content.endswith('```'):
|
||||
html_content = html_content[3:-3] # 移除前后的 '```'
|
||||
|
||||
html_content = html_content.strip()
|
||||
|
||||
# 如果内容为空,返回原始输出
|
||||
if not html_content:
|
||||
logger.info("处理后内容为空,返回原始输出")
|
||||
html_content = output
|
||||
|
||||
logger.info(f"HTML处理完成,最终长度: {len(html_content)} 字符")
|
||||
return html_content
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"处理HTML输出失败: {str(e)},返回原始输出")
|
||||
return output
|
||||
|
||||
def _generate_fallback_html(self, input_data: Dict[str, Any]) -> str:
|
||||
"""
|
||||
生成备用HTML报告(当LLM失败时使用)
|
||||
|
||||
Args:
|
||||
input_data: 输入数据
|
||||
|
||||
Returns:
|
||||
备用HTML内容
|
||||
"""
|
||||
logger.info("使用备用HTML生成方法")
|
||||
|
||||
query = input_data.get('query', '智能舆情分析报告')
|
||||
query_report = input_data.get('query_engine_report', '')
|
||||
media_report = input_data.get('media_engine_report', '')
|
||||
insight_report = input_data.get('insight_engine_report', '')
|
||||
forum_logs = input_data.get('forum_logs', '')
|
||||
|
||||
generation_time = datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")
|
||||
|
||||
html_content = f"""<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{query} - 智能舆情分析报告</title>
|
||||
<style>
|
||||
body {{
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
background: #f5f5f5;
|
||||
}}
|
||||
.container {{
|
||||
background: white;
|
||||
padding: 40px;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||
}}
|
||||
h1 {{
|
||||
color: #2c3e50;
|
||||
border-bottom: 3px solid #3498db;
|
||||
padding-bottom: 10px;
|
||||
}}
|
||||
h2 {{
|
||||
color: #34495e;
|
||||
margin-top: 30px;
|
||||
margin-bottom: 15px;
|
||||
}}
|
||||
.section {{
|
||||
margin-bottom: 30px;
|
||||
padding: 20px;
|
||||
border-left: 4px solid #3498db;
|
||||
background: #f8f9fa;
|
||||
}}
|
||||
.meta {{
|
||||
background: #e9ecef;
|
||||
padding: 15px;
|
||||
border-radius: 5px;
|
||||
margin-bottom: 20px;
|
||||
}}
|
||||
.footer {{
|
||||
margin-top: 40px;
|
||||
padding-top: 20px;
|
||||
border-top: 1px solid #eee;
|
||||
text-align: center;
|
||||
color: #666;
|
||||
}}
|
||||
pre {{
|
||||
background: #f4f4f4;
|
||||
padding: 15px;
|
||||
border-radius: 5px;
|
||||
overflow-x: auto;
|
||||
white-space: pre-wrap;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>{query}</h1>
|
||||
|
||||
<div class="meta">
|
||||
<strong>报告生成时间:</strong> {generation_time}<br>
|
||||
<strong>数据来源:</strong> QueryEngine、MediaEngine、InsightEngine、ForumEngine<br>
|
||||
<strong>报告类型:</strong> 综合舆情分析报告
|
||||
</div>
|
||||
|
||||
<h2>执行摘要</h2>
|
||||
<div class="section">
|
||||
本报告整合了多个分析引擎的研究结果,为您提供全面的舆情分析洞察。
|
||||
通过对查询主题"{query}"的深度分析,我们从多个维度展现了当前的舆情态势。
|
||||
</div>
|
||||
|
||||
{f'<h2>QueryEngine分析结果</h2><div class="section"><pre>{query_report}</pre></div>' if query_report else ''}
|
||||
|
||||
{f'<h2>MediaEngine分析结果</h2><div class="section"><pre>{media_report}</pre></div>' if media_report else ''}
|
||||
|
||||
{f'<h2>InsightEngine分析结果</h2><div class="section"><pre>{insight_report}</pre></div>' if insight_report else ''}
|
||||
|
||||
{f'<h2>论坛监控数据</h2><div class="section"><pre>{forum_logs}</pre></div>' if forum_logs else ''}
|
||||
|
||||
<h2>综合结论</h2>
|
||||
<div class="section">
|
||||
基于多个分析引擎的综合研究,我们对"{query}"主题进行了全面分析。
|
||||
各引擎从不同角度提供了深入洞察,为决策提供了重要参考。
|
||||
</div>
|
||||
|
||||
<div class="footer">
|
||||
<p>本报告由智能舆情分析平台自动生成</p>
|
||||
<p>ReportEngine v1.0 | 生成时间: {generation_time}</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
return html_content
|
||||
|
||||
|
||||
@@ -0,0 +1,78 @@
|
||||
"""
|
||||
章节篇幅规划节点。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from ..core import TemplateSection
|
||||
from ..prompts import (
|
||||
SYSTEM_PROMPT_WORD_BUDGET,
|
||||
build_word_budget_prompt,
|
||||
)
|
||||
from .base_node import BaseNode
|
||||
|
||||
|
||||
class WordBudgetNode(BaseNode):
|
||||
"""规划各章节字数与重点"""
|
||||
|
||||
def __init__(self, llm_client):
|
||||
super().__init__(llm_client, "WordBudgetNode")
|
||||
|
||||
def run(
|
||||
self,
|
||||
sections: List[TemplateSection],
|
||||
design: Dict[str, Any],
|
||||
reports: Dict[str, str],
|
||||
forum_logs: str,
|
||||
query: str,
|
||||
template_overview: Dict[str, Any] | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""根据设计稿和所有素材规划章节字数,让LLM写作时有明确篇幅目标"""
|
||||
# 输入中除了章节骨架外,还包含布局节点输出,方便约束篇幅时参考视觉主次
|
||||
payload = {
|
||||
"query": query,
|
||||
"design": design,
|
||||
"sections": [section.to_dict() for section in sections],
|
||||
"templateOverview": template_overview
|
||||
or {
|
||||
"title": sections[0].title if sections else "",
|
||||
"chapters": [section.to_dict() for section in sections],
|
||||
},
|
||||
"reports": reports,
|
||||
"forumLogs": forum_logs,
|
||||
}
|
||||
user = build_word_budget_prompt(payload)
|
||||
response = self.llm_client.stream_invoke_to_string(
|
||||
SYSTEM_PROMPT_WORD_BUDGET,
|
||||
user,
|
||||
temperature=0.25,
|
||||
top_p=0.85,
|
||||
)
|
||||
plan = self._parse_response(response)
|
||||
logger.info("章节字数规划已生成")
|
||||
return plan
|
||||
|
||||
def _parse_response(self, raw: str) -> Dict[str, Any]:
|
||||
"""将LLM输出的JSON文本转为字典,失败时提示规划异常"""
|
||||
cleaned = raw.strip()
|
||||
if cleaned.startswith("```json"):
|
||||
cleaned = cleaned[7:]
|
||||
if cleaned.startswith("```"):
|
||||
cleaned = cleaned[3:]
|
||||
if cleaned.endswith("```"):
|
||||
cleaned = cleaned[:-3]
|
||||
cleaned = cleaned.strip()
|
||||
if not cleaned:
|
||||
raise ValueError("篇幅规划LLM返回空内容")
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ValueError(f"篇幅规划JSON解析失败: {exc}") from exc
|
||||
|
||||
|
||||
__all__ = ["WordBudgetNode"]
|
||||
Reference in New Issue
Block a user