Blocked HTML

This commit is contained in:
马一丁
2025-11-13 10:56:28 +08:00
parent 403dbbd296
commit 4846b1f758
20 changed files with 3660 additions and 367 deletions
+357 -97
View File
@@ -5,15 +5,28 @@ Report Agent主类
import json
import os
from loguru import logger
from pathlib import Path
from uuid import uuid4
from datetime import datetime
from typing import Optional, Dict, Any, List
from loguru import logger
from .core import (
ChapterStorage,
DocumentComposer,
TemplateSection,
parse_template_sections,
)
from .ir import IRValidator
from .llms import LLMClient
from .nodes import (
TemplateSelectionNode,
HTMLGenerationNode
ChapterGenerationNode,
DocumentLayoutNode,
WordBudgetNode,
)
from .renderers import HTMLRenderer
from .state import ReportState
from .utils.config import settings, Settings
@@ -128,6 +141,12 @@ class ReportAgent:
# 初始化LLM客户端
self.llm_client = self._initialize_llm()
# 初始化章级存储/校验/渲染组件
self.chapter_storage = ChapterStorage(self.config.CHAPTER_OUTPUT_DIR)
self.document_composer = DocumentComposer()
self.validator = IRValidator()
self.renderer = HTMLRenderer()
# 初始化节点
self._initialize_nodes()
@@ -139,6 +158,7 @@ class ReportAgent:
# 确保输出目录存在
os.makedirs(self.config.OUTPUT_DIR, exist_ok=True)
os.makedirs(self.config.DOCUMENT_IR_OUTPUT_DIR, exist_ok=True)
logger.info("Report Agent已初始化")
logger.info(f"使用LLM: {self.llm_client.get_model_info()}")
@@ -175,61 +195,144 @@ class ReportAgent:
self.llm_client,
self.config.TEMPLATE_DIR
)
self.html_generation_node = HTMLGenerationNode(self.llm_client)
self.document_layout_node = DocumentLayoutNode(self.llm_client)
self.word_budget_node = WordBudgetNode(self.llm_client)
self.chapter_generation_node = ChapterGenerationNode(
self.llm_client,
self.validator,
self.chapter_storage
)
def generate_report(self, query: str, reports: List[Any], forum_logs: str = "",
custom_template: str = "", save_report: bool = True) -> str:
def generate_report(self, query: str, reports: List[Any], forum_logs: str = "",
custom_template: str = "", save_report: bool = True) -> str:
"""
生成综合报告
生成综合报告(章节JSON → IR → HTML
Args:
query: 原始查询
reports: 三个子agent的报告列表(按顺序:QueryEngine, MediaEngine, InsightEngine
forum_logs: 论坛日志内容
custom_template: 用户自定义模板(可选)
save_report: 是否保存报告到文件
Returns:
dict: 包含HTML内容保存文件信息
dict: HTML内容以及保存文件路径信息
"""
start_time = datetime.now()
# 为新的查询重置状态,确保文件命名信息完整
self.state = ReportState(query=query)
self.state.metadata.query = query
report_id = f"report-{uuid4().hex[:8]}"
self.state.task_id = report_id
self.state.query = query
self.state.metadata.query = query
self.state.mark_processing()
logger.info(f"开始生成报告: {query}")
logger.info(f"输入数据 - 报告数量: {len(reports)}, 论坛日志长度: {len(forum_logs)}")
normalized_reports = self._normalize_reports(reports)
logger.info(f"开始生成报告 {report_id}: {query}")
logger.info(f"输入数据 - 报告数量: {len(reports)}, 论坛日志长度: {len(str(forum_logs))}")
try:
# Step 1: 模板选择
template_result = self._select_template(query, reports, forum_logs, custom_template)
# Step 2: 直接生成HTML报告
html_report = self._generate_html_report(query, reports, forum_logs, template_result)
# Step 3: 保存报告
self.state.metadata.template_used = template_result.get('template_name', '')
sections = self._slice_template(template_result.get('template_content', ''))
if not sections:
raise ValueError("模板无法解析出章节,请检查模板内容。")
template_text = template_result.get('template_content', '')
template_overview = self._build_template_overview(template_text, sections)
# 基于模板骨架+三引擎内容设计全局标题、目录与视觉主题
layout_design = self.document_layout_node.run(
sections,
template_text,
normalized_reports,
forum_logs,
query,
template_overview,
)
# 使用刚生成的设计稿对全书进行篇幅规划,约束各章字数与重点
word_plan = self.word_budget_node.run(
sections,
layout_design,
normalized_reports,
forum_logs,
query,
template_overview,
)
# 记录每个章节的目标字数/强调点,后续传给章节LLM
chapter_targets = {
entry.get("chapterId"): entry
for entry in word_plan.get("chapters", [])
if entry.get("chapterId")
}
generation_context = self._build_generation_context(
query,
normalized_reports,
forum_logs,
template_result,
layout_design,
chapter_targets,
word_plan,
template_overview,
)
# IR/渲染需要的全局元数据,带上设计稿给出的标题/主题/目录/篇幅信息
manifest_meta = {
"query": query,
"title": layout_design.get("title") or (f"{query} - 舆情洞察报告" if query else template_result.get("template_name")),
"subtitle": layout_design.get("subtitle"),
"tagline": layout_design.get("tagline"),
"templateName": template_result.get("template_name"),
"selectionReason": template_result.get("selection_reason"),
"themeTokens": generation_context.get("theme_tokens", {}),
"toc": {
"depth": 3,
"autoNumbering": True,
"title": layout_design.get("tocTitle") or "目录",
},
"hero": layout_design.get("hero"),
"layoutNotes": layout_design.get("layoutNotes"),
"wordPlan": {
"totalWords": word_plan.get("totalWords"),
"globalGuidelines": word_plan.get("globalGuidelines"),
},
"templateOverview": template_overview,
}
if layout_design.get("themeTokens"):
manifest_meta["themeTokens"] = layout_design["themeTokens"]
if layout_design.get("tocPlan"):
manifest_meta["toc"]["customEntries"] = layout_design["tocPlan"]
# 初始化章节输出目录并写入manifest,方便流式存盘
run_dir = self.chapter_storage.start_session(report_id, manifest_meta)
self._persist_planning_artifacts(run_dir, layout_design, word_plan, template_overview)
chapters = []
for section in sections:
logger.info(f"生成章节: {section.title}")
chapter = self.chapter_generation_node.run(
section,
generation_context,
run_dir
)
chapters.append(chapter)
document_ir = self.document_composer.build_document(
report_id,
manifest_meta,
chapters
)
html_report = self.renderer.render(document_ir)
self.state.html_content = html_report
self.state.mark_completed()
saved_files = {}
if save_report:
saved_files = self._save_report(html_report)
# 更新生成时间
end_time = datetime.now()
generation_time = (end_time - start_time).total_seconds()
saved_files = self._save_report(html_report, document_ir, report_id)
generation_time = (datetime.now() - start_time).total_seconds()
self.state.metadata.generation_time = generation_time
logger.info(f"报告生成完成,耗时: {generation_time:.2f}")
return {
'html_content': html_report,
'report_id': report_id,
**saved_files
}
except Exception as e:
self.state.mark_failed(str(e))
logger.exception(f"报告生成过程中发生错误: {str(e)}")
raise e
raise
def _select_template(self, query: str, reports: List[Any], forum_logs: str, custom_template: str):
"""选择报告模板"""
@@ -271,38 +374,153 @@ class ReportAgent:
self.state.metadata.template_used = fallback_template['template_name']
return fallback_template
def _generate_html_report(self, query: str, reports: List[Any], forum_logs: str, template_result: Dict[str, Any]) -> str:
"""生成HTML报告"""
logger.info("多轮生成HTML报告...")
# 准备报告内容,确保有3个报告
query_report = reports[0] if len(reports) > 0 else ""
media_report = reports[1] if len(reports) > 1 else ""
insight_report = reports[2] if len(reports) > 2 else ""
# 转换为字符串格式
query_report = str(query_report) if query_report else ""
media_report = str(media_report) if media_report else ""
insight_report = str(insight_report) if insight_report else ""
html_input = {
'query': query,
'query_engine_report': query_report,
'media_engine_report': media_report,
'insight_engine_report': insight_report,
'forum_logs': forum_logs,
'selected_template': template_result.get('template_content', '')
def _slice_template(self, template_markdown: str) -> List[TemplateSection]:
"""将模板切成章节列表,若为空则提供fallback"""
sections = parse_template_sections(template_markdown)
if sections:
return sections
logger.warning("模板未解析出章节,使用默认章节骨架")
fallback = TemplateSection(
title="1.0 综合分析",
slug="section-1-0",
order=10,
depth=1,
raw_title="1.0 综合分析",
number="1.0",
chapter_id="S1",
outline=["1.1 摘要", "1.2 数据亮点", "1.3 风险提示"],
)
return [fallback]
def _build_generation_context(
self,
query: str,
reports: Dict[str, str],
forum_logs: str,
template_result: Dict[str, Any],
layout_design: Dict[str, Any],
chapter_directives: Dict[str, Any],
word_plan: Dict[str, Any],
template_overview: Dict[str, Any],
) -> Dict[str, Any]:
"""
构造章节生成所需的共享上下文
这里把“全书设计稿”“章节篇幅约束”“统一主题配色”等一次性整理好,
避免每次章节调用都重新拼装上下文。
"""
# 优先使用设计稿定制的主题色,否则退回默认主题
theme_tokens = (
layout_design.get("themeTokens")
if layout_design else None
) or self._default_theme_tokens()
return {
"query": query,
"template_name": template_result.get("template_name"),
"reports": reports,
"forum_logs": self._stringify(forum_logs),
"theme_tokens": theme_tokens,
"style_directives": {
"tone": "analytical",
"audience": "executive",
"language": "zh-CN",
},
"data_bundles": [],
"max_tokens": min(self.config.MAX_CONTENT_LENGTH, 6000),
"layout": layout_design or {},
"template_overview": template_overview or {},
"chapter_directives": chapter_directives or {},
"word_plan": word_plan or {},
}
# 使用HTML生成节点生成报告
html_content = self.html_generation_node.run(html_input)
# 更新状态
self.state.html_content = html_content
self.state.mark_completed()
logger.info("HTML报告生成完成")
return html_content
def _normalize_reports(self, reports: List[Any]) -> Dict[str, str]:
"""将不同来源的报告统一转为字符串"""
keys = ["query_engine", "media_engine", "insight_engine"]
normalized: Dict[str, str] = {}
for idx, key in enumerate(keys):
value = reports[idx] if idx < len(reports) else ""
normalized[key] = self._stringify(value)
return normalized
def _stringify(self, value: Any) -> str:
"""安全地将对象转成字符串"""
if value is None:
return ""
if isinstance(value, str):
return value
if isinstance(value, (dict, list)):
try:
return json.dumps(value, ensure_ascii=False, indent=2)
except Exception:
return str(value)
return str(value)
def _default_theme_tokens(self) -> Dict[str, Any]:
"""默认的主题变量,供渲染器/LLM共用"""
return {
"colors": {
"bg": "#f8f9fa",
"text": "#212529",
"primary": "#007bff",
"secondary": "#6c757d",
"card": "#ffffff",
"border": "#dee2e6",
"accent1": "#17a2b8",
"accent2": "#28a745",
"accent3": "#ffc107",
"accent4": "#dc3545",
},
"fonts": {
"body": "-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, 'Noto Sans', sans-serif",
"heading": "'Source Han Sans SC', 'PingFang SC', 'Microsoft YaHei', sans-serif",
},
"spacing": {"container": "1200px", "gutter": "24px"},
"vars": {
"header_sticky": True,
"toc_depth": 3,
"enable_dark_mode": True,
},
}
def _build_template_overview(
self,
template_markdown: str,
sections: List[TemplateSection],
) -> Dict[str, Any]:
"""提取模板标题与章节骨架,供设计/篇幅规划统一引用"""
fallback_title = sections[0].title if sections else ""
overview = {
"title": self._extract_template_title(template_markdown, fallback_title),
"chapters": [],
}
for section in sections:
overview["chapters"].append(
{
"chapterId": section.chapter_id,
"title": section.title,
"rawTitle": section.raw_title,
"number": section.number,
"slug": section.slug,
"order": section.order,
"depth": section.depth,
"outline": section.outline,
}
)
return overview
@staticmethod
def _extract_template_title(template_markdown: str, fallback: str = "") -> str:
"""尝试从Markdown中提取首个标题,找不到时使用fallback"""
for line in template_markdown.splitlines():
stripped = line.strip()
if not stripped:
continue
if stripped.startswith("#"):
return stripped.lstrip("#").strip()
if stripped:
fallback = fallback or stripped
return fallback or "智能舆情分析报告"
def _get_fallback_template_content(self) -> str:
"""获取备用模板内容"""
@@ -353,40 +571,82 @@ class ReportAgent:
*生成时间:{generation_time}*
"""
def _save_report(self, html_content: str):
"""保存报告到文件"""
# 生成文件名
def _save_report(self, html_content: str, document_ir: Dict[str, Any], report_id: str) -> Dict[str, Any]:
"""保存HTML与IR到文件并返回路径信息"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
query_safe = "".join(c for c in self.state.metadata.query if c.isalnum() or c in (' ', '-', '_')).rstrip()
query_safe = query_safe.replace(' ', '_')[:30]
filename = f"final_report_{query_safe}_{timestamp}.html"
filepath = os.path.join(self.config.OUTPUT_DIR, filename)
# 保存HTML报告
with open(filepath, 'w', encoding='utf-8') as f:
f.write(html_content)
abs_report_path = os.path.abspath(filepath)
rel_report_path = os.path.relpath(abs_report_path, os.getcwd())
logger.info(f"报告已保存到: {abs_report_path}")
# 保存状态
query_safe = "".join(
c for c in self.state.metadata.query if c.isalnum() or c in (" ", "-", "_")
).rstrip()
query_safe = query_safe.replace(" ", "_")[:30] or "report"
html_filename = f"final_report_{query_safe}_{timestamp}.html"
html_path = Path(self.config.OUTPUT_DIR) / html_filename
html_path.write_text(html_content, encoding="utf-8")
html_abs = str(html_path.resolve())
html_rel = os.path.relpath(html_abs, os.getcwd())
ir_path = self._save_document_ir(document_ir, query_safe, timestamp)
ir_abs = str(ir_path.resolve())
ir_rel = os.path.relpath(ir_abs, os.getcwd())
state_filename = f"report_state_{query_safe}_{timestamp}.json"
state_filepath = os.path.join(self.config.OUTPUT_DIR, state_filename)
self.state.save_to_file(state_filepath)
abs_state_path = os.path.abspath(state_filepath)
rel_state_path = os.path.relpath(abs_state_path, os.getcwd())
logger.info(f"状态已保存到: {abs_state_path}")
state_path = Path(self.config.OUTPUT_DIR) / state_filename
self.state.save_to_file(str(state_path))
state_abs = str(state_path.resolve())
state_rel = os.path.relpath(state_abs, os.getcwd())
logger.info(f"HTML报告已保存: {html_path}")
logger.info(f"Document IR已保存: {ir_path}")
logger.info(f"状态已保存到: {state_path}")
return {
'report_filename': filename,
'report_filepath': abs_report_path,
'report_relative_path': rel_report_path,
'report_filename': html_filename,
'report_filepath': html_abs,
'report_relative_path': html_rel,
'ir_filename': ir_path.name,
'ir_filepath': ir_abs,
'ir_relative_path': ir_rel,
'state_filename': state_filename,
'state_filepath': abs_state_path,
'state_relative_path': rel_state_path
'state_filepath': state_abs,
'state_relative_path': state_rel,
}
def _save_document_ir(self, document_ir: Dict[str, Any], query_safe: str, timestamp: str) -> Path:
"""将整本IR写入独立目录"""
filename = f"report_ir_{query_safe}_{timestamp}.json"
ir_path = Path(self.config.DOCUMENT_IR_OUTPUT_DIR) / filename
ir_path.write_text(
json.dumps(document_ir, ensure_ascii=False, indent=2),
encoding="utf-8",
)
return ir_path
def _persist_planning_artifacts(
self,
run_dir: Path,
layout_design: Dict[str, Any],
word_plan: Dict[str, Any],
template_overview: Dict[str, Any],
):
"""
将文档设计稿、篇幅规划与模板概览另存成JSON
方便在调试或复盘时快速定位:标题/目录/主题是如何确定的、
字数分配有什么要求,以便后续人工校正。
"""
artifacts = {
"document_layout": layout_design,
"word_plan": word_plan,
"template_overview": template_overview,
}
for name, payload in artifacts.items():
if not payload:
continue
path = run_dir / f"{name}.json"
try:
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
except Exception as exc:
logger.warning(f"写入{name}失败: {exc}")
def get_progress_summary(self) -> Dict[str, Any]:
"""获取进度摘要"""
@@ -515,4 +775,4 @@ def create_agent(config_file: Optional[str] = None) -> ReportAgent:
"""
config = Settings() # 以空配置初始化,而从从环境变量初始化
return ReportAgent(config)
return ReportAgent(config)
+16
View File
@@ -0,0 +1,16 @@
"""
Report Engine核心工具集合。
包含模板切片、章节存储等基础能力,供agent流水线复用。
"""
from .template_parser import TemplateSection, parse_template_sections
from .chapter_storage import ChapterStorage
from .stitcher import DocumentComposer
__all__ = [
"TemplateSection",
"parse_template_sections",
"ChapterStorage",
"DocumentComposer",
]
+209
View File
@@ -0,0 +1,209 @@
"""
章节JSON的落盘与清单管理。
每一章在流式生成时会立即写入raw文件,完成校验后再写入
格式化的chapter.json,并在manifest中记录元数据,便于后续装订。
"""
from __future__ import annotations
import json
from contextlib import contextmanager
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Dict, Generator, List, Optional
@dataclass
class ChapterRecord:
"""manifest中记录的章节元数据"""
chapter_id: str
slug: str
title: str
order: int
status: str
files: Dict[str, str] = field(default_factory=dict)
errors: List[str] = field(default_factory=list)
updated_at: str = field(default_factory=lambda: datetime.utcnow().isoformat() + "Z")
def to_dict(self) -> Dict[str, object]:
return {
"chapterId": self.chapter_id,
"slug": self.slug,
"title": self.title,
"order": self.order,
"status": self.status,
"files": self.files,
"errors": self.errors,
"updatedAt": self.updated_at,
}
class ChapterStorage:
"""
章节JSON写入与manifest管理器。
用法:
run_dir = storage.start_session(report_id, {...})
chapter_dir = storage.begin_chapter(run_dir, meta)
with storage.capture_stream(chapter_dir) as fp:
fp.write(chunk)
storage.persist_chapter(run_dir, meta, payload, errors)
"""
def __init__(self, base_dir: str):
self.base_dir = Path(base_dir)
self.base_dir.mkdir(parents=True, exist_ok=True)
self._manifests: Dict[str, Dict[str, object]] = {}
# ======== 会话 & manifest ========
def start_session(self, report_id: str, metadata: Dict[str, object]) -> Path:
"""为本次报告创建独立的章节输出目录与manifest"""
run_dir = self.base_dir / report_id
run_dir.mkdir(parents=True, exist_ok=True)
manifest = {
"reportId": report_id,
"createdAt": datetime.utcnow().isoformat() + "Z",
"metadata": metadata,
"chapters": [],
}
self._manifests[self._key(run_dir)] = manifest
self._write_manifest(run_dir, manifest)
return run_dir
def begin_chapter(self, run_dir: Path, chapter_meta: Dict[str, object]) -> Path:
"""创建章节子目录并在manifest中标记为streaming状态"""
slug_value = str(
chapter_meta.get("slug") or chapter_meta.get("chapterId") or "section"
)
chapter_dir = self._chapter_dir(
run_dir,
slug_value,
int(chapter_meta.get("order", 0)),
)
record = ChapterRecord(
chapter_id=str(chapter_meta.get("chapterId")),
slug=slug_value,
title=str(chapter_meta.get("title")),
order=int(chapter_meta.get("order", 0)),
status="streaming",
files={"raw": str(self._raw_stream_path(chapter_dir).relative_to(run_dir))},
)
self._upsert_record(run_dir, record)
return chapter_dir
def persist_chapter(
self,
run_dir: Path,
chapter_meta: Dict[str, object],
payload: Dict[str, object],
errors: Optional[List[str]] = None,
) -> Path:
"""章节流式生成完毕后写入最终JSON并更新manifest状态"""
slug_value = str(
chapter_meta.get("slug") or chapter_meta.get("chapterId") or "section"
)
chapter_dir = self._chapter_dir(
run_dir,
slug_value,
int(chapter_meta.get("order", 0)),
)
final_path = chapter_dir / "chapter.json"
final_path.write_text(
json.dumps(payload, ensure_ascii=False, indent=2),
encoding="utf-8",
)
record = ChapterRecord(
chapter_id=str(chapter_meta.get("chapterId")),
slug=slug_value,
title=str(chapter_meta.get("title")),
order=int(chapter_meta.get("order", 0)),
status="ready" if not errors else "invalid",
files={
"raw": str(self._raw_stream_path(chapter_dir).relative_to(run_dir)),
"json": str(final_path.relative_to(run_dir)),
},
errors=errors or [],
)
self._upsert_record(run_dir, record)
return final_path
def load_chapters(self, run_dir: Path) -> List[Dict[str, object]]:
payloads: List[Dict[str, object]] = []
for child in sorted(run_dir.iterdir()):
if not child.is_dir():
continue
chapter_path = child / "chapter.json"
if not chapter_path.exists():
continue
try:
payload = json.loads(chapter_path.read_text(encoding="utf-8"))
payloads.append(payload)
except json.JSONDecodeError:
continue
payloads.sort(key=lambda x: x.get("order", 0))
return payloads
# ======== 文件操作 ========
@contextmanager
def capture_stream(self, chapter_dir: Path) -> Generator:
"""将流式输出实时写入raw文件"""
raw_path = self._raw_stream_path(chapter_dir)
raw_path.parent.mkdir(parents=True, exist_ok=True)
with raw_path.open("w", encoding="utf-8") as fp:
yield fp
# ======== 内部工具 ========
def _chapter_dir(self, run_dir: Path, slug: str, order: int) -> Path:
safe_slug = self._safe_slug(slug)
folder = f"{order:03d}-{safe_slug}"
path = run_dir / folder
path.mkdir(parents=True, exist_ok=True)
return path
def _safe_slug(self, slug: str) -> str:
slug = slug.replace(" ", "-").replace("/", "-")
return slug or "section"
def _raw_stream_path(self, chapter_dir: Path) -> Path:
return chapter_dir / "stream.raw"
def _key(self, run_dir: Path) -> str:
return str(run_dir.resolve())
def _manifest_path(self, run_dir: Path) -> Path:
return run_dir / "manifest.json"
def _write_manifest(self, run_dir: Path, manifest: Dict[str, object]):
self._manifest_path(run_dir).write_text(
json.dumps(manifest, ensure_ascii=False, indent=2),
encoding="utf-8",
)
def _read_manifest(self, run_dir: Path) -> Dict[str, object]:
manifest_path = self._manifest_path(run_dir)
if manifest_path.exists():
return json.loads(manifest_path.read_text(encoding="utf-8"))
return {"reportId": run_dir.name, "chapters": []}
def _upsert_record(self, run_dir: Path, record: ChapterRecord):
"""更新或追加manifest中的章节记录,保证顺序一致"""
key = self._key(run_dir)
manifest = self._manifests.get(key) or self._read_manifest(run_dir)
chapters: List[Dict[str, object]] = manifest.get("chapters", [])
chapters = [c for c in chapters if c.get("chapterId") != record.chapter_id]
chapters.append(record.to_dict())
chapters.sort(key=lambda x: x.get("order", 0))
manifest["chapters"] = chapters
manifest.setdefault("updatedAt", datetime.utcnow().isoformat() + "Z")
self._manifests[key] = manifest
self._write_manifest(run_dir, manifest)
__all__ = ["ChapterStorage", "ChapterRecord"]
+60
View File
@@ -0,0 +1,60 @@
"""
章节装订器:负责把多个章节JSON合并为整本IR。
"""
from __future__ import annotations
from datetime import datetime
from typing import Dict, List, Set
from ..ir import IR_VERSION
class DocumentComposer:
"""
将章节拼接成Document IR的简单装订器。
"""
def __init__(self):
self._seen_anchors: Set[str] = set()
def build_document(
self,
report_id: str,
metadata: Dict[str, object],
chapters: List[Dict[str, object]],
) -> Dict[str, object]:
"""把所有章节按order排序并注入唯一锚点,形成整本IR"""
ordered = sorted(chapters, key=lambda c: c.get("order", 0))
for idx, chapter in enumerate(ordered, start=1):
chapter.setdefault("chapterId", f"S{idx}")
anchor = chapter.get("anchor") or f"section-{idx}"
chapter["anchor"] = self._ensure_unique_anchor(anchor)
chapter.setdefault("order", idx * 10)
document = {
"version": IR_VERSION,
"reportId": report_id,
"metadata": {
**metadata,
"generatedAt": metadata.get("generatedAt")
or datetime.utcnow().isoformat() + "Z",
},
"themeTokens": metadata.get("themeTokens", {}),
"chapters": ordered,
"assets": metadata.get("assets", {}),
}
return document
def _ensure_unique_anchor(self, anchor: str) -> str:
"""若存在重复锚点则追加序号,确保全局唯一"""
base = anchor
counter = 2
while anchor in self._seen_anchors:
anchor = f"{base}-{counter}"
counter += 1
self._seen_anchors.add(anchor)
return anchor
__all__ = ["DocumentComposer"]
+208
View File
@@ -0,0 +1,208 @@
"""
Markdown模板切片工具。
LLM需要“按章调用”,因此必须把Markdown模板解析为结构化章节队列。
这里通过轻量正则和缩进启发式,兼容“# 标题”与
“- **1.0 标题** / - 1.1 子标题”等多种写法。
"""
from __future__ import annotations
import re
import unicodedata
from dataclasses import dataclass, field
from typing import List, Optional
SECTION_ORDER_STEP = 10
@dataclass
class TemplateSection:
"""模板章节实体"""
title: str
slug: str
order: int
depth: int
raw_title: str
number: str = ""
chapter_id: str = ""
outline: List[str] = field(default_factory=list)
def to_dict(self) -> dict:
return {
"title": self.title,
"slug": self.slug,
"order": self.order,
"depth": self.depth,
"number": self.number,
"chapterId": self.chapter_id,
"outline": self.outline,
}
heading_pattern = re.compile(r"^(#{1,6})\s+(.*)$")
bullet_pattern = re.compile(r"^[-*+]\s+(.*)$")
number_pattern = re.compile(r"^(?P<num>\d+(?:\.\d+)*)(?:[\s、:.-]+(?P<label>.*))?$")
def parse_template_sections(template_md: str) -> List[TemplateSection]:
"""
将Markdown模板切分成章节列表(按大标题)。
返回的每个TemplateSection都携带slug/order/章节号,
方便后续分章调用与锚点生成。
"""
sections: List[TemplateSection] = []
current: Optional[TemplateSection] = None
order = SECTION_ORDER_STEP
used_slugs = set()
for raw_line in template_md.splitlines():
if not raw_line.strip():
continue
indent = len(raw_line) - len(raw_line.lstrip(" "))
stripped = raw_line.strip()
meta = _classify_line(stripped, indent)
if not meta:
continue
if meta["is_section"]:
slug = _ensure_unique_slug(meta["slug"], used_slugs)
section = TemplateSection(
title=meta["title"],
slug=slug,
order=order,
depth=meta["depth"],
raw_title=meta["raw"],
number=meta["number"],
)
sections.append(section)
current = section
order += SECTION_ORDER_STEP
continue
# outline
if current:
current.outline.append(meta["title"])
for idx, section in enumerate(sections, start=1):
# 为每个章节生成稳定的chapter_id,便于后续引用
section.chapter_id = f"S{idx}"
return sections
def _classify_line(stripped: str, indent: int) -> Optional[dict]:
"""根据缩进与符号分类行"""
heading_match = heading_pattern.match(stripped)
if heading_match:
level = len(heading_match.group(1))
payload = _strip_markup(heading_match.group(2).strip())
title_info = _split_number(payload)
slug = _build_slug(title_info["number"], title_info["title"])
return {
"is_section": level <= 2,
"depth": level,
"title": title_info["display"],
"raw": payload,
"number": title_info["number"],
"slug": slug,
}
bullet_match = bullet_pattern.match(stripped)
if bullet_match:
payload = _strip_markup(bullet_match.group(1).strip())
title_info = _split_number(payload)
slug = _build_slug(title_info["number"], title_info["title"])
is_section = indent <= 1
depth = 1 if indent <= 1 else 2
return {
"is_section": is_section,
"depth": depth,
"title": title_info["display"],
"raw": payload,
"number": title_info["number"],
"slug": slug,
}
# 兼容“1.1 ...”没有前缀符号的行
number_match = number_pattern.match(stripped)
if number_match and number_match.group("label"):
payload = stripped
title = number_match.group("label").strip()
number = number_match.group("num")
slug = _build_slug(number, title)
is_section = indent == 0 and number.count(".") <= 1
depth = 1 if is_section else 2
display = f"{number} {title}" if title else number
return {
"is_section": is_section,
"depth": depth,
"title": display,
"raw": payload,
"number": number,
"slug": slug,
}
return None
def _strip_markup(text: str) -> str:
"""去除包裹的**、__等简单强调标记"""
if text.startswith(("**", "__")) and text.endswith(("**", "__")) and len(text) > 4:
return text[2:-2].strip()
return text
def _split_number(payload: str) -> dict:
"""拆分编号与标题"""
match = number_pattern.match(payload)
number = match.group("num") if match else ""
label = match.group("label") if match else payload
label = (label or "").strip()
display = f"{number} {label}".strip() if number else label or payload
title_core = label or payload
return {
"number": number,
"title": title_core,
"display": display,
}
def _build_slug(number: str, title: str) -> str:
"""根据编号/标题生成锚点"""
if number:
token = number.replace(".", "-")
else:
token = _slugify_text(title)
token = token or "section"
return f"section-{token}"
def _slugify_text(text: str) -> str:
text = unicodedata.normalize("NFKD", text)
text = text.replace("·", "-").replace(" ", "-")
text = re.sub(r"[^0-9a-zA-Z\u4e00-\u9fff-]+", "-", text)
text = re.sub(r"-{2,}", "-", text)
return text.strip("-").lower()
def _ensure_unique_slug(slug: str, used: set) -> str:
if slug not in used:
used.add(slug)
return slug
base = slug
idx = 2
while slug in used:
slug = f"{base}-{idx}"
idx += 1
used.add(slug)
return slug
__all__ = ["TemplateSection", "parse_template_sections"]
+17 -9
View File
@@ -78,7 +78,9 @@ class ReportTask:
'has_result': bool(self.html_content),
'report_file_ready': bool(self.report_file_path),
'report_file_name': self.report_file_name,
'report_file_path': self.report_file_relative_path
'report_file_path': self.report_file_relative_path or self.report_file_path,
'state_file_ready': bool(self.state_file_path),
'state_file_path': self.state_file_relative_path or self.state_file_path
}
@@ -135,17 +137,21 @@ def run_report_generation(task: ReportTask, query: str, custom_template: str = "
save_report=True
)
html_report = generation_result.get('html_content', '')
if isinstance(generation_result, dict):
html_report = generation_result.get('html_content', '')
else:
html_report = generation_result
task.update_status("running", 90)
# 保存结果
task.html_content = html_report
task.report_file_path = generation_result.get('report_filepath', '')
task.report_file_relative_path = generation_result.get('report_relative_path', '')
task.report_file_name = generation_result.get('report_filename', '')
task.state_file_path = generation_result.get('state_filepath', '')
task.state_file_relative_path = generation_result.get('state_relative_path', '')
if isinstance(generation_result, dict):
task.report_file_path = generation_result.get('report_filepath', '')
task.report_file_relative_path = generation_result.get('report_relative_path', '')
task.report_file_name = generation_result.get('report_filename', '')
task.state_file_path = generation_result.get('state_filepath', '')
task.state_file_relative_path = generation_result.get('state_relative_path', '')
task.update_status("completed", 100)
except Exception as e:
@@ -269,7 +275,9 @@ def get_progress(task_id: str):
'has_result': True,
'report_file_ready': False,
'report_file_name': '',
'report_file_path': ''
'report_file_path': '',
'state_file_ready': False,
'state_file_path': ''
}
})
@@ -534,4 +542,4 @@ def clear_log():
return jsonify({
'success': False,
'error': f'清空日志失败: {str(e)}'
}), 500
}), 500
+24
View File
@@ -0,0 +1,24 @@
"""
Report Engine的可执行JSON契约(IR)定义与校验工具。
该模块暴露统一的Schema文本与校验器,供提示词、章节生成、
以及最终装订流程共同复用,确保从LLM到渲染的产物结构一致。
"""
from .schema import (
IR_VERSION,
CHAPTER_JSON_SCHEMA,
CHAPTER_JSON_SCHEMA_TEXT,
ALLOWED_BLOCK_TYPES,
ALLOWED_INLINE_MARKS,
)
from .validator import IRValidator
__all__ = [
"IR_VERSION",
"CHAPTER_JSON_SCHEMA",
"CHAPTER_JSON_SCHEMA_TEXT",
"ALLOWED_BLOCK_TYPES",
"ALLOWED_INLINE_MARKS",
"IRValidator",
]
+369
View File
@@ -0,0 +1,369 @@
"""
Report Engine JSON契约(IRSchema定义。
这里集中维护所有章节级别的Schema与可用于提示词的文本表示,
确保章节生成、校验与渲染对同一个结构有统一认知。
"""
from __future__ import annotations
import json
from typing import Any, Dict, List
IR_VERSION = "1.0"
# ====== 基础常量 ======
ALLOWED_INLINE_MARKS: List[str] = [
"bold",
"italic",
"underline",
"strike",
"code",
"link",
"color",
"font",
"highlight",
"subscript",
"superscript",
"math",
]
ALLOWED_BLOCK_TYPES: List[str] = [
"heading",
"paragraph",
"list",
"table",
"blockquote",
"hr",
"code",
"math",
"figure",
"callout",
"kpiGrid",
"widget",
"toc",
]
# ====== Schema定义 ======
inline_mark_schema: Dict[str, Any] = {
"type": "object",
"required": ["type"],
"properties": {
"type": {"type": "string", "enum": ALLOWED_INLINE_MARKS},
"value": {"type": ["string", "number", "object"]},
"href": {"type": "string", "format": "uri-reference"},
"title": {"type": "string"},
"style": {"type": "object"},
},
"additionalProperties": True,
}
inline_run_schema: Dict[str, Any] = {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string"},
"marks": {
"type": "array",
"items": {"$ref": "#/definitions/inlineMark"},
},
},
"additionalProperties": True,
}
heading_block: Dict[str, Any] = {
"title": "HeadingBlock",
"type": "object",
"properties": {
"type": {"const": "heading"},
"level": {"type": "integer", "minimum": 1, "maximum": 6},
"text": {"type": "string"},
"anchor": {"type": "string"},
"numbering": {"type": "string"},
"subtitle": {"type": "string"},
},
"required": ["type", "level", "text", "anchor"],
"additionalProperties": True,
}
paragraph_block: Dict[str, Any] = {
"title": "ParagraphBlock",
"type": "object",
"properties": {
"type": {"const": "paragraph"},
"inlines": {
"type": "array",
"items": {"$ref": "#/definitions/inlineRun"},
},
"align": {"type": "string", "enum": ["left", "center", "right", "justify"]},
},
"required": ["type", "inlines"],
"additionalProperties": True,
}
list_block: Dict[str, Any] = {
"title": "ListBlock",
"type": "object",
"properties": {
"type": {"const": "list"},
"listType": {"type": "string", "enum": ["ordered", "bullet", "task"]},
"items": {
"type": "array",
"items": {
"type": "array",
"items": {"$ref": "#/definitions/block"},
},
},
},
"required": ["type", "listType", "items"],
"additionalProperties": True,
}
table_block: Dict[str, Any] = {
"title": "TableBlock",
"type": "object",
"properties": {
"type": {"const": "table"},
"colgroup": {"type": "array", "items": {"type": "object"}},
"rows": {
"type": "array",
"items": {
"type": "object",
"properties": {
"cells": {
"type": "array",
"items": {
"type": "object",
"properties": {
"rowspan": {"type": "integer", "minimum": 1},
"colspan": {"type": "integer", "minimum": 1},
"align": {
"type": "string",
"enum": ["left", "center", "right"],
},
"blocks": {
"type": "array",
"items": {"$ref": "#/definitions/block"},
},
},
"required": ["blocks"],
"additionalProperties": True,
},
}
},
"required": ["cells"],
"additionalProperties": True,
},
},
"caption": {"type": "string"},
"zebra": {"type": "boolean"},
},
"required": ["type", "rows"],
"additionalProperties": True,
}
blockquote_block: Dict[str, Any] = {
"title": "BlockquoteBlock",
"type": "object",
"properties": {
"type": {"const": "blockquote"},
"blocks": {
"type": "array",
"items": {"$ref": "#/definitions/block"},
},
"variant": {"type": "string"},
},
"required": ["type", "blocks"],
"additionalProperties": True,
}
hr_block: Dict[str, Any] = {
"title": "HorizontalRuleBlock",
"type": "object",
"properties": {
"type": {"const": "hr"},
"variant": {"type": "string"},
},
"required": ["type"],
"additionalProperties": True,
}
code_block: Dict[str, Any] = {
"title": "CodeBlock",
"type": "object",
"properties": {
"type": {"const": "code"},
"lang": {"type": "string"},
"content": {"type": "string"},
"caption": {"type": "string"},
},
"required": ["type", "content"],
"additionalProperties": True,
}
math_block: Dict[str, Any] = {
"title": "MathBlock",
"type": "object",
"properties": {
"type": {"const": "math"},
"latex": {"type": "string"},
"displayMode": {"type": "boolean"},
},
"required": ["type", "latex"],
"additionalProperties": True,
}
figure_block: Dict[str, Any] = {
"title": "FigureBlock",
"type": "object",
"properties": {
"type": {"const": "figure"},
"img": {
"type": "object",
"properties": {
"src": {"type": "string"},
"alt": {"type": "string"},
"width": {"type": "number"},
"height": {"type": "number"},
"srcset": {"type": "string"},
},
"required": ["src"],
"additionalProperties": True,
},
"caption": {"type": "string"},
"responsive": {"type": "boolean"},
},
"required": ["type", "img"],
"additionalProperties": True,
}
callout_block: Dict[str, Any] = {
"title": "CalloutBlock",
"type": "object",
"properties": {
"type": {"const": "callout"},
"tone": {
"type": "string",
"enum": ["info", "warning", "success", "danger"],
},
"title": {"type": "string"},
"blocks": {
"type": "array",
"items": {"$ref": "#/definitions/block"},
},
},
"required": ["type", "tone", "blocks"],
"additionalProperties": True,
}
kpi_block: Dict[str, Any] = {
"title": "KPIGridBlock",
"type": "object",
"properties": {
"type": {"const": "kpiGrid"},
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"label": {"type": "string"},
"value": {"type": "string"},
"unit": {"type": "string"},
"delta": {"type": "string"},
"deltaTone": {"type": "string", "enum": ["up", "down", "neutral"]},
},
"required": ["label", "value"],
"additionalProperties": True,
},
},
"cols": {"type": "integer"},
},
"required": ["type", "items"],
"additionalProperties": True,
}
widget_block: Dict[str, Any] = {
"title": "WidgetBlock",
"type": "object",
"properties": {
"type": {"const": "widget"},
"widgetId": {"type": "string"},
"widgetType": {"type": "string"},
"props": {"type": "object"},
"data": {"type": "object"},
"dataRef": {"type": "string"},
},
"required": ["type", "widgetId", "widgetType"],
"additionalProperties": True,
}
toc_block: Dict[str, Any] = {
"title": "TOCBlock",
"type": "object",
"properties": {
"type": {"const": "toc"},
"depth": {"type": "integer", "minimum": 1, "maximum": 4},
"autoNumbering": {"type": "boolean"},
},
"required": ["type"],
"additionalProperties": True,
}
block_variants: List[Dict[str, Any]] = [
heading_block,
paragraph_block,
list_block,
table_block,
blockquote_block,
hr_block,
code_block,
math_block,
figure_block,
callout_block,
kpi_block,
widget_block,
toc_block,
]
CHAPTER_JSON_SCHEMA: Dict[str, Any] = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "ReportEngineChapterIR",
"type": "object",
"required": ["chapterId", "title", "anchor", "order", "blocks"],
"properties": {
"chapterId": {"type": "string"},
"anchor": {"type": "string"},
"title": {"type": "string"},
"order": {"type": "number"},
"summary": {"type": "string"},
"blocks": {
"type": "array",
"items": {"$ref": "#/definitions/block"},
},
"xrefs": {"type": "object"},
"widgets": {"type": "array", "items": {"type": "string"}},
"footnotes": {"type": "array", "items": {"type": "object"}},
"errors": {"type": "array", "items": {"type": "string"}},
"metadata": {"type": "object"},
},
"additionalProperties": True,
"definitions": {
"inlineMark": inline_mark_schema,
"inlineRun": inline_run_schema,
"block": {"oneOf": block_variants},
},
}
CHAPTER_JSON_SCHEMA_TEXT: str = json.dumps(
CHAPTER_JSON_SCHEMA,
ensure_ascii=False,
indent=2,
)
__all__ = [
"IR_VERSION",
"ALLOWED_INLINE_MARKS",
"ALLOWED_BLOCK_TYPES",
"CHAPTER_JSON_SCHEMA",
"CHAPTER_JSON_SCHEMA_TEXT",
]
+218
View File
@@ -0,0 +1,218 @@
"""
章节级JSON结构校验器。
LLM按章节生成IR后,需要在落盘与装订前经过严格校验,以避免
渲染期的结构性崩溃。本模块实现轻量级的Python校验逻辑,
无需依赖jsonschema库即可快速定位错误。
"""
from __future__ import annotations
from typing import Any, Dict, List, Tuple
from .schema import ALLOWED_BLOCK_TYPES, ALLOWED_INLINE_MARKS, IR_VERSION
class IRValidator:
"""
章节IR结构校验器。
说明:
- validate_chapter返回(是否通过, 错误列表)
- 错误定位采用path语法,便于快速追踪
"""
def __init__(self, schema_version: str = IR_VERSION):
self.schema_version = schema_version
# ======== 对外接口 ========
def validate_chapter(self, chapter: Dict[str, Any]) -> Tuple[bool, List[str]]:
"""校验单个章节对象的必填字段与block结构"""
errors: List[str] = []
if not isinstance(chapter, dict):
return False, ["chapter必须是对象"]
for field in ("chapterId", "title", "anchor", "order", "blocks"):
if field not in chapter:
errors.append(f"missing chapter.{field}")
if not isinstance(chapter.get("blocks"), list) or not chapter.get("blocks"):
errors.append("chapter.blocks必须是非空数组")
return False, errors
blocks = chapter.get("blocks", [])
for idx, block in enumerate(blocks):
self._validate_block(block, f"blocks[{idx}]", errors)
return len(errors) == 0, errors
# ======== 内部工具 ========
def _validate_block(self, block: Any, path: str, errors: List[str]):
"""根据block类型调用不同的校验器"""
if not isinstance(block, dict):
errors.append(f"{path} 必须是对象")
return
block_type = block.get("type")
if block_type not in ALLOWED_BLOCK_TYPES:
errors.append(f"{path}.type 不被支持: {block_type}")
return
validator = getattr(self, f"_validate_{block_type}_block", None)
if validator:
validator(block, path, errors)
def _validate_heading_block(self, block: Dict[str, Any], path: str, errors: List[str]):
"""heading必须有level/text/anchor"""
if "level" not in block or not isinstance(block["level"], int):
errors.append(f"{path}.level 必须是整数")
if "text" not in block:
errors.append(f"{path}.text 缺失")
if "anchor" not in block:
errors.append(f"{path}.anchor 缺失")
def _validate_paragraph_block(self, block: Dict[str, Any], path: str, errors: List[str]):
"""paragraph需要非空inlines,并逐条校验"""
inlines = block.get("inlines")
if not isinstance(inlines, list) or not inlines:
errors.append(f"{path}.inlines 必须是非空数组")
return
for idx, run in enumerate(inlines):
self._validate_inline_run(run, f"{path}.inlines[{idx}]", errors)
def _validate_list_block(self, block: Dict[str, Any], path: str, errors: List[str]):
"""列表需要声明listType且每个item都是block数组"""
if block.get("listType") not in {"ordered", "bullet", "task"}:
errors.append(f"{path}.listType 取值非法")
items = block.get("items")
if not isinstance(items, list) or not items:
errors.append(f"{path}.items 必须是非空列表")
return
for i, item in enumerate(items):
if not isinstance(item, list):
errors.append(f"{path}.items[{i}] 必须是区块数组")
continue
for j, sub_block in enumerate(item):
self._validate_block(sub_block, f"{path}.items[{i}][{j}]", errors)
def _validate_table_block(self, block: Dict[str, Any], path: str, errors: List[str]):
"""表格需提供rows/cells/blocks,递归校验单元格内容"""
rows = block.get("rows")
if not isinstance(rows, list) or not rows:
errors.append(f"{path}.rows 必须是非空数组")
return
for r_idx, row in enumerate(rows):
cells = row.get("cells") if isinstance(row, dict) else None
if not isinstance(cells, list) or not cells:
errors.append(f"{path}.rows[{r_idx}].cells 必须是非空数组")
continue
for c_idx, cell in enumerate(cells):
if not isinstance(cell, dict):
errors.append(f"{path}.rows[{r_idx}].cells[{c_idx}] 必须是对象")
continue
blocks = cell.get("blocks")
if not isinstance(blocks, list) or not blocks:
errors.append(
f"{path}.rows[{r_idx}].cells[{c_idx}].blocks 必须是非空数组"
)
continue
for b_idx, sub_block in enumerate(blocks):
self._validate_block(
sub_block,
f"{path}.rows[{r_idx}].cells[{c_idx}].blocks[{b_idx}]",
errors,
)
def _validate_blockquote_block(
self, block: Dict[str, Any], path: str, errors: List[str]
):
"""引用块内部需要至少一个子block"""
inner = block.get("blocks")
if not isinstance(inner, list) or not inner:
errors.append(f"{path}.blocks 必须是非空数组")
return
for idx, sub_block in enumerate(inner):
self._validate_block(sub_block, f"{path}.blocks[{idx}]", errors)
def _validate_callout_block(self, block: Dict[str, Any], path: str, errors: List[str]):
"""callout需声明tone,并至少有一个子block"""
tone = block.get("tone")
if tone not in {"info", "warning", "success", "danger"}:
errors.append(f"{path}.tone 取值非法: {tone}")
blocks = block.get("blocks")
if not isinstance(blocks, list) or not blocks:
errors.append(f"{path}.blocks 必须是非空数组")
return
for idx, sub_block in enumerate(blocks):
self._validate_block(sub_block, f"{path}.blocks[{idx}]", errors)
def _validate_kpiGrid_block(self, block: Dict[str, Any], path: str, errors: List[str]):
"""KPI卡需要非空items,每项包含label/value"""
items = block.get("items")
if not isinstance(items, list) or not items:
errors.append(f"{path}.items 必须是非空数组")
return
for idx, item in enumerate(items):
if not isinstance(item, dict):
errors.append(f"{path}.items[{idx}] 必须是对象")
continue
if "label" not in item or "value" not in item:
errors.append(f"{path}.items[{idx}] 需要label与value")
def _validate_widget_block(self, block: Dict[str, Any], path: str, errors: List[str]):
"""widget必须声明widgetId/type,并提供数据或数据引用"""
if "widgetId" not in block:
errors.append(f"{path}.widgetId 缺失")
if "widgetType" not in block:
errors.append(f"{path}.widgetType 缺失")
if "data" not in block and "dataRef" not in block:
errors.append(f"{path} 需要 data 或 dataRef 其一")
def _validate_code_block(self, block: Dict[str, Any], path: str, errors: List[str]):
"""code block至少要有content"""
if "content" not in block:
errors.append(f"{path}.content 缺失")
def _validate_math_block(self, block: Dict[str, Any], path: str, errors: List[str]):
"""数学块要求latex字段"""
if "latex" not in block:
errors.append(f"{path}.latex 缺失")
def _validate_figure_block(
self, block: Dict[str, Any], path: str, errors: List[str]
):
"""figure需要img对象且至少带src"""
img = block.get("img")
if not isinstance(img, dict):
errors.append(f"{path}.img 必须是对象")
return
if "src" not in img:
errors.append(f"{path}.img.src 缺失")
def _validate_inline_run(
self, run: Any, path: str, errors: List[str]
):
"""校验paragraph中的inline run与marks合法性"""
if not isinstance(run, dict):
errors.append(f"{path} 必须是对象")
return
if "text" not in run:
errors.append(f"{path}.text 缺失")
marks = run.get("marks", [])
if marks is None:
return
if not isinstance(marks, list):
errors.append(f"{path}.marks 必须是数组")
return
for m_idx, mark in enumerate(marks):
if not isinstance(mark, dict):
errors.append(f"{path}.marks[{m_idx}] 必须是对象")
continue
m_type = mark.get("type")
if m_type not in ALLOWED_INLINE_MARKS:
errors.append(f"{path}.marks[{m_idx}].type 不被支持: {m_type}")
__all__ = ["IRValidator"]
+7 -3
View File
@@ -5,11 +5,15 @@ Report Engine节点处理模块
from .base_node import BaseNode, StateMutationNode
from .template_selection_node import TemplateSelectionNode
from .html_generation_node import HTMLGenerationNode
from .chapter_generation_node import ChapterGenerationNode
from .document_layout_node import DocumentLayoutNode
from .word_budget_node import WordBudgetNode
__all__ = [
"BaseNode",
"StateMutationNode",
"StateMutationNode",
"TemplateSelectionNode",
"HTMLGenerationNode"
"ChapterGenerationNode",
"DocumentLayoutNode",
"WordBudgetNode",
]
@@ -0,0 +1,506 @@
"""
章节级JSON生成节点。
每个章节依据Markdown模板切片独立调用LLM,流式写入Raw文件,
完成后校验并落盘标准化JSON。该节点只负责“拿到合规章节”。
"""
from __future__ import annotations
import json
from pathlib import Path
import re
from typing import Any, Dict, List, Tuple
from loguru import logger
from ..core import TemplateSection, ChapterStorage
from ..ir import ALLOWED_BLOCK_TYPES, IRValidator
from ..prompts import (
SYSTEM_PROMPT_CHAPTER_JSON,
build_chapter_user_prompt,
)
from .base_node import BaseNode
try:
from json_repair import repair_json as _json_repair_fn
except ImportError: # pragma: no cover - optional dependency
_json_repair_fn = None
class ChapterGenerationNode(BaseNode):
"""负责按章节调用LLM并校验JSON结构"""
_COLON_EQUALS_PATTERN = re.compile(r'(":\s*)=')
def __init__(self, llm_client, validator: IRValidator, storage: ChapterStorage):
super().__init__(llm_client, "ChapterGenerationNode")
self.validator = validator
self.storage = storage
def run(
self,
section: TemplateSection,
context: Dict[str, Any],
run_dir: Path,
**kwargs,
) -> Dict[str, Any]:
"""针对单个章节调用LLM,校验/落盘章节JSON并返回结构化结果"""
chapter_meta = {
"chapterId": section.chapter_id,
"slug": section.slug,
"title": section.title,
"order": section.order,
}
chapter_dir = self.storage.begin_chapter(run_dir, chapter_meta)
llm_payload = self._build_payload(section, context)
user_message = build_chapter_user_prompt(llm_payload)
raw_text = self._stream_llm(user_message, chapter_dir, **kwargs)
chapter_json = self._parse_chapter(raw_text)
# 自动补全关键字段后再校验
chapter_json.setdefault("chapterId", section.chapter_id)
chapter_json.setdefault("anchor", section.slug)
chapter_json.setdefault("title", section.title)
chapter_json.setdefault("order", section.order)
self._sanitize_chapter_blocks(chapter_json)
valid, errors = self.validator.validate_chapter(chapter_json)
self.storage.persist_chapter(
run_dir,
chapter_meta,
chapter_json,
errors=None if valid else errors,
)
if not valid:
raise ValueError(
f"{section.title} 章节JSON校验失败: {'; '.join(errors[:5])}"
)
return chapter_json
# ====== 内部方法 ======
def _build_payload(self, section: TemplateSection, context: Dict[str, Any]) -> Dict[str, Any]:
"""构造LLM输入payload"""
reports = context.get("reports", {})
# 章节篇幅规划(来自WordBudgetNode),用于指导字数与强调点
chapter_plan_map = context.get("chapter_directives", {})
chapter_plan = chapter_plan_map.get(section.chapter_id) if chapter_plan_map else {}
payload = {
"section": {
"chapterId": section.chapter_id,
"title": section.title,
"slug": section.slug,
"order": section.order,
"number": section.number,
"outline": section.outline,
},
"globalContext": {
"query": context.get("query"),
"templateName": context.get("template_name"),
"themeTokens": context.get("theme_tokens", {}),
"styleDirectives": context.get("style_directives", {}),
# layout里包含标题/目录/hero等信息,方便章节保持统一视觉调性
"layout": context.get("layout"),
"templateOverview": context.get("template_overview", {}),
},
"reports": {
"query_engine": reports.get("query_engine", ""),
"media_engine": reports.get("media_engine", ""),
"insight_engine": reports.get("insight_engine", ""),
},
"forumLogs": context.get("forum_logs", ""),
"dataBundles": context.get("data_bundles", []),
"constraints": {
"language": "zh-CN",
"maxTokens": context.get("max_tokens", 4096),
"allowedBlocks": ALLOWED_BLOCK_TYPES,
"styleHints": {
"expectWidgets": True,
"forceHeadingAnchors": True,
"allowInlineMix": True,
},
},
"chapterPlan": chapter_plan,
"wordPlan": context.get("word_plan"),
}
if chapter_plan:
constraints = payload["constraints"]
if chapter_plan.get("targetWords"):
constraints["wordTarget"] = chapter_plan["targetWords"]
if chapter_plan.get("minWords"):
constraints["minWords"] = chapter_plan["minWords"]
if chapter_plan.get("maxWords"):
constraints["maxWords"] = chapter_plan["maxWords"]
if chapter_plan.get("emphasis"):
constraints["emphasis"] = chapter_plan["emphasis"]
if chapter_plan.get("sections"):
constraints["sectionBudgets"] = chapter_plan["sections"]
payload["globalContext"]["sectionBudgets"] = chapter_plan["sections"]
return payload
def _stream_llm(self, user_message: str, chapter_dir: Path, **kwargs) -> str:
"""流式调用LLM并实时写入raw文件"""
chunks: List[str] = []
with self.storage.capture_stream(chapter_dir) as stream_fp:
stream = self.llm_client.stream_invoke(
SYSTEM_PROMPT_CHAPTER_JSON,
user_message,
temperature=kwargs.get("temperature", 0.2),
top_p=kwargs.get("top_p", 0.95),
)
for delta in stream:
stream_fp.write(delta)
chunks.append(delta)
return "".join(chunks)
def _parse_chapter(self, raw_text: str) -> Dict[str, Any]:
"""清洗LLM输出并解析JSON"""
cleaned = raw_text.strip()
if cleaned.startswith("```json"):
cleaned = cleaned[7:]
if cleaned.startswith("```"):
cleaned = cleaned[3:]
if cleaned.endswith("```"):
cleaned = cleaned[:-3]
cleaned = cleaned.strip()
if not cleaned:
raise ValueError("LLM返回空内容")
candidate_payloads = [cleaned]
repaired = self._repair_llm_json(cleaned)
if repaired != cleaned:
candidate_payloads.append(repaired)
try:
data = self._parse_with_candidates(candidate_payloads)
except json.JSONDecodeError as exc:
repaired_payload = self._attempt_json_repair(cleaned)
if repaired_payload:
candidate_payloads.append(repaired_payload)
try:
data = self._parse_with_candidates(candidate_payloads[-1:])
except json.JSONDecodeError as inner_exc:
raise ValueError(f"章节JSON解析失败: {inner_exc}") from inner_exc
else:
raise ValueError(f"章节JSON解析失败: {exc}") from exc
if "chapter" in data and isinstance(data["chapter"], dict):
return data["chapter"]
if isinstance(data, dict) and all(
key in data for key in ("chapterId", "title", "blocks")
):
return data
if isinstance(data, list):
for item in data:
if isinstance(item, dict):
if "chapter" in item and isinstance(item["chapter"], dict):
return item["chapter"]
if all(key in item for key in ("chapterId", "title", "blocks")):
return item
raise ValueError("章节JSON缺少chapter字段")
def _repair_llm_json(self, text: str) -> str:
"""处理常见的LLM错误(如\":=导致的非法JSON"""
repaired = text
mutated = False
new_text = self._COLON_EQUALS_PATTERN.sub(r"\1", repaired)
if new_text != repaired:
logger.warning("检测到章节JSON中的\":=\"字符,已自动移除多余的'='")
repaired = new_text
mutated = True
repaired, escaped = self._escape_in_string_controls(repaired)
if escaped:
logger.warning("检测到章节JSON字符串中存在未转义的控制字符,已自动转换为转义序列")
mutated = True
repaired, balanced = self._balance_brackets(repaired)
if balanced:
logger.warning("检测到章节JSON括号不平衡,已自动补齐/剔除异常括号")
mutated = True
repaired, commas_fixed = self._fix_missing_commas(repaired)
if commas_fixed:
logger.warning("检测到章节JSON对象/数组之间缺少逗号,已自动补齐")
mutated = True
return repaired if mutated else text
def _escape_in_string_controls(self, text: str) -> Tuple[str, bool]:
"""
将字符串字面量中的裸换行/制表符/控制字符替换为JSON合法的转义序列。
"""
if not text:
return text, False
result: List[str] = []
in_string = False
escaped = False
mutated = False
control_map = {"\n": "\\n", "\r": "\\n", "\t": "\\t"}
for ch in text:
if escaped:
result.append(ch)
escaped = False
continue
if ch == "\\":
result.append(ch)
escaped = True
continue
if ch == '"':
result.append(ch)
in_string = not in_string
continue
if in_string and ch in control_map:
result.append(control_map[ch])
mutated = True
continue
if in_string and ord(ch) < 0x20:
result.append(f"\\u{ord(ch):04x}")
mutated = True
continue
result.append(ch)
return "".join(result), mutated
def _fix_missing_commas(self, text: str) -> Tuple[str, bool]:
"""在对象/数组连续出现时自动补逗号"""
if not text:
return text, False
chars: List[str] = []
mutated = False
in_string = False
escaped = False
length = len(text)
i = 0
while i < length:
ch = text[i]
chars.append(ch)
if escaped:
escaped = False
i += 1
continue
if ch == "\\":
escaped = True
i += 1
continue
if ch == '"':
in_string = not in_string
i += 1
continue
if not in_string and ch in "}]":
j = i + 1
while j < length and text[j] in " \t\r\n":
j += 1
if j < length:
next_ch = text[j]
if next_ch in "{[":
chars.append(",")
mutated = True
i += 1
return "".join(chars), mutated
def _balance_brackets(self, text: str) -> Tuple[str, bool]:
"""尝试修复因LLM多写/少写括号导致的不平衡结构"""
if not text:
return text, False
result: List[str] = []
stack: List[str] = []
mutated = False
in_string = False
escaped = False
opener_map = {"{": "}", "[": "]"}
for ch in text:
if escaped:
result.append(ch)
escaped = False
continue
if ch == "\\":
result.append(ch)
escaped = True
continue
if ch == '"':
result.append(ch)
in_string = not in_string
continue
if in_string:
result.append(ch)
continue
if ch in "{[":
stack.append(ch)
result.append(ch)
continue
if ch in "}]":
if stack and ((ch == "}" and stack[-1] == "{") or (ch == "]" and stack[-1] == "[")):
stack.pop()
result.append(ch)
else:
mutated = True
continue
result.append(ch)
while stack:
opener = stack.pop()
result.append(opener_map[opener])
mutated = True
return "".join(result), mutated
def _attempt_json_repair(self, text: str) -> str | None:
"""使用可选的json_repair库进一步修复复杂语法错误"""
if not _json_repair_fn:
return None
try:
fixed = _json_repair_fn(text)
except Exception as exc: # pragma: no cover - library failure
logger.warning(f"json_repair 修复章节JSON失败: {exc}")
return None
if fixed == text:
return None
logger.warning("已使用json_repair自动修复章节JSON语法")
return fixed
def _sanitize_chapter_blocks(self, chapter: Dict[str, Any]):
"""修正常见的结构性错误(例如list.items嵌套过深)"""
def walk(blocks: List[Dict[str, Any]] | None):
if not isinstance(blocks, list):
return
for block in blocks:
if not isinstance(block, dict):
continue
self._ensure_block_type(block)
block_type = block.get("type")
if block_type == "list":
items = block.get("items")
normalized = self._normalize_list_items(items)
if normalized:
block["items"] = normalized
for entry in block.get("items", []):
walk(entry)
elif block_type in {"callout", "blockquote"}:
walk(block.get("blocks"))
elif block_type == "table":
for row in block.get("rows", []):
cells = row.get("cells") or []
for cell in cells:
walk(cell.get("blocks"))
elif block_type == "widget":
self._normalize_widget_block(block)
else:
nested = block.get("blocks")
if isinstance(nested, list):
walk(nested)
walk(chapter.get("blocks"))
def _normalize_list_items(self, items: Any) -> List[List[Dict[str, Any]]]:
"""确保list block的items为[[block, block], ...]结构"""
if not isinstance(items, list):
return []
normalized: List[List[Dict[str, Any]]] = []
for item in items:
normalized.extend(self._coerce_list_item(item))
return [entry for entry in normalized if entry]
def _coerce_list_item(self, item: Any) -> List[List[Dict[str, Any]]]:
"""将各种嵌套写法统一折算为区块数组"""
result: List[List[Dict[str, Any]]] = []
if isinstance(item, dict):
self._ensure_block_type(item)
result.append([item])
return result
if isinstance(item, list):
dicts = [elem for elem in item if isinstance(elem, dict)]
if dicts:
for elem in dicts:
self._ensure_block_type(elem)
result.append(dicts)
for elem in item:
if isinstance(elem, list):
result.extend(self._coerce_list_item(elem))
elif isinstance(elem, dict):
continue
elif isinstance(elem, str):
result.append([self._as_paragraph_block(elem)])
elif isinstance(elem, (int, float)):
result.append([self._as_paragraph_block(str(elem))])
elif isinstance(item, str):
result.append([self._as_paragraph_block(item)])
elif isinstance(item, (int, float)):
result.append([self._as_paragraph_block(str(item))])
return result
def _normalize_widget_block(self, block: Dict[str, Any]):
"""确保widget具备顶层data或dataRef"""
has_data = block.get("data") is not None or block.get("dataRef") is not None
if has_data:
return
props = block.get("props")
if isinstance(props, dict) and "data" in props:
block["data"] = props.pop("data")
return
block["data"] = {"labels": [], "datasets": []}
def _ensure_block_type(self, block: Dict[str, Any]):
"""若block缺少合法type,则降级为paragraph"""
block_type = block.get("type")
if isinstance(block_type, str) and block_type in ALLOWED_BLOCK_TYPES:
return
text = ""
for key in ("text", "content", "title"):
value = block.get(key)
if isinstance(value, str) and value.strip():
text = value.strip()
break
if not text:
try:
text = json.dumps(block, ensure_ascii=False)
except Exception:
text = str(block)
block.clear()
block["type"] = "paragraph"
block["inlines"] = [{"text": text}]
@staticmethod
def _as_paragraph_block(text: str) -> Dict[str, Any]:
return {
"type": "paragraph",
"inlines": [{"text": text or ""}],
}
@staticmethod
def _parse_with_candidates(payloads: List[str]) -> Dict[str, Any]:
"""按顺序尝试多个payload,直到解析成功"""
last_exc: json.JSONDecodeError | None = None
for payload in payloads:
try:
return json.loads(payload)
except json.JSONDecodeError as exc:
last_exc = exc
assert last_exc is not None
raise last_exc
__all__ = ["ChapterGenerationNode"]
@@ -0,0 +1,81 @@
"""
根据模板目录与多源报告,生成整本报告的标题/目录/主题设计。
"""
from __future__ import annotations
import json
from typing import Any, Dict, List
from loguru import logger
from ..core import TemplateSection
from ..prompts import (
SYSTEM_PROMPT_DOCUMENT_LAYOUT,
build_document_layout_prompt,
)
from .base_node import BaseNode
class DocumentLayoutNode(BaseNode):
"""负责生成全局标题、目录与Hero设计"""
def __init__(self, llm_client):
super().__init__(llm_client, "DocumentLayoutNode")
def run(
self,
sections: List[TemplateSection],
template_markdown: str,
reports: Dict[str, str],
forum_logs: str,
query: str,
template_overview: Dict[str, Any] | None = None,
) -> Dict[str, Any]:
"""综合模板+多源内容,生成全书的标题、目录结构与主题色板"""
# 将模板原文、切片结构与多源报告一并喂给LLM,便于其理解层级与素材
payload = {
"query": query,
"template": {
"raw": template_markdown,
"sections": [section.to_dict() for section in sections],
},
"templateOverview": template_overview
or {
"title": sections[0].title if sections else "",
"chapters": [section.to_dict() for section in sections],
},
"reports": reports,
"forumLogs": forum_logs,
}
user_message = build_document_layout_prompt(payload)
response = self.llm_client.stream_invoke_to_string(
SYSTEM_PROMPT_DOCUMENT_LAYOUT,
user_message,
temperature=0.3,
top_p=0.9,
)
design = self._parse_response(response)
logger.info("文档标题/目录设计已生成")
return design
def _parse_response(self, raw: str) -> Dict[str, Any]:
"""解析LLM返回的JSON文本,若失败则抛出友好错误"""
cleaned = raw.strip()
if cleaned.startswith("```json"):
cleaned = cleaned[7:]
if cleaned.startswith("```"):
cleaned = cleaned[3:]
if cleaned.endswith("```"):
cleaned = cleaned[:-3]
cleaned = cleaned.strip()
if not cleaned:
raise ValueError("文档设计LLM返回空内容")
try:
return json.loads(cleaned)
except json.JSONDecodeError as exc:
raise ValueError(f"文档设计JSON解析失败: {exc}") from exc
__all__ = ["DocumentLayoutNode"]
-254
View File
@@ -1,254 +0,0 @@
"""
HTML生成节点
将整合后的内容转换为美观的HTML报告
"""
import json
from datetime import datetime
from typing import Dict, Any
from loguru import logger
from .base_node import StateMutationNode
from ..llms.base import LLMClient
from ..state.state import ReportState
from ..prompts import SYSTEM_PROMPT_HTML_GENERATION
# 不再需要text_processing依赖
class HTMLGenerationNode(StateMutationNode):
"""HTML生成处理节点"""
def __init__(self, llm_client: LLMClient):
"""
初始化HTML生成节点
Args:
llm_client: LLM客户端
"""
super().__init__(llm_client, "HTMLGenerationNode")
def run(self, input_data: Dict[str, Any], **kwargs) -> str:
"""
执行HTML生成
Args:
input_data: 包含报告数据的字典
- query: 原始查询
- query_engine_report: QueryEngine报告内容
- media_engine_report: MediaEngine报告内容
- insight_engine_report: InsightEngine报告内容
- forum_logs: 论坛日志内容
- selected_template: 选择的模板内容
Returns:
生成的HTML内容
"""
logger.info("开始生成HTML报告...")
try:
# 准备LLM输入数据
llm_input = {
"query": input_data.get('query', ''),
"query_engine_report": input_data.get('query_engine_report', ''),
"media_engine_report": input_data.get('media_engine_report', ''),
"insight_engine_report": input_data.get('insight_engine_report', ''),
"forum_logs": input_data.get('forum_logs', ''),
"selected_template": input_data.get('selected_template', '')
}
# 转换为JSON格式传递给LLM
message = json.dumps(llm_input, ensure_ascii=False, indent=2)
# 调用LLM生成HTML
response = self.llm_client.stream_invoke_to_string(SYSTEM_PROMPT_HTML_GENERATION, message)
# 处理响应(简化版)
processed_response = self.process_output(response)
logger.info("HTML报告生成完成")
return processed_response
except Exception as e:
logger.exception(f"HTML生成失败: {str(e)}")
# 返回备用HTML
return self._generate_fallback_html(input_data)
def mutate_state(self, input_data: Dict[str, Any], state: ReportState, **kwargs) -> ReportState:
"""
修改报告状态,添加生成的HTML内容
Args:
input_data: 输入数据
state: 当前报告状态
**kwargs: 额外参数
Returns:
更新后的报告状态
"""
# 生成HTML
html_content = self.run(input_data, **kwargs)
# 更新状态
state.html_content = html_content
state.mark_completed()
return state
def process_output(self, output: str) -> str:
"""
处理LLM输出,提取HTML内容
Args:
output: LLM原始输出
Returns:
HTML内容
"""
try:
logger.info(f"处理LLM原始输出,长度: {len(output)} 字符")
html_content = output.strip()
# 清理markdown代码块标记(如果存在)
if html_content.startswith('```html'):
html_content = html_content[7:] # 移除 '```html'
if html_content.endswith('```'):
html_content = html_content[:-3] # 移除结尾的 '```'
elif html_content.startswith('```') and html_content.endswith('```'):
html_content = html_content[3:-3] # 移除前后的 '```'
html_content = html_content.strip()
# 如果内容为空,返回原始输出
if not html_content:
logger.info("处理后内容为空,返回原始输出")
html_content = output
logger.info(f"HTML处理完成,最终长度: {len(html_content)} 字符")
return html_content
except Exception as e:
logger.exception(f"处理HTML输出失败: {str(e)},返回原始输出")
return output
def _generate_fallback_html(self, input_data: Dict[str, Any]) -> str:
"""
生成备用HTML报告(当LLM失败时使用)
Args:
input_data: 输入数据
Returns:
备用HTML内容
"""
logger.info("使用备用HTML生成方法")
query = input_data.get('query', '智能舆情分析报告')
query_report = input_data.get('query_engine_report', '')
media_report = input_data.get('media_engine_report', '')
insight_report = input_data.get('insight_engine_report', '')
forum_logs = input_data.get('forum_logs', '')
generation_time = datetime.now().strftime("%Y年%m月%d%H:%M:%S")
html_content = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{query} - 智能舆情分析报告</title>
<style>
body {{
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
line-height: 1.6;
color: #333;
max-width: 1200px;
margin: 0 auto;
padding: 20px;
background: #f5f5f5;
}}
.container {{
background: white;
padding: 40px;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}}
h1 {{
color: #2c3e50;
border-bottom: 3px solid #3498db;
padding-bottom: 10px;
}}
h2 {{
color: #34495e;
margin-top: 30px;
margin-bottom: 15px;
}}
.section {{
margin-bottom: 30px;
padding: 20px;
border-left: 4px solid #3498db;
background: #f8f9fa;
}}
.meta {{
background: #e9ecef;
padding: 15px;
border-radius: 5px;
margin-bottom: 20px;
}}
.footer {{
margin-top: 40px;
padding-top: 20px;
border-top: 1px solid #eee;
text-align: center;
color: #666;
}}
pre {{
background: #f4f4f4;
padding: 15px;
border-radius: 5px;
overflow-x: auto;
white-space: pre-wrap;
}}
</style>
</head>
<body>
<div class="container">
<h1>{query}</h1>
<div class="meta">
<strong>报告生成时间:</strong> {generation_time}<br>
<strong>数据来源:</strong> QueryEngine、MediaEngine、InsightEngine、ForumEngine<br>
<strong>报告类型:</strong> 综合舆情分析报告
</div>
<h2>执行摘要</h2>
<div class="section">
本报告整合了多个分析引擎的研究结果,为您提供全面的舆情分析洞察。
通过对查询主题"{query}"的深度分析,我们从多个维度展现了当前的舆情态势。
</div>
{f'<h2>QueryEngine分析结果</h2><div class="section"><pre>{query_report}</pre></div>' if query_report else ''}
{f'<h2>MediaEngine分析结果</h2><div class="section"><pre>{media_report}</pre></div>' if media_report else ''}
{f'<h2>InsightEngine分析结果</h2><div class="section"><pre>{insight_report}</pre></div>' if insight_report else ''}
{f'<h2>论坛监控数据</h2><div class="section"><pre>{forum_logs}</pre></div>' if forum_logs else ''}
<h2>综合结论</h2>
<div class="section">
基于多个分析引擎的综合研究,我们对"{query}"主题进行了全面分析。
各引擎从不同角度提供了深入洞察,为决策提供了重要参考。
</div>
<div class="footer">
<p>本报告由智能舆情分析平台自动生成</p>
<p>ReportEngine v1.0 | 生成时间: {generation_time}</p>
</div>
</div>
</body>
</html>"""
return html_content
+78
View File
@@ -0,0 +1,78 @@
"""
章节篇幅规划节点。
"""
from __future__ import annotations
import json
from typing import Any, Dict, List
from loguru import logger
from ..core import TemplateSection
from ..prompts import (
SYSTEM_PROMPT_WORD_BUDGET,
build_word_budget_prompt,
)
from .base_node import BaseNode
class WordBudgetNode(BaseNode):
"""规划各章节字数与重点"""
def __init__(self, llm_client):
super().__init__(llm_client, "WordBudgetNode")
def run(
self,
sections: List[TemplateSection],
design: Dict[str, Any],
reports: Dict[str, str],
forum_logs: str,
query: str,
template_overview: Dict[str, Any] | None = None,
) -> Dict[str, Any]:
"""根据设计稿和所有素材规划章节字数,让LLM写作时有明确篇幅目标"""
# 输入中除了章节骨架外,还包含布局节点输出,方便约束篇幅时参考视觉主次
payload = {
"query": query,
"design": design,
"sections": [section.to_dict() for section in sections],
"templateOverview": template_overview
or {
"title": sections[0].title if sections else "",
"chapters": [section.to_dict() for section in sections],
},
"reports": reports,
"forumLogs": forum_logs,
}
user = build_word_budget_prompt(payload)
response = self.llm_client.stream_invoke_to_string(
SYSTEM_PROMPT_WORD_BUDGET,
user,
temperature=0.25,
top_p=0.85,
)
plan = self._parse_response(response)
logger.info("章节字数规划已生成")
return plan
def _parse_response(self, raw: str) -> Dict[str, Any]:
"""将LLM输出的JSON文本转为字典,失败时提示规划异常"""
cleaned = raw.strip()
if cleaned.startswith("```json"):
cleaned = cleaned[7:]
if cleaned.startswith("```"):
cleaned = cleaned[3:]
if cleaned.endswith("```"):
cleaned = cleaned[:-3]
cleaned = cleaned.strip()
if not cleaned:
raise ValueError("篇幅规划LLM返回空内容")
try:
return json.loads(cleaned)
except json.JSONDecodeError as exc:
raise ValueError(f"篇幅规划JSON解析失败: {exc}") from exc
__all__ = ["WordBudgetNode"]
+17 -3
View File
@@ -6,13 +6,27 @@ Report Engine提示词模块
from .prompts import (
SYSTEM_PROMPT_TEMPLATE_SELECTION,
SYSTEM_PROMPT_HTML_GENERATION,
SYSTEM_PROMPT_CHAPTER_JSON,
SYSTEM_PROMPT_DOCUMENT_LAYOUT,
SYSTEM_PROMPT_WORD_BUDGET,
output_schema_template_selection,
input_schema_html_generation
input_schema_html_generation,
chapter_generation_input_schema,
build_chapter_user_prompt,
build_document_layout_prompt,
build_word_budget_prompt,
)
__all__ = [
"SYSTEM_PROMPT_TEMPLATE_SELECTION",
"SYSTEM_PROMPT_HTML_GENERATION",
"SYSTEM_PROMPT_HTML_GENERATION",
"SYSTEM_PROMPT_CHAPTER_JSON",
"SYSTEM_PROMPT_DOCUMENT_LAYOUT",
"SYSTEM_PROMPT_WORD_BUDGET",
"output_schema_template_selection",
"input_schema_html_generation"
"input_schema_html_generation",
"chapter_generation_input_schema",
"build_chapter_user_prompt",
"build_document_layout_prompt",
"build_word_budget_prompt",
]
+238
View File
@@ -5,6 +5,12 @@ Report Engine 的所有提示词定义
import json
from ..ir import (
ALLOWED_BLOCK_TYPES,
CHAPTER_JSON_SCHEMA_TEXT,
IR_VERSION,
)
# ===== JSON Schema 定义 =====
# 模板选择输出Schema
@@ -30,6 +36,58 @@ input_schema_html_generation = {
}
}
# 分章节JSON生成输入Schema(给提示词说明字段)
chapter_generation_input_schema = {
"type": "object",
"properties": {
"section": {
"type": "object",
"properties": {
"title": {"type": "string"},
"slug": {"type": "string"},
"order": {"type": "number"},
"number": {"type": "string"},
"outline": {"type": "array", "items": {"type": "string"}}
},
"required": ["title", "slug", "order"]
},
"globalContext": {
"type": "object",
"properties": {
"query": {"type": "string"},
"templateName": {"type": "string"},
"themeTokens": {"type": "object"},
"styleDirectives": {"type": "object"}
}
},
"reports": {
"type": "object",
"properties": {
"query_engine": {"type": "string"},
"media_engine": {"type": "string"},
"insight_engine": {"type": "string"}
}
},
"forumLogs": {"type": "string"},
"dataBundles": {
"type": "array",
"items": {"type": "object"}
},
"constraints": {
"type": "object",
"properties": {
"language": {"type": "string"},
"maxTokens": {"type": "number"},
"allowedBlocks": {
"type": "array",
"items": {"type": "string"}
}
}
}
},
"required": ["section", "globalContext", "reports"]
}
# HTML报告生成输出Schema - 已简化,不再使用JSON格式
# output_schema_html_generation = {
# "type": "object",
@@ -39,6 +97,96 @@ input_schema_html_generation = {
# "required": ["html_content"]
# }
# 文档标题/目录设计输出Schema:约束DocumentLayoutNode期望的字段
document_layout_output_schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"subtitle": {"type": "string"},
"tagline": {"type": "string"},
"tocTitle": {"type": "string"},
"hero": {
"type": "object",
"properties": {
"summary": {"type": "string"},
"highlights": {"type": "array", "items": {"type": "string"}},
"kpis": {
"type": "array",
"items": {
"type": "object",
"properties": {
"label": {"type": "string"},
"value": {"type": "string"},
"delta": {"type": "string"},
"tone": {"type": "string", "enum": ["up", "down", "neutral"]},
},
"required": ["label", "value"],
},
},
"actions": {"type": "array", "items": {"type": "string"}},
},
},
"themeTokens": {"type": "object"},
"tocPlan": {
"type": "array",
"items": {
"type": "object",
"properties": {
"chapterId": {"type": "string"},
"anchor": {"type": "string"},
"display": {"type": "string"},
"description": {"type": "string"},
},
"required": ["chapterId", "display"],
},
},
"layoutNotes": {"type": "array", "items": {"type": "string"}},
},
"required": ["title", "tocPlan"],
}
# 章节字数规划Schema:约束WordBudgetNode的输出结构
word_budget_output_schema = {
"type": "object",
"properties": {
"totalWords": {"type": "number"},
"tolerance": {"type": "number"},
"globalGuidelines": {"type": "array", "items": {"type": "string"}},
"chapters": {
"type": "array",
"items": {
"type": "object",
"properties": {
"chapterId": {"type": "string"},
"title": {"type": "string"},
"targetWords": {"type": "number"},
"minWords": {"type": "number"},
"maxWords": {"type": "number"},
"emphasis": {"type": "array", "items": {"type": "string"}},
"rationale": {"type": "string"},
"sections": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"anchor": {"type": "string"},
"targetWords": {"type": "number"},
"minWords": {"type": "number"},
"maxWords": {"type": "number"},
"notes": {"type": "string"},
},
"required": ["title", "targetWords"],
},
},
},
"required": ["chapterId", "targetWords"],
},
},
},
"required": ["totalWords", "chapters"],
}
# ===== 系统提示词定义 =====
# 模板选择的系统提示词
@@ -133,3 +281,93 @@ SYSTEM_PROMPT_HTML_GENERATION = f"""
**重要:直接返回完整的HTML代码,不要包含任何解释、说明或其他文本。只返回HTML代码本身。**
"""
# 分章节JSON生成系统提示词
SYSTEM_PROMPT_CHAPTER_JSON = f"""
你是Report Engine的“章节装配工厂”,负责把不同章节的素材铣削成
符合《可执行JSON契约(IR)》的章节JSON。稍后我会提供单个章节要点、
全局数据与风格指令,你需要:
1. 完全遵循IR版本 {IR_VERSION} 的结构,严禁输出HTML或Markdown。
2. 仅使用以下Block类型:{', '.join(ALLOWED_BLOCK_TYPES)};其中图表用block.type=widget并填充Chart.js配置。
3. 所有段落都放入paragraph.inlines,混排样式通过marks表示(bold/italic/color/link等)。
4. 所有heading必须包含anchor,锚点与编号保持模板一致,比如section-2-1。
5. 表格需给出rows/cells/alignKPI卡请使用kpiGrid,分割线用hr。
6. 如需引用图表/交互组件,统一用widgetType表示(例如chart.js/line、chart.js/doughnut)。
7. 鼓励结合outline中列出的子标题,生成多层heading与细粒度内容,同时可补充callout、blockquote等。
8. 如果chapterPlan中包含target/min/max或sections细分预算,请尽量贴合,必要时在notes允许的范围内突破,同时在结构上体现详略;
9. 一级标题需使用中文数字(“一、二、三”),二级标题使用阿拉伯数字(“1.1、1.2”),heading.text中直接写好编号,与outline顺序对应;
10. 严禁输出外部图片/AI生图链接,仅可使用Chart.js图表、表格、色块、callout等HTML原生组件;如需视觉辅助请改为文字描述或数据表;
11. 段落混排需通过marks表达粗体、斜体、下划线、颜色等样式,禁止残留Markdown语法(如**text**);
12. 行间公式用block.type="math"并填入math.latex,行内公式在paragraph.inlines里将文本设为Latex并加上marks.type="math",渲染层会用MathJax处理;
13. widget配色需与CSS变量兼容,不要硬编码背景色或文字色,legend/ticks由渲染层控制;
14. 善用callout、kpiGrid、表格、widget等提升版面丰富度,但必须遵守模板章节范围。
15. 输出前务必自检JSON语法:禁止出现`{{}}{{`或`][`相连缺少逗号、列表项嵌套超过一层、未闭合的括号或未转义换行,`list` block的items必须是`[[block,...], ...]`结构,若无法满足则返回错误提示而不是输出不合法JSON。
16. 所有widget块必须在顶层提供`data`或`dataRef`(可将props中的`data`上移),确保Chart.js能够直接渲染;缺失数据时宁可输出表格或段落,绝不留空。
17. 任何block都必须声明合法`type`heading/paragraph/list/...);若需要普通文本请使用`paragraph`并给出`inlines`,禁止返回`type:null`或未知值。
<CHAPTER JSON SCHEMA>
{CHAPTER_JSON_SCHEMA_TEXT}
</CHAPTER JSON SCHEMA>
输出格式:
{{"chapter": {{...遵循上述Schema的章节JSON...}}}}
严禁添加除JSON以外的任何文本或注释。
"""
# 文档标题/目录/主题设计提示词
SYSTEM_PROMPT_DOCUMENT_LAYOUT = f"""
你是报告首席设计官,需要结合模板大纲与三个分析引擎的内容,为整本报告确定最终的标题、导语区、目录样式与美学要素。
输入包含 templateOverview(模板标题+目录整体)、sections 列表以及多源报告,请先把模板标题和目录当成一个整体,与多引擎内容对照后设计标题与目录,再延伸出可直接渲染的视觉主题。你的输出会被独立存储以便后续拼接,请确保字段齐备。
目标:
1. 生成具有中文叙事风格的 title/subtitle/tagline,并确保可直接放在封面中央,文案中需自然提到“文章总览”;
2. 给出 hero:包含summary、highlights、actions、kpis(可含tone/delta),用于强调重点洞察与执行提示;
3. 输出 tocPlan,一级目录固定用中文数字(“一、二、三”),二级目录用“1.1/1.2”,可在description里说明详略;如需定制目录标题,请填写 tocTitle;
4. 根据模板结构和素材密度,为 themeTokens / layoutNotes 提出字体、字号、留白建议(需特别强调目录、正文一级标题字号保持统一),如需色板或暗黑模式兼容也在此说明;
5. 严禁要求外部图片或AI生图,推荐Chart.js图表、表格、色块、KPI卡等可直接渲染的原生组件;
6. 不随意增删章节,仅优化命名或描述;若有排版或章节合并提示,请放入 layoutNotes,渲染层会严格遵循。
输出必须满足下述JSON Schema
<OUTPUT JSON SCHEMA>
{json.dumps(document_layout_output_schema, ensure_ascii=False, indent=2)}
</OUTPUT JSON SCHEMA>
只返回JSON,勿附加额外文本。
"""
# 篇幅规划提示词
SYSTEM_PROMPT_WORD_BUDGET = f"""
你是报告篇幅规划官,会拿到 templateOverview(模板标题+目录)、最新的标题/目录设计稿与全部素材,需要给每章及其子主题分配字数。
要求:
1. 总字数约40000字,可上下浮动5%,并给出 globalGuidelines 说明整体详略策略;
2. chapters 中每章需包含 targetWords/min/max、需要额外展开的 emphasis、sections 数组(为该章各小节/提纲分配字数与注意事项,可注明“允许在必要时超出10%补充案例”等);
3. rationale 必须解释该章篇幅配置理由,引用模板/素材中的关键信息;
4. 章节编号遵循一级中文数字、二级阿拉伯数字,便于后续统一字号;
5. 结果写成JSON并满足下述Schema,仅用于内部存储与章节生成,不直接输出给读者。
<OUTPUT JSON SCHEMA>
{json.dumps(word_budget_output_schema, ensure_ascii=False, indent=2)}
</OUTPUT JSON SCHEMA>
只返回JSON,无额外说明。
"""
def build_chapter_user_prompt(payload: dict) -> str:
"""
将章节上下文序列化为提示词输入。
"""
return json.dumps(payload, ensure_ascii=False, indent=2)
def build_document_layout_prompt(payload: dict) -> str:
"""将文档设计所需的上下文序列化为JSON字符串"""
return json.dumps(payload, ensure_ascii=False, indent=2)
def build_word_budget_prompt(payload: dict) -> str:
"""将篇幅规划输入转为字符串,便于送入LLM"""
return json.dumps(payload, ensure_ascii=False, indent=2)
+7
View File
@@ -0,0 +1,7 @@
"""
Report Engine渲染器集合。
"""
from .html_renderer import HTMLRenderer
__all__ = ["HTMLRenderer"]
File diff suppressed because it is too large Load Diff
+10
View File
@@ -17,6 +17,14 @@ class Settings(BaseSettings):
REPORT_ENGINE_PROVIDER: Optional[str] = Field(None, description="模型服务商,仅兼容保留")
MAX_CONTENT_LENGTH: int = Field(200000, description="最大内容长度")
OUTPUT_DIR: str = Field("final_reports", description="主输出目录")
# 章节分块JSON会存储在该目录,便于溯源与断点续传
CHAPTER_OUTPUT_DIR: str = Field(
"final_reports/chapters", description="章节JSON缓存目录"
)
# 装订后的整本IR/manifest也会持久化,方便调试与审计
DOCUMENT_IR_OUTPUT_DIR: str = Field(
"final_reports/ir", description="整本IR/Manifest输出目录"
)
TEMPLATE_DIR: str = Field("ReportEngine/report_template", description="多模板目录")
API_TIMEOUT: float = Field(900.0, description="单API超时时间(秒)")
MAX_RETRY_DELAY: float = Field(180.0, description="最大重试间隔(秒)")
@@ -41,6 +49,8 @@ def print_config(config: Settings):
message += f"LLM Base URL: {config.REPORT_ENGINE_BASE_URL or '(默认)'}\n"
message += f"最大内容长度: {config.MAX_CONTENT_LENGTH}\n"
message += f"输出目录: {config.OUTPUT_DIR}\n"
message += f"章节JSON目录: {config.CHAPTER_OUTPUT_DIR}\n"
message += f"整本IR目录: {config.DOCUMENT_IR_OUTPUT_DIR}\n"
message += f"模板目录: {config.TEMPLATE_DIR}\n"
message += f"API 超时时间: {config.API_TIMEOUT}\n"
message += f"最大重试间隔: {config.MAX_RETRY_DELAY}\n"
+1 -1
View File
@@ -15,6 +15,7 @@ requests==2.31.0
httpx==0.28.1
aiofiles==23.2.1
aiohttp>=3.8.0
PySocks>=1.7.1
# ===== LLM接口 =====
openai>=1.3.0
@@ -32,7 +33,6 @@ jieba==0.42.1
# ===== 数据库 =====
pymysql==1.1.0
aiomysql==0.2.0
asyncmy==0.2.9
aiosqlite==0.21.0
redis>=4.6.0
SQLAlchemy==2.0.35