更新部分爬虫以兼容本地运行及数据库存储

本地化&2.0
Update Bocha API base URL in .env.example and config.py
2025-12-16 10:56:56 +08:00 · 2025-12-02 14:01:39 +08:00 · 2025-11-29 20:25:01 +08:00 · 2025-11-29 14:26:26 +08:00 · 2025-11-28 10:08:13 +08:00 · 2025-11-28 00:59:20 +08:00
108 changed files with 29390 additions and 1525 deletions
@@ -41,6 +41,8 @@ QUERY_ENGINE_BASE_URL=
 QUERY_ENGINE_MODEL_NAME=

 # Report Agent（推荐gemini-2.5-pro，中转厂商申请地址：https://aihubmix.com/?aff=8Ds9）
+# 注意：Report Agent需要相对较强的模型能力，如果最终报告出现图表空白/异常段落的情况，
+#      请尝试更换更强的模型，建议使用能力不低于DeepSeek-V3.2-Exp的非思考模式的模型
 REPORT_ENGINE_API_KEY=
 REPORT_ENGINE_BASE_URL=
 REPORT_ENGINE_MODEL_NAME=
@@ -65,5 +67,5 @@ KEYWORD_OPTIMIZER_MODEL_NAME=
 TAVILY_API_KEY=

 # Bocha AI Search BASEURL，用于Bocha多模态搜索，这里密钥名称虽然是Web Search，但其实是要AI Search的，申请地址：https://open.bochaai.com/
-BOCHA_BASE_URL=https://api.bochaai.com/v1/ai-search
+BOCHA_BASE_URL=https://api.bocha.cn/v1/ai-search
 BOCHA_WEB_SEARCH_API_KEY=
@@ -1,3 +1,6 @@
 *.ipynb binary

-* text=auto
+* text=auto
+
+*.html linguist-vendored
+*.js linguist-vendored
@@ -1,28 +1,33 @@
 name: Docker Image CI
-
 on:
  push:
    tags:
      - 'v*'
-
 jobs:
  build_and_publish:
    runs-on: ubuntu-latest
    permissions:
      contents: read
      packages: write
-
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
-
+      
+      - name: Free Disk Space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker image prune --all --force
+          df -h
+      
      - name: Log in to the Container registry
        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Extract metadata (tags, labels) for Docker
        id: meta
        uses: docker/metadata-action@v5
@@ -32,15 +37,12 @@ jobs:
            type=ref,event=tag
            type=semver,pattern={{version}}
            type=raw,value=latest
-
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v3
-
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
        with:
          driver: docker-container
-
      - name: Build and push Docker image
        uses: docker/build-push-action@v5
        with:
@@ -9,34 +9,46 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
    PATH="/root/.local/bin:${PATH}" \
    PLAYWRIGHT_BROWSERS_PATH=/ms-playwright

-# Install system dependencies required by scientific Python stack, Playwright, and Streamlit
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential \
-    curl \
-    git \
-    libgl1 \
-    libglib2.0-0 \
-    libgtk-3-0 \
-    libpango-1.0-0 \
-    libpangocairo-1.0-0 \
-    libatk1.0-0 \
-    libatk-bridge2.0-0 \
-    libxcb1 \
-    libxcomposite1 \
-    libxdamage1 \
-    libxext6 \
-    libxfixes3 \
-    libxi6 \
-    libxtst6 \
-    libnss3 \
-    libxrandr2 \
-    libxkbcommon0 \
-    libasound2 \
-    libx11-xcb1 \
-    libxshmfence1 \
-    libgbm1 \
-    ffmpeg \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
+# Install system dependencies required by scientific Python stack, Playwright, Streamlit, and WeasyPrint PDF
+RUN set -euo pipefail; \
+    apt-get update; \
+    if apt-cache show libgdk-pixbuf-2.0-0 >/dev/null 2>&1; then \
+        GDK_PIXBUF_PKG=libgdk-pixbuf-2.0-0; \
+    else \
+        GDK_PIXBUF_PKG=libgdk-pixbuf2.0-0; \
+    fi; \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libgl1 \
+        libglib2.0-0 \
+        libgtk-3-0 \
+        libpango-1.0-0 \
+        libpangocairo-1.0-0 \
+        libpangoft2-1.0-0 \
+        "${GDK_PIXBUF_PKG}" \
+        libffi-dev \
+        libcairo2 \
+        libatk1.0-0 \
+        libatk-bridge2.0-0 \
+        libxcb1 \
+        libxcomposite1 \
+        libxdamage1 \
+        libxext6 \
+        libxfixes3 \
+        libxi6 \
+        libxtst6 \
+        libnss3 \
+        libxrandr2 \
+        libxkbcommon0 \
+        libasound2 \
+        libx11-xcb1 \
+        libxshmfence1 \
+        libgbm1 \
+        ffmpeg; \
+    apt-get clean; \
+    rm -rf /var/lib/apt/lists/*

 # Install the latest uv release and expose it on PATH
 RUN curl -LsSf --retry 3 --retry-delay 2 --proto '=https' --proto-redir '=https' --tlsv1.2 https://astral.sh/uv/install.sh | sh
@@ -263,7 +263,12 @@ class DeepSearchAgent:
            logger.info("  - 未找到搜索结果")
        
        # 更新状态中的搜索历史
-        paragraph.research.add_search_results(search_query, search_results)
+        paragraph.research.add_search_results(
+            search_query,
+            search_results,
+            search_tool=search_tool,
+            paragraph_title=paragraph.title,
+        )
        
        # 生成初始总结
        logger.info("  - 生成初始总结...")
@@ -341,7 +346,12 @@ class DeepSearchAgent:
                logger.info("    未找到反思搜索结果")
            
            # 更新搜索历史
-            paragraph.research.add_search_results(search_query, search_results)
+            paragraph.research.add_search_results(
+                search_query,
+                search_results,
+                search_tool=search_tool,
+                paragraph_title=paragraph.title,
+            )
            
            # 生成反思总结
            reflection_summary_input = {
@@ -17,6 +17,9 @@ class Search:
    title: str = ""                    # 搜索结果标题
    content: str = ""                  # 搜索返回的内容
    score: Optional[float] = None      # 相关度评分
+    paragraph_title: str = ""          # 段落标题，便于展示归属
+    search_tool: str = ""              # 使用的搜索工具
+    has_result: bool = True            # 是否有返回结果
    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
    
    def to_dict(self) -> Dict[str, Any]:
@@ -27,6 +30,9 @@ class Search:
            "title": self.title,
            "content": self.content,
            "score": self.score,
+            "paragraph_title": self.paragraph_title,
+            "search_tool": self.search_tool,
+            "has_result": self.has_result,
            "timestamp": self.timestamp
        }
    
@@ -39,6 +45,9 @@ class Search:
            title=data.get("title", ""),
            content=data.get("content", ""),
            score=data.get("score"),
+            paragraph_title=data.get("paragraph_title", ""),
+            search_tool=data.get("search_tool", ""),
+            has_result=data.get("has_result", True),
            timestamp=data.get("timestamp", datetime.now().isoformat())
        )

@@ -55,17 +64,42 @@ class Research:
        """添加搜索记录"""
        self.search_history.append(search)
    
-    def add_search_results(self, query: str, results: List[Dict[str, Any]]):
+    def add_search_results(self, query: str, results: List[Dict[str, Any]], search_tool: str = "", paragraph_title: str = ""):
        """批量添加搜索结果"""
-        for result in results:
-            search = Search(
-                query=query,
-                url=result.get("url", ""),
-                title=result.get("title", ""),
-                content=result.get("content", ""),
-                score=result.get("score")
+        if not results:
+            # 记录一次“无结果”搜索，方便前端显示搜索轨迹
+            self.add_search(
+                Search(
+                    query=query or "",
+                    title="未找到结果",
+                    content="本次搜索未返回结果或调用失败",
+                    url="",
+                    score=None,
+                    paragraph_title=paragraph_title,
+                    search_tool=search_tool,
+                    has_result=False,
+                )
+            )
+            return
+
+        for result in results:
+            url = result.get("url") or ""
+            title = result.get("title") or ""
+            content = result.get("content") or result.get("raw_content") or ""
+            if not isinstance(content, str):
+                content = str(content)
+            self.add_search(
+                Search(
+                    query=query or "",
+                    url=url,
+                    title=title,
+                    content=content,
+                    score=result.get("score"),
+                    paragraph_title=paragraph_title or result.get("paragraph_title", ""),
+                    search_tool=search_tool or result.get("search_tool", ""),
+                    has_result=True,
+                )
            )
-            self.add_search(search)
    
    def get_search_count(self) -> int:
        """获取搜索次数"""
@@ -9,8 +9,8 @@
 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。

 # 基础配置
-PLATFORM = "bili"  # 平台，xhs | dy | ks | bili | wb | tieba | zhihu
-KEYWORDS = "电影鬼灭之刃,亲属想侵吞3姐妹亡父赔偿款,网警斩断侵害未成年人网络黑色产业链,2007年后出生的人不能在马尔代夫吸烟,沈月,是公主也是自己的骑士,以军虐囚视频,唐朝诡事录,广州地铁回应APP乘车码频繁弹窗广告,全红婵的减肥计划精确到克"  # 关键词搜索配置，以英文逗号分隔
+PLATFORM = "ks"  # 平台，xhs | dy | ks | bili | wb | tieba | zhihu
+KEYWORDS = "F6智慧门店,F6智数,中国汽车后市场白皮书,南京爱福路汽车科技有限公司,汽车后市场,汽车修理厂,新康众,天猫养车,汽后,汽修厂,爱福路,康众"  # 关键词搜索配置，以英文逗号分隔
 LOGIN_TYPE = "qrcode"  # qrcode or phone or cookie
 COOKIES = ""
 CRAWLER_TYPE = "search"  # 爬取类型，search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
@@ -30,6 +30,12 @@ IP_PROXY_PROVIDER_NAME = "kuaidaili"  # kuaidaili | wandouhttp
 # 抖音如果一直提示失败，打开浏览器看下是否扫码登录之后出现了手机号验证，如果出现了手动过一下再试。
 HEADLESS = True

+# HTTP/网络配置
+# 如内网有自签名证书导致 TLS 失败，可临时置为 False
+HTTPX_VERIFY = False
+# 若需要指定上游代理（如 http://user:pass@host:port），填此值；留空使用系统/环境变量
+HTTPX_PROXY = ""
+
 # 是否保存登录状态
 SAVE_LOGIN_STATE = True

@@ -61,7 +67,7 @@ BROWSER_LAUNCH_TIMEOUT = 30
 AUTO_CLOSE_BROWSER = True

 # 数据保存类型选项配置,支持五种类型：csv、db、json、sqlite、postgresql, 最好保存到DB，有排重的功能。
-SAVE_DATA_OPTION = "postgresql"  # csv or db or json or sqlite or postgresql
+SAVE_DATA_OPTION = "db"  # csv or db or json or sqlite or postgresql

 # 用户浏览器缓存的浏览器文件配置
 USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name
@@ -70,7 +76,7 @@ USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name
 START_PAGE = 1

 # 爬取视频/帖子的数量控制
-CRAWLER_MAX_NOTES_COUNT = 5
+CRAWLER_MAX_NOTES_COUNT = 50

 # 并发爬虫数量控制
 MAX_CONCURRENCY_NUM = 1
@@ -84,6 +90,11 @@ ENABLE_GET_COMMENTS = True
 # 爬取一级评论的数量控制(单视频/帖子)
 CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 20

+# 是否对评论做去重及重复页跳出（针对贴吧等平台）
+ENABLE_COMMENT_DEDUP = True
+# 连续多少页没有新评论时中断评论循环
+COMMENT_DUP_BREAK_THRESHOLD = 2
+
 # 是否开启爬二级评论模式, 默认不开启爬二级评论
 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
 ENABLE_GET_SUB_COMMENTS = False
@@ -12,10 +12,10 @@
 import os

 # mysql config - 使用MindSpider的数据库配置
-MYSQL_DB_PWD = "bettafish"
-MYSQL_DB_USER = "bettafish"
-MYSQL_DB_HOST = "127.0.0.1"
-MYSQL_DB_PORT = 5444
+MYSQL_DB_PWD = "123123"
+MYSQL_DB_USER = "intelligence"
+MYSQL_DB_HOST = "123.60.167.249"
+MYSQL_DB_PORT = 3306
 MYSQL_DB_NAME = "bettafish"

 mysql_db_config = {
@@ -48,7 +48,7 @@ sqlite_db_config = {
 POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "bettafish")
 POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "bettafish")
 POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "127.0.0.1")
-POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "5444")
+POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "5432")
 POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "bettafish")

 postgresql_db_config = {
@@ -13,10 +13,11 @@ _engines = {}
 async def create_database_if_not_exists(db_type: str):
    if db_type == "mysql" or db_type == "db":
        # Connect to the server without a database
-        server_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}"
+        server_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}?charset=utf8mb4"
        engine = create_async_engine(server_url, echo=False)
        async with engine.connect() as conn:
-            await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {mysql_db_config['db_name']}"))
+            # 确保数据库使用utf8mb4字符集
+            await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {mysql_db_config['db_name']} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
        await engine.dispose()
    elif db_type == "postgresql":
        # Connect to PostgreSQL default database (postgres) to create target database
@@ -48,7 +49,8 @@ def get_async_engine(db_type: str = None):
    if db_type == "sqlite":
        db_url = f"sqlite+aiosqlite:///{sqlite_db_config['db_path']}"
    elif db_type == "mysql" or db_type == "db":
-        db_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}"
+        # 添加charset=utf8mb4以支持完整的UTF-8编码（包括emoji和中文）
+        db_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}?charset=utf8mb4"
    elif db_type == "postgresql":
        db_url = f"postgresql+asyncpg://{postgresql_db_config['user']}:{postgresql_db_config['password']}@{postgresql_db_config['host']}:{postgresql_db_config['port']}/{postgresql_db_config['db_name']}"
    else:
@@ -11,42 +11,54 @@

 import asyncio
 import sys
-from typing import Optional
+from typing import Dict, Optional, Type
+
+import importlib

 import cmd_arg
 import config
 from database import db
 from base.base_crawler import AbstractCrawler
-from media_platform.bilibili import BilibiliCrawler
-from media_platform.douyin import DouYinCrawler
-from media_platform.kuaishou import KuaishouCrawler
-from media_platform.tieba import TieBaCrawler
-from media_platform.weibo import WeiboCrawler
-from media_platform.xhs import XiaoHongShuCrawler
-from media_platform.zhihu import ZhihuCrawler
 from tools.async_file_writer import AsyncFileWriter
 from var import crawler_type_var


 class CrawlerFactory:
-    CRAWLERS = {
-        "xhs": XiaoHongShuCrawler,
-        "dy": DouYinCrawler,
-        "ks": KuaishouCrawler,
-        "bili": BilibiliCrawler,
-        "wb": WeiboCrawler,
-        "tieba": TieBaCrawler,
-        "zhihu": ZhihuCrawler,
+    _CRAWLER_PATHS = {
+        "xhs": "media_platform.xhs.XiaoHongShuCrawler",
+        "dy": "media_platform.douyin.DouYinCrawler",
+        "ks": "media_platform.kuaishou.KuaishouCrawler",
+        "bili": "media_platform.bilibili.BilibiliCrawler",
+        "wb": "media_platform.weibo.WeiboCrawler",
+        "tieba": "media_platform.tieba.TieBaCrawler",
+        "zhihu": "media_platform.zhihu.ZhihuCrawler",
    }
+    _cache: Dict[str, Type[AbstractCrawler]] = {}

    @staticmethod
    def create_crawler(platform: str) -> AbstractCrawler:
-        crawler_class = CrawlerFactory.CRAWLERS.get(platform)
-        if not crawler_class:
+        path = CrawlerFactory._CRAWLER_PATHS.get(platform)
+        if not path:
            raise ValueError(
                "Invalid Media Platform Currently only supported xhs or dy or ks or bili ..."
            )
-        return crawler_class()
+
+        if platform not in CrawlerFactory._cache:
+            module_name, class_name = path.rsplit(".", 1)
+            try:
+                module = importlib.import_module(module_name)
+                crawler_class = getattr(module, class_name)
+            except ModuleNotFoundError as exc:
+                hint = (
+                    "Please install optional dependency 'xhshow' (pip install xhshow) "
+                    "or disable the xhs platform."
+                    if platform == "xhs" and exc.name == "xhshow"
+                    else f"Missing dependency while importing {module_name}"
+                )
+                raise ModuleNotFoundError(f"{exc}: {hint}") from exc
+            CrawlerFactory._cache[platform] = crawler_class
+
+        return CrawlerFactory._cache[platform]()


 crawler: Optional[AbstractCrawler] = None
@@ -59,6 +71,12 @@ crawler: Optional[AbstractCrawler] = None
 async def main():
    # Init crawler
    global crawler
+    
+    # 导入工具模块以初始化日志
+    from tools import utils
+    utils.logger.info("=" * 60)
+    utils.logger.info("MediaCrawler 启动")
+    utils.logger.info("=" * 60)

    # parse cmd
    args = await cmd_arg.parse_cmd()
@@ -69,30 +87,40 @@ async def main():
        print(f"Database {args.init_db} initialized successfully.")
        return  # Exit the main function cleanly

+    crawler = None
+    try:
+        crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
+        await crawler.start()

-
-    crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
-    await crawler.start()
-
-    # Generate wordcloud after crawling is complete
-    # Only for JSON save mode
-    if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
-        try:
-            file_writer = AsyncFileWriter(
-                platform=config.PLATFORM,
-                crawler_type=crawler_type_var.get()
-            )
-            await file_writer.generate_wordcloud_from_comments()
-        except Exception as e:
-            print(f"Error generating wordcloud: {e}")
+        # Generate wordcloud after crawling is complete
+        # Only for JSON save mode
+        if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
+            try:
+                file_writer = AsyncFileWriter(
+                    platform=config.PLATFORM,
+                    crawler_type=crawler_type_var.get()
+                )
+                await file_writer.generate_wordcloud_from_comments()
+            except Exception as e:
+                print(f"Error generating wordcloud: {e}")
+    finally:
+        # 确保爬虫结束后关闭浏览器
+        if crawler:
+            try:
+                await crawler.close()
+                print(f"[MediaCrawler] 浏览器已关闭")
+            except Exception as e:
+                print(f"[MediaCrawler] 关闭浏览器时出错: {e}")


 def cleanup():
-    if crawler:
-        # asyncio.run(crawler.close())
-        pass
-    if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
-        asyncio.run(db.close())
+    # 注意：crawler.close() 已经在 main() 的 finally 块中调用
+    # 这里只处理数据库关闭
+    if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
+        try:
+            asyncio.run(db.close())
+        except Exception as e:
+            print(f"[MediaCrawler] 关闭数据库连接时出错: {e}")


 if __name__ == "__main__":
@@ -49,8 +49,27 @@ class BilibiliClient(AbstractApiClient):
        self.cookie_dict = cookie_dict

    async def request(self, method, url, **kwargs) -> Any:
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            response = await client.request(method, url, timeout=self.timeout, **kwargs)
+        """
+        Basic HTTP request wrapper with retries for transient network errors.
+        """
+        verify = getattr(config, "HTTPX_VERIFY", True)
+        # 优先使用传入 proxy，其次是 config.HTTPX_PROXY，最后走系统环境变量
+        proxy = self.proxy or getattr(config, "HTTPX_PROXY", "") or None
+
+        async with httpx.AsyncClient(proxy=proxy, timeout=self.timeout, verify=verify) as client:
+            # 简单重试，处理短暂的连接失败
+            last_exc: Optional[Exception] = None
+            for attempt in range(3):
+                try:
+                    response = await client.request(method, url, **kwargs)
+                    break
+                except httpx.HTTPError as e:
+                    last_exc = e
+                    if attempt == 2:
+                        # 3rd failure -> give up
+                        utils.logger.error(f"[BilibiliClient.request] Network error on {method} {url}: {repr(e)}")
+                        raise DataFetchError(f"network error: {e}") from e
+                    await asyncio.sleep(1)
        try:
            data: Dict = response.json()
        except json.JSONDecodeError:
@@ -68,10 +68,23 @@ class BilibiliLogin(AbstractLogin):
            return True
        return False

+    async def _has_valid_login_cookie(self) -> bool:
+        """
+        快速检查当前上下文是否已有登录态，用于避免重复扫码。
+        """
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        return bool(cookie_dict.get("SESSDATA") or cookie_dict.get("DedeUserID"))
+
    async def login_by_qrcode(self):
        """login bilibili website and keep webdriver login state"""
        utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")

+        # 如果已经登录则直接跳过扫码流程
+        if await self._has_valid_login_cookie():
+            utils.logger.info("[BilibiliLogin.login_by_qrcode] 已检测到有效登录态，跳过扫码登录")
+            return
+
        # click login button
        login_button_ele = self.context_page.locator(
            "xpath=//div[@class='right-entry__outside go-login-btn']//div"
@@ -95,15 +95,25 @@ class DouYinClient(AbstractApiClient):
        params["a_bogus"] = a_bogus

    async def request(self, method, url, **kwargs):
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            response = await client.request(method, url, timeout=self.timeout, **kwargs)
        try:
-            if response.text == "" or response.text == "blocked":
-                utils.logger.error(f"request params incrr, response.text: {response.text}")
-                raise Exception("account blocked")
-            return response.json()
+            async with httpx.AsyncClient(proxy=self.proxy) as client:
+                response = await client.request(method, url, timeout=self.timeout, **kwargs)
+            try:
+                if response.text == "" or response.text == "blocked":
+                    utils.logger.error(f"request params incrr, response.text: {response.text}")
+                    raise Exception("account blocked")
+                return response.json()
+            except Exception as e:
+                raise DataFetchError(f"{e}, {response.text}")
+        except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout, httpx.WriteTimeout) as e:
+            utils.logger.error(f"网络连接错误: {type(e).__name__}: {e}")
+            raise DataFetchError(f"网络连接失败: {type(e).__name__}: {e}")
+        except httpx.TimeoutException as e:
+            utils.logger.error(f"请求超时: {e}")
+            raise DataFetchError(f"请求超时: {e}")
        except Exception as e:
-            raise DataFetchError(f"{e}, {response.text}")
+            utils.logger.error(f"请求异常: {type(e).__name__}: {e}")
+            raise DataFetchError(f"请求失败: {type(e).__name__}: {e}")

    async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
        """
@@ -121,6 +121,8 @@ class DouYinCrawler(AbstractCrawler):
                    utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
                    page += 1
                    continue
+                posts_res = None
+                retry_success = False
                try:
                    utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
                    posts_res = await self.dy_client.search_info_by_keyword(
@@ -129,11 +131,36 @@ class DouYinCrawler(AbstractCrawler):
                        publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
                        search_id=dy_search_id,
                    )
-                    if posts_res.get("data") is None or posts_res.get("data") == []:
-                        utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
+                    retry_success = True
+                except DataFetchError as e:
+                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed: {e}")
+                    # 如果是网络连接错误，等待后重试一次
+                    if "网络连接" in str(e) or "ConnectError" in str(e) or "超时" in str(e):
+                        utils.logger.warning(f"[DouYinCrawler.search] 网络错误，等待3秒后重试...")
+                        await asyncio.sleep(3)
+                        try:
+                            posts_res = await self.dy_client.search_info_by_keyword(
+                                keyword=keyword,
+                                offset=page * dy_limit_count - dy_limit_count,
+                                publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
+                                search_id=dy_search_id,
+                            )
+                            retry_success = True
+                        except Exception as retry_e:
+                            utils.logger.error(f"[DouYinCrawler.search] 重试失败: {retry_e}")
+                            break
+                    else:
                        break
-                except DataFetchError:
-                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
+                except Exception as e:
+                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} unexpected error: {type(e).__name__}: {e}")
+                    break
+                
+                # 如果请求失败（包括重试失败），跳过后续处理
+                if not retry_success or posts_res is None:
+                    break
+                    
+                if posts_res.get("data") is None or posts_res.get("data") == []:
+                    utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
                    break

                page += 1
@@ -362,13 +389,16 @@ class DouYinCrawler(AbstractCrawler):

    async def close(self) -> None:
        """Close browser context"""
-        # 如果使用CDP模式，需要特殊处理
-        if self.cdp_manager:
-            await self.cdp_manager.cleanup()
-            self.cdp_manager = None
-        else:
-            await self.browser_context.close()
-        utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
+        try:
+            # 如果使用CDP模式，需要特殊处理
+            if self.cdp_manager:
+                await self.cdp_manager.cleanup()
+                self.cdp_manager = None
+            elif self.browser_context:
+                await self.browser_context.close()
+            utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
+        except Exception as e:
+            utils.logger.error(f"[DouYinCrawler.close] An error occurred during close: {e}")

    async def get_aweme_media(self, aweme_item: Dict):
        """
@@ -45,13 +45,51 @@ class KuaiShouClient(AbstractApiClient):
        self.graphql = KuaiShouGraphQL()

    async def request(self, method, url, **kwargs) -> Any:
-        async with httpx.AsyncClient(proxy=self.proxy) as client:
-            response = await client.request(method, url, timeout=self.timeout, **kwargs)
-        data: Dict = response.json()
-        if data.get("errors"):
-            raise DataFetchError(data.get("errors", "unkonw error"))
-        else:
-            return data.get("data", {})
+        """Make HTTP request with retry and proxy fallback."""
+        max_retries = 3
+
+        # build proxy attempts: try proxy first (if set), then no-proxy
+        proxy_attempts: List[Optional[str]] = []
+        if self.proxy:
+            proxy_attempts.append(self.proxy)
+        proxy_attempts.append(None)  # always allow a direct attempt
+
+        last_exc: Optional[Exception] = None
+
+        for attempt in range(max_retries):
+            proxy_to_use = proxy_attempts[min(attempt, len(proxy_attempts) - 1)]
+            try:
+                async with httpx.AsyncClient(proxy=proxy_to_use) as client:
+                    response = await client.request(method, url, timeout=self.timeout, **kwargs)
+                data: Dict = response.json()
+                if data.get("errors"):
+                    raise DataFetchError(data.get("errors", "unkonw error"))
+                return data.get("data", {})
+            except (httpx.ConnectError, httpx.ConnectTimeout, httpx.NetworkError) as e:
+                last_exc = e
+                utils.logger.warning(
+                    f"[KuaiShouClient.request] Network error (attempt {attempt+1}/{max_retries}) "
+                    f"proxy={proxy_to_use} url={url} err={e!r}"
+                )
+                if attempt < max_retries - 1:
+                    await asyncio.sleep(1)
+                    continue
+                utils.logger.error(
+                    f"[KuaiShouClient.request] Network failed after {max_retries} attempts "
+                    f"proxy={proxy_to_use} url={url} err={e!r}"
+                )
+                raise
+            except Exception as e:
+                # For other exceptions (like DataFetchError), don't retry
+                last_exc = e
+                utils.logger.error(
+                    f"[KuaiShouClient.request] Request failed proxy={proxy_to_use} url={url} err={e!r}"
+                )
+                raise
+
+        # If somehow we exit the loop without returning, raise last exception
+        if last_exc:
+            raise last_exc

    async def get(self, uri: str, params=None) -> Dict:
        final_uri = uri
@@ -83,7 +83,26 @@ class KuaishouCrawler(AbstractCrawler):


            self.context_page = await self.browser_context.new_page()
-            await self.context_page.goto(f"{self.index_url}?isHome=1")
+            # 添加重试机制处理网络连接错误
+            max_retries = 3
+            retry_count = 0
+            while retry_count < max_retries:
+                try:
+                    await self.context_page.goto(f"{self.index_url}?isHome=1", timeout=30000)
+                    break
+                except Exception as e:
+                    retry_count += 1
+                    error_msg = str(e)
+                    if "ERR_CONNECTION_RESET" in error_msg or "net::" in error_msg or "Connection" in error_msg:
+                        if retry_count < max_retries:
+                            utils.logger.warning(f"[KuaishouCrawler] 网络连接错误，第 {retry_count} 次重试: {e}")
+                            await asyncio.sleep(2 * retry_count)  # 递增等待时间
+                        else:
+                            utils.logger.error(f"[KuaishouCrawler] 网络连接失败，已重试 {max_retries} 次: {e}")
+                            raise
+                    else:
+                        # 非网络错误直接抛出
+                        raise

            # Create a client to interact with the kuaishou website.
            self.ks_client = await self.create_ks_client(httpx_proxy_format)
@@ -426,10 +445,13 @@ class KuaishouCrawler(AbstractCrawler):

    async def close(self):
        """Close browser context"""
-        # 如果使用CDP模式，需要特殊处理
-        if self.cdp_manager:
-            await self.cdp_manager.cleanup()
-            self.cdp_manager = None
-        else:
-            await self.browser_context.close()
-        utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")
+        try:
+            # 如果使用CDP模式，需要特殊处理
+            if self.cdp_manager:
+                await self.cdp_manager.cleanup()
+                self.cdp_manager = None
+            elif self.browser_context:
+                await self.browser_context.close()
+            utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")
+        except Exception as e:
+            utils.logger.error(f"[KuaishouCrawler.close] An error occurred during close: {e}")
@@ -49,6 +49,21 @@ class KuaishouLogin(AbstractLogin):
        else:
            raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")

+    async def _quick_check_login_state(self) -> bool:
+        """
+            Quick check if the current login status is successful without retry
+            Returns True if logged in, False otherwise
+        """
+        try:
+            current_cookie = await self.browser_context.cookies()
+            _, cookie_dict = utils.convert_cookies(current_cookie)
+            kuaishou_pass_token = cookie_dict.get("passToken")
+            if kuaishou_pass_token:
+                return True
+            return False
+        except Exception:
+            return False
+
    @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
    async def check_login_state(self) -> bool:
        """
@@ -67,11 +82,47 @@ class KuaishouLogin(AbstractLogin):
        """login kuaishou website and keep webdriver login state"""
        utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")

-        # click login button
+        # Check if already logged in (quick check without retry)
+        is_logged_in = await self._quick_check_login_state()
+        if is_logged_in:
+            utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login button click ...")
+            return
+
+        # Check if login button exists (if not, might already be logged in)
        login_button_ele = self.context_page.locator(
            "xpath=//p[text()='登录']"
        )
-        await login_button_ele.click()
+        
+        try:
+            # Wait for the element to be visible with a shorter timeout
+            await login_button_ele.wait_for(state="visible", timeout=3000)
+            utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button found, attempting to click ...")
+            
+            # Try normal click first
+            await login_button_ele.click(timeout=5000)
+        except Exception as e:
+            # If login button is not found, might already be logged in
+            if "timeout" in str(e).lower() or "waiting for" in str(e).lower():
+                utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button not found, checking if already logged in ...")
+                # Double check login state (quick check)
+                is_logged_in = await self._quick_check_login_state()
+                if is_logged_in:
+                    utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login ...")
+                    return
+                utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Login button not found and not logged in: {e}")
+                raise
+            else:
+                utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Normal click failed: {e}, trying force click...")
+                try:
+                    # If normal click fails, try force click to bypass overlay
+                    await login_button_ele.click(force=True, timeout=5000)
+                except Exception as e2:
+                    utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Force click failed: {e2}, trying JavaScript click...")
+                    # If force click also fails, use JavaScript to click directly
+                    await login_button_ele.evaluate("element => element.click()")
+        
+        # Wait a moment for the login modal to appear
+        await asyncio.sleep(1)

        # find login qrcode
        qrcode_img_selector = "//div[@class='qrcode-img']//img"
@@ -10,7 +10,7 @@

 import asyncio
 import json
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union, Set
 from urllib.parse import urlencode, quote

 import requests
@@ -48,6 +48,8 @@ class BaiduTieBaClient(AbstractApiClient):
        self._page_extractor = TieBaExtractor()
        self.default_ip_proxy = default_ip_proxy
        self.playwright_page = playwright_page  # Playwright页面对象
+        self._last_captcha_check_time = 0  # 上次验证码检测时间
+        self._captcha_verified_recently = False  # 是否最近完成过验证码

    def _sync_request(self, method, url, proxy=None, **kwargs):
        """
@@ -210,6 +212,287 @@ class BaiduTieBaClient(AbstractApiClient):
        self.headers["Cookie"] = cookie_str
        utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")

+    async def _wait_for_captcha_completion(self, max_wait_time: int = 300):
+        """
+        检测并等待百度验证码完成（在爬虫过程中使用）
+        等待用户手动拖动验证码，验证成功后自动识别并继续
+        
+        Args:
+            max_wait_time: 最大等待时间（秒），默认120秒
+        """
+        if not self.playwright_page:
+            return
+        
+        import time
+
+        async def _detect_captcha() -> bool:
+            """更全面地检测验证码（包含文本、URL 及常见容器）"""
+            # DOM 选择器
+            selector_hits = [
+                '.tang-pass-slider',
+                '#captcha',
+                '.vcode-img',
+                '.pass-verify',
+                '.tang-pass-verify',
+                '.pass-verify-slider',
+                'div[id*="captcha"]',
+                'div[class*="verify"]',
+                'div[class*="captcha"]',
+                'text=安全验证',
+                'text=请输入验证码',
+                'text=拖动',
+                'text=滑动',
+            ]
+            for selector in selector_hits:
+                try:
+                    element = await self.playwright_page.query_selector(selector)
+                    if element and await element.is_visible():
+                        return True
+                except Exception:
+                    continue
+
+            # URL 关键词
+            url_lower = (self.playwright_page.url or "").lower()
+            if any(key in url_lower for key in ["verify", "captcha", "wappass"]):
+                return True
+
+            # 页面文本关键词（截断以降低开销）
+            try:
+                page_text = (await self.playwright_page.content())[:4000]
+                if any(
+                    kw in page_text
+                    for kw in ["安全验证", "请输入验证码", "完成验证", "滑块", "拖动完成验证"]
+                ):
+                    return True
+            except Exception:
+                pass
+            return False
+
+        # 如果最近5秒内刚完成过验证码，跳过检测（避免重复检测）
+        if self._captcha_verified_recently:
+            time_since_last_check = time.time() - self._last_captcha_check_time
+            if time_since_last_check < 5:
+                utils.logger.debug(
+                    f"[BaiduTieBaClient] 最近 {time_since_last_check:.1f} 秒内完成过验证码，跳过检测"
+                )
+                return
+            else:
+                self._captcha_verified_recently = False
+
+        # 基础选择器（用于后续反复检测）
+        captcha_selectors = [
+            '.tang-pass-slider',
+            '#captcha',
+            '.vcode-img',
+            '.pass-verify',
+            '.tang-pass-verify',
+            '.pass-verify-slider',
+            'div[id*="captcha"]',
+            'div[class*="verify"]',
+            'div[class*="captcha"]',
+        ]
+        success_selectors = [
+            '.tang-pass-success',
+            '.pass-verify-success',
+            'div[class*="success"]',
+        ]
+
+        # 检测验证码是否存在
+        captcha_found = await _detect_captcha()
+        if captcha_found:
+            utils.logger.warning("[BaiduTieBaClient] 🔐 检测到验证码，请手动拖动完成验证...")
+        if not captcha_found:
+            return
+        
+        # 记录当前URL，用于检测页面跳转
+        initial_url = self.playwright_page.url
+        utils.logger.info(f"[BaiduTieBaClient] 当前页面URL: {initial_url}")
+        utils.logger.info(f"[BaiduTieBaClient] ⏳ 等待用户手动完成验证码（最多等待 {max_wait_time} 秒）...")
+        
+        start_time = time.time()
+        last_log_time = 0
+        check_interval = 1  # 检查间隔改为1秒，更快响应
+        
+        while True:
+            # 检查是否超时
+            elapsed_time = time.time() - start_time
+            if elapsed_time >= max_wait_time:
+                utils.logger.warning(
+                    f"[BaiduTieBaClient] ⏰ 等待验证码超时（{max_wait_time}秒），跳过当前百度贴吧爬取任务"
+                )
+                # 超时直接中断本次百度贴吧爬虫，交给上层捕获处理
+                raise TimeoutError(
+                    f"Baidu captcha wait timeout ({max_wait_time}s), skip tieba crawling"
+                )
+            
+            try:
+                # 检测验证成功的标识
+                verification_success = False
+                for selector in success_selectors:
+                    try:
+                        element = await self.playwright_page.query_selector(selector)
+                        if element:
+                            is_visible = await element.is_visible()
+                            if is_visible:
+                                verification_success = True
+                                utils.logger.info(f"[BaiduTieBaClient] ✅ 检测到验证成功标识 (selector: {selector})")
+                                break
+                    except Exception:
+                        continue
+                
+                # 检测验证码是否还存在
+                captcha_still_exists = False
+                for selector in captcha_selectors:
+                    try:
+                        element = await self.playwright_page.query_selector(selector)
+                        if element:
+                            is_visible = await element.is_visible()
+                            if is_visible:
+                                captcha_still_exists = True
+                                break
+                    except Exception:
+                        continue
+                
+                # 检测页面URL是否变化（验证成功后可能会跳转）
+                current_url = self.playwright_page.url
+                url_changed = current_url != initial_url
+                
+                # 判断验证是否成功
+                # 成功条件：1. 验证码消失 2. 或者检测到成功标识 3. 或者URL变化（且不是验证码页面）
+                if verification_success or (not captcha_still_exists and url_changed):
+                    # 验证码消失且URL变化，可能是验证成功后的跳转
+                    utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失，检测到页面变化，等待3秒确认验证完成...")
+                    await asyncio.sleep(3)
+                    
+                    # 再次确认验证码是否真的消失了
+                    captcha_still_exists = False
+                    for selector in captcha_selectors:
+                        try:
+                            element = await self.playwright_page.query_selector(selector)
+                            if element:
+                                is_visible = await element.is_visible()
+                                if is_visible:
+                                    captcha_still_exists = True
+                                    break
+                        except Exception:
+                            continue
+                    
+                    if not captcha_still_exists:
+                        # 确认验证成功
+                        final_url = self.playwright_page.url
+                        utils.logger.info(f"[BaiduTieBaClient] ✅ 验证码验证成功！")
+                        if url_changed:
+                            utils.logger.info(f"[BaiduTieBaClient] 📍 页面已跳转: {initial_url} -> {final_url}")
+                        else:
+                            utils.logger.info(f"[BaiduTieBaClient] 📍 页面URL未变化，验证在当前页面完成")
+                        
+                        # 标记最近完成过验证码，避免立即再次检测
+                        self._captcha_verified_recently = True
+                        import time
+                        self._last_captcha_check_time = time.time()
+                        
+                        # 等待页面稳定，避免立即再次检测验证码
+                        await asyncio.sleep(3)
+                        
+                        # 验证成功后，再次检查是否又出现了验证码（防止跳转到新的验证码页面）
+                        utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后，检查是否又出现验证码...")
+                        await asyncio.sleep(2)
+                        
+                        captcha_reappeared = False
+                        for selector in captcha_selectors:
+                            try:
+                                element = await self.playwright_page.query_selector(selector)
+                                if element:
+                                    is_visible = await element.is_visible()
+                                    if is_visible:
+                                        captcha_reappeared = True
+                                        utils.logger.warning(f"[BaiduTieBaClient] ⚠️  验证成功后检测到新的验证码 (selector: {selector})，继续等待...")
+                                        break
+                            except Exception:
+                                continue
+                        
+                        if not captcha_reappeared:
+                            utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功，未出现新的验证码，继续执行...")
+                            break
+                        else:
+                            # 如果又出现了验证码，重置状态继续等待
+                            utils.logger.warning("[BaiduTieBaClient] ⚠️  检测到新的验证码，重置等待状态...")
+                            initial_url = self.playwright_page.url
+                            start_time = time.time()
+                            continue
+                    else:
+                        # 验证码又出现了，可能验证失败或页面刷新
+                        utils.logger.warning("[BaiduTieBaClient] ⚠️  验证码重新出现，可能验证失败，继续等待...")
+                elif not captcha_still_exists and not url_changed:
+                    # 验证码消失但URL未变化，可能是验证成功但未跳转
+                    utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失，等待3秒确认验证完成...")
+                    await asyncio.sleep(3)
+                    
+                    # 再次确认
+                    captcha_still_exists = False
+                    for selector in captcha_selectors:
+                        try:
+                            element = await self.playwright_page.query_selector(selector)
+                            if element:
+                                is_visible = await element.is_visible()
+                                if is_visible:
+                                    captcha_still_exists = True
+                                    break
+                        except Exception:
+                            continue
+                    
+                    if not captcha_still_exists:
+                        utils.logger.info("[BaiduTieBaClient] ✅ 验证码验证成功！")
+                        
+                        # 标记最近完成过验证码
+                        self._captcha_verified_recently = True
+                        import time
+                        self._last_captcha_check_time = time.time()
+                        
+                        # 等待页面稳定
+                        await asyncio.sleep(3)
+                        
+                        # 验证成功后，再次检查是否又出现了验证码
+                        utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后，检查是否又出现验证码...")
+                        await asyncio.sleep(2)
+                        
+                        captcha_reappeared = False
+                        for selector in captcha_selectors:
+                            try:
+                                element = await self.playwright_page.query_selector(selector)
+                                if element:
+                                    is_visible = await element.is_visible()
+                                    if is_visible:
+                                        captcha_reappeared = True
+                                        utils.logger.warning(f"[BaiduTieBaClient] ⚠️  验证成功后检测到新的验证码 (selector: {selector})，继续等待...")
+                                        break
+                            except Exception:
+                                continue
+                        
+                        if not captcha_reappeared:
+                            utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功，未出现新的验证码，继续执行...")
+                            break
+                        else:
+                            # 如果又出现了验证码，重置状态继续等待
+                            utils.logger.warning("[BaiduTieBaClient] ⚠️  检测到新的验证码，重置等待状态...")
+                            initial_url = self.playwright_page.url
+                            start_time = time.time()
+                            continue
+                
+            except Exception as e:
+                # 如果检测过程中出现异常，继续等待
+                utils.logger.debug(f"[BaiduTieBaClient] 验证码检测异常: {e}")
+            
+            # 等待一段时间后再次检查
+            await asyncio.sleep(check_interval)
+            
+            # 每10秒输出一次提示
+            current_time = int(elapsed_time)
+            if current_time != last_log_time and current_time % 10 == 0 and current_time > 0:
+                remaining_time = max_wait_time - current_time
+                utils.logger.info(f"[BaiduTieBaClient] ⏳ 仍在等待验证码完成...（剩余 {remaining_time} 秒）")
+                last_log_time = current_time
+
    async def get_notes_by_keyword(
        self,
        keyword: str,
@@ -253,6 +536,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问搜索页面
            await self.playwright_page.goto(full_url, wait_until="domcontentloaded")

+            # 检测并等待验证码完成
+            await self._wait_for_captcha_completion()
+
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -290,6 +576,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问帖子详情页面
            await self.playwright_page.goto(note_url, wait_until="domcontentloaded")

+            # 检测并等待验证码完成
+            await self._wait_for_captcha_completion()
+
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -328,6 +617,8 @@ class BaiduTieBaClient(AbstractApiClient):

        result: List[TiebaComment] = []
        current_page = 1
+        seen_comment_ids: Set[str] = set()
+        duplicate_page_count = 0

        while note_detail.total_replay_page >= current_page and len(result) < max_count:
            # 构造评论页URL
@@ -338,6 +629,9 @@ class BaiduTieBaClient(AbstractApiClient):
                # 使用Playwright访问评论页面
                await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")

+                # 检测并等待验证码完成
+                await self._wait_for_captcha_completion()
+
                # 等待页面加载,使用配置文件中的延时设置
                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -353,6 +647,26 @@ class BaiduTieBaClient(AbstractApiClient):
                    utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有评论,停止爬取")
                    break

+                if config.ENABLE_COMMENT_DEDUP:
+                    new_comments: List[TiebaComment] = []
+                    for comment in comments:
+                        comment_id = getattr(comment, "comment_id", None)
+                        if comment_id and comment_id not in seen_comment_ids:
+                            seen_comment_ids.add(comment_id)
+                            new_comments.append(comment)
+                    if not new_comments:
+                        duplicate_page_count += 1
+                        utils.logger.info(
+                            f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有出现新的评论(重复数据)，计数={duplicate_page_count}"
+                        )
+                        if duplicate_page_count >= config.COMMENT_DUP_BREAK_THRESHOLD:
+                            utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 连续 {duplicate_page_count} 页无新增评论，提前结束抓取")
+                            break
+                        current_page += 1
+                        continue
+                    comments = new_comments
+                    duplicate_page_count = 0
+
                # 限制评论数量
                if len(result) + len(comments) > max_count:
                    comments = comments[:max_count - len(result)]
@@ -408,6 +722,8 @@ class BaiduTieBaClient(AbstractApiClient):

            current_page = 1
            max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
+            seen_sub_ids: Set[str] = set()
+            duplicate_page_count = 0

            while max_sub_page_num >= current_page:
                # 构造子评论URL
@@ -424,6 +740,9 @@ class BaiduTieBaClient(AbstractApiClient):
                    # 使用Playwright访问子评论页面
                    await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")

+                    # 检测并等待验证码完成
+                    await self._wait_for_captcha_completion()
+
                    # 等待页面加载,使用配置文件中的延时设置
                    await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -442,6 +761,28 @@ class BaiduTieBaClient(AbstractApiClient):
                        )
                        break

+                    if config.ENABLE_COMMENT_DEDUP:
+                        new_sub_comments: List[TiebaComment] = []
+                        for sub_comment in sub_comments:
+                            sub_comment_id = getattr(sub_comment, "comment_id", None)
+                            if sub_comment_id and sub_comment_id not in seen_sub_ids:
+                                seen_sub_ids.add(sub_comment_id)
+                                new_sub_comments.append(sub_comment)
+                        if not new_sub_comments:
+                            duplicate_page_count += 1
+                            utils.logger.info(
+                                f"[BaiduTieBaClient.get_comments_all_sub_comments] 评论{parment_comment.comment_id}第{current_page}页未出现新子评论，计数={duplicate_page_count}"
+                            )
+                            if duplicate_page_count >= config.COMMENT_DUP_BREAK_THRESHOLD:
+                                utils.logger.info(
+                                    f"[BaiduTieBaClient.get_comments_all_sub_comments] 评论{parment_comment.comment_id}连续 {duplicate_page_count} 页无新增子评论，提前结束"
+                                )
+                                break
+                            current_page += 1
+                            continue
+                        sub_comments = new_sub_comments
+                        duplicate_page_count = 0
+
                    if callback:
                        await callback(parment_comment.note_id, sub_comments)

@@ -481,6 +822,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问贴吧页面
            await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")

+            # 检测并等待验证码完成
+            await self._wait_for_captcha_completion()
+
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -516,6 +860,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问创作者主页
            await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")

+            # 检测并等待验证码完成
+            await self._wait_for_captcha_completion()
+
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -551,6 +898,9 @@ class BaiduTieBaClient(AbstractApiClient):
            # 使用Playwright访问创作者帖子列表页面
            await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")

+            # 检测并等待验证码完成
+            await self._wait_for_captcha_completion()
+
            # 等待页面加载,使用配置文件中的延时设置
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

@@ -662,10 +662,13 @@ class TieBaCrawler(AbstractCrawler):
        Returns:

        """
-        # 如果使用CDP模式，需要特殊处理
-        if self.cdp_manager:
-            await self.cdp_manager.cleanup()
-            self.cdp_manager = None
-        else:
-            await self.browser_context.close()
-        utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
+        try:
+            # 如果使用CDP模式，需要特殊处理
+            if self.cdp_manager:
+                await self.cdp_manager.cleanup()
+                self.cdp_manager = None
+            elif self.browser_context:
+                await self.browser_context.close()
+            utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
+        except Exception as e:
+            utils.logger.error(f"[BaiduTieBaCrawler.close] An error occurred during close: {e}")
@@ -58,11 +58,47 @@ class WeiboClient:
        if enable_return_response:
            return response

-        data: Dict = response.json()
+        # 检查响应状态码
+        if response.status_code != 200:
+            error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
+            utils.logger.error(f"[WeiboClient.request] request {method}:{url} failed with status {response.status_code}")
+            raise DataFetchError(error_msg)
+
+        # 检查响应内容类型
+        content_type = response.headers.get("content-type", "").lower()
+        if "application/json" not in content_type and "text/json" not in content_type:
+            # 可能是HTML响应（如登录页面）
+            response_text = response.text[:500]
+            utils.logger.warning(f"[WeiboClient.request] Unexpected content type: {content_type}, response preview: {response_text}")
+            # 如果看起来像是HTML，可能是需要登录
+            if "<html" in response_text.lower() or "<!doctype" in response_text.lower():
+                raise DataFetchError("Response is HTML, may need to login or cookie expired")
+            raise DataFetchError(f"Unexpected content type: {content_type}")
+
+        # 安全地解析JSON
+        try:
+            data: Dict = response.json()
+        except ValueError as e:
+            # JSON解析失败
+            response_text = response.text[:500]
+            utils.logger.error(f"[WeiboClient.request] JSON decode error for {method}:{url}")
+            utils.logger.error(f"[WeiboClient.request] Response text (first 500 chars): {response_text}")
+            raise DataFetchError(f"Failed to parse JSON response: {e}")
+
+        # 检查响应是否为空
+        if not data:
+            utils.logger.warning(f"[WeiboClient.request] Empty response for {method}:{url}")
+            return {"cards": []}
+
        ok_code = data.get("ok")
        if ok_code == 0:  # response error
+            msg = data.get("msg", "response error")
+            # "这里还没有内容" 是正常情况，表示没有更多数据，不应该抛出异常
+            if msg == "这里还没有内容" or "还没有内容" in msg:
+                utils.logger.info(f"[WeiboClient.request] No more content available: {msg}")
+                return {"cards": []}  # 返回空结果，而不是抛出异常
            utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
-            raise DataFetchError(data.get("msg", "response error"))
+            raise DataFetchError(msg)
        elif ok_code != 1:  # unknown error
            utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
            raise DataFetchError(data.get("msg", "unknown error"))
@@ -15,6 +15,7 @@

 import asyncio
 import os
+import re
 # import random  # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
 from asyncio import Task
 from typing import Dict, List, Optional, Tuple
@@ -56,6 +57,17 @@ class WeiboCrawler(AbstractCrawler):
        self.cdp_manager = None

    async def start(self):
+        # 初始化数据库表（如果需要）
+        if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
+            try:
+                from database.db_session import create_tables
+                utils.logger.info(f"[WeiboCrawler.start] Initializing database tables for {config.SAVE_DATA_OPTION}...")
+                await create_tables(config.SAVE_DATA_OPTION)
+                utils.logger.info(f"[WeiboCrawler.start] Database tables initialized successfully")
+            except Exception as e:
+                utils.logger.error(f"[WeiboCrawler.start] Failed to initialize database tables: {e}", exc_info=True)
+                raise
+        
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
            ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
@@ -151,16 +163,39 @@ class WeiboCrawler(AbstractCrawler):
                    page += 1
                    continue
                utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
-                search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
+                try:
+                    search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
+                    cards = search_res.get("cards", [])
+                    utils.logger.info(f"[WeiboCrawler.search] Received {len(cards)} cards from search API")
+                    
+                    # 如果没有更多内容，跳出循环
+                    if len(cards) == 0:
+                        utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}', stopping pagination")
+                        break
+                except DataFetchError as e:
+                    # 如果是"没有内容"的错误，正常结束
+                    if "还没有内容" in str(e) or "没有内容" in str(e):
+                        utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}': {e}")
+                        break
+                    # 其他错误继续抛出
+                    raise
+                
                note_id_list: List[str] = []
-                note_list = filter_search_result_card(search_res.get("cards"))
+                note_list = filter_search_result_card(cards)
+                utils.logger.info(f"[WeiboCrawler.search] Filtered to {len(note_list)} notes (card_type=9)")
+                
                for note_item in note_list:
                    if note_item:
                        mblog: Dict = note_item.get("mblog")
                        if mblog:
-                            note_id_list.append(mblog.get("id"))
-                            await weibo_store.update_weibo_note(note_item)
-                            await self.get_note_images(mblog)
+                            note_id = mblog.get("id")
+                            note_id_list.append(note_id)
+                            try:
+                                await weibo_store.update_weibo_note(note_item)
+                                await self.get_note_images(mblog)
+                            except Exception as e:
+                                utils.logger.error(f"[WeiboCrawler.search] Failed to save note {note_id}: {e}", exc_info=True)
+                                # 继续处理其他笔记，不中断整个流程

                page += 1
                
@@ -383,10 +418,13 @@ class WeiboCrawler(AbstractCrawler):

    async def close(self):
        """Close browser context"""
-        # 如果使用CDP模式，需要特殊处理
-        if self.cdp_manager:
-            await self.cdp_manager.cleanup()
-            self.cdp_manager = None
-        else:
-            await self.browser_context.close()
-        utils.logger.info("[WeiboCrawler.close] Browser context closed ...")
+        try:
+            # 如果使用CDP模式，需要特殊处理
+            if self.cdp_manager:
+                await self.cdp_manager.cleanup()
+                self.cdp_manager = None
+            elif self.browser_context:
+                await self.browser_context.close()
+            utils.logger.info("[WeiboCrawler.close] Browser context closed ...")
+        except Exception as e:
+            utils.logger.error(f"[WeiboCrawler.close] An error occurred during close: {e}")
@@ -17,6 +17,7 @@ from urllib.parse import urlencode
 import httpx
 from playwright.async_api import BrowserContext, Page
 from tenacity import retry, stop_after_attempt, wait_fixed
+from xhshow import Xhshow

 import config
 from base.base_crawler import AbstractApiClient
@@ -27,7 +28,6 @@ from .exception import DataFetchError, IPBlockError
 from .field import SearchNoteType, SearchSortType
 from .help import get_search_id, sign
 from .extractor import XiaoHongShuExtractor
-from .secsign import seccore_signv2_playwright


 class XiaoHongShuClient(AbstractApiClient):
@@ -53,24 +53,51 @@ class XiaoHongShuClient(AbstractApiClient):
        self.playwright_page = playwright_page
        self.cookie_dict = cookie_dict
        self._extractor = XiaoHongShuExtractor()
+        # 初始化 xhshow 客户端用于签名生成
+        self._xhshow_client = Xhshow()

    async def _pre_headers(self, url: str, data=None) -> Dict:
        """
-        请求头参数签名
+        请求头参数签名，使用 xhshow 库生成签名
        Args:
-            url:
-            data:
+            url: 完整的 URI（GET 请求包含查询参数）
+            data: POST 请求的请求体数据

        Returns:

        """
-        x_s = await seccore_signv2_playwright(self.playwright_page, url, data)
-        local_storage = await self.playwright_page.evaluate("() => window.localStorage")
+        # 获取 a1 cookie 值
+        a1_value = self.cookie_dict.get("a1", "")
+
+        # 根据请求类型使用不同的签名方法
+        if data is None:
+            # GET 请求：从 url 中提取参数
+            from urllib.parse import urlparse, parse_qs
+            parsed = urlparse(url)
+            params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()}
+            # 使用完整的 URL（包含 host）
+            full_url = f"{self._host}{url}"
+            x_s = self._xhshow_client.sign_xs_get(uri=full_url, a1_value=a1_value, params=params)
+        else:
+            # POST 请求：使用 data 作为 payload
+            full_url = f"{self._host}{url}"
+            x_s = self._xhshow_client.sign_xs_post(uri=full_url, a1_value=a1_value, payload=data)
+
+        # 尝试获取 b1 值（从 localStorage），如果获取失败则使用空字符串
+        b1_value = ""
+        try:
+            if self.playwright_page:
+                local_storage = await self.playwright_page.evaluate("() => window.localStorage")
+                b1_value = local_storage.get("b1", "")
+        except Exception as e:
+            utils.logger.warning(f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}, using empty string")
+
+        # 使用 sign 函数生成其他签名头
        signs = sign(
-            a1=self.cookie_dict.get("a1", ""),
-            b1=local_storage.get("b1", ""),
+            a1=a1_value,
+            b1=b1_value,
            x_s=x_s,
-            x_t=str(int(time.time())),
+            x_t=str(int(time.time() * 1000)),  # x-t 使用毫秒时间戳
        )

        headers = {
@@ -115,7 +142,8 @@ class XiaoHongShuClient(AbstractApiClient):
        elif data["code"] == self.IP_ERROR_CODE:
            raise IPBlockError(self.IP_ERROR_STR)
        else:
-            raise DataFetchError(data.get("msg", None))
+            err_msg = data.get("msg", None) or f"{response.text}"
+            raise DataFetchError(err_msg)

    async def get(self, uri: str, params=None) -> Dict:
        """
@@ -480,6 +508,8 @@ class XiaoHongShuClient(AbstractApiClient):
        creator: str,
        cursor: str,
        page_size: int = 30,
+        xsec_token: str = "",
+        xsec_source: str = "pc_feed",
    ) -> Dict:
        """
        获取博主的笔记
@@ -487,24 +517,22 @@ class XiaoHongShuClient(AbstractApiClient):
            creator: 博主ID
            cursor: 上一页最后一条笔记的ID
            page_size: 分页数据长度
+            xsec_token: 验证token
+            xsec_source: 渠道来源

        Returns:

        """
-        uri = "/api/sns/web/v1/user_posted"
-        data = {
-            "user_id": creator,
-            "cursor": cursor,
-            "num": page_size,
-            "image_formats": "jpg,webp,avif",
-        }
-        return await self.get(uri, data)
+        uri = f"/api/sns/web/v1/user_posted?num={page_size}&cursor={cursor}&user_id={creator}&xsec_token={xsec_token}&xsec_source={xsec_source}"
+        return await self.get(uri)

    async def get_all_notes_by_creator(
        self,
        user_id: str,
        crawl_interval: float = 1.0,
        callback: Optional[Callable] = None,
+        xsec_token: str = "",
+        xsec_source: str = "pc_feed",
    ) -> List[Dict]:
        """
        获取指定用户下的所有发过的帖子，该方法会一直查找一个用户下的所有帖子信息
@@ -512,6 +540,8 @@ class XiaoHongShuClient(AbstractApiClient):
            user_id: 用户ID
            crawl_interval: 爬取一次的延迟单位（秒）
            callback: 一次分页爬取结束后的更新回调函数
+            xsec_token: 验证token
+            xsec_source: 渠道来源

        Returns:

@@ -520,7 +550,7 @@ class XiaoHongShuClient(AbstractApiClient):
        notes_has_more = True
        notes_cursor = ""
        while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
-            notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
+            notes_res = await self.get_notes_by_creator(user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source)
            if not notes_res:
                utils.logger.error(
                    f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."
@@ -201,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
                user_id=user_id,
                crawl_interval=crawl_interval,
                callback=self.fetch_creator_notes_detail,
+                xsec_token=creator_info.xsec_token,
+                xsec_source=creator_info.xsec_source,
            )

            note_ids = []
@@ -279,12 +281,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
            Dict: note detail
        """
        note_detail = None
+        utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
        async with semaphore:
            try:
-                utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
-                note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
+                try:
+                    note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
+                except RetryError:
+                    pass
+
                if not note_detail:
-                    raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
+                    note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
+                                                                                 enable_cookie=True)
+                    if not note_detail:
+                        raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")

                note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
                
@@ -428,13 +437,16 @@ class XiaoHongShuCrawler(AbstractCrawler):

    async def close(self):
        """Close browser context"""
-        # 如果使用CDP模式，需要特殊处理
-        if self.cdp_manager:
-            await self.cdp_manager.cleanup()
-            self.cdp_manager = None
-        else:
-            await self.browser_context.close()
-        utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
+        try:
+            # 如果使用CDP模式，需要特殊处理
+            if self.cdp_manager:
+                await self.cdp_manager.cleanup()
+                self.cdp_manager = None
+            elif self.browser_context:
+                await self.browser_context.close()
+            utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
+        except Exception as e:
+            utils.logger.error(f"[XiaoHongShuCrawler.close] An error occurred during close: {e}")

    async def get_notice_media(self, note_detail: Dict):
        if not config.ENABLE_GET_MEIDAS:
@@ -34,7 +34,7 @@ class ZhiHuClient(AbstractApiClient):

    def __init__(
        self,
-        timeout=10,
+        timeout=30,  # 增加超时时间到30秒，避免请求卡住
        proxy=None,
        *,
        headers: Dict[str, str],
@@ -57,7 +57,8 @@ class ZhiHuClient(AbstractApiClient):
        """
        d_c0 = self.cookie_dict.get("d_c0")
        if not d_c0:
-            raise Exception("d_c0 not found in cookies")
+            utils.logger.error(f"[ZhiHuClient._pre_headers] d_c0 not found in cookies. Available cookies: {list(self.cookie_dict.keys())}")
+            raise Exception("d_c0 not found in cookies. Please make sure you have logged in and cookies are updated.")
        sign_res = sign(url, self.default_headers["cookie"])
        headers = self.default_headers.copy()
        headers['x-zst-81'] = sign_res["x-zst-81"]
@@ -184,6 +185,7 @@ class ZhiHuClient(AbstractApiClient):
        Returns:

        """
+        utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 开始搜索关键词: {keyword}, 页码: {page}")
        uri = "/api/v4/search_v3"
        params = {
            "gk_version": "gz-gaokao",
@@ -200,9 +202,16 @@ class ZhiHuClient(AbstractApiClient):
            "sort": sort.value,
            "vertical": note_type.value,
        }
-        search_res = await self.get(uri, params)
-        utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}")
-        return self._extractor.extract_contents_from_search(search_res)
+        try:
+            utils.logger.debug(f"[ZhiHuClient.get_note_by_keyword] 发送搜索请求: {uri}, params: {params}")
+            search_res = await self.get(uri, params)
+            utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 搜索请求成功，开始解析结果")
+            contents = self._extractor.extract_contents_from_search(search_res)
+            utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 解析完成，找到 {len(contents)} 条内容")
+            return contents
+        except Exception as e:
+            utils.logger.error(f"[ZhiHuClient.get_note_by_keyword] 搜索失败: {e}", exc_info=True)
+            raise

    async def get_root_comments(
        self,
@@ -90,7 +90,9 @@ class ZhihuCrawler(AbstractCrawler):
                await self.browser_context.add_init_script(path="libs/stealth.min.js")

            self.context_page = await self.browser_context.new_page()
-            await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
+            # 设置页面超时时间为30秒
+            self.context_page.set_default_timeout(30000)
+            await self.context_page.goto(self.index_url, wait_until="domcontentloaded", timeout=30000)

            # Create a client to interact with the zhihu website.
            self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format)
@@ -103,43 +105,106 @@ class ZhihuCrawler(AbstractCrawler):
                    cookie_str=config.COOKIES,
                )
                await login_obj.begin()
+                # 登录后等待页面稳定
+                await asyncio.sleep(2)
                await self.zhihu_client.update_cookies(
                    browser_context=self.browser_context
                )

            # 知乎的搜索接口需要打开搜索页面之后cookies才能访问API，单独的首页不行
+            # 使用用户配置的第一个关键词，如果没有关键词则使用默认的"test"
+            search_keyword = "test"  # 默认关键词
+            if config.KEYWORDS and config.KEYWORDS.strip():
+                keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
+                if keywords_list:
+                    search_keyword = keywords_list[0]
+                    utils.logger.info(f"[ZhihuCrawler.start] 使用用户关键词 '{search_keyword}' 初始化搜索页面")
+                else:
+                    utils.logger.warning(f"[ZhihuCrawler.start] 关键词配置为空，使用默认关键词 'test'")
+            else:
+                utils.logger.warning(f"[ZhihuCrawler.start] 未配置关键词，使用默认关键词 'test'")
+            
            utils.logger.info(
-                "[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies，该过程需要5秒左右"
+                f"[ZhihuCrawler.start] ========== 准备跳转到搜索页面获取Cookies =========="
            )
-            await self.context_page.goto(
-                f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
+            utils.logger.info(
+                f"[ZhihuCrawler.start] 关键词: {search_keyword}"
            )
-            await asyncio.sleep(5)
-            await self.zhihu_client.update_cookies(browser_context=self.browser_context)
+            try:
+                # 使用用户配置的关键词，而不是硬编码的python
+                from urllib.parse import quote
+                encoded_keyword = quote(search_keyword)
+                search_url = f"{self.index_url}/search?q={encoded_keyword}&search_source=Guess&utm_content=search_hot&type=content"
+                utils.logger.info(f"[ZhihuCrawler.start] 跳转到搜索页面: {search_url}")
+                
+                # 添加超时时间，避免卡住
+                await self.context_page.goto(
+                    search_url,
+                    wait_until="domcontentloaded",
+                    timeout=30000
+                )
+                utils.logger.info("[ZhihuCrawler.start] 页面跳转完成，等待页面稳定...")
+                # 等待页面基本加载完成，不等待networkidle（知乎页面可能一直有请求）
+                await asyncio.sleep(3)
+                utils.logger.info("[ZhihuCrawler.start] 搜索页面已加载，开始更新cookies")
+                await self.zhihu_client.update_cookies(browser_context=self.browser_context)
+                utils.logger.info("[ZhihuCrawler.start] ========== Cookies更新完成 ==========")
+            except Exception as e:
+                utils.logger.error(f"[ZhihuCrawler.start] 跳转到搜索页面失败: {e}，尝试继续执行", exc_info=True)
+                # 即使跳转失败，也尝试更新cookies
+                try:
+                    await self.zhihu_client.update_cookies(browser_context=self.browser_context)
+                    utils.logger.info("[ZhihuCrawler.start] Cookies更新完成（跳转失败后）")
+                except Exception as cookie_error:
+                    utils.logger.error(f"[ZhihuCrawler.start] 更新cookies失败: {cookie_error}", exc_info=True)

            crawler_type_var.set(config.CRAWLER_TYPE)
+            utils.logger.info(f"[ZhihuCrawler.start] ========== 开始执行爬取任务 ==========")
+            utils.logger.info(f"[ZhihuCrawler.start] 爬取类型: {config.CRAWLER_TYPE}")
+            
            if config.CRAWLER_TYPE == "search":
                # Search for notes and retrieve their comment information.
+                utils.logger.info("[ZhihuCrawler.start] 准备开始搜索关键词")
                await self.search()
            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
+                utils.logger.info("[ZhihuCrawler.start] 准备开始获取指定帖子详情")
                await self.get_specified_notes()
            elif config.CRAWLER_TYPE == "creator":
                # Get creator's information and their notes and comments
+                utils.logger.info("[ZhihuCrawler.start] 准备开始获取创作者信息")
                await self.get_creators_and_notes()
            else:
-                pass
+                utils.logger.warning(f"[ZhihuCrawler.start] 未知的爬取类型: {config.CRAWLER_TYPE}")

            utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...")

    async def search(self) -> None:
        """Search for notes and retrieve their comment information."""
-        utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords")
+        utils.logger.info("[ZhihuCrawler.search] ========== 开始搜索知乎关键词 ==========")
        zhihu_limit_count = 20  # zhihu limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
        start_page = config.START_PAGE
-        for keyword in config.KEYWORDS.split(","):
+        
+        # 统计信息
+        total_saved_contents = 0
+        total_failed_contents = 0
+        total_saved_comments = 0
+        
+        # 安全地处理关键词列表
+        if not config.KEYWORDS or not config.KEYWORDS.strip():
+            utils.logger.error("[ZhihuCrawler.search] 关键词配置为空，无法执行搜索任务")
+            return
+        
+        keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
+        if not keywords_list:
+            utils.logger.error("[ZhihuCrawler.search] 关键词列表为空，无法执行搜索任务")
+            return
+        
+        utils.logger.info(f"[ZhihuCrawler.search] 关键词列表: {keywords_list}, 共 {len(keywords_list)} 个关键词")
+        
+        for keyword in keywords_list:
            source_keyword_var.set(keyword)
            utils.logger.info(
                f"[ZhihuCrawler.search] Current search keyword: {keyword}"
@@ -164,7 +229,7 @@ class ZhihuCrawler(AbstractCrawler):
                        )
                    )
                    utils.logger.info(
-                        f"[ZhihuCrawler.search] Search contents :{content_list}"
+                        f"[ZhihuCrawler.search] Search contents :{len(content_list)} 条"
                    )
                    if not content_list:
                        utils.logger.info("No more content!")
@@ -175,13 +240,41 @@ class ZhihuCrawler(AbstractCrawler):
                    utils.logger.info(f"[ZhihuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
                    
                    page += 1
+                    # 保存内容，添加异常处理和统计
+                    saved_count = 0
+                    failed_count = 0
                    for content in content_list:
-                        await zhihu_store.update_zhihu_content(content)
+                        try:
+                            await zhihu_store.update_zhihu_content(content)
+                            saved_count += 1
+                        except Exception as e:
+                            failed_count += 1
+                            utils.logger.error(
+                                f"[ZhihuCrawler.search] 保存内容失败 (content_id={content.content_id}): {e}"
+                            )
+                    
+                    if saved_count > 0:
+                        utils.logger.info(
+                            f"[ZhihuCrawler.search] 关键词 '{keyword}' 第 {page-1} 页: 成功保存 {saved_count} 条内容"
+                        )
+                        total_saved_contents += saved_count
+                    if failed_count > 0:
+                        utils.logger.warning(
+                            f"[ZhihuCrawler.search] 关键词 '{keyword}' 第 {page-1} 页: 保存失败 {failed_count} 条内容"
+                        )
+                        total_failed_contents += failed_count

                    await self.batch_get_content_comments(content_list)
                except DataFetchError:
                    utils.logger.error("[ZhihuCrawler.search] Search content error")
                    return
+        
+        # 输出最终统计信息
+        utils.logger.info(
+            f"[ZhihuCrawler.search] 关键词搜索完成统计: "
+            f"成功保存 {total_saved_contents} 条内容, "
+            f"失败 {total_failed_contents} 条内容"
+        )

    async def batch_get_content_comments(self, content_list: List[ZhihuContent]):
        """
@@ -386,6 +479,18 @@ class ZhihuCrawler(AbstractCrawler):
        cookie_str, cookie_dict = utils.convert_cookies(
            await self.browser_context.cookies()
        )
+        
+        # 获取用户配置的关键词用于 referer，如果没有则使用默认值
+        referer_keyword = "test"
+        if config.KEYWORDS and config.KEYWORDS.strip():
+            keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
+            if keywords_list:
+                referer_keyword = keywords_list[0]
+        
+        from urllib.parse import quote
+        encoded_referer_keyword = quote(referer_keyword)
+        referer_url = f"https://www.zhihu.com/search?q={encoded_referer_keyword}&time_interval=a_year&type=content"
+        
        zhihu_client_obj = ZhiHuClient(
            proxy=httpx_proxy,
            headers={
@@ -393,7 +498,7 @@ class ZhihuCrawler(AbstractCrawler):
                "accept-language": "zh-CN,zh;q=0.9",
                "cookie": cookie_str,
                "priority": "u=1, i",
-                "referer": "https://www.zhihu.com/search?q=python&time_interval=a_year&type=content",
+                "referer": referer_url,
                "user-agent": self.user_agent,
                "x-api-version": "3.0.91",
                "x-app-za": "OS=Web",
@@ -473,10 +578,13 @@ class ZhihuCrawler(AbstractCrawler):

    async def close(self):
        """Close browser context"""
-        # 如果使用CDP模式，需要特殊处理
-        if self.cdp_manager:
-            await self.cdp_manager.cleanup()
-            self.cdp_manager = None
-        else:
-            await self.browser_context.close()
-        utils.logger.info("[ZhihuCrawler.close] Browser context closed ...")
+        try:
+            # 如果使用CDP模式，需要特殊处理
+            if self.cdp_manager:
+                await self.cdp_manager.cleanup()
+                self.cdp_manager = None
+            elif self.browser_context:
+                await self.browser_context.close()
+            utils.logger.info("[ZhihuCrawler.close] Browser context closed ...")
+        except Exception as e:
+            utils.logger.error(f"[ZhihuCrawler.close] An error occurred during close: {e}")
@@ -24,3 +24,4 @@ cryptography>=45.0.7
 alembic>=1.16.5
 asyncmy>=0.2.10
 sqlalchemy>=2.0.43
+xhshow>=0.1.3
@@ -119,6 +119,32 @@ class BiliDbStoreImplement(AbstractStore):
            content_item: content item dict
        """
        video_id = content_item.get("video_id")
+        if not video_id:
+            return
+        
+        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
+        # 支持精确匹配和模糊匹配两种模式
+        try:
+            import sys
+            from pathlib import Path
+            project_root = Path(__file__).resolve().parents[4]
+            if str(project_root) not in sys.path:
+                sys.path.insert(0, str(project_root))
+            from config import settings
+            
+            title = content_item.get("title", "")
+            desc = content_item.get("desc", "")
+            content_text = title + " " + desc
+            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
+            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
+            
+            if strict_keywords or fuzzy_keywords:
+                if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
+                    utils.logger.warning(f"[BilibiliDbStoreImplement.store_content] ❌ Filtered video {video_id} - content does not match any keyword")
+                    return
+        except Exception as e:
+            utils.logger.debug(f"[BilibiliDbStoreImplement.store_content] Failed to load keyword config: {e}")
+        
        # 确保 video_id 为整数类型，匹配数据库 BigInteger 字段
        if video_id is not None:
            video_id = int(video_id) if not isinstance(video_id, int) else video_id
@@ -88,6 +88,30 @@ class DouyinDbStoreImplement(AbstractStore):
            content_item: content item dict
        """
        aweme_id = content_item.get("aweme_id")
+        if not aweme_id:
+            return
+        
+        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
+        # 支持精确匹配和模糊匹配两种模式
+        try:
+            import sys
+            from pathlib import Path
+            project_root = Path(__file__).resolve().parents[4]
+            if str(project_root) not in sys.path:
+                sys.path.insert(0, str(project_root))
+            from config import settings
+            
+            desc = content_item.get("desc", "")
+            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
+            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
+            
+            if strict_keywords or fuzzy_keywords:
+                if not utils.check_keyword_match_with_modes(desc, strict_keywords, fuzzy_keywords):
+                    utils.logger.warning(f"[DouyinDbStoreImplement.store_content] ❌ Filtered aweme {aweme_id} - content does not match any keyword")
+                    return
+        except Exception as e:
+            utils.logger.debug(f"[DouyinDbStoreImplement.store_content] Failed to load keyword config: {e}")
+        
        async with get_session() as session:
            result = await session.execute(select(DouyinAweme).where(DouyinAweme.aweme_id == aweme_id))
            aweme_detail = result.scalar_one_or_none()
@@ -89,6 +89,30 @@ class KuaishouDbStoreImplement(AbstractStore):
            content_item: content item dict
        """
        video_id = content_item.get("video_id")
+        if not video_id:
+            return
+        
+        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
+        # 支持精确匹配和模糊匹配两种模式
+        try:
+            import sys
+            from pathlib import Path
+            project_root = Path(__file__).resolve().parents[4]
+            if str(project_root) not in sys.path:
+                sys.path.insert(0, str(project_root))
+            from config import settings
+            
+            caption = content_item.get("caption", "")
+            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
+            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
+            
+            if strict_keywords or fuzzy_keywords:
+                if not utils.check_keyword_match_with_modes(caption, strict_keywords, fuzzy_keywords):
+                    utils.logger.warning(f"[KuaishouDbStoreImplement.store_content] ❌ Filtered video {video_id} - content does not match any keyword")
+                    return
+        except Exception as e:
+            utils.logger.debug(f"[KuaishouDbStoreImplement.store_content] Failed to load keyword config: {e}")
+        
        async with get_session() as session:
            result = await session.execute(select(KuaishouVideo).where(KuaishouVideo.video_id == video_id))
            video_detail = result.scalar_one_or_none()
@@ -95,6 +95,32 @@ class TieBaDbStoreImplement(AbstractStore):
            content_item: content item dict
        """
        note_id = content_item.get("note_id")
+        if not note_id:
+            return
+        
+        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
+        # 支持精确匹配和模糊匹配两种模式
+        try:
+            import sys
+            from pathlib import Path
+            project_root = Path(__file__).resolve().parents[4]
+            if str(project_root) not in sys.path:
+                sys.path.insert(0, str(project_root))
+            from config import settings
+            
+            title = content_item.get("title", "")
+            text = content_item.get("text", "")
+            content_text = title + " " + text
+            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
+            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
+            
+            if strict_keywords or fuzzy_keywords:
+                if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
+                    utils.logger.warning(f"[TiebaDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
+                    return
+        except Exception as e:
+            utils.logger.debug(f"[TiebaDbStoreImplement.store_content] Failed to load keyword config: {e}")
+        
        async with get_session() as session:
            stmt = select(TiebaNote).where(TiebaNote.note_id == note_id)
            res = await session.execute(stmt)
@@ -93,7 +93,12 @@ async def update_weibo_note(note_item: Dict):
        "source_keyword": source_keyword_var.get(),
    }
    utils.logger.info(f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
-    await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
+    try:
+        await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
+        utils.logger.debug(f"[store.weibo.update_weibo_note] Successfully saved note {note_id}")
+    except Exception as e:
+        utils.logger.error(f"[store.weibo.update_weibo_note] Failed to save note {note_id}: {e}", exc_info=True)
+        raise


 async def batch_update_weibo_note_comments(note_id: str, comments: List[Dict]):
@@ -148,7 +153,12 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
        "avatar": user_info.get("profile_image_url", ""),
    }
    utils.logger.info(f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
-    await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
+    try:
+        await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
+        utils.logger.debug(f"[store.weibo.update_weibo_note_comment] Successfully saved comment {comment_id}")
+    except Exception as e:
+        utils.logger.error(f"[store.weibo.update_weibo_note_comment] Failed to save comment {comment_id}: {e}", exc_info=True)
+        raise


 async def update_weibo_note_image(picid: str, pic_content, extension_file_name):
@@ -21,7 +21,7 @@ import pathlib
 from typing import Dict

 import aiofiles
-from sqlalchemy import select
+from sqlalchemy import select, text
 from sqlalchemy.ext.asyncio import AsyncSession

 import config
@@ -29,7 +29,7 @@ from base.base_crawler import AbstractStore
 from database.models import WeiboCreator, WeiboNote, WeiboNoteComment
 from tools import utils, words
 from tools.async_file_writer import AsyncFileWriter
-from database.db_session import get_session
+from database.db_session import get_session, get_async_engine
 from var import crawler_type_var


@@ -88,6 +88,33 @@ class WeiboCsvStoreImplement(AbstractStore):


 class WeiboDbStoreImplement(AbstractStore):
+    
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    async def _check_connection(self):
+        """检查数据库连接是否正常（使用类变量缓存检查结果）"""
+        # 使用类变量缓存检查结果，避免重复检查
+        if not hasattr(WeiboDbStoreImplement, '_global_connection_checked'):
+            WeiboDbStoreImplement._global_connection_checked = False
+        
+        if WeiboDbStoreImplement._global_connection_checked:
+            return True
+        
+        try:
+            engine = get_async_engine(config.SAVE_DATA_OPTION)
+            if engine is None:
+                utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Engine is None for SAVE_DATA_OPTION={config.SAVE_DATA_OPTION}")
+                return False
+            
+            async with engine.connect() as conn:
+                await conn.execute(text("SELECT 1"))
+            WeiboDbStoreImplement._global_connection_checked = True
+            utils.logger.info(f"[WeiboDbStoreImplement._check_connection] Database connection verified")
+            return True
+        except Exception as e:
+            utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Database connection failed: {e}", exc_info=True)
+            return False

    async def store_content(self, content_item: Dict):
        """
@@ -99,21 +126,62 @@ class WeiboDbStoreImplement(AbstractStore):

        """
        note_id = content_item.get("note_id")
-        async with get_session() as session:
-            stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
-            res = await session.execute(stmt)
-            db_note = res.scalar_one_or_none()
-            if db_note:
-                db_note.last_modify_ts = utils.get_current_timestamp()
-                for key, value in content_item.items():
-                    if hasattr(db_note, key):
-                        setattr(db_note, key, value)
-            else:
-                content_item["add_ts"] = utils.get_current_timestamp()
-                content_item["last_modify_ts"] = utils.get_current_timestamp()
-                db_note = WeiboNote(**content_item)
-                session.add(db_note)
-            await session.commit()
+        if not note_id:
+            utils.logger.error(f"[WeiboDbStoreImplement.store_content] note_id is missing in content_item: {content_item}")
+            return
+        
+        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
+        # 支持精确匹配和模糊匹配两种模式
+        try:
+            import sys
+            from pathlib import Path
+            project_root = Path(__file__).resolve().parents[4]
+            if str(project_root) not in sys.path:
+                sys.path.insert(0, str(project_root))
+            from config import settings
+            
+            content_text = content_item.get("content", "")
+            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
+            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
+            
+            if strict_keywords or fuzzy_keywords:
+                if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
+                    utils.logger.warning(f"[WeiboDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
+                    return
+        except Exception as e:
+            utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Failed to load keyword config: {e}")
+        
+        # 检查数据库连接
+        if not await self._check_connection():
+            utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database connection check failed, skipping save for note {note_id}")
+            return
+        
+        try:
+            async with get_session() as session:
+                if session is None:
+                    utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database session is None, check SAVE_DATA_OPTION config")
+                    return
+                
+                stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
+                res = await session.execute(stmt)
+                db_note = res.scalar_one_or_none()
+                if db_note:
+                    utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Updating existing note {note_id}")
+                    db_note.last_modify_ts = utils.get_current_timestamp()
+                    for key, value in content_item.items():
+                        if hasattr(db_note, key):
+                            setattr(db_note, key, value)
+                else:
+                    utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Creating new note {note_id}")
+                    content_item["add_ts"] = utils.get_current_timestamp()
+                    content_item["last_modify_ts"] = utils.get_current_timestamp()
+                    db_note = WeiboNote(**content_item)
+                    session.add(db_note)
+                await session.commit()
+                utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Successfully committed note {note_id} to database")
+        except Exception as e:
+            utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database error saving note {note_id}: {e}", exc_info=True)
+            raise

    async def store_comment(self, comment_item: Dict):
        """
@@ -125,21 +193,36 @@ class WeiboDbStoreImplement(AbstractStore):

        """
        comment_id = comment_item.get("comment_id")
-        async with get_session() as session:
-            stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
-            res = await session.execute(stmt)
-            db_comment = res.scalar_one_or_none()
-            if db_comment:
-                db_comment.last_modify_ts = utils.get_current_timestamp()
-                for key, value in comment_item.items():
-                    if hasattr(db_comment, key):
-                        setattr(db_comment, key, value)
-            else:
-                comment_item["add_ts"] = utils.get_current_timestamp()
-                comment_item["last_modify_ts"] = utils.get_current_timestamp()
-                db_comment = WeiboNoteComment(**comment_item)
-                session.add(db_comment)
-            await session.commit()
+        if not comment_id:
+            utils.logger.error(f"[WeiboDbStoreImplement.store_comment] comment_id is missing in comment_item: {comment_item}")
+            return
+        
+        try:
+            async with get_session() as session:
+                if session is None:
+                    utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database session is None, check SAVE_DATA_OPTION config")
+                    return
+                
+                stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
+                res = await session.execute(stmt)
+                db_comment = res.scalar_one_or_none()
+                if db_comment:
+                    utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Updating existing comment {comment_id}")
+                    db_comment.last_modify_ts = utils.get_current_timestamp()
+                    for key, value in comment_item.items():
+                        if hasattr(db_comment, key):
+                            setattr(db_comment, key, value)
+                else:
+                    utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Creating new comment {comment_id}")
+                    comment_item["add_ts"] = utils.get_current_timestamp()
+                    comment_item["last_modify_ts"] = utils.get_current_timestamp()
+                    db_comment = WeiboNoteComment(**comment_item)
+                    session.add(db_comment)
+                await session.commit()
+                utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Successfully committed comment {comment_id} to database")
+        except Exception as e:
+            utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database error saving comment {comment_id}: {e}", exc_info=True)
+            raise

    async def store_creator(self, creator: Dict):
        """
@@ -89,6 +89,34 @@ class XhsDbStoreImplement(AbstractStore):
        note_id = content_item.get("note_id")
        if not note_id:
            return
+        
+        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
+        # 支持精确匹配和模糊匹配两种模式
+        try:
+            import sys
+            from pathlib import Path
+            # 添加项目根目录到路径，以便导入 MindSpider 的 config
+            project_root = Path(__file__).resolve().parents[4]
+            if str(project_root) not in sys.path:
+                sys.path.insert(0, str(project_root))
+            from config import settings
+            
+            title = content_item.get("title", "")
+            desc = content_item.get("desc", "")
+            content_text = title + " " + desc
+            
+            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
+            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
+            
+            # 如果配置了关键词，进行匹配检查
+            if strict_keywords or fuzzy_keywords:
+                if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
+                    utils.logger.warning(f"[XhsDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
+                    return
+        except Exception as e:
+            # 如果配置读取失败，记录警告但不阻止保存（向后兼容）
+            utils.logger.debug(f"[XhsDbStoreImplement.store_content] Failed to load keyword config: {e}")
+        
        async with get_session() as session:
            if await self.content_is_exist(session, note_id):
                await self.update_content(session, content_item)
@@ -16,9 +16,9 @@ import config
 from base.base_crawler import AbstractStore
 from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
 from ._store_impl import (ZhihuCsvStoreImplement,
-                                          ZhihuDbStoreImplement,
-                                          ZhihuJsonStoreImplement,
-                                          ZhihuSqliteStoreImplement)
+                          ZhihuDbStoreImplement,
+                          ZhihuJsonStoreImplement,
+                          ZhihuSqliteStoreImplement)
 from tools import utils
 from var import source_keyword_var

@@ -36,9 +36,11 @@ class ZhihuStoreFactory:
    def create_store() -> AbstractStore:
        store_class = ZhihuStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
        if not store_class:
-            raise ValueError("[ZhihuStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or postgresql ...")
+            raise ValueError(
+                "[ZhihuStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or postgresql ...")
        return store_class()

+
 async def batch_update_zhihu_contents(contents: List[ZhihuContent]):
    """
    批量更新知乎内容
@@ -54,6 +56,7 @@ async def batch_update_zhihu_contents(contents: List[ZhihuContent]):
    for content_item in contents:
        await update_zhihu_content(content_item)

+
 async def update_zhihu_content(content_item: ZhihuContent):
    """
    更新知乎内容
@@ -70,7 +73,6 @@ async def update_zhihu_content(content_item: ZhihuContent):
    await ZhihuStoreFactory.create_store().store_content(local_db_item)


-
 async def batch_update_zhihu_note_comments(comments: List[ZhihuComment]):
    """
    批量更新知乎内容评论
@@ -82,9 +84,22 @@ async def batch_update_zhihu_note_comments(comments: List[ZhihuComment]):
    """
    if not comments:
        return
-    
+
+    success_count = 0
+    error_count = 0
    for comment_item in comments:
-        await update_zhihu_content_comment(comment_item)
+        try:
+            await update_zhihu_content_comment(comment_item)
+            success_count += 1
+        except Exception as e:
+            error_count += 1
+            comment_id = getattr(comment_item, 'comment_id', 'unknown')
+            utils.logger.error(f"[store.zhihu.batch_update_zhihu_note_comments] 保存评论失败 (comment_id={comment_id}): {e}", exc_info=True)
+    
+    if error_count > 0:
+        utils.logger.warning(f"[store.zhihu.batch_update_zhihu_note_comments] 批量保存完成: 成功 {success_count} 条, 失败 {error_count} 条")
+    else:
+        utils.logger.info(f"[store.zhihu.batch_update_zhihu_note_comments] 批量保存完成: 成功 {success_count} 条")


 async def update_zhihu_content_comment(comment_item: ZhihuComment):
@@ -96,10 +111,17 @@ async def update_zhihu_content_comment(comment_item: ZhihuComment):
    Returns:

    """
-    local_db_item = comment_item.model_dump()
-    local_db_item.update({"last_modify_ts": utils.get_current_timestamp()})
-    utils.logger.info(f"[store.zhihu.update_zhihu_note_comment] zhihu content comment:{local_db_item}")
-    await ZhihuStoreFactory.create_store().store_comment(local_db_item)
+    try:
+        local_db_item = comment_item.model_dump()
+        local_db_item.update({"last_modify_ts": utils.get_current_timestamp()})
+        # 使用更安全的日志记录方式，避免编码问题导致日志输出异常
+        comment_id = local_db_item.get('comment_id', 'unknown')
+        utils.logger.debug(f"[store.zhihu.update_zhihu_note_comment] 准备保存评论: comment_id={comment_id}")
+        await ZhihuStoreFactory.create_store().store_comment(local_db_item)
+    except Exception as e:
+        comment_id = getattr(comment_item, 'comment_id', 'unknown')
+        utils.logger.error(f"[store.zhihu.update_zhihu_note_comment] 保存评论异常 (comment_id={comment_id}): {e}", exc_info=True)
+        raise


 async def save_creator(creator: ZhihuCreator):
@@ -94,17 +94,72 @@ class ZhihuDbStoreImplement(AbstractStore):
            content_item: content item dict
        """
        content_id = content_item.get("content_id")
-        async with get_session() as session:
-            stmt = select(ZhihuContent).where(ZhihuContent.content_id == content_id)
-            result = await session.execute(stmt)
-            existing_content = result.scalars().first()
-            if existing_content:
-                for key, value in content_item.items():
-                    setattr(existing_content, key, value)
-            else:
-                new_content = ZhihuContent(**content_item)
-                session.add(new_content)
-            await session.commit()
+        if not content_id:
+            return
+        
+        # 关键词过滤：仅在落库时进行，仅对主贴/视频过滤，不过滤评论
+        # 支持精确匹配和模糊匹配两种模式
+        try:
+            import sys
+            from pathlib import Path
+            project_root = Path(__file__).resolve().parents[4]
+            if str(project_root) not in sys.path:
+                sys.path.insert(0, str(project_root))
+            from config import settings
+            
+            title = content_item.get("title", "")
+            content = content_item.get("content", "")
+            content_text = title + " " + content
+            strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
+            fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
+            
+            if strict_keywords or fuzzy_keywords:
+                if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
+                    utils.logger.warning(f"[ZhihuDbStoreImplement.store_content] ❌ Filtered content {content_id} - content does not match any keyword")
+                    return
+        except Exception as e:
+            utils.logger.debug(f"[ZhihuDbStoreImplement.store_content] Failed to load keyword config: {e}")
+        
+        try:
+            # 确保所有字符串值都是正确的UTF-8编码
+            cleaned_item = {}
+            for key, value in content_item.items():
+                if isinstance(value, bytes):
+                    # 如果是bytes类型，尝试解码为UTF-8
+                    try:
+                        value = value.decode('utf-8')
+                    except UnicodeDecodeError:
+                        # 如果UTF-8解码失败，尝试其他编码
+                        try:
+                            value = value.decode('gbk', errors='replace')
+                        except:
+                            value = value.decode('utf-8', errors='replace')
+                elif isinstance(value, str):
+                    # 确保字符串是有效的UTF-8
+                    try:
+                        value.encode('utf-8')
+                    except UnicodeEncodeError:
+                        # 如果编码失败，尝试修复
+                        value = value.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
+                cleaned_item[key] = value
+            
+            async with get_session() as session:
+                stmt = select(ZhihuContent).where(ZhihuContent.content_id == content_id)
+                result = await session.execute(stmt)
+                existing_content = result.scalars().first()
+                if existing_content:
+                    for key, value in cleaned_item.items():
+                        setattr(existing_content, key, value)
+                    utils.logger.debug(f"[ZhihuDbStore] 更新内容: {content_id}")
+                else:
+                    new_content = ZhihuContent(**cleaned_item)
+                    session.add(new_content)
+                    utils.logger.debug(f"[ZhihuDbStore] 新增内容: {content_id}")
+                await session.commit()
+                utils.logger.info(f"[ZhihuDbStore] 成功保存内容到数据库: {content_id}")
+        except Exception as e:
+            utils.logger.error(f"[ZhihuDbStore] 保存内容失败 (content_id={content_id}): {e}", exc_info=True)
+            raise

    async def store_comment(self, comment_item: Dict):
        """
@@ -113,17 +168,46 @@ class ZhihuDbStoreImplement(AbstractStore):
            comment_item: comment item dict
        """
        comment_id = comment_item.get("comment_id")
-        async with get_session() as session:
-            stmt = select(ZhihuComment).where(ZhihuComment.comment_id == comment_id)
-            result = await session.execute(stmt)
-            existing_comment = result.scalars().first()
-            if existing_comment:
-                for key, value in comment_item.items():
-                    setattr(existing_comment, key, value)
-            else:
-                new_comment = ZhihuComment(**comment_item)
-                session.add(new_comment)
-            await session.commit()
+        try:
+            # 确保所有字符串值都是正确的UTF-8编码
+            cleaned_item = {}
+            for key, value in comment_item.items():
+                if isinstance(value, bytes):
+                    # 如果是bytes类型，尝试解码为UTF-8
+                    try:
+                        value = value.decode('utf-8')
+                    except UnicodeDecodeError:
+                        # 如果UTF-8解码失败，尝试其他编码
+                        try:
+                            value = value.decode('gbk', errors='replace')
+                        except:
+                            value = value.decode('utf-8', errors='replace')
+                elif isinstance(value, str):
+                    # 确保字符串是有效的UTF-8
+                    try:
+                        value.encode('utf-8')
+                    except UnicodeEncodeError:
+                        # 如果编码失败，尝试修复
+                        value = value.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
+                cleaned_item[key] = value
+            
+            async with get_session() as session:
+                stmt = select(ZhihuComment).where(ZhihuComment.comment_id == comment_id)
+                result = await session.execute(stmt)
+                existing_comment = result.scalars().first()
+                if existing_comment:
+                    for key, value in cleaned_item.items():
+                        setattr(existing_comment, key, value)
+                    utils.logger.debug(f"[ZhihuDbStore] 更新评论: {comment_id}")
+                else:
+                    new_comment = ZhihuComment(**cleaned_item)
+                    session.add(new_comment)
+                    utils.logger.debug(f"[ZhihuDbStore] 新增评论: {comment_id}")
+                await session.commit()
+                utils.logger.info(f"[ZhihuDbStore] 成功保存评论到数据库: {comment_id}")
+        except Exception as e:
+            utils.logger.error(f"[ZhihuDbStore] 保存评论失败 (comment_id={comment_id}): {e}", exc_info=True)
+            raise

    async def store_creator(self, creator: Dict):
        """
@@ -11,6 +11,11 @@

 import argparse
 import logging
+import os
+import re
+import sys
+from logging.handlers import RotatingFileHandler
+from pathlib import Path

 from .crawler_util import *
 from .slider_util import *
@@ -19,17 +24,80 @@ from .time_util import *

 def init_loging_config():
    level = logging.INFO
-    logging.basicConfig(
-        level=level,
-        format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
-        datefmt='%Y-%m-%d %H:%M:%S'
-    )
+    
+    # 日志格式
+    log_format = "%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s"
+    date_format = '%Y-%m-%d %H:%M:%S'
+    
+    # 创建日志目录（项目根目录的 logs 文件夹）
+    # 从当前文件位置向上查找，直到找到包含 logs 目录的项目根目录
+    current_file = Path(__file__).resolve()
+    project_root = None
+    
+    # 方法1: 向上查找直到找到 logs 目录
+    for parent in current_file.parents:
+        logs_dir = parent / "logs"
+        if logs_dir.exists() or parent.name == "BettaFish-1.2.0":
+            project_root = parent
+            break
+    
+    # 方法2: 如果没找到，使用当前工作目录
+    if project_root is None:
+        project_root = Path.cwd()
+        # 如果当前在 MediaCrawler 目录，向上查找
+        if project_root.name == "MediaCrawler":
+            project_root = project_root.parent.parent
+    
+    log_dir = project_root / "logs"
+    log_dir.mkdir(exist_ok=True)
+    
+    # 日志文件路径
+    log_file = log_dir / "mediacrawler.log"
+    
+    # 配置根日志记录器
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level)
+    
+    # 清除已有的处理器，避免重复
+    root_logger.handlers.clear()
+    
+    # 控制台处理器 - 明确使用 sys.stdout 确保输出到控制台
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setLevel(level)
+    console_formatter = logging.Formatter(log_format, datefmt=date_format)
+    console_handler.setFormatter(console_formatter)
+    root_logger.addHandler(console_handler)
+    
+    # 确保输出立即刷新
+    sys.stdout.flush()
+    sys.stderr.flush()
+    
+    # 文件处理器（带轮转，最大10MB，保留5个备份）
+    try:
+        file_handler = RotatingFileHandler(
+            log_file,
+            maxBytes=10 * 1024 * 1024,  # 10MB
+            backupCount=5,
+            encoding='utf-8'
+        )
+        file_handler.setLevel(level)
+        file_formatter = logging.Formatter(log_format, datefmt=date_format)
+        file_handler.setFormatter(file_formatter)
+        root_logger.addHandler(file_handler)
+    except Exception as e:
+        # 如果文件日志初始化失败，至少保证控制台日志可用
+        print(f"警告: 无法初始化文件日志: {e}")
+    
+    # 创建 MediaCrawler 专用日志记录器
    _logger = logging.getLogger("MediaCrawler")
    _logger.setLevel(level)
-
+    
    # 关闭 httpx 的 INFO 日志
    logging.getLogger("httpx").setLevel(logging.WARNING)
-
+    
+    # 输出日志文件位置
+    _logger.info(f"日志文件: {log_file}")
+    
    return _logger


@@ -44,3 +112,101 @@ def str2bool(v):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+def check_keyword_match_strict(content: str, keyword: str) -> bool:
+    """
+    严格关键词匹配：检查内容是否包含关键词（严格模式）
+    
+    Args:
+        content: 要检查的内容文本
+        keyword: 关键词（可以是单个关键词，也可以是逗号分隔的多个关键词）
+    
+    Returns:
+        bool: 如果内容包含任意一个关键词返回True，否则返回False
+    """
+    if not content or not keyword:
+        return False
+    
+    # 清理HTML标签
+    clean_content = re.sub(r"<.*?>", "", content)
+    # 转换为小写进行匹配
+    clean_content_lower = clean_content.lower()
+    
+    # 支持多个关键词（逗号分隔），只要匹配任意一个即可
+    keywords = [k.strip().lower() for k in keyword.split(",") if k.strip()]
+    
+    # 检查内容是否包含任意一个关键词
+    for kw in keywords:
+        if kw in clean_content_lower:
+            return True
+    
+    return False
+
+
+def check_keyword_match_fuzzy(content: str, keyword: str) -> bool:
+    """
+    模糊关键词匹配：检查内容是否包含关键词（模糊模式，支持部分匹配）
+    
+    Args:
+        content: 要检查的内容文本
+        keyword: 关键词（可以是单个关键词，也可以是逗号分隔的多个关键词）
+    
+    Returns:
+        bool: 如果内容包含任意一个关键词（或关键词的部分）返回True，否则返回False
+    """
+    if not content or not keyword:
+        return False
+    
+    # 清理HTML标签
+    clean_content = re.sub(r"<.*?>", "", content)
+    # 转换为小写进行匹配
+    clean_content_lower = clean_content.lower()
+    
+    # 支持多个关键词（逗号分隔），只要匹配任意一个即可
+    keywords = [k.strip().lower() for k in keyword.split(",") if k.strip()]
+    
+    # 检查内容是否包含任意一个关键词（或关键词的部分）
+    for kw in keywords:
+        # 精确匹配
+        if kw in clean_content_lower:
+            return True
+        # 模糊匹配：如果关键词长度>=3，检查是否包含关键词的主要部分
+        if len(kw) >= 3:
+            # 去除空格后匹配
+            kw_no_space = kw.replace(" ", "")
+            content_no_space = clean_content_lower.replace(" ", "")
+            if kw_no_space in content_no_space:
+                return True
+            # 检查关键词的前半部分（至少2个字符）
+            if len(kw) >= 4:
+                half_kw = kw[:len(kw)//2]
+                if half_kw in clean_content_lower:
+                    return True
+    
+    return False
+
+
+def check_keyword_match_with_modes(content: str, strict_keywords: str = None, fuzzy_keywords: str = None) -> bool:
+    """
+    使用精确和模糊两种模式检查关键词匹配
+    
+    Args:
+        content: 要检查的内容文本
+        strict_keywords: 精确匹配关键词（逗号分隔）
+        fuzzy_keywords: 模糊匹配关键词（逗号分隔）
+    
+    Returns:
+        bool: 如果内容匹配任意一个关键词（精确或模糊）返回True，否则返回False
+    """
+    # 先检查精确匹配关键词
+    if strict_keywords:
+        if check_keyword_match_strict(content, strict_keywords):
+            return True
+    
+    # 再检查模糊匹配关键词
+    if fuzzy_keywords:
+        if check_keyword_match_fuzzy(content, fuzzy_keywords):
+            return True
+    
+    return False
@@ -32,6 +32,7 @@ class KeywordManager:
    def __init__(self):
        """初始化关键词管理器"""
        self.engine: Engine = None
+        self.custom_keywords_path: Optional[Path] = self._resolve_custom_keywords_path()
        self.connect()
    
    def connect(self):
@@ -68,24 +69,31 @@ class KeywordManager:
        Returns:
            关键词列表
        """
+        if not getattr(settings, "ENABLE_KEYWORD_SEARCH", True):
+            logger.info("关键词搜索已通过配置禁用，返回默认关键词列表")
+            return self._limit_keywords(self._get_default_keywords(), max_keywords)
+
        if not target_date:
            target_date = date.today()
        
+        if getattr(settings, "USE_DEFAULT_KEYWORDS_ONLY", False):
+            logger.info("配置启用默认关键词模式，直接返回默认关键词")
+            return self._limit_keywords(self._get_default_keywords(), max_keywords)
+        
        logger.info(f"正在获取 {target_date} 的关键词...")
        
+        # 优先使用自定义关键词
+        custom_keywords = self._get_custom_keywords(target_date, max_keywords)
+        if custom_keywords:
+            return custom_keywords
+        
        # 首先尝试获取指定日期的关键词
        topics_data = self.get_daily_topics(target_date)
        
        if topics_data and topics_data.get('keywords'):
            keywords = topics_data['keywords']
            logger.info(f"成功获取 {target_date} 的 {len(keywords)} 个关键词")
-            
-            # 如果关键词太多，随机选择指定数量
-            if len(keywords) > max_keywords:
-                keywords = random.sample(keywords, max_keywords)
-                logger.info(f"随机选择了 {max_keywords} 个关键词")
-            
-            return keywords
+            return self._limit_keywords(keywords, max_keywords)
        
        # 如果没有当天的关键词，尝试获取最近几天的
        logger.info(f"{target_date} 没有关键词数据，尝试获取最近的关键词...")
@@ -100,15 +108,14 @@ class KeywordManager:
            
            # 去重并限制数量
            unique_keywords = list(set(all_keywords))
-            if len(unique_keywords) > max_keywords:
-                unique_keywords = random.sample(unique_keywords, max_keywords)
+            limited_keywords = self._limit_keywords(unique_keywords, max_keywords)
            
-            logger.info(f"从最近7天的数据中获取到 {len(unique_keywords)} 个关键词")
-            return unique_keywords
+            logger.info(f"从最近7天的数据中获取到 {len(limited_keywords)} 个关键词")
+            return limited_keywords
        
        # 如果都没有，返回默认关键词
        logger.info("没有找到任何关键词数据，使用默认关键词")
-        return self._get_default_keywords()
+        return self._limit_keywords(self._get_default_keywords(), max_keywords)
    
    def get_daily_topics(self, extract_date: date = None) -> Optional[Dict]:
        """
@@ -176,17 +183,76 @@ class KeywordManager:
        except Exception as e:
            logger.exception(f"获取最近话题分析失败: {e}")
            return []
+
+    def _resolve_custom_keywords_path(self) -> Optional[Path]:
+        """解析自定义关键词文件路径"""
+        raw_path = getattr(settings, "CUSTOM_KEYWORDS_FILE", None)
+        if not raw_path:
+            return None
+        path = Path(raw_path).expanduser()
+        if not path.is_absolute():
+            path = project_root / path
+        return path
+
+    @staticmethod
+    def _limit_keywords(keywords: List[str], max_keywords: int) -> List[str]:
+        """根据最大数量限制关键词"""
+        if not keywords:
+            return []
+        if max_keywords and len(keywords) > max_keywords:
+            keywords = random.sample(keywords, max_keywords)
+        return keywords
+
+    def _get_custom_keywords(self, target_date: date, max_keywords: int) -> Optional[List[str]]:
+        """
+        从自定义关键词文件中获取指定日期的关键词
+        支持格式：
+        1. ["AI", "大模型"]
+        2. {"2025-11-26": ["AI"], "default": ["科技"]}
+        """
+        if not self.custom_keywords_path:
+            return None
+
+        path = self.custom_keywords_path
+        if not path.exists():
+            logger.warning(f"自定义关键词文件不存在: {path}")
+            return None
+
+        try:
+            with path.open("r", encoding="utf-8") as f:
+                data = json.load(f)
+        except Exception as e:
+            logger.error(f"读取自定义关键词文件失败({path}): {e}")
+            return None
+
+        keywords: Optional[List[str]] = None
+        if isinstance(data, list):
+            keywords = data
+        elif isinstance(data, dict):
+            date_key = target_date.isoformat()
+            if date_key in data and isinstance(data[date_key], list):
+                keywords = data[date_key]
+            elif "default" in data and isinstance(data["default"], list):
+                keywords = data["default"]
+            elif "keywords" in data and isinstance(data["keywords"], list):
+                keywords = data["keywords"]
+
+        if not keywords:
+            return None
+
+        cleaned_keywords = [kw.strip() for kw in keywords if isinstance(kw, str) and kw.strip()]
+        if not cleaned_keywords:
+            return None
+
+        limited = self._limit_keywords(cleaned_keywords, max_keywords)
+        logger.info(f"使用自定义关键词（{path}）共 {len(limited)} 个")
+        return limited
    
    def _get_default_keywords(self) -> List[str]:
        """获取默认关键词列表"""
        return [
-            "科技", "人工智能", "AI", "编程", "互联网",
-            "创业", "投资", "理财", "股市", "经济",
-            "教育", "学习", "考试", "大学", "就业",
-            "健康", "养生", "运动", "美食", "旅游",
-            "时尚", "美妆", "购物", "生活", "家居",
-            "电影", "音乐", "游戏", "娱乐", "明星",
-            "新闻", "热点", "社会", "政策", "环保"
+            "F6智慧门店","F6智数","中国汽车后市场白皮书","南京爱福路汽车科技有限公司","汽车后市场","汽车修理厂",
+            "新康众","天猫养车","汽后","汽修厂","爱福路","康众",
        ]
    
    def get_all_keywords_for_platforms(self, platforms: List[str], target_date: date = None, 
@@ -286,6 +352,16 @@ class KeywordManager:
        if not target_date:
            target_date = date.today()
        
+        # 当配置为仅使用默认关键词时，直接返回默认关键词的摘要，避免上层误判为“无数据”
+        if getattr(settings, "USE_DEFAULT_KEYWORDS_ONLY", False):
+            default_keywords = self._get_default_keywords()
+            return {
+                'date': target_date,
+                'keywords_count': len(default_keywords),
+                'summary': '使用默认关键词模式',
+                'has_data': bool(default_keywords)
+            }
+        
        topics_data = self.get_daily_topics(target_date)
        
        if topics_data:
@@ -251,16 +251,92 @@ postgresql_db_config = {{
            
            logger.info(f"执行命令: {' '.join(cmd)}")
            
-            # 切换到MediaCrawler目录并执行
-            result = subprocess.run(
+            # 切换到MediaCrawler目录并执行，捕获输出
+            # 使用utf-8编码，errors='surrogateescape'可以更好地处理编码问题
+            # 设置环境变量确保子进程使用UTF-8编码
+            env = os.environ.copy()
+            env['PYTHONIOENCODING'] = 'utf-8'
+            env['PYTHONUTF8'] = '1'
+            
+            # 使用 Popen 实时输出日志，而不是等到结束才显示
+            import subprocess as sp
+            process = sp.Popen(
                cmd,
                cwd=self.mediacrawler_path,
-                timeout=3600  # 60分钟超时
+                stdout=sp.PIPE,
+                stderr=sp.STDOUT,  # 将stderr合并到stdout
+                text=True,
+                encoding='utf-8',
+                errors='surrogateescape',
+                env=env,
+                bufsize=1,  # 行缓冲
+                universal_newlines=True
+            )
+            
+            # 实时读取并输出日志
+            output_lines = []
+            error_lines = []
+            try:
+                for line in process.stdout:
+                    line = line.rstrip()
+                    if line:
+                        output_lines.append(line)
+                        # 实时输出到控制台
+                        print(f"[{platform}] {line}", flush=True)
+                        logger.info(f"[{platform}] {line}")
+                
+                # 等待进程完成
+                return_code = process.wait(timeout=3600)
+            except sp.TimeoutExpired:
+                process.kill()
+                process.wait()
+                return_code = -1
+                logger.error(f"[{platform}] 爬取超时")
+            except Exception as e:
+                process.kill()
+                process.wait()
+                return_code = -1
+                logger.error(f"[{platform}] 执行异常: {e}", exc_info=True)
+            
+            # 创建类似 subprocess.run 的 result 对象
+            class Result:
+                def __init__(self, returncode, stdout, stderr):
+                    self.returncode = returncode
+                    self.stdout = stdout
+                    self.stderr = stderr
+            
+            result = Result(
+                returncode=return_code,
+                stdout='\n'.join(output_lines),
+                stderr='\n'.join(error_lines)
            )
            
            end_time = datetime.now()
            duration = (end_time - start_time).total_seconds()
            
+            # 解析输出，提取实际保存的数据量
+            output_lines = result.stdout.split('\n') if result.stdout else []
+            error_lines = result.stderr.split('\n') if result.stderr else []
+            
+            # 输出日志到控制台和日志文件
+            if output_lines:
+                logger.info(f"[{platform}] 爬虫标准输出:")
+                for line in output_lines:
+                    if line.strip():  # 忽略空行
+                        logger.info(f"[{platform}] {line}")
+            
+            if error_lines:
+                logger.warning(f"[{platform}] 爬虫错误输出:")
+                for line in error_lines:
+                    if line.strip():  # 忽略空行
+                        logger.warning(f"[{platform}] {line}")
+            
+            # 合并所有输出行用于解析
+            all_lines = output_lines + error_lines
+            
+            # 解析统计信息
+            parsed_stats = self._parse_crawl_output(all_lines, error_lines)
+            
            # 创建统计信息
            crawl_stats = {
                "platform": platform,
@@ -270,9 +346,11 @@ postgresql_db_config = {{
                "end_time": end_time.isoformat(),
                "return_code": result.returncode,
                "success": result.returncode == 0,
-                "notes_count": 0,
-                "comments_count": 0,
-                "errors_count": 0
+                "notes_count": parsed_stats.get("notes_count", 0),
+                "comments_count": parsed_stats.get("comments_count", 0),
+                "errors_count": parsed_stats.get("errors_count", 0),
+                "output_preview": '\n'.join(output_lines[-20:]) if output_lines else "",  # 最后20行输出
+                "error_preview": '\n'.join(error_lines[-20:]) if error_lines else ""  # 最后20行错误
            }
            
            # 保存统计信息
@@ -280,8 +358,16 @@ postgresql_db_config = {{
            
            if result.returncode == 0:
                logger.info(f"✅ {platform} 爬取完成，耗时: {duration:.1f}秒")
+                logger.info(f"   保存内容: {crawl_stats['notes_count']} 条，评论: {crawl_stats['comments_count']} 条")
+                if crawl_stats['notes_count'] == 0 and crawl_stats['comments_count'] == 0:
+                    logger.warning(f"⚠️ {platform} 爬取成功但未保存任何数据，请检查数据库连接和保存逻辑")
+                    # 输出部分日志用于调试
+                    if crawl_stats['error_preview']:
+                        logger.warning(f"   错误信息: {crawl_stats['error_preview'][:500]}")
            else:
                logger.error(f"❌ {platform} 爬取失败，返回码: {result.returncode}")
+                if error_lines:
+                    logger.error(f"   错误信息: {crawl_stats['error_preview'][:500]}")
            
            return crawl_stats
            
@@ -294,6 +380,7 @@ postgresql_db_config = {{
    
    def _parse_crawl_output(self, output_lines: List[str], error_lines: List[str]) -> Dict:
        """解析爬取输出，提取统计信息"""
+        import re
        stats = {
            "notes_count": 0,
            "comments_count": 0,
@@ -301,32 +388,122 @@ postgresql_db_config = {{
            "login_required": False
        }
        
-        # 解析输出行
-        for line in output_lines:
-            if "条笔记" in line or "条内容" in line:
+        # 合并所有行用于解析
+        all_lines = output_lines + error_lines
+        
+        # 用于统计各平台的保存操作次数（通过日志关键字统计）
+        # 视频/内容保存操作的关键字
+        content_save_keywords = [
+            "[store.bilibili.update_bilibili_video]",
+            "update_bilibili_video",
+            "[store.douyin.update_dy_aweme]",
+            "update_dy_aweme",
+            "[store.kuaishou.update_kuaishou_video]",
+            "update_kuaishou_video",
+            "[store.xhs.update_xhs_note]",
+            "update_xhs_note",
+            "[store.weibo.update_weibo_note]",
+            "update_weibo_note",
+            "[store.tieba.update_tieba_note]",
+            "update_tieba_note",
+            "[store.zhihu.update_zhihu_content]",
+            "update_zhihu_content",
+        ]
+        
+        # 评论保存操作的关键字
+        comment_save_keywords = [
+            "[store.bilibili.update_bilibili_video_comment]",
+            "update_bilibili_video_comment",
+            "[store.douyin.update_dy_aweme_comment]",
+            "update_dy_aweme_comment",
+            "[store.kuaishou.update_ks_video_comment]",
+            "update_ks_video_comment",
+            "[store.xhs.update_xhs_note_comment]",
+            "update_xhs_note_comment",
+            "[store.weibo.update_weibo_note_comment]",
+            "update_weibo_note_comment",
+            "[store.tieba.update_tieba_note_comment]",
+            "update_tieba_note_comment",
+            "[store.zhihu.update_zhihu_content_comment]",
+            "update_zhihu_note_comment",
+            "update_zhihu_content_comment",
+        ]
+        
+        # 先统计日志关键字出现的次数（用于bilibili等没有汇总信息的平台）
+        log_keyword_content_count = 0
+        log_keyword_comment_count = 0
+        
+        # 解析输出行，查找各种可能的数据保存信息
+        for line in all_lines:
+            line_lower = line.lower()
+            
+            # 统计视频/内容保存操作（通过日志关键字）
+            for keyword in content_save_keywords:
+                if keyword in line or keyword.lower() in line_lower:
+                    log_keyword_content_count += 1
+                    break  # 避免重复计数
+            
+            # 统计评论保存操作（通过日志关键字）
+            for keyword in comment_save_keywords:
+                if keyword in line or keyword.lower() in line_lower:
+                    log_keyword_comment_count += 1
+                    break  # 避免重复计数
+            
+            # 查找保存的内容数量（多种可能的格式）
+            # 例如："保存了 10 条笔记"、"成功保存 5 条内容"、"inserted 3 records"等
+            if any(keyword in line_lower for keyword in ["条笔记", "条内容", "条视频", "条帖子", "条回答"]):
                try:
-                    # 提取数字
-                    import re
+                    # 提取数字，优先取第一个数字
                    numbers = re.findall(r'\d+', line)
                    if numbers:
-                        stats["notes_count"] = int(numbers[0])
+                        # 如果找到多个数字，取最大的（通常是总数）
+                        potential_count = max([int(n) for n in numbers])
+                        if potential_count > stats["notes_count"]:
+                            stats["notes_count"] = potential_count
                except:
                    pass
-            elif "条评论" in line:
+            
+            # 查找保存的评论数量
+            if "条评论" in line_lower:
                try:
-                    import re
                    numbers = re.findall(r'\d+', line)
                    if numbers:
-                        stats["comments_count"] = int(numbers[0])
+                        potential_count = max([int(n) for n in numbers])
+                        if potential_count > stats["comments_count"]:
+                            stats["comments_count"] = potential_count
                except:
                    pass
-            elif "登录" in line or "扫码" in line:
+            
+            # 查找数据库相关错误
+            if any(keyword in line_lower for keyword in ["数据库", "database", "connection", "连接失败", "保存失败"]):
+                if "error" in line_lower or "失败" in line_lower or "异常" in line_lower:
+                    stats["errors_count"] += 1
+            
+            # 查找登录相关
+            if any(keyword in line_lower for keyword in ["登录", "扫码", "login", "需要登录"]):
                stats["login_required"] = True
        
-        # 解析错误行
-        for line in error_lines:
-            if "error" in line.lower() or "异常" in line:
-                stats["errors_count"] += 1
+        # 如果通过汇总信息没有找到保存数量，使用日志关键字统计的结果
+        # 这样可以支持bilibili等没有输出汇总信息的平台
+        if stats["notes_count"] == 0 and log_keyword_content_count > 0:
+            stats["notes_count"] = log_keyword_content_count
+        if stats["comments_count"] == 0 and log_keyword_comment_count > 0:
+            stats["comments_count"] = log_keyword_comment_count
+        
+        # 如果没有找到明确的保存数量，尝试从数据库操作日志中提取
+        if stats["notes_count"] == 0 and stats["comments_count"] == 0:
+            # 查找可能的数据库插入信息
+            for line in all_lines:
+                line_lower = line.lower()
+                # 查找类似 "insert into" 或 "保存到" 的信息
+                if "insert" in line_lower or "保存到" in line_lower:
+                    try:
+                        numbers = re.findall(r'\d+', line)
+                        if numbers:
+                            # 尝试提取可能的记录数
+                            pass  # 这里可以进一步解析
+                    except:
+                        pass
        
        return stats
    
@@ -22,6 +22,26 @@ class Settings(BaseSettings):
    DB_PASSWORD: str = Field("your_password", description="数据库密码")
    DB_NAME: str = Field("mindspider", description="数据库名称")
    DB_CHARSET: str = Field("utf8mb4", description="数据库字符集")
+    CUSTOM_KEYWORDS_FILE: Optional[str] = Field(
+        None,
+        description="自定义关键词文件路径（可为绝对路径或相对MindSpider目录的路径）"
+    )
+    USE_DEFAULT_KEYWORDS_ONLY: bool = Field(
+        True,
+        description="为True时忽略数据库/自定义结果，直接使用默认关键词"
+    )
+    ENABLE_KEYWORD_SEARCH: bool = Field(
+        True,
+        description="开启后运行基于关键词的爬取流程，关闭则完全跳过关键词搜索"
+    )
+    STRICT_KEYWORDS: Optional[str] = Field(
+        None,
+        description="精确匹配关键词（逗号分隔），内容必须完整包含这些关键词才能落库"
+    )
+    FUZZY_KEYWORDS: Optional[str] = Field(
+        None,
+        description="模糊匹配关键词（逗号分隔），内容包含这些关键词的部分即可落库"
+    )
    MINDSPIDER_API_KEY: Optional[str] = Field(None, description="MINDSPIDER API密钥")
    MINDSPIDER_BASE_URL: Optional[str] = Field("https://api.deepseek.com", description="MINDSPIDER API基础URL，推荐deepseek-chat模型使用https://api.deepseek.com")
    MINDSPIDER_MODEL_NAME: Optional[str] = Field("deepseek-chat", description="MINDSPIDER API模型名称, 推荐deepseek-chat")
@@ -49,6 +49,7 @@ parsel==1.9.1
 pyexecjs==1.5.1
 typer>=0.12.3
 pyhumps==3.8.0
+xhshow>=0.1.3

 # ===============================
 # 工具包
@@ -8,6 +8,8 @@
 <a href="https://lioncc.ai/" target="_blank"><img src="./static/image/logo_loincc.png" alt="666ghj%2FBettaFish | Trendshift" height="40"/></a>&ensp;
 <a href="https://share.302.ai/P66Qe3" target="_blank"><img src="./static/image/logo_302ai.png" alt="666ghj%2FBettaFish | Trendshift" height="40"/></a>

+<a href="https://open.anspire.cn/?share_code=3E1FUOUH" target="_blank"><img src="./static/image/logo_anspire.png" alt="666ghj%2FBettaFish | Trendshift" height="50"/></a>
+
 [![GitHub Stars](https://img.shields.io/github/stars/666ghj/BettaFish?style=flat-square)](https://github.com/666ghj/BettaFish/stargazers)
 [![GitHub Watchers](https://img.shields.io/github/watchers/666ghj/BettaFish?style=flat-square)](https://github.com/666ghj/BettaFish/watchers)
 [![GitHub Forks](https://img.shields.io/github/forks/666ghj/BettaFish?style=flat-square)](https://github.com/666ghj/BettaFish/network)
@@ -23,13 +25,6 @@

 </div>

-## 🌟 Join Our Official Community
-
-<div align="center">
-  <img src="https://capsule-render.vercel.app/api?type=waving&color=gradient&height=200&section=header&text=Scan%20to%20Join%20Our%20QQ%20Group&fontSize=40&fontAlignY=35&desc=Welcome%20to%20Our%20Community!&descAlignY=55" alt="Welcome%20to%20Our%20Community!" style="width:60%; max-width:900px; display:block; margin:0 auto;">
-  <img src="static/image/QQ_Light_Horizenal.png" alt="BettaFish QQ Group QR Code" style="width:60%; max-width:360px; display:block; margin:20px auto 0;">
-</div>
-
 ## ⚡ Project Overview

 **"BettaFish"** is an innovative multi-agent public opinion analysis system built from scratch. It helps break information cocoons, restore the original public sentiment, predict future trends, and assist decision-making. Users only need to raise analysis needs like chatting; the agents automatically analyze 30+ mainstream social platforms at home and abroad and millions of public comments.
@@ -88,6 +83,11 @@ Solomon LionCC BettaFish WeiYu Benefits: Open codecodex.ai Lion Programming Chan
 302.AI is a pay-as-you-go enterprise AI resource hub that offers the latest and most comprehensive AI models and APIs on the market, along with a variety of ready-to-use online AI applications.
 </details>

+<details>
+<summary>Provider of core agent capabilities including AI web search, file parsing, and web content scraping: <span style="margin-left: 10px"><a href="https://open.anspire.cn/?share_code=3E1FUOUH" target="_blank"><img src="./static/image/logo_anspire.png" alt="666ghj%2FBettaFish | Trendshift" height="50"/></a></span></summary>
+Anspire Open is a leading infrastructure provider for the agent era. We offer developers the core capability stack needed to build powerful agents. Currently available services include AI web search (multiple versions, highly competitive pricing), file parsing (limited-time free), web content scraping (limited-time free), cloud browser automation (Anspire Browser Agent, in beta), multi-turn rewriting, and more. We continue to provide a solid foundation for agents to connect and operate in complex digital worlds. Seamlessly integrates with mainstream agent platforms such as Dify, Coze, and Yuanqi. Through a transparent credit-based billing system and modular design, we provide enterprises with efficient, low-cost customized support to accelerate intelligent transformation.
+</details>
+
 ## 🏗️ System Architecture

 ### Overall Architecture Diagram
@@ -114,104 +114,183 @@ Solomon LionCC BettaFish WeiYu Benefits: Open codecodex.ai Lion Programming Chan
 | 4 | Strategy Formulation | Develop segmented research strategies based on preliminary results | Internal Decision Modules of Each Agent | - |
 | 5-N | **Iterative Phase** | **Forum Collaboration + In-depth Research** | **ForumEngine + All Agents** | **Multi-round cycles** |
 | 5.1 | In-depth Research | Each Agent conducts specialized search guided by forum host | Each Agent + Reflection Mechanisms + Forum Guidance | Each cycle |
-| 5.2 | Forum Collaboration | ForumEngine monitors Agent communications and generates host summaries | ForumEngine + LLM Host | Each cycle |
+| 5.2 | Forum Collaboration | ForumEngine monitors Agent communications and generates host guidance | ForumEngine + LLM Host | Each cycle |
 | 5.3 | Communication Integration | Each Agent adjusts research directions based on discussions | Each Agent + forum_reader tool | Each cycle |
 | N+1 | Result Integration | Report Agent collects all analysis results and forum content | Report Agent | - |
-| N+2 | Report Generation | Dynamically select templates and styles, generate final reports through multiple rounds | Report Agent + Template Engine | - |
+| N+2 | IR Intermediate Representation | Dynamically select templates and styles, generate metadata through multiple rounds, assemble into IR intermediate representation | Report Agent + Template Engine | - |
+| N+3 | Report Generation | Perform quality checks on chunks, render into interactive HTML report based on IR | Report Agent + Stitching Engine | - |

 ### Project Code Structure Tree

 ```
 BettaFish/
-├── QueryEngine/                   # Domestic and international news breadth search Agent
-│   ├── agent.py                   # Agent main logic
-│   ├── llms/                      # LLM interface wrapper
-│   ├── nodes/                     # Processing nodes
-│   ├── tools/                     # Search tools
-│   ├── utils/                     # Utility functions
-│   └── ...                        # Other modules
-├── MediaEngine/                   # Powerful multimodal understanding Agent
-│   ├── agent.py                   # Agent main logic
-│   ├── nodes/                     # Processing nodes
-│   ├── llms/                      # LLM interfaces
-│   ├── tools/                     # Search tools
-│   ├── utils/                     # Utility functions
-│   └── ...                        # Other modules
-├── InsightEngine/                 # Private database mining Agent
-│   ├── agent.py                   # Agent main logic
-│   ├── llms/                      # LLM interface wrapper
-│   │   └── base.py                # Unified OpenAI-compatible client
-│   ├── nodes/                     # Processing nodes
-│   │   ├── base_node.py           # Base node class
-│   │   ├── formatting_node.py     # Formatting node
-│   │   ├── report_structure_node.py # Report structure node
-│   │   ├── search_node.py         # Search node
-│   │   └── summary_node.py        # Summary node
-│   ├── tools/                     # Database query and analysis tools
-│   │   ├── keyword_optimizer.py   # Qwen keyword optimization middleware
-│   │   ├── search.py              # Database operation toolkit
-│   │   └── sentiment_analyzer.py  # Sentiment analysis integration tool
-│   ├── state/                     # State management
-│   │   ├── __init__.py
-│   │   └── state.py               # Agent state definition
-│   ├── prompts/                   # Prompt templates
-│   │   ├── __init__.py
-│   │   └── prompts.py             # Various prompts
-│   └── utils/                     # Utility functions
-│       ├── __init__.py
-│       ├── config.py              # Configuration management
-│       └── text_processing.py     # Text processing tools
-├── ReportEngine/                  # Multi-round report generation Agent
-│   ├── agent.py                   # Agent main logic
-│   ├── llms/                      # LLM interfaces
-│   ├── nodes/                     # Report generation nodes
-│   │   ├── template_selection.py  # Template selection node
-│   │   └── html_generation.py     # HTML generation node
-│   ├── report_template/           # Report template library
-│   │   ├── 社会公共热点事件分析.md
-│   │   ├── 商业品牌舆情监测.md
-│   │   └── ...                    # More templates
-│   └── flask_interface.py         # Flask API interface
-├── ForumEngine/                   # Forum engine simple implementation
-│   ├── monitor.py                 # Log monitoring and forum management
-│   └── llm_host.py                # Forum host LLM module
-├── MindSpider/                    # Weibo crawler system
-│   ├── main.py                    # Crawler main program
-│   ├── config.py                  # Crawler configuration file
-│   ├── BroadTopicExtraction/      # Topic extraction module
-│   │   ├── database_manager.py    # Database manager
-│   │   ├── get_today_news.py      # Today's news fetching
-│   │   ├── main.py                # Topic extraction main program
-│   │   └── topic_extractor.py     # Topic extractor
-│   ├── DeepSentimentCrawling/     # Deep sentiment crawling
-│   │   ├── keyword_manager.py     # Keyword manager
-│   │   ├── main.py                # Deep crawling main program
-│   │   ├── MediaCrawler/          # Media crawler core
-│   │   └── platform_crawler.py    # Platform crawler management
-│   └── schema/                    # Database schema
-│       ├── db_manager.py          # Database manager
-│       ├── init_database.py       # Database initialization
-│       └── mindspider_tables.sql  # Database table structure
-├── SentimentAnalysisModel/        # Sentiment analysis model collection
-│   ├── WeiboSentiment_Finetuned/  # Fine-tuned BERT/GPT-2 models
-│   ├── WeiboMultilingualSentiment/# Multilingual sentiment analysis (recommended)
-│   ├── WeiboSentiment_SmallQwen/  # Small parameter Qwen3 fine-tuning
-│   └── WeiboSentiment_MachineLearning/ # Traditional machine learning methods
-├── SingleEngineApp/               # Individual Agent Streamlit applications
-│   ├── query_engine_streamlit_app.py
-│   ├── media_engine_streamlit_app.py
-│   └── insight_engine_streamlit_app.py
-├── templates/                     # Flask templates
-│   └── index.html                 # Main interface frontend
-├── static/                        # Static resources
-├── logs/                          # Runtime log directory
-├── final_reports/                 # Final generated HTML report files
-├── utils/                         # Common utility functions
-│   ├── forum_reader.py            # Agent forum communication
-│   └── retry_helper.py            # Network request retry mechanism tool
-├── app.py                         # Flask main application entry
-├── config.py                      # Global configuration file
-└── requirements.txt               # Python dependency list
+├── QueryEngine/                            # Domestic and international news breadth search Agent
+│   ├── agent.py                            # Agent main logic, coordinates search and analysis workflow
+│   ├── llms/                               # LLM interface wrapper
+│   ├── nodes/                              # Processing nodes: search, formatting, summarization, etc.
+│   ├── tools/                              # Domestic and international news search toolkit
+│   ├── utils/                              # Utility functions
+│   ├── state/                              # State management
+│   ├── prompts/                            # Prompt templates
+│   └── ...
+├── MediaEngine/                            # Powerful multimodal understanding Agent
+│   ├── agent.py                            # Agent main logic, handles video/image multimodal content
+│   ├── llms/                               # LLM interface wrapper
+│   ├── nodes/                              # Processing nodes: search, formatting, summarization, etc.
+│   ├── tools/                              # Multimodal search toolkit
+│   ├── utils/                              # Utility functions
+│   ├── state/                              # State management
+│   ├── prompts/                            # Prompt templates
+│   └── ...
+├── InsightEngine/                          # Private database mining Agent
+│   ├── agent.py                            # Agent main logic, coordinates database queries and analysis
+│   ├── llms/                               # LLM interface wrapper
+│   │   └── base.py                         # Unified OpenAI-compatible client
+│   ├── nodes/                              # Processing nodes: search, formatting, summarization, etc.
+│   │   ├── base_node.py                    # Base node class
+│   │   ├── search_node.py                  # Search node
+│   │   ├── formatting_node.py              # Formatting node
+│   │   ├── report_structure_node.py        # Report structure node
+│   │   └── summary_node.py                 # Summary node
+│   ├── tools/                              # Database query and analysis toolkit
+│   │   ├── keyword_optimizer.py            # Qwen keyword optimization middleware
+│   │   ├── search.py                       # Database operation toolkit (topic search, comment retrieval, etc.)
+│   │   └── sentiment_analyzer.py           # Sentiment analysis integration tool
+│   ├── utils/                              # Utility functions
+│   │   ├── config.py                       # Configuration management
+│   │   ├── db.py                           # SQLAlchemy async engine + read-only query wrapper
+│   │   └── text_processing.py              # Text processing utilities
+│   ├── state/                              # State management
+│   │   └── state.py                        # Agent state definition
+│   ├── prompts/                            # Prompt templates
+│   │   └── prompts.py                      # Various prompt templates
+│   └── __init__.py
+├── ReportEngine/                           # Multi-round report generation Agent
+│   ├── agent.py                            # Master orchestrator: template selection → layout → budget → chapter → render
+│   ├── flask_interface.py                  # Flask/SSE entry point, manages task queuing and streaming events
+│   ├── llms/                               # OpenAI-compatible LLM wrappers
+│   │   └── base.py                         # Unified streaming/retry client
+│   ├── core/                               # Core functionalities: template parsing, chapter storage, document stitching
+│   │   ├── template_parser.py              # Markdown template slicer and slug generator
+│   │   ├── chapter_storage.py              # Chapter run directory, manifest, and raw stream writer
+│   │   └── stitcher.py                     # Document IR stitcher, adds anchors/metadata
+│   ├── ir/                                 # Report Intermediate Representation (IR) contract & validation
+│   │   ├── schema.py                       # Block/mark schema constant definitions
+│   │   └── validator.py                    # Chapter JSON structure validator
+│   ├── nodes/                              # Full workflow reasoning nodes
+│   │   ├── base_node.py                    # Node base class + logging/state hooks
+│   │   ├── template_selection_node.py      # Template candidate collection and LLM selection
+│   │   ├── document_layout_node.py         # Title/TOC/theme designer
+│   │   ├── word_budget_node.py             # Word budget planning and chapter directive generation
+│   │   └── chapter_generation_node.py      # Chapter-level JSON generation + validation
+│   ├── prompts/                            # Prompt library and schema descriptions
+│   │   └── prompts.py                      # Template selection/layout/budget/chapter prompts
+│   ├── renderers/                          # IR renderers
+│   │   ├── html_renderer.py                # Document IR→interactive HTML
+│   │   ├── pdf_renderer.py                 # HTML→PDF export (WeasyPrint)
+│   │   ├── pdf_layout_optimizer.py         # PDF layout optimizer
+│   │   └── chart_to_svg.py                 # Chart to SVG conversion tool
+│   ├── state/                              # Task/metadata state models
+│   │   └── state.py                        # ReportState and serialization utilities
+│   ├── utils/                              # Configuration and helper utilities
+│   │   ├── config.py                       # Pydantic settings + printer helper
+│   │   ├── dependency_check.py             # Dependency checking tool
+│   │   ├── json_parser.py                  # JSON parsing utilities
+│   │   ├── chart_validator.py              # Chart validation tool
+│   │   └── chart_repair_api.py             # Chart repair API
+│   ├── report_template/                    # Markdown template library
+│   │   ├── 企业品牌声誉分析报告.md
+│   │   └── ...
+│   └── __init__.py
+├── ForumEngine/                            # Forum engine: Agent collaboration mechanism
+│   ├── monitor.py                          # Log monitoring and forum management core
+│   ├── llm_host.py                         # Forum moderator LLM module
+│   └── __init__.py
+├── MindSpider/                             # Social media crawler system
+│   ├── main.py                             # Crawler main program entry
+│   ├── config.py                           # Crawler configuration file
+│   ├── BroadTopicExtraction/               # Topic extraction module
+│   │   ├── main.py                         # Topic extraction main program
+│   │   ├── database_manager.py             # Database manager
+│   │   ├── get_today_news.py               # Today's news fetcher
+│   │   └── topic_extractor.py              # Topic extractor
+│   ├── DeepSentimentCrawling/              # Deep sentiment crawling module
+│   │   ├── main.py                         # Deep crawling main program
+│   │   ├── keyword_manager.py              # Keyword manager
+│   │   ├── platform_crawler.py             # Platform crawler manager
+│   │   └── MediaCrawler/                   # Media crawler core
+│   │       ├── main.py
+│   │       ├── config/                     # Platform configurations
+│   │       ├── media_platform/             # Platform crawler implementations
+│   │       └── ...
+│   └── schema/                             # Database schema definitions
+│       ├── db_manager.py                   # Database manager
+│       ├── init_database.py                # Database initialization script
+│       ├── mindspider_tables.sql           # Database table structure SQL
+│       ├── models_bigdata.py               # SQLAlchemy mappings for large-scale media opinion tables
+│       └── models_sa.py                    # ORM models for DailyTopic/Task extension tables
+├── SentimentAnalysisModel/                 # Sentiment analysis model collection
+│   ├── WeiboSentiment_Finetuned/           # Fine-tuned BERT/GPT-2 models
+│   │   ├── BertChinese-Lora/               # BERT Chinese LoRA fine-tuning
+│   │   │   ├── train.py
+│   │   │   ├── predict.py
+│   │   │   └── ...
+│   │   └── GPT2-Lora/                      # GPT-2 LoRA fine-tuning
+│   │       ├── train.py
+│   │       ├── predict.py
+│   │       └── ...
+│   ├── WeiboMultilingualSentiment/         # Multilingual sentiment analysis
+│   │   ├── train.py
+│   │   ├── predict.py
+│   │   └── ...
+│   ├── WeiboSentiment_SmallQwen/           # Small parameter Qwen3 fine-tuning
+│   │   ├── train.py
+│   │   ├── predict_universal.py
+│   │   └── ...
+│   └── WeiboSentiment_MachineLearning/     # Traditional machine learning methods
+│       ├── train.py
+│       ├── predict.py
+│       └── ...
+├── SingleEngineApp/                        # Individual Agent Streamlit applications
+│   ├── query_engine_streamlit_app.py       # QueryEngine standalone app
+│   ├── media_engine_streamlit_app.py       # MediaEngine standalone app
+│   └── insight_engine_streamlit_app.py     # InsightEngine standalone app
+├── query_engine_streamlit_reports/         # QueryEngine standalone app outputs
+├── media_engine_streamlit_reports/         # MediaEngine standalone app outputs
+├── insight_engine_streamlit_reports/       # InsightEngine standalone app outputs
+├── templates/                              # Flask frontend templates
+│   └── index.html                          # Main interface HTML
+├── static/                                 # Static resources
+│   └── image/                              # Image resources
+│       ├── logo_compressed.png
+│       ├── framework.png
+│       └── ...
+├── logs/                                   # Runtime log directory
+├── final_reports/                          # Final generated report files
+│   ├── ir/                                 # Report IR JSON files
+│   └── *.html                              # Final HTML reports
+├── utils/                                  # Common utility functions
+│   ├── forum_reader.py                     # Agent inter-communication forum tool
+│   ├── github_issues.py                    # Unified GitHub issue link generator and error formatter
+│   └── retry_helper.py                     # Network request retry mechanism utility
+├── tests/                                  # Unit tests and integration tests
+│   ├── run_tests.py                        # pytest entry script
+│   ├── test_monitor.py                     # ForumEngine monitoring unit tests
+│   ├── test_report_engine_sanitization.py  # ReportEngine security tests
+│   └── ...
+├── app.py                                  # Flask main application entry point
+├── config.py                               # Global configuration file
+├── .env.example                            # Environment variable example file
+├── docker-compose.yml                      # Docker multi-service orchestration config
+├── Dockerfile                              # Docker image build file
+├── requirements.txt                        # Python dependency list
+├── regenerate_latest_pdf.py                # PDF regeneration utility script
+├── report_engine_only.py                   # Report Engine CLI version
+├── README.md                               # Chinese documentation
+├── README-EN.md                            # English documentation
+├── CONTRIBUTING.md                         # Chinese contribution guide
+├── CONTRIBUTING-EN.md                      # English contribution guide
+└── LICENSE                                 # GPL-2.0 open source license
 ```

 ## 🚀 Quick Start (Docker)
@@ -276,7 +355,13 @@ conda activate your_conda_name
 uv venv --python 3.11 # Create Python 3.11 environment
 ```

-### 2. Install Dependencies
+### 2. Install System Dependencies for PDF Export (Optional)
+
+This section contains detailed configuration instructions:[Configure the dependencies](./static/Partial%20README%20for%20PDF%20Exporting/README-EN.md)
+
+### 3. Install Dependencies
+
+> If Step 2 is skipped, the WeasyPrint library may not install correctly, and the PDF functionality may be unavailable.

 ```bash
 # Basic dependency installation
@@ -287,14 +372,14 @@ uv pip install -r requirements.txt
 # If you do not want to use the local sentiment analysis model (which has low computational requirements and defaults to the CPU version), you can comment out the 'Machine Learning' section in this file before executing the command.
 ```

-### 3. Install Playwright Browser Drivers
+### 4. Install Playwright Browser Drivers

 ```bash
 # Install browser drivers (for crawler functionality)
 playwright install chromium
 ```

-### 4. Configure LLM and Database
+### 5. Configure LLM and Database

 Copy the `.env.example` file in the project root directory and rename it to `.env`.

@@ -318,23 +403,22 @@ DB_CHARSET=utf8mb4
 DB_DIALECT=postgresql
 # Database initialization is not required, as it will be checked automatically upon executing app.py

-# LLM configuration
+# ====================== LLM Configuration ======================
 # You can switch each Engine's LLM provider as long as it follows the OpenAI-compatible request format
+# The configuration file provides recommended LLMs for each Agent. For initial deployment, please refer to the recommended settings first

 # Insight Agent
 INSIGHT_ENGINE_API_KEY=
-# Insight Agent LLM API BaseUrl, customize API provider
 INSIGHT_ENGINE_BASE_URL=
-# Insight Agent LLM Model Name, e.g., kimi-k2-0711-preview
 INSIGHT_ENGINE_MODEL_NAME=
+
 # Media Agent
 ...
 ```
-Recommended LLM API Provider: [aihubmix](https://aihubmix.com/?aff=8Ds9)

-### 5. Launch System
+### 6. Launch System

-#### 5.1 Complete System Launch (Recommended)
+#### 6.1 Complete System Launch (Recommended)

 ```bash
 # In project root directory, activate conda environment
@@ -357,11 +441,9 @@ python app.py

 > Note 2: Data scraping needs to be performed as a separate operation. Please refer to the instructions in section 5.3.

-> Note 3: If page display issues occur during remote server deployment, see [PR#45](https://github.com/666ghj/BettaFish/pull/45)
-
 Visit http://localhost:5000 to use the complete system

-#### 5.2 Launch Individual Agents
+#### 6.2 Launch Individual Agents

 ```bash
 # Start QueryEngine
@@ -374,7 +456,7 @@ streamlit run SingleEngineApp/media_engine_streamlit_app.py --server.port 8502
 streamlit run SingleEngineApp/insight_engine_streamlit_app.py --server.port 8501
 ```

-#### 5.3 Crawler System Standalone Use
+#### 6.3 Crawler System Standalone Use

 This section has detailed configuration documentation: [MindSpider Usage Guide](./MindSpider/README.md)

@@ -404,6 +486,44 @@ python main.py --broad-topic --date 2024-01-20
 python main.py --deep-sentiment --platforms xhs dy wb
 ```

+#### 6.4 Command-line Report Generation Tool
+
+If you don't need the Web interface, you can use the command-line tool to generate reports directly. This tool automatically retrieves the latest report files from the three analysis engines, skips file addition verification, and directly generates comprehensive reports.
+
+```bash
+# Basic usage (automatically extract topic from filename)
+python report_engine_only.py
+
+# Specify report topic
+python report_engine_only.py --query "Civil Engineering Industry Analysis"
+
+# Skip PDF generation (even if system supports it)
+python report_engine_only.py --skip-pdf
+
+# Show verbose logging
+python report_engine_only.py --verbose
+
+# Show help information
+python report_engine_only.py --help
+```
+
+**Features:**
+
+1. **Automatic Dependency Check**: The program automatically checks system dependencies required for PDF generation and provides installation instructions if missing
+2. **Get Latest Files**: Automatically retrieves the latest analysis reports from three engine directories (`insight_engine_streamlit_reports`, `media_engine_streamlit_reports`, `query_engine_streamlit_reports`)
+3. **File Confirmation**: Displays all selected file names, paths, and modification times, waiting for user confirmation (default input `y` to continue, input `n` to exit)
+4. **Direct Report Generation**: Skips file addition verification and directly calls Report Engine to generate comprehensive reports
+5. **Automatic File Saving**:
+   - HTML reports saved to `final_reports/` directory
+   - PDF reports (if dependencies available) saved to `final_reports/pdf/` directory
+   - File naming format: `final_report_{topic}_{timestamp}.html/pdf`
+
+**Notes:**
+
+- Ensure at least one of the three engine directories contains `.md` report files
+- The command-line tool is independent of the Web interface and does not interfere with each other
+- PDF generation requires system dependencies, see "Install PDF Export System Dependencies" section above
+
 ## ⚙️ Advanced Configuration (Deprecated: Configuration has been unified to the `.env` file in the project root directory, and other sub-agents automatically inherit the root directory configuration)

 ### Modify Key Parameters
@@ -654,6 +774,13 @@ Thanks to these excellent contributors:

 [![Contributors](https://contrib.rocks/image?repo=666ghj/BettaFish)](https://github.com/666ghj/BettaFish/graphs/contributors)

+## 🌟 Join Our Official Community
+
+<div align="center">
+  <img src="https://capsule-render.vercel.app/api?type=waving&color=gradient&height=200&section=header&text=Welcome%20to%20Our%20QQ%20Group!&fontSize=40&fontAlignY=35&desc=Scan%20to%20Join%20Our%20Community&descAlignY=55" alt="Welcome to Our QQ Group!" style="width:60%; max-width:900px; display:block; margin:0 auto;">
+  <img src="static/image/QQ_Light_Horizenal.png" alt="BettaFish QQ Group QR Code" style="width:60%; max-width:360px; display:block; margin:20px auto 0;">
+</div>
+
 ## 📈 Project Statistics

 <a href="https://www.star-history.com/#666ghj/BettaFish&type=date&legend=top-left">
@@ -26,13 +26,6 @@

 </div>

-## 🌟 加入官方交流群
-
-<div align="center">
-  <img src="https://capsule-render.vercel.app/api?type=waving&color=gradient&height=200&section=header&text=欢迎加入我们的技术交流QQ群！&fontSize=40&fontAlignY=35&desc=扫描下方二维码加入群聊&descAlignY=55" alt="欢迎加入我们的技术交流QQ群！" style="width:60%; max-width:900px; display:block; margin:0 auto;">
-  <img src="static/image/QQ_Light_Horizenal.png" alt="BettaFish 技术交流群二维码" style="width:60%; max-width:360px; display:block; margin:20px auto 0;">
-</div>
-
 ## ⚡ 项目概述

 “**微舆**” 是一个从0实现的创新型 多智能体 舆情分析系统，帮助大家破除信息茧房，还原舆情原貌，预测未来走向，辅助决策。用户只需像聊天一样提出分析需求，智能体开始全自动分析 国内外30+主流社媒 与 数百万条大众评论。
@@ -121,104 +114,183 @@ LLM模型API赞助：<a href="https://aihubmix.com/?aff=8Ds9" target="_blank"><i
 | 4 | 策略制定 | 基于初步结果制定分块研究策略 | 各Agent内部决策模块 | - |
 | 5-N | **循环阶段** | **论坛协作 + 深度研究** | **ForumEngine + 所有Agent** | **多轮循环** |
 | 5.1 | 深度研究 | 各Agent基于论坛主持人引导进行专项搜索 | 各Agent + 反思机制 + 论坛引导 | 每轮循环 |
-| 5.2 | 论坛协作 | ForumEngine监控Agent发言并生成主持人总结 | ForumEngine + LLM主持人 | 每轮循环 |
+| 5.2 | 论坛协作 | ForumEngine监控Agent发言并生成主持人引导 | ForumEngine + LLM主持人 | 每轮循环 |
 | 5.3 | 交流融合 | 各Agent根据讨论调整研究方向 | 各Agent + forum_reader工具 | 每轮循环 |
 | N+1 | 结果整合 | Report Agent收集所有分析结果和论坛内容 | Report Agent | - |
-| N+2 | 报告生成 | 动态选择模板和样式，多轮生成最终报告 | Report Agent + 模板引擎 | - |
+| N+2 | IR中间表示 | 动态选择模板和样式，多轮生成元数据，装订为IR中间表示 | Report Agent + 模板引擎 | - |
+| N+3 | 报告生成 | 分块进行质量检测，基于IR渲染成交互式 HTML 报告 | Report Agent + 装订引擎 | - |

 ### 项目代码结构树

 ```
 BettaFish/
-├── QueryEngine/                   # 国内外新闻广度搜索Agent
-│   ├── agent.py                   # Agent主逻辑
-│   ├── llms/                      # LLM接口封装
-│   ├── nodes/                     # 处理节点
-│   ├── tools/                     # 搜索工具
-│   ├── utils/                     # 工具函数
-│   └── ...                        # 其他模块
-├── MediaEngine/                   # 强大的多模态理解Agent
-│   ├── agent.py                   # Agent主逻辑
-│   ├── nodes/                     # 处理节点
-│   ├── llms/                      # LLM接口
-│   ├── tools/                     # 搜索工具
-│   ├── utils/                     # 工具函数
-│   └── ...                        # 其他模块
-├── InsightEngine/                 # 私有数据库挖掘Agent
-│   ├── agent.py                   # Agent主逻辑
-│   ├── llms/                      # LLM接口封装
-│   │   └── base.py                # 统一的 OpenAI 兼容客户端
-│   ├── nodes/                     # 处理节点
-│   │   ├── base_node.py           # 基础节点类
-│   │   ├── formatting_node.py     # 格式化节点
-│   │   ├── report_structure_node.py # 报告结构节点
-│   │   ├── search_node.py         # 搜索节点
-│   │   └── summary_node.py        # 总结节点
-│   ├── tools/                     # 数据库查询和分析工具
-│   │   ├── keyword_optimizer.py   # Qwen关键词优化中间件
-│   │   ├── search.py              # 数据库操作工具集
-│   │   └── sentiment_analyzer.py  # 情感分析集成工具
-│   ├── state/                     # 状态管理
-│   │   ├── __init__.py
-│   │   └── state.py               # Agent状态定义
-│   ├── prompts/                   # 提示词模板
-│   │   ├── __init__.py
-│   │   └── prompts.py             # 各类提示词
-│   └── utils/                     # 工具函数
-│       ├── __init__.py
-│       ├── config.py              # 配置管理
-│       └── text_processing.py     # 文本处理工具
-├── ReportEngine/                  # 多轮报告生成Agent
-│   ├── agent.py                   # Agent主逻辑
-│   ├── llms/                      # LLM接口
-│   ├── nodes/                     # 报告生成节点
-│   │   ├── template_selection.py  # 模板选择节点
-│   │   └── html_generation.py     # HTML生成节点
-│   ├── report_template/           # 报告模板库
-│   │   ├── 社会公共热点事件分析.md
-│   │   ├── 商业品牌舆情监测.md
-│   │   └── ...                    # 更多模板
-│   └── flask_interface.py         # Flask API接口
-├── ForumEngine/                   # 论坛引擎简易实现
-│   ├── monitor.py                 # 日志监控和论坛管理
-│   └── llm_host.py                # 论坛主持人LLM模块
-├── MindSpider/                    # 微博爬虫系统
-│   ├── main.py                    # 爬虫主程序
-│   ├── config.py                  # 爬虫配置文件
-│   ├── BroadTopicExtraction/      # 话题提取模块
-│   │   ├── database_manager.py    # 数据库管理器
-│   │   ├── get_today_news.py      # 今日新闻获取
-│   │   ├── main.py                # 话题提取主程序
-│   │   └── topic_extractor.py     # 话题提取器
-│   ├── DeepSentimentCrawling/     # 深度舆情爬取
-│   │   ├── keyword_manager.py     # 关键词管理器
-│   │   ├── main.py                # 深度爬取主程序
-│   │   ├── MediaCrawler/          # 媒体爬虫核心
-│   │   └── platform_crawler.py    # 平台爬虫管理
-│   └── schema/                    # 数据库结构
-│       ├── db_manager.py          # 数据库管理器
-│       ├── init_database.py       # 数据库初始化
-│       └── mindspider_tables.sql  # 数据库表结构
-├── SentimentAnalysisModel/        # 情感分析模型集合
-│   ├── WeiboSentiment_Finetuned/  # 微调BERT/GPT-2模型
-│   ├── WeiboMultilingualSentiment/# 多语言情感分析（推荐）
-│   ├── WeiboSentiment_SmallQwen/  # 小参数Qwen3微调
-│   └── WeiboSentiment_MachineLearning/ # 传统机器学习方法
-├── SingleEngineApp/               # 单独Agent的Streamlit应用
-│   ├── query_engine_streamlit_app.py
-│   ├── media_engine_streamlit_app.py
-│   └── insight_engine_streamlit_app.py
-├── templates/                     # Flask模板
-│   └── index.html                 # 主界面前端
-├── static/                        # 静态资源
-├── logs/                          # 运行日志目录
-├── final_reports/                 # 最终生成的HTML报告文件
-├── utils/                         # 通用工具函数
-│   ├── forum_reader.py            # Agent间论坛通信
-│   └── retry_helper.py            # 网络请求重试机制工具
-├── app.py                         # Flask主应用入口
-├── config.py                      # 全局配置文件
-└── requirements.txt               # Python依赖包清单
+├── QueryEngine/                            # 国内外新闻广度搜索Agent
+│   ├── agent.py                            # Agent主逻辑，协调搜索与分析流程
+│   ├── llms/                               # LLM接口封装
+│   ├── nodes/                              # 处理节点：搜索、格式化、总结等
+│   ├── tools/                              # 国内外新闻搜索工具集
+│   ├── utils/                              # 工具函数
+│   ├── state/                              # 状态管理
+│   ├── prompts/                            # 提示词模板
+│   └── ...
+├── MediaEngine/                            # 强大的多模态理解Agent
+│   ├── agent.py                            # Agent主逻辑，处理视频/图片等多模态内容
+│   ├── llms/                               # LLM接口封装
+│   ├── nodes/                              # 处理节点：搜索、格式化、总结等
+│   ├── tools/                              # 多模态搜索工具集
+│   ├── utils/                              # 工具函数
+│   ├── state/                              # 状态管理
+│   ├── prompts/                            # 提示词模板
+│   └── ...
+├── InsightEngine/                          # 私有数据库挖掘Agent
+│   ├── agent.py                            # Agent主逻辑，协调数据库查询与分析
+│   ├── llms/                               # LLM接口封装
+│   │   └── base.py                         # 统一的OpenAI兼容客户端
+│   ├── nodes/                              # 处理节点：搜索、格式化、总结等
+│   │   ├── base_node.py                    # 基础节点类
+│   │   ├── search_node.py                  # 搜索节点
+│   │   ├── formatting_node.py              # 格式化节点
+│   │   ├── report_structure_node.py        # 报告结构节点
+│   │   └── summary_node.py                 # 总结节点
+│   ├── tools/                              # 数据库查询和分析工具集
+│   │   ├── keyword_optimizer.py            # Qwen关键词优化中间件
+│   │   ├── search.py                       # 数据库操作工具集（话题搜索、评论获取等）
+│   │   └── sentiment_analyzer.py           # 情感分析集成工具
+│   ├── utils/                              # 工具函数
+│   │   ├── config.py                       # 配置管理
+│   │   ├── db.py                           # SQLAlchemy异步引擎与只读查询封装
+│   │   └── text_processing.py              # 文本处理工具
+│   ├── state/                              # 状态管理
+│   │   └── state.py                        # Agent状态定义
+│   ├── prompts/                            # 提示词模板
+│   │   └── prompts.py                      # 各类提示词
+│   └── __init__.py
+├── ReportEngine/                           # 多轮报告生成Agent
+│   ├── agent.py                            # 总调度器：模板选择→布局→篇幅→章节→渲染
+│   ├── flask_interface.py                  # Flask/SSE入口，管理任务排队与流式事件
+│   ├── llms/                               # OpenAI兼容LLM封装
+│   │   └── base.py                         # 统一的流式/重试客户端
+│   ├── core/                               # 核心功能：模板解析、章节存储、文档装订
+│   │   ├── template_parser.py              # Markdown模板切片与slug生成
+│   │   ├── chapter_storage.py              # 章节run目录、manifest与raw流写入
+│   │   └── stitcher.py                     # Document IR装订器，补齐锚点/元数据
+│   ├── ir/                                 # 报告中间表示（IR）契约与校验
+│   │   ├── schema.py                       # 块/标记Schema常量定义
+│   │   └── validator.py                    # 章节JSON结构校验器
+│   ├── nodes/                              # 全流程推理节点
+│   │   ├── base_node.py                    # 节点基类+日志/状态钩子
+│   │   ├── template_selection_node.py      # 模板候选收集与LLM筛选
+│   │   ├── document_layout_node.py         # 标题/目录/主题设计
+│   │   ├── word_budget_node.py             # 篇幅规划与章节指令生成
+│   │   └── chapter_generation_node.py      # 章节级JSON生成+校验
+│   ├── prompts/                            # 提示词库与Schema说明
+│   │   └── prompts.py                      # 模板选择/布局/篇幅/章节提示词
+│   ├── renderers/                          # IR渲染器
+│   │   ├── html_renderer.py                # Document IR→交互式HTML
+│   │   ├── pdf_renderer.py                 # HTML→PDF导出（WeasyPrint）
+│   │   ├── pdf_layout_optimizer.py         # PDF布局优化器
+│   │   └── chart_to_svg.py                 # 图表转SVG工具
+│   ├── state/                              # 任务/元数据状态模型
+│   │   └── state.py                        # ReportState与序列化工具
+│   ├── utils/                              # 配置与辅助工具
+│   │   ├── config.py                       # Pydantic Settings与打印助手
+│   │   ├── dependency_check.py             # 依赖检查工具
+│   │   ├── json_parser.py                  # JSON解析工具
+│   │   ├── chart_validator.py              # 图表校验工具
+│   │   └── chart_repair_api.py             # 图表修复API
+│   ├── report_template/                    # Markdown模板库
+│   │   ├── 企业品牌声誉分析报告.md
+│   │   └── ...
+│   └── __init__.py
+├── ForumEngine/                            # 论坛引擎：Agent协作机制
+│   ├── monitor.py                          # 日志监控和论坛管理核心
+│   ├── llm_host.py                         # 论坛主持人LLM模块
+│   └── __init__.py
+├── MindSpider/                             # 社交媒体爬虫系统
+│   ├── main.py                             # 爬虫主程序入口
+│   ├── config.py                           # 爬虫配置文件
+│   ├── BroadTopicExtraction/               # 话题提取模块
+│   │   ├── main.py                         # 话题提取主程序
+│   │   ├── database_manager.py             # 数据库管理器
+│   │   ├── get_today_news.py               # 今日新闻获取
+│   │   └── topic_extractor.py              # 话题提取器
+│   ├── DeepSentimentCrawling/              # 深度舆情爬取模块
+│   │   ├── main.py                         # 深度爬取主程序
+│   │   ├── keyword_manager.py              # 关键词管理器
+│   │   ├── platform_crawler.py             # 平台爬虫管理
+│   │   └── MediaCrawler/                   # 社媒爬虫核心
+│   │       ├── main.py
+│   │       ├── config/                     # 各平台配置
+│   │       ├── media_platform/             # 各平台爬虫实现
+│   │       └── ...
+│   └── schema/                             # 数据库结构定义
+│       ├── db_manager.py                   # 数据库管理器
+│       ├── init_database.py                # 数据库初始化脚本
+│       ├── mindspider_tables.sql           # 数据库表结构SQL
+│       ├── models_bigdata.py               # 大规模媒体舆情表的SQLAlchemy映射
+│       └── models_sa.py                    # DailyTopic/Task等扩展表ORM模型
+├── SentimentAnalysisModel/                 # 情感分析模型集合
+│   ├── WeiboSentiment_Finetuned/           # 微调BERT/GPT-2模型
+│   │   ├── BertChinese-Lora/               # BERT中文LoRA微调
+│   │   │   ├── train.py
+│   │   │   ├── predict.py
+│   │   │   └── ...
+│   │   └── GPT2-Lora/                      # GPT-2 LoRA微调
+│   │       ├── train.py
+│   │       ├── predict.py
+│   │       └── ...
+│   ├── WeiboMultilingualSentiment/         # 多语言情感分析
+│   │   ├── train.py
+│   │   ├── predict.py
+│   │   └── ...
+│   ├── WeiboSentiment_SmallQwen/           # 小参数Qwen3微调
+│   │   ├── train.py
+│   │   ├── predict_universal.py
+│   │   └── ...
+│   └── WeiboSentiment_MachineLearning/     # 传统机器学习方法
+│       ├── train.py
+│       ├── predict.py
+│       └── ...
+├── SingleEngineApp/                        # 单独Agent的Streamlit应用
+│   ├── query_engine_streamlit_app.py       # QueryEngine独立应用
+│   ├── media_engine_streamlit_app.py       # MediaEngine独立应用
+│   └── insight_engine_streamlit_app.py     # InsightEngine独立应用
+├── query_engine_streamlit_reports/         # QueryEngine单应用运行输出
+├── media_engine_streamlit_reports/         # MediaEngine单应用运行输出
+├── insight_engine_streamlit_reports/       # InsightEngine单应用运行输出
+├── templates/                              # Flask前端模板
+│   └── index.html                          # 主界面HTML
+├── static/                                 # 静态资源
+│   └── image/                              # 图片资源
+│       ├── logo_compressed.png
+│       ├── framework.png
+│       └── ...
+├── logs/                                   # 运行日志目录
+├── final_reports/                          # 最终生成的报告文件
+│   ├── ir/                                 # 报告IR JSON文件
+│   └── *.html                              # 最终HTML报告
+├── utils/                                  # 通用工具函数
+│   ├── forum_reader.py                     # Agent间论坛通信工具
+│   ├── github_issues.py                    # 统一生成GitHub Issue链接与错误提示
+│   └── retry_helper.py                     # 网络请求重试机制工具
+├── tests/                                  # 单元测试与集成测试
+│   ├── run_tests.py                        # pytest入口脚本
+│   ├── test_monitor.py                     # ForumEngine监控单元测试
+│   ├── test_report_engine_sanitization.py  # ReportEngine安全性测试
+│   └── ...
+├── app.py                                  # Flask主应用入口
+├── config.py                               # 全局配置文件
+├── .env.example                            # 环境变量示例文件
+├── docker-compose.yml                      # Docker多服务编排配置
+├── Dockerfile                              # Docker镜像构建文件
+├── requirements.txt                        # Python依赖包清单
+├── regenerate_latest_pdf.py                # PDF重新生成工具脚本
+├── report_engine_only.py                   # Report Engine命令行版本
+├── README.md                               # 中文说明文档
+├── README-EN.md                            # 英文说明文档
+├── CONTRIBUTING.md                         # 中文贡献指南
+├── CONTRIBUTING-EN.md                      # 英文贡献指南
+└── LICENSE                                 # GPL-2.0开源许可证
 ```

 ## 🚀 快速开始（Docker）
@@ -287,7 +359,13 @@ conda activate your_conda_name
 uv venv --python 3.11 # 创建3.11环境
 ```

-### 2. 安装依赖包
+### 2. 安装 PDF 导出所需系统依赖（可选）
+
+这部分有详细的配置说明：[配置所需依赖](./static/Partial%20README%20for%20PDF%20Exporting/README.md)
+
+### 3. 安装依赖包
+
+> 如果跳过了步骤2，weasyprint库可能无法安装，PDF功能可能无法正常使用。

 ```bash
 # 基础依赖安装
@@ -295,17 +373,17 @@ pip install -r requirements.txt

 # uv版本命令（更快速安装）
 uv pip install -r requirements.txt
-# 如果不想使用本地情感分析模型（算力需求很小，默认安装cpu版本），可以将该文件中的“机器学习”部分注释掉再执行指令
+# 如果不想使用本地情感分析模型（算力需求很小，默认安装cpu版本），可以将该文件中的"机器学习"部分注释掉再执行指令
 ```

-### 3. 安装Playwright浏览器驱动
+### 4. 安装Playwright浏览器驱动

 ```bash
 # 安装浏览器驱动（用于爬虫功能）
 playwright install chromium
 ```

-### 4. 配置LLM与数据库
+### 5. 配置LLM与数据库

 复制一份项目根目录 `.env.example` 文件，命名为 `.env`

@@ -331,22 +409,20 @@ DB_DIALECT=postgresql

 # ====================== LLM配置 ======================
 # 您可以更改每个部分LLM使用的API，只要兼容OpenAI请求格式都可以
+# 配置文件内部给了每一个Agent的推荐LLM，初次部署请先参考推荐设置

 # Insight Agent
 INSIGHT_ENGINE_API_KEY=
-# Insight Agent LLM接口BaseUrl，可自定义厂商API
 INSIGHT_ENGINE_BASE_URL=
-# Insight Agent LLM模型名称，如kimi-k2-0711-preview
 INSIGHT_ENGINE_MODEL_NAME=

 # Media Agent
 ...
 ```
-推荐LLM API供应商：[推理时代](https://aihubmix.com/?aff=8Ds9)

-### 5. 启动系统
+### 6. 启动系统

-#### 5.1 完整系统启动（推荐）
+#### 6.1 完整系统启动（推荐）

 ```bash
 # 在项目根目录下，激活conda环境
@@ -367,13 +443,11 @@ python app.py

 > 注1：一次运行终止后，streamlit app可能结束异常仍然占用端口，此时搜索占用端口的进程kill掉即可

-> 注2：数据爬取需要单独操作，见5.3指引
-
-> 注3：如果服务器远程部署出现页面显示问题，见[PR#45](https://github.com/666ghj/BettaFish/pull/45)
+> 注2：数据爬取需要单独操作，见6.3指引

 访问 http://localhost:5000 即可使用完整系统

-#### 5.2 单独启动某个Agent
+#### 6.2 单独启动某个Agent

 ```bash
 # 启动QueryEngine
@@ -386,7 +460,7 @@ streamlit run SingleEngineApp/media_engine_streamlit_app.py --server.port 8502
 streamlit run SingleEngineApp/insight_engine_streamlit_app.py --server.port 8501
 ```

-#### 5.3 爬虫系统单独使用
+#### 6.3 爬虫系统单独使用

 这部分有详细的配置文档：[MindSpider使用说明](./MindSpider/README.md)

@@ -416,6 +490,44 @@ python main.py --broad-topic --date 2024-01-20
 python main.py --deep-sentiment --platforms xhs dy wb
 ```

+#### 6.4 命令行报告生成工具
+
+如果您不需要Web界面，可以使用命令行工具直接生成报告。该工具会自动获取三个分析引擎的最新报告文件，跳过文件增加审核，直接生成综合报告。
+
+```bash
+# 基本使用（自动从文件名提取主题）
+python report_engine_only.py
+
+# 指定报告主题
+python report_engine_only.py --query "土木工程行业分析"
+
+# 跳过PDF生成（即使系统支持）
+python report_engine_only.py --skip-pdf
+
+# 显示详细日志
+python report_engine_only.py --verbose
+
+# 查看帮助信息
+python report_engine_only.py --help
+```
+
+**功能说明：**
+
+1. **自动检查依赖**：程序会自动检查PDF生成所需的系统依赖，如果缺失会给出安装提示
+2. **获取最新文件**：自动从三个引擎目录（`insight_engine_streamlit_reports`、`media_engine_streamlit_reports`、`query_engine_streamlit_reports`）获取最新的分析报告
+3. **文件确认**：显示所有选择的文件名、路径和修改时间，等待用户确认（默认输入 `y` 继续，输入 `n` 退出）
+4. **直接生成报告**：跳过文件增加审核程序，直接调用Report Engine生成综合报告
+5. **自动保存文件**：
+   - HTML报告保存到 `final_reports/` 目录
+   - PDF报告（如果有依赖）保存到 `final_reports/pdf/` 目录
+   - 文件命名格式：`final_report_{主题}_{时间戳}.html/pdf`
+
+**注意事项：**
+
+- 确保三个引擎目录中至少有一个包含`.md`报告文件
+- 命令行工具与Web界面相互独立，不会相互影响
+- PDF生成需要安装系统依赖，详见上文"安装 PDF 导出所需系统依赖"部分
+
 ## ⚙️ 高级配置（已过时，已经统一为项目根目录.env文件管理，其他子agent自动继承根目录配置）

 ### 修改关键参数
@@ -664,6 +776,13 @@ class DeepSearchAgent:

 [![Contributors](https://contrib.rocks/image?repo=666ghj/BettaFish)](https://github.com/666ghj/BettaFish/graphs/contributors)

+## 🌟 加入官方交流群
+
+<div align="center">
+  <img src="https://capsule-render.vercel.app/api?type=waving&color=gradient&height=200&section=header&text=欢迎加入我们的技术交流QQ群！&fontSize=40&fontAlignY=35&desc=扫描下方二维码加入群聊&descAlignY=55" alt="欢迎加入我们的技术交流QQ群！" style="width:60%; max-width:900px; display:block; margin:0 auto;">
+  <img src="static/image/QQ_Light_Horizenal.png" alt="BettaFish 技术交流群二维码" style="width:60%; max-width:360px; display:block; margin:20px auto 0;">
+</div>
+
 ## 📈 项目统计

 <a href="https://www.star-history.com/#666ghj/BettaFish&type=date&legend=top-left">
@@ -1,7 +1,8 @@
 """
-Report Engine
-一个智能报告生成AI代理实现
-基于三个子agent的输出和论坛日志生成综合HTML报告
+Report Engine。
+
+一个智能报告生成AI代理实现，聚合 Query/Media/Insight 三个子引擎的
+Markdown 与论坛讨论，最终落地结构化HTML报告。
 """

 from .agent import ReportAgent, create_agent
@@ -0,0 +1,17 @@
+"""
+Report Engine核心工具集合。
+
+该包封装了模板切片、章节存储与章节装订三大基础能力，
+所有上层节点都会复用这些工具保证结构一致。
+"""
+
+from .template_parser import TemplateSection, parse_template_sections
+from .chapter_storage import ChapterStorage
+from .stitcher import DocumentComposer
+
+__all__ = [
+    "TemplateSection",
+    "parse_template_sections",
+    "ChapterStorage",
+    "DocumentComposer",
+]
@@ -0,0 +1,290 @@
+"""
+章节JSON的落盘与清单管理。
+
+每一章在流式生成时会立即写入raw文件，完成校验后再写入
+格式化的chapter.json，并在manifest中记录元数据，便于后续装订。
+"""
+
+from __future__ import annotations
+
+import json
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Generator, List, Optional
+
+
+@dataclass
+class ChapterRecord:
+    """
+    manifest中记录的章节元数据。
+
+    该结构用于在 `manifest.json` 中追踪每章的状态、文件位置、
+    以及可能的错误列表，方便前端或调试工具读取。
+    """
+
+    chapter_id: str
+    slug: str
+    title: str
+    order: int
+    status: str
+    files: Dict[str, str] = field(default_factory=dict)
+    errors: List[str] = field(default_factory=list)
+    updated_at: str = field(default_factory=lambda: datetime.utcnow().isoformat() + "Z")
+
+    def to_dict(self) -> Dict[str, object]:
+        """将记录转换为便于写入manifest.json的序列化字典"""
+        return {
+            "chapterId": self.chapter_id,
+            "slug": self.slug,
+            "title": self.title,
+            "order": self.order,
+            "status": self.status,
+            "files": self.files,
+            "errors": self.errors,
+            "updatedAt": self.updated_at,
+        }
+
+
+class ChapterStorage:
+    """
+    章节JSON写入与manifest管理器。
+
+    负责：
+        - 为每次报告创建独立run目录与manifest快照；
+        - 在章节流式生成时即时写入 `stream.raw`；
+        - 校验通过后持久化 `chapter.json` 并更新manifest状态。
+    """
+
+    def __init__(self, base_dir: str):
+        """
+        创建章节存储器。
+
+        Args:
+            base_dir: 所有输出run目录的根路径
+        """
+        self.base_dir = Path(base_dir)
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+        self._manifests: Dict[str, Dict[str, object]] = {}
+
+    # ======== 会话与清单 ========
+
+    def start_session(self, report_id: str, metadata: Dict[str, object]) -> Path:
+        """
+        为本次报告创建独立的章节输出目录与manifest。
+
+        同时把全局metadata写入 `manifest.json`，供渲染/调试查询。
+
+        参数:
+            report_id: 任务ID。
+            metadata: Report元数据（标题、主题等）。
+
+        返回:
+            Path: 新建的run目录。
+        """
+        run_dir = self.base_dir / report_id
+        run_dir.mkdir(parents=True, exist_ok=True)
+        manifest = {
+            "reportId": report_id,
+            "createdAt": datetime.utcnow().isoformat() + "Z",
+            "metadata": metadata,
+            "chapters": [],
+        }
+        self._manifests[self._key(run_dir)] = manifest
+        self._write_manifest(run_dir, manifest)
+        return run_dir
+
+    def begin_chapter(self, run_dir: Path, chapter_meta: Dict[str, object]) -> Path:
+        """
+        创建章节子目录并在manifest中标记为streaming状态。
+
+        会生成 `order-slug` 风格的子目录，并提前登记 raw 文件路径。
+
+        参数:
+            run_dir: 会话根目录。
+            chapter_meta: 包含 chapterId/title/slug/order 的元数据。
+
+        返回:
+            Path: 章节目录。
+        """
+        slug_value = str(
+            chapter_meta.get("slug") or chapter_meta.get("chapterId") or "section"
+        )
+        chapter_dir = self._chapter_dir(
+            run_dir,
+            slug_value,
+            int(chapter_meta.get("order", 0)),
+        )
+        record = ChapterRecord(
+            chapter_id=str(chapter_meta.get("chapterId")),
+            slug=slug_value,
+            title=str(chapter_meta.get("title")),
+            order=int(chapter_meta.get("order", 0)),
+            status="streaming",
+            files={"raw": str(self._raw_stream_path(chapter_dir).relative_to(run_dir))},
+        )
+        self._upsert_record(run_dir, record)
+        return chapter_dir
+
+    def persist_chapter(
+        self,
+        run_dir: Path,
+        chapter_meta: Dict[str, object],
+        payload: Dict[str, object],
+        errors: Optional[List[str]] = None,
+    ) -> Path:
+        """
+        章节流式生成完毕后写入最终JSON并更新manifest状态。
+
+        若校验失败，错误信息会被写入manifest，供前端展示。
+
+        参数:
+            run_dir: 会话根目录。
+            chapter_meta: 章节元信息。
+            payload: 校验通过的章节JSON。
+            errors: 可选的错误列表，用于标记invalid状态。
+
+        返回:
+            Path: 最终的 `chapter.json` 文件路径。
+        """
+        slug_value = str(
+            chapter_meta.get("slug") or chapter_meta.get("chapterId") or "section"
+        )
+        chapter_dir = self._chapter_dir(
+            run_dir,
+            slug_value,
+            int(chapter_meta.get("order", 0)),
+        )
+        final_path = chapter_dir / "chapter.json"
+        final_path.write_text(
+            json.dumps(payload, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+
+        record = ChapterRecord(
+            chapter_id=str(chapter_meta.get("chapterId")),
+            slug=slug_value,
+            title=str(chapter_meta.get("title")),
+            order=int(chapter_meta.get("order", 0)),
+            status="ready" if not errors else "invalid",
+            files={
+                "raw": str(self._raw_stream_path(chapter_dir).relative_to(run_dir)),
+                "json": str(final_path.relative_to(run_dir)),
+            },
+            errors=errors or [],
+        )
+        self._upsert_record(run_dir, record)
+        return final_path
+
+    def load_chapters(self, run_dir: Path) -> List[Dict[str, object]]:
+        """
+        从指定run目录读取全部chapter.json并按order排序返回。
+
+        常用于 DocumentComposer 将多个章节装订成整本IR。
+
+        参数:
+            run_dir: 会话根目录。
+
+        返回:
+            list[dict]: 章节payload列表。
+        """
+        payloads: List[Dict[str, object]] = []
+        for child in sorted(run_dir.iterdir()):
+            if not child.is_dir():
+                continue
+            chapter_path = child / "chapter.json"
+            if not chapter_path.exists():
+                continue
+            try:
+                payload = json.loads(chapter_path.read_text(encoding="utf-8"))
+                payloads.append(payload)
+            except json.JSONDecodeError:
+                continue
+        payloads.sort(key=lambda x: x.get("order", 0))
+        return payloads
+
+    # ======== 文件操作 ========
+
+    @contextmanager
+    def capture_stream(self, chapter_dir: Path) -> Generator:
+        """
+        将流式输出实时写入raw文件。
+
+        通过 contextmanager 暴露文件句柄，简化章节节点的写入逻辑。
+
+        参数:
+            chapter_dir: 当前章节目录。
+
+        返回:
+            Generator[TextIO]: 作为上下文管理器使用的文件对象。
+        """
+        raw_path = self._raw_stream_path(chapter_dir)
+        raw_path.parent.mkdir(parents=True, exist_ok=True)
+        with raw_path.open("w", encoding="utf-8") as fp:
+            yield fp
+
+    # ======== 内部工具 ========
+
+    def _chapter_dir(self, run_dir: Path, slug: str, order: int) -> Path:
+        """根据slug/order生成稳定目录，确保各章分隔存盘且可排序。"""
+        safe_slug = self._safe_slug(slug)
+        folder = f"{order:03d}-{safe_slug}"
+        path = run_dir / folder
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    def _safe_slug(self, slug: str) -> str:
+        """移除危险字符，避免生成非法文件夹名。"""
+        slug = slug.replace(" ", "-").replace("/", "-")
+        return slug or "section"
+
+    def _raw_stream_path(self, chapter_dir: Path) -> Path:
+        """返回某章节流式输出对应的raw文件路径。"""
+        return chapter_dir / "stream.raw"
+
+    def _key(self, run_dir: Path) -> str:
+        """将run目录解析为字典缓存的键，避免重复读取磁盘。"""
+        return str(run_dir.resolve())
+
+    def _manifest_path(self, run_dir: Path) -> Path:
+        """获取manifest.json的实际文件路径。"""
+        return run_dir / "manifest.json"
+
+    def _write_manifest(self, run_dir: Path, manifest: Dict[str, object]):
+        """将内存中的manifest快照全量写回磁盘。"""
+        self._manifest_path(run_dir).write_text(
+            json.dumps(manifest, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+
+    def _read_manifest(self, run_dir: Path) -> Dict[str, object]:
+        """
+        从磁盘读取已有manifest。
+
+        进程重启或多实例写盘时可借助它恢复上下文。
+        """
+        manifest_path = self._manifest_path(run_dir)
+        if manifest_path.exists():
+            return json.loads(manifest_path.read_text(encoding="utf-8"))
+        return {"reportId": run_dir.name, "chapters": []}
+
+    def _upsert_record(self, run_dir: Path, record: ChapterRecord):
+        """
+        更新或追加manifest中的章节记录，保证顺序一致。
+
+        内部会自动排序并写回缓存+磁盘。
+        """
+        key = self._key(run_dir)
+        manifest = self._manifests.get(key) or self._read_manifest(run_dir)
+        chapters: List[Dict[str, object]] = manifest.get("chapters", [])
+        chapters = [c for c in chapters if c.get("chapterId") != record.chapter_id]
+        chapters.append(record.to_dict())
+        chapters.sort(key=lambda x: x.get("order", 0))
+        manifest["chapters"] = chapters
+        manifest.setdefault("updatedAt", datetime.utcnow().isoformat() + "Z")
+        self._manifests[key] = manifest
+        self._write_manifest(run_dir, manifest)
+
+
+__all__ = ["ChapterStorage", "ChapterRecord"]
@@ -0,0 +1,133 @@
+"""
+章节装订器：负责把多个章节JSON合并为整本IR。
+
+DocumentComposer 会注入缺失锚点、统一顺序，并补齐 IR 级元数据。
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Dict, List, Set
+
+from ..ir import IR_VERSION
+
+
+class DocumentComposer:
+    """
+    将章节拼接成Document IR的简单装订器。
+
+    作用：
+        - 按order排序章节，补充默认chapterId；
+        - 防止anchor重复，生成全局唯一锚点；
+        - 注入 IR 版本与生成时间戳。
+    """
+
+    def __init__(self):
+        """初始化装订器并记录已使用的锚点，避免重复"""
+        self._seen_anchors: Set[str] = set()
+
+    def build_document(
+        self,
+        report_id: str,
+        metadata: Dict[str, object],
+        chapters: List[Dict[str, object]],
+    ) -> Dict[str, object]:
+        """
+        把所有章节按order排序并注入唯一锚点，形成整本IR。
+
+        同时合并 metadata/themeTokens/assets，供渲染器直接消费。
+
+        参数:
+            report_id: 本次报告ID。
+            metadata: 全局元信息（标题、主题、toc等）。
+            chapters: 章节payload列表。
+
+        返回:
+            dict: 满足渲染器需求的Document IR。
+        """
+        # 构建从chapterId到toc anchor的映射
+        toc_anchor_map = self._build_toc_anchor_map(metadata)
+
+        ordered = sorted(chapters, key=lambda c: c.get("order", 0))
+        for idx, chapter in enumerate(ordered, start=1):
+            chapter.setdefault("chapterId", f"S{idx}")
+
+            # 优先级：1. 目录配置的anchor 2. 章节自带的anchor 3. 默认anchor
+            chapter_id = chapter.get("chapterId")
+            anchor = (
+                toc_anchor_map.get(chapter_id) or
+                chapter.get("anchor") or
+                f"section-{idx}"
+            )
+            chapter["anchor"] = self._ensure_unique_anchor(anchor)
+            chapter.setdefault("order", idx * 10)
+            if chapter.get("errorPlaceholder"):
+                self._ensure_heading_block(chapter)
+
+        document = {
+            "version": IR_VERSION,
+            "reportId": report_id,
+            "metadata": {
+                **metadata,
+                "generatedAt": metadata.get("generatedAt")
+                or datetime.utcnow().isoformat() + "Z",
+            },
+            "themeTokens": metadata.get("themeTokens", {}),
+            "chapters": ordered,
+            "assets": metadata.get("assets", {}),
+        }
+        return document
+
+    def _ensure_unique_anchor(self, anchor: str) -> str:
+        """若存在重复锚点则追加序号，确保全局唯一。"""
+        base = anchor
+        counter = 2
+        while anchor in self._seen_anchors:
+            anchor = f"{base}-{counter}"
+            counter += 1
+        self._seen_anchors.add(anchor)
+        return anchor
+
+    def _build_toc_anchor_map(self, metadata: Dict[str, object]) -> Dict[str, str]:
+        """
+        从metadata.toc.customEntries构建chapterId到anchor的映射。
+
+        参数:
+            metadata: 文档元信息。
+
+        返回:
+            dict: chapterId -> anchor 的映射。
+        """
+        toc_config = metadata.get("toc") or {}
+        custom_entries = toc_config.get("customEntries") or []
+        anchor_map = {}
+
+        for entry in custom_entries:
+            if isinstance(entry, dict):
+                chapter_id = entry.get("chapterId")
+                anchor = entry.get("anchor")
+                if chapter_id and anchor:
+                    anchor_map[chapter_id] = anchor
+
+        return anchor_map
+
+    def _ensure_heading_block(self, chapter: Dict[str, object]) -> None:
+        """保证占位章节仍然拥有可用于目录的heading block。"""
+        blocks = chapter.get("blocks")
+        if isinstance(blocks, list):
+            for block in blocks:
+                if isinstance(block, dict) and block.get("type") == "heading":
+                    return
+        heading = {
+            "type": "heading",
+            "level": 2,
+            "text": chapter.get("title") or "占位章节",
+            "anchor": chapter.get("anchor"),
+        }
+        if isinstance(blocks, list):
+            blocks.insert(0, heading)
+        else:
+            chapter["blocks"] = [heading]
+
+
+__all__ = ["DocumentComposer"]
@@ -0,0 +1,302 @@
+"""
+Markdown模板切片工具。
+
+LLM需要“按章调用”，因此必须把Markdown模板解析为结构化章节队列。
+这里通过轻量正则和缩进启发式，兼容“# 标题”与
+“- **1.0 标题** /   - 1.1 子标题”等多种写法。
+"""
+
+from __future__ import annotations
+
+import re
+import unicodedata
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+SECTION_ORDER_STEP = 10
+
+
+@dataclass
+class TemplateSection:
+    """
+    模板章节实体。
+
+    记录标题、slug、序号、层级、原始标题、章节编号与提纲，
+    方便后续节点在提示词中引用并保持锚点一致。
+    """
+
+    title: str
+    slug: str
+    order: int
+    depth: int
+    raw_title: str
+    number: str = ""
+    chapter_id: str = ""
+    outline: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> dict:
+        """
+        将章节实体序列化为字典。
+
+        该结构广泛用于提示词上下文以及 layout/word budget 节点的输入。
+        """
+        return {
+            "title": self.title,
+            "slug": self.slug,
+            "order": self.order,
+            "depth": self.depth,
+            "number": self.number,
+            "chapterId": self.chapter_id,
+            "outline": self.outline,
+        }
+
+
+# 解析表达式刻意避免使用 `.*`，以保持匹配的确定性，
+# 并规避不可信模板文本中常见的正则DoS风险。
+heading_pattern = re.compile(
+    r"""
+    (?P<marker>\#{1,6})       # Markdown标题标记
+    [ \t]+                    # 必需的空白字符
+    (?P<title>[^\r\n]+)       # 不包含换行的标题文本
+    """,
+    re.VERBOSE,
+)
+bullet_pattern = re.compile(
+    r"""
+    (?P<marker>[-*+])         # 列表项目符号
+    [ \t]+
+    (?P<title>[^\r\n]+)
+    """,
+    re.VERBOSE,
+)
+number_pattern = re.compile(
+    r"""
+    (?P<num>
+        (?:0|[1-9]\d*)
+        (?:\.(?:0|[1-9]\d*))*
+    )
+    (?:
+        (?:[ \t\u00A0\u3000、:：-]+|\.(?!\d))+
+        (?P<label>[^\r\n]*)
+    )?
+    """,
+    re.VERBOSE,
+)
+
+
+def parse_template_sections(template_md: str) -> List[TemplateSection]:
+    """
+    将Markdown模板切分成章节列表（按大标题）。
+
+    返回的每个TemplateSection都携带slug/order/章节号，
+    方便后续分章调用与锚点生成。解析时会同时兼容
+    “# 标题”“无符号编号”“列表提纲”等不同写法。
+
+    参数:
+        template_md: 模板Markdown全文。
+
+    返回:
+        list[TemplateSection]: 结构化的章节序列。
+    """
+
+    sections: List[TemplateSection] = []
+    current: Optional[TemplateSection] = None
+    order = SECTION_ORDER_STEP
+    used_slugs = set()
+
+    for raw_line in template_md.splitlines():
+        if not raw_line.strip():
+            continue
+
+        indent = len(raw_line) - len(raw_line.lstrip(" "))
+        stripped = raw_line.strip()
+
+        meta = _classify_line(stripped, indent)
+        if not meta:
+            continue
+
+        if meta["is_section"]:
+            slug = _ensure_unique_slug(meta["slug"], used_slugs)
+            section = TemplateSection(
+                title=meta["title"],
+                slug=slug,
+                order=order,
+                depth=meta["depth"],
+                raw_title=meta["raw"],
+                number=meta["number"],
+            )
+            sections.append(section)
+            current = section
+            order += SECTION_ORDER_STEP
+            continue
+
+        # 提纲条目
+        if current:
+            current.outline.append(meta["title"])
+
+    for idx, section in enumerate(sections, start=1):
+        # 为每个章节生成稳定的chapter_id，便于后续引用
+        section.chapter_id = f"S{idx}"
+
+    return sections
+
+
+def _classify_line(stripped: str, indent: int) -> Optional[dict]:
+    """
+    根据缩进与符号分类行。
+
+    借助正则判断当前行是章节标题、提纲还是普通列表项，
+    并衍生 depth/slug/number 等派生信息。
+
+    参数:
+        stripped: 去除前后空格后的原始行。
+        indent: 行首空格数量，用于区分层级。
+
+    返回:
+        dict | None: 识别后的元数据；无法识别时返回None。
+    """
+
+    heading_match = heading_pattern.fullmatch(stripped)
+    if heading_match:
+        level = len(heading_match.group("marker"))
+        payload = _strip_markup(heading_match.group("title").strip())
+        title_info = _split_number(payload)
+        slug = _build_slug(title_info["number"], title_info["title"])
+        return {
+            "is_section": level <= 2,
+            "depth": level,
+            "title": title_info["display"],
+            "raw": payload,
+            "number": title_info["number"],
+            "slug": slug,
+        }
+
+    bullet_match = bullet_pattern.fullmatch(stripped)
+    if bullet_match:
+        payload = _strip_markup(bullet_match.group("title").strip())
+        title_info = _split_number(payload)
+        slug = _build_slug(title_info["number"], title_info["title"])
+        is_section = indent <= 1
+        depth = 1 if indent <= 1 else 2
+        return {
+            "is_section": is_section,
+            "depth": depth,
+            "title": title_info["display"],
+            "raw": payload,
+            "number": title_info["number"],
+            "slug": slug,
+        }
+
+    # 兼容“1.1 ...”没有前缀符号的行
+    number_match = number_pattern.fullmatch(stripped)
+    if number_match and number_match.group("label"):
+        payload = stripped
+        title = number_match.group("label").strip()
+        number = number_match.group("num")
+        slug = _build_slug(number, title)
+        is_section = indent == 0 and number.count(".") <= 1
+        depth = 1 if is_section else 2
+        display = f"{number} {title}" if title else number
+        return {
+            "is_section": is_section,
+            "depth": depth,
+            "title": display,
+            "raw": payload,
+            "number": number,
+            "slug": slug,
+        }
+
+    return None
+
+
+def _strip_markup(text: str) -> str:
+    """去除包裹的**、__等强调标记，避免干扰标题匹配。"""
+    if text.startswith(("**", "__")) and text.endswith(("**", "__")) and len(text) > 4:
+        return text[2:-2].strip()
+    return text
+
+
+def _split_number(payload: str) -> dict:
+    """
+    拆分编号与标题。
+
+    例如 `1.2 市场趋势` 会被拆成 number=1.2、label=市场趋势，
+    并提供 display 用于回填标题。
+
+    参数:
+        payload: 原始标题字符串。
+
+    返回:
+        dict: 包含 number/title/display。
+    """
+    match = number_pattern.fullmatch(payload)
+    number = match.group("num") if match else ""
+    label = match.group("label") if match else payload
+    label = (label or "").strip()
+    display = f"{number} {label}".strip() if number else label or payload
+    title_core = label or payload
+    return {
+        "number": number,
+        "title": title_core,
+        "display": display,
+    }
+
+
+def _build_slug(number: str, title: str) -> str:
+    """
+    根据编号/标题生成锚点，优先复用编号，缺失时对标题slug化。
+
+    参数:
+        number: 章节编号。
+        title: 标题文本。
+
+    返回:
+        str: 形如 `section-1-0` 的slug。
+    """
+    if number:
+        token = number.replace(".", "-")
+    else:
+        token = _slugify_text(title)
+    token = token or "section"
+    return f"section-{token}"
+
+
+def _slugify_text(text: str) -> str:
+    """
+    对任意文本做降噪与转写，得到URL友好的slug片段。
+
+    会规整大小写、移除特殊符号并保留汉字，确保锚点可读。
+    """
+    text = unicodedata.normalize("NFKD", text)
+    text = text.replace("·", "-").replace(" ", "-")
+    text = re.sub(r"[^0-9a-zA-Z\u4e00-\u9fff-]+", "-", text)
+    text = re.sub(r"-{2,}", "-", text)
+    return text.strip("-").lower()
+
+
+def _ensure_unique_slug(slug: str, used: set) -> str:
+    """
+    若slug重复则自动追加序号，直到在used集合中唯一。
+
+    通过 `-2/-3...` 的方式保证相同标题不会产生重复锚点。
+
+    参数:
+        slug: 初始slug。
+        used: 已使用集合。
+
+    返回:
+        str: 去重后的slug。
+    """
+    if slug not in used:
+        used.add(slug)
+        return slug
+    base = slug
+    idx = 2
+    while slug in used:
+        slug = f"{base}-{idx}"
+        idx += 1
+    used.add(slug)
+    return slug
+
+
+__all__ = ["TemplateSection", "parse_template_sections"]
@@ -0,0 +1,24 @@
+"""
+Report Engine的可执行JSON契约(IR)定义与校验工具。
+
+该模块暴露统一的Schema文本与校验器，供提示词、章节生成、
+以及最终装订流程共同复用，确保从LLM到渲染的产物结构一致。
+"""
+
+from .schema import (
+    IR_VERSION,
+    CHAPTER_JSON_SCHEMA,
+    CHAPTER_JSON_SCHEMA_TEXT,
+    ALLOWED_BLOCK_TYPES,
+    ALLOWED_INLINE_MARKS,
+)
+from .validator import IRValidator
+
+__all__ = [
+    "IR_VERSION",
+    "CHAPTER_JSON_SCHEMA",
+    "CHAPTER_JSON_SCHEMA_TEXT",
+    "ALLOWED_BLOCK_TYPES",
+    "ALLOWED_INLINE_MARKS",
+    "IRValidator",
+]
@@ -0,0 +1,369 @@
+"""
+Report Engine JSON契约（IR）Schema定义。
+
+这里集中维护所有章节级别的Schema与可用于提示词的文本表示，
+确保章节生成、校验与渲染对同一个结构有统一认知。
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, List
+
+IR_VERSION = "1.0"
+
+# ====== 基础常量 ======
+ALLOWED_INLINE_MARKS: List[str] = [
+    "bold",
+    "italic",
+    "underline",
+    "strike",
+    "code",
+    "link",
+    "color",
+    "font",
+    "highlight",
+    "subscript",
+    "superscript",
+    "math",
+]
+
+ALLOWED_BLOCK_TYPES: List[str] = [
+    "heading",
+    "paragraph",
+    "list",
+    "table",
+    "blockquote",
+    "hr",
+    "code",
+    "math",
+    "figure",
+    "callout",
+    "kpiGrid",
+    "widget",
+    "toc",
+]
+
+# ====== Schema定义 ======
+inline_mark_schema: Dict[str, Any] = {
+    "type": "object",
+    "required": ["type"],
+    "properties": {
+        "type": {"type": "string", "enum": ALLOWED_INLINE_MARKS},
+        "value": {"type": ["string", "number", "object"]},
+        "href": {"type": "string", "format": "uri-reference"},
+        "title": {"type": "string"},
+        "style": {"type": "object"},
+    },
+    "additionalProperties": True,
+}
+
+inline_run_schema: Dict[str, Any] = {
+    "type": "object",
+    "required": ["text"],
+    "properties": {
+        "text": {"type": "string"},
+        "marks": {
+            "type": "array",
+            "items": {"$ref": "#/definitions/inlineMark"},
+        },
+    },
+    "additionalProperties": True,
+}
+
+heading_block: Dict[str, Any] = {
+    "title": "HeadingBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "heading"},
+        "level": {"type": "integer", "minimum": 1, "maximum": 6},
+        "text": {"type": "string"},
+        "anchor": {"type": "string"},
+        "numbering": {"type": "string"},
+        "subtitle": {"type": "string"},
+    },
+    "required": ["type", "level", "text", "anchor"],
+    "additionalProperties": True,
+}
+
+paragraph_block: Dict[str, Any] = {
+    "title": "ParagraphBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "paragraph"},
+        "inlines": {
+            "type": "array",
+            "items": {"$ref": "#/definitions/inlineRun"},
+        },
+        "align": {"type": "string", "enum": ["left", "center", "right", "justify"]},
+    },
+    "required": ["type", "inlines"],
+    "additionalProperties": True,
+}
+
+list_block: Dict[str, Any] = {
+    "title": "ListBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "list"},
+        "listType": {"type": "string", "enum": ["ordered", "bullet", "task"]},
+        "items": {
+            "type": "array",
+            "items": {
+                "type": "array",
+                "items": {"$ref": "#/definitions/block"},
+            },
+        },
+    },
+    "required": ["type", "listType", "items"],
+    "additionalProperties": True,
+}
+
+table_block: Dict[str, Any] = {
+    "title": "TableBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "table"},
+        "colgroup": {"type": "array", "items": {"type": "object"}},
+        "rows": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "cells": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "rowspan": {"type": "integer", "minimum": 1},
+                                "colspan": {"type": "integer", "minimum": 1},
+                                "align": {
+                                    "type": "string",
+                                    "enum": ["left", "center", "right"],
+                                },
+                                "blocks": {
+                                    "type": "array",
+                                    "items": {"$ref": "#/definitions/block"},
+                                },
+                            },
+                            "required": ["blocks"],
+                            "additionalProperties": True,
+                        },
+                    }
+                },
+                "required": ["cells"],
+                "additionalProperties": True,
+            },
+        },
+        "caption": {"type": "string"},
+        "zebra": {"type": "boolean"},
+    },
+    "required": ["type", "rows"],
+    "additionalProperties": True,
+}
+
+blockquote_block: Dict[str, Any] = {
+    "title": "BlockquoteBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "blockquote"},
+        "blocks": {
+            "type": "array",
+            "items": {"$ref": "#/definitions/block"},
+        },
+        "variant": {"type": "string"},
+    },
+    "required": ["type", "blocks"],
+    "additionalProperties": True,
+}
+
+hr_block: Dict[str, Any] = {
+    "title": "HorizontalRuleBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "hr"},
+        "variant": {"type": "string"},
+    },
+    "required": ["type"],
+    "additionalProperties": True,
+}
+
+code_block: Dict[str, Any] = {
+    "title": "CodeBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "code"},
+        "lang": {"type": "string"},
+        "content": {"type": "string"},
+        "caption": {"type": "string"},
+    },
+    "required": ["type", "content"],
+    "additionalProperties": True,
+}
+
+math_block: Dict[str, Any] = {
+    "title": "MathBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "math"},
+        "latex": {"type": "string"},
+        "displayMode": {"type": "boolean"},
+    },
+    "required": ["type", "latex"],
+    "additionalProperties": True,
+}
+
+figure_block: Dict[str, Any] = {
+    "title": "FigureBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "figure"},
+        "img": {
+            "type": "object",
+            "properties": {
+                "src": {"type": "string"},
+                "alt": {"type": "string"},
+                "width": {"type": "number"},
+                "height": {"type": "number"},
+                "srcset": {"type": "string"},
+            },
+            "required": ["src"],
+            "additionalProperties": True,
+        },
+        "caption": {"type": "string"},
+        "responsive": {"type": "boolean"},
+    },
+    "required": ["type", "img"],
+    "additionalProperties": True,
+}
+
+callout_block: Dict[str, Any] = {
+    "title": "CalloutBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "callout"},
+        "tone": {
+            "type": "string",
+            "enum": ["info", "warning", "success", "danger"],
+        },
+        "title": {"type": "string"},
+        "blocks": {
+            "type": "array",
+            "items": {"$ref": "#/definitions/block"},
+        },
+    },
+    "required": ["type", "tone", "blocks"],
+    "additionalProperties": True,
+}
+
+kpi_block: Dict[str, Any] = {
+    "title": "KPIGridBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "kpiGrid"},
+        "items": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "label": {"type": "string"},
+                    "value": {"type": "string"},
+                    "unit": {"type": "string"},
+                    "delta": {"type": "string"},
+                    "deltaTone": {"type": "string", "enum": ["up", "down", "neutral"]},
+                },
+                "required": ["label", "value"],
+                "additionalProperties": True,
+            },
+        },
+        "cols": {"type": "integer"},
+    },
+    "required": ["type", "items"],
+    "additionalProperties": True,
+}
+
+widget_block: Dict[str, Any] = {
+    "title": "WidgetBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "widget"},
+        "widgetId": {"type": "string"},
+        "widgetType": {"type": "string"},
+        "props": {"type": "object"},
+        "data": {"type": "object"},
+        "dataRef": {"type": "string"},
+    },
+    "required": ["type", "widgetId", "widgetType"],
+    "additionalProperties": True,
+}
+
+toc_block: Dict[str, Any] = {
+    "title": "TOCBlock",
+    "type": "object",
+    "properties": {
+        "type": {"const": "toc"},
+        "depth": {"type": "integer", "minimum": 1, "maximum": 4},
+        "autoNumbering": {"type": "boolean"},
+    },
+    "required": ["type"],
+    "additionalProperties": True,
+}
+
+block_variants: List[Dict[str, Any]] = [
+    heading_block,
+    paragraph_block,
+    list_block,
+    table_block,
+    blockquote_block,
+    hr_block,
+    code_block,
+    math_block,
+    figure_block,
+    callout_block,
+    kpi_block,
+    widget_block,
+    toc_block,
+]
+
+CHAPTER_JSON_SCHEMA: Dict[str, Any] = {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "title": "ReportEngineChapterIR",
+    "type": "object",
+    "required": ["chapterId", "title", "anchor", "order", "blocks"],
+    "properties": {
+        "chapterId": {"type": "string"},
+        "anchor": {"type": "string"},
+        "title": {"type": "string"},
+        "order": {"type": "number"},
+        "summary": {"type": "string"},
+        "blocks": {
+            "type": "array",
+            "items": {"$ref": "#/definitions/block"},
+        },
+        "xrefs": {"type": "object"},
+        "widgets": {"type": "array", "items": {"type": "string"}},
+        "footnotes": {"type": "array", "items": {"type": "object"}},
+        "errors": {"type": "array", "items": {"type": "string"}},
+        "metadata": {"type": "object"},
+    },
+    "additionalProperties": True,
+    "definitions": {
+        "inlineMark": inline_mark_schema,
+        "inlineRun": inline_run_schema,
+        "block": {"oneOf": block_variants},
+    },
+}
+
+CHAPTER_JSON_SCHEMA_TEXT: str = json.dumps(
+    CHAPTER_JSON_SCHEMA,
+    ensure_ascii=False,
+    indent=2,
+)
+
+__all__ = [
+    "IR_VERSION",
+    "ALLOWED_INLINE_MARKS",
+    "ALLOWED_BLOCK_TYPES",
+    "CHAPTER_JSON_SCHEMA",
+    "CHAPTER_JSON_SCHEMA_TEXT",
+]
@@ -0,0 +1,220 @@
+"""
+章节级JSON结构校验器。
+
+LLM按章节生成IR后，需要在落盘与装订前经过严格校验，以避免
+渲染期的结构性崩溃。本模块实现轻量级的Python校验逻辑，
+无需依赖jsonschema库即可快速定位错误。
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Tuple
+
+from .schema import ALLOWED_BLOCK_TYPES, ALLOWED_INLINE_MARKS, IR_VERSION
+
+
+class IRValidator:
+    """
+    章节IR结构校验器。
+
+    说明：
+        - validate_chapter返回(是否通过, 错误列表)
+        - 错误定位采用path语法，便于快速追踪
+        - 内置对heading/paragraph/list/table等所有区块的细粒度校验
+    """
+
+    def __init__(self, schema_version: str = IR_VERSION):
+        """记录当前Schema版本，便于未来多版本并存"""
+        self.schema_version = schema_version
+
+    # ======== 对外接口 ========
+
+    def validate_chapter(self, chapter: Dict[str, Any]) -> Tuple[bool, List[str]]:
+        """校验单个章节对象的必填字段与block结构"""
+        errors: List[str] = []
+        if not isinstance(chapter, dict):
+            return False, ["chapter必须是对象"]
+
+        for field in ("chapterId", "title", "anchor", "order", "blocks"):
+            if field not in chapter:
+                errors.append(f"missing chapter.{field}")
+
+        if not isinstance(chapter.get("blocks"), list) or not chapter.get("blocks"):
+            errors.append("chapter.blocks必须是非空数组")
+            return False, errors
+
+        blocks = chapter.get("blocks", [])
+        for idx, block in enumerate(blocks):
+            self._validate_block(block, f"blocks[{idx}]", errors)
+
+        return len(errors) == 0, errors
+
+    # ======== 内部工具 ========
+
+    def _validate_block(self, block: Any, path: str, errors: List[str]):
+        """根据block类型调用不同的校验器"""
+        if not isinstance(block, dict):
+            errors.append(f"{path} 必须是对象")
+            return
+
+        block_type = block.get("type")
+        if block_type not in ALLOWED_BLOCK_TYPES:
+            errors.append(f"{path}.type 不被支持: {block_type}")
+            return
+
+        validator = getattr(self, f"_validate_{block_type}_block", None)
+        if validator:
+            validator(block, path, errors)
+
+    def _validate_heading_block(self, block: Dict[str, Any], path: str, errors: List[str]):
+        """heading必须有level/text/anchor"""
+        if "level" not in block or not isinstance(block["level"], int):
+            errors.append(f"{path}.level 必须是整数")
+        if "text" not in block:
+            errors.append(f"{path}.text 缺失")
+        if "anchor" not in block:
+            errors.append(f"{path}.anchor 缺失")
+
+    def _validate_paragraph_block(self, block: Dict[str, Any], path: str, errors: List[str]):
+        """paragraph需要非空inlines，并逐条校验"""
+        inlines = block.get("inlines")
+        if not isinstance(inlines, list) or not inlines:
+            errors.append(f"{path}.inlines 必须是非空数组")
+            return
+        for idx, run in enumerate(inlines):
+            self._validate_inline_run(run, f"{path}.inlines[{idx}]", errors)
+
+    def _validate_list_block(self, block: Dict[str, Any], path: str, errors: List[str]):
+        """列表需要声明listType且每个item都是block数组"""
+        if block.get("listType") not in {"ordered", "bullet", "task"}:
+            errors.append(f"{path}.listType 取值非法")
+        items = block.get("items")
+        if not isinstance(items, list) or not items:
+            errors.append(f"{path}.items 必须是非空列表")
+            return
+        for i, item in enumerate(items):
+            if not isinstance(item, list):
+                errors.append(f"{path}.items[{i}] 必须是区块数组")
+                continue
+            for j, sub_block in enumerate(item):
+                self._validate_block(sub_block, f"{path}.items[{i}][{j}]", errors)
+
+    def _validate_table_block(self, block: Dict[str, Any], path: str, errors: List[str]):
+        """表格需提供rows/cells/blocks，递归校验单元格内容"""
+        rows = block.get("rows")
+        if not isinstance(rows, list) or not rows:
+            errors.append(f"{path}.rows 必须是非空数组")
+            return
+        for r_idx, row in enumerate(rows):
+            cells = row.get("cells") if isinstance(row, dict) else None
+            if not isinstance(cells, list) or not cells:
+                errors.append(f"{path}.rows[{r_idx}].cells 必须是非空数组")
+                continue
+            for c_idx, cell in enumerate(cells):
+                if not isinstance(cell, dict):
+                    errors.append(f"{path}.rows[{r_idx}].cells[{c_idx}] 必须是对象")
+                    continue
+                blocks = cell.get("blocks")
+                if not isinstance(blocks, list) or not blocks:
+                    errors.append(
+                        f"{path}.rows[{r_idx}].cells[{c_idx}].blocks 必须是非空数组"
+                    )
+                    continue
+                for b_idx, sub_block in enumerate(blocks):
+                    self._validate_block(
+                        sub_block,
+                        f"{path}.rows[{r_idx}].cells[{c_idx}].blocks[{b_idx}]",
+                        errors,
+                    )
+
+    def _validate_blockquote_block(
+        self, block: Dict[str, Any], path: str, errors: List[str]
+    ):
+        """引用块内部需要至少一个子block"""
+        inner = block.get("blocks")
+        if not isinstance(inner, list) or not inner:
+            errors.append(f"{path}.blocks 必须是非空数组")
+            return
+        for idx, sub_block in enumerate(inner):
+            self._validate_block(sub_block, f"{path}.blocks[{idx}]", errors)
+
+    def _validate_callout_block(self, block: Dict[str, Any], path: str, errors: List[str]):
+        """callout需声明tone，并至少有一个子block"""
+        tone = block.get("tone")
+        if tone not in {"info", "warning", "success", "danger"}:
+            errors.append(f"{path}.tone 取值非法: {tone}")
+        blocks = block.get("blocks")
+        if not isinstance(blocks, list) or not blocks:
+            errors.append(f"{path}.blocks 必须是非空数组")
+            return
+        for idx, sub_block in enumerate(blocks):
+            self._validate_block(sub_block, f"{path}.blocks[{idx}]", errors)
+
+    def _validate_kpiGrid_block(self, block: Dict[str, Any], path: str, errors: List[str]):
+        """KPI卡需要非空items，每项包含label/value"""
+        items = block.get("items")
+        if not isinstance(items, list) or not items:
+            errors.append(f"{path}.items 必须是非空数组")
+            return
+        for idx, item in enumerate(items):
+            if not isinstance(item, dict):
+                errors.append(f"{path}.items[{idx}] 必须是对象")
+                continue
+            if "label" not in item or "value" not in item:
+                errors.append(f"{path}.items[{idx}] 需要label与value")
+
+    def _validate_widget_block(self, block: Dict[str, Any], path: str, errors: List[str]):
+        """widget必须声明widgetId/type，并提供数据或数据引用"""
+        if "widgetId" not in block:
+            errors.append(f"{path}.widgetId 缺失")
+        if "widgetType" not in block:
+            errors.append(f"{path}.widgetType 缺失")
+        if "data" not in block and "dataRef" not in block:
+            errors.append(f"{path} 需要 data 或 dataRef 其一")
+
+    def _validate_code_block(self, block: Dict[str, Any], path: str, errors: List[str]):
+        """code block至少要有content"""
+        if "content" not in block:
+            errors.append(f"{path}.content 缺失")
+
+    def _validate_math_block(self, block: Dict[str, Any], path: str, errors: List[str]):
+        """数学块要求latex字段"""
+        if "latex" not in block:
+            errors.append(f"{path}.latex 缺失")
+
+    def _validate_figure_block(
+        self, block: Dict[str, Any], path: str, errors: List[str]
+    ):
+        """figure需要img对象且至少带src"""
+        img = block.get("img")
+        if not isinstance(img, dict):
+            errors.append(f"{path}.img 必须是对象")
+            return
+        if "src" not in img:
+            errors.append(f"{path}.img.src 缺失")
+
+    def _validate_inline_run(
+        self, run: Any, path: str, errors: List[str]
+    ):
+        """校验paragraph中的inline run与marks合法性"""
+        if not isinstance(run, dict):
+            errors.append(f"{path} 必须是对象")
+            return
+        if "text" not in run:
+            errors.append(f"{path}.text 缺失")
+        marks = run.get("marks", [])
+        if marks is None:
+            return
+        if not isinstance(marks, list):
+            errors.append(f"{path}.marks 必须是数组")
+            return
+        for m_idx, mark in enumerate(marks):
+            if not isinstance(mark, dict):
+                errors.append(f"{path}.marks[{m_idx}] 必须是对象")
+                continue
+            m_type = mark.get("type")
+            if m_type not in ALLOWED_INLINE_MARKS:
+                errors.append(f"{path}.marks[{m_idx}].type 不被支持: {m_type}")
+
+
+__all__ = ["IRValidator"]
@@ -1,5 +1,7 @@
 """
-LLM module for the Report Engine.
+Report Engine LLM子模块。
+
+目前主要暴露 OpenAI 兼容的 `LLMClient` 封装。
 """

 from .base import LLMClient
@@ -1,5 +1,7 @@
 """
-Unified OpenAI-compatible LLM client for the Report Engine, with retry support.
+Report Engine 默认的OpenAI兼容LLM客户端封装。
+
+提供统一的非流式/流式调用、可选重试、字节安全拼接与模型元信息查询。
 """

 import os
@@ -19,7 +21,9 @@ try:
    from retry_helper import with_retry, LLM_RETRY_CONFIG
 except ImportError:
    def with_retry(config=None):
+        """简化版with_retry占位，实现与真实装饰器一致的调用签名"""
        def decorator(func):
+            """直接返回原函数，确保无retry依赖时代码仍可运行"""
            return func
        return decorator

@@ -27,9 +31,17 @@ except ImportError:


 class LLMClient:
-    """Minimal wrapper around the OpenAI-compatible chat completion API."""
+    """针对OpenAI Chat Completion API的轻量封装，统一Report Engine调用入口。"""

    def __init__(self, api_key: str, model_name: str, base_url: Optional[str] = None):
+        """
+        初始化LLM客户端并保存基础连接信息。
+
+        Args:
+            api_key: 用于鉴权的API Token
+            model_name: 具体模型ID，用于定位供应商能力
+            base_url: 自定义兼容接口地址，默认为OpenAI官方
+        """
        if not api_key:
            raise ValueError("Report Engine LLM API key is required.")
        if not model_name:
@@ -55,6 +67,17 @@ class LLMClient:

    @with_retry(LLM_RETRY_CONFIG)
    def invoke(self, system_prompt: str, user_prompt: str, **kwargs) -> str:
+        """
+        以非流式方式调用LLM，并返回一次性完成的完整响应。
+
+        Args:
+            system_prompt: 系统角色提示
+            user_prompt: 用户高优先级指令
+            **kwargs: 允许透传temperature/top_p等采样参数
+
+        Returns:
+            去除首尾空白后的LLM响应文本
+        """
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
@@ -78,15 +101,15 @@ class LLMClient:

    def stream_invoke(self, system_prompt: str, user_prompt: str, **kwargs) -> Generator[str, None, None]:
        """
-        流式调用LLM，逐步返回响应内容
+        流式调用LLM，逐步返回响应内容。
        
-        Args:
-            system_prompt: 系统提示词
-            user_prompt: 用户提示词
-            **kwargs: 额外参数（temperature, top_p等）
+        参数:
+            system_prompt: 系统提示词。
+            user_prompt: 用户提示词。
+            **kwargs: 采样参数（temperature、top_p等）。
            
-        Yields:
-            响应文本块（str）
+        产出:
+            str: 每次yield一段delta文本，方便上层实时渲染。
        """
        messages = [
            {"role": "system", "content": system_prompt},
@@ -120,15 +143,15 @@ class LLMClient:
    @with_retry(LLM_RETRY_CONFIG)
    def stream_invoke_to_string(self, system_prompt: str, user_prompt: str, **kwargs) -> str:
        """
-        流式调用LLM并安全地拼接为完整字符串（避免UTF-8多字节字符截断）
+        流式调用LLM并安全地拼接为完整字符串（避免UTF-8多字节字符截断）。
        
-        Args:
-            system_prompt: 系统提示词
-            user_prompt: 用户提示词
-            **kwargs: 额外参数（temperature, top_p等）
+        参数:
+            system_prompt: 系统提示词。
+            user_prompt: 用户提示词。
+            **kwargs: 采样或超时配置。
            
-        Returns:
-            完整的响应字符串
+        返回:
+            str: 将所有delta拼接后的完整响应。
        """
        # 以字节形式收集所有块
        byte_chunks = []
@@ -142,11 +165,13 @@ class LLMClient:

    @staticmethod
    def validate_response(response: Optional[str]) -> str:
+        """兜底处理None/空白字符串，防止上层逻辑崩溃"""
        if response is None:
            return ""
        return response.strip()

    def get_model_info(self) -> Dict[str, Any]:
+        """以字典形式返回当前客户端的模型/提供方/基础URL信息"""
        return {
            "provider": self.provider,
            "model": self.model_name,
@@ -1,15 +1,22 @@
 """
-Report Engine节点处理模块
-实现报告生成的各个处理步骤
+Report Engine节点处理模块。
+
+封装模板选择、章节生成、文档布局、篇幅规划等流水线节点。
 """

 from .base_node import BaseNode, StateMutationNode
 from .template_selection_node import TemplateSelectionNode
-from .html_generation_node import HTMLGenerationNode
+from .chapter_generation_node import ChapterGenerationNode, ChapterJsonParseError, ChapterContentError
+from .document_layout_node import DocumentLayoutNode
+from .word_budget_node import WordBudgetNode

 __all__ = [
    "BaseNode",
-    "StateMutationNode", 
+    "StateMutationNode",
    "TemplateSelectionNode",
-    "HTMLGenerationNode"
+    "ChapterGenerationNode",
+    "ChapterJsonParseError",
+    "ChapterContentError",
+    "DocumentLayoutNode",
+    "WordBudgetNode",
 ]
@@ -1,6 +1,7 @@
 """
-Report Engine节点基类
-定义所有处理节点的基础接口
+Report Engine节点基类。
+
+所有高阶推理节点都继承于此，统一日志、输入校验与状态变更接口。
 """

 from abc import ABC, abstractmethod
@@ -10,7 +11,12 @@ from ..state.state import ReportState
 from loguru import logger

 class BaseNode(ABC):
-    """节点基类"""
+    """
+    节点基类。
+
+    统一实现日志工具、输入/输出钩子以及LLM客户端依赖注入，
+    便于所有节点只专注业务逻辑。
+    """
    
    def __init__(self, llm_client: LLMClient, node_name: str = ""):
        """
@@ -19,6 +25,8 @@ class BaseNode(ABC):
        Args:
            llm_client: LLM客户端
            node_name: 节点名称
+
+        BaseNode 会保存节点名以便统一输出日志前缀。
        """
        self.llm_client = llm_client
        self.node_name = node_name or self.__class__.__name__
@@ -39,7 +47,8 @@ class BaseNode(ABC):
    
    def validate_input(self, input_data: Any) -> bool:
        """
-        验证输入数据
+        验证输入数据。
+        默认直接通过，子类可按需覆写实现字段检查。
        
        Args:
            input_data: 输入数据
@@ -51,7 +60,8 @@ class BaseNode(ABC):
    
    def process_output(self, output: Any) -> Any:
        """
-        处理输出数据
+        处理输出数据。
+        子类可覆写进行结构化或校验。
        
        Args:
            output: 原始输出
@@ -62,23 +72,29 @@ class BaseNode(ABC):
        return output
    
    def log_info(self, message: str):
-        """记录信息日志"""
+        """记录信息日志，并自动带上节点名作为前缀。"""
        formatted_message = f"[{self.node_name}] {message}"
        logger.info(formatted_message)
    
    def log_error(self, message: str):
-        """记录错误日志"""
+        """记录错误日志，便于排障。"""
        formatted_message = f"[{self.node_name}] {message}"
        logger.error(formatted_message)


 class StateMutationNode(BaseNode):
-    """带状态修改功能的节点基类"""
+    """
+    带状态修改功能的节点基类。
+
+    适用于节点需要直接写入 ReportState 的场景。
+    """
    
    @abstractmethod
    def mutate_state(self, input_data: Any, state: ReportState, **kwargs) -> ReportState:
        """
-        修改状态
+        修改状态。
+
+        子类需返回新的状态对象或在原地修改后回传，供流水线记录。
        
        Args:
            input_data: 输入数据
@@ -0,0 +1,207 @@
+"""
+根据模板目录与多源报告，生成整本报告的标题/目录/主题设计。
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, List
+
+from loguru import logger
+
+from ..core import TemplateSection
+from ..prompts import (
+    SYSTEM_PROMPT_DOCUMENT_LAYOUT,
+    build_document_layout_prompt,
+)
+from ..utils.json_parser import RobustJSONParser, JSONParseError
+from .base_node import BaseNode
+
+
+class DocumentLayoutNode(BaseNode):
+    """
+    负责生成全局标题、目录与Hero设计。
+
+    结合模板切片、报告摘要与论坛讨论，指导整本书的视觉与结构基调。
+    """
+
+    def __init__(self, llm_client):
+        """记录LLM客户端并设置节点名字，供BaseNode日志使用"""
+        super().__init__(llm_client, "DocumentLayoutNode")
+        # 初始化鲁棒JSON解析器，启用所有修复策略
+        self.json_parser = RobustJSONParser(
+            enable_json_repair=True,
+            enable_llm_repair=False,  # 可以根据需要启用LLM修复
+            max_repair_attempts=3,
+        )
+
+    def run(
+        self,
+        sections: List[TemplateSection],
+        template_markdown: str,
+        reports: Dict[str, str],
+        forum_logs: str,
+        query: str,
+        template_overview: Dict[str, Any] | None = None,
+    ) -> Dict[str, Any]:
+        """
+        综合模板+多源内容，生成全书的标题、目录结构与主题色板。
+
+        参数:
+            sections: 模板切片后的章节列表。
+            template_markdown: 模板原文，用于LLM理解上下文。
+            reports: 三个引擎的内容映射。
+            forum_logs: 论坛讨论摘要。
+            query: 用户查询词。
+            template_overview: 预生成的模板概览，可复用以减少提示词长度。
+
+        返回:
+            dict: 包含 title/subtitle/toc/hero/themeTokens 等设计信息的字典。
+        """
+        # 将模板原文、切片结构与多源报告一并喂给LLM，便于其理解层级与素材
+        payload = {
+            "query": query,
+            "template": {
+                "raw": template_markdown,
+                "sections": [section.to_dict() for section in sections],
+            },
+            "templateOverview": template_overview
+            or {
+                "title": sections[0].title if sections else "",
+                "chapters": [section.to_dict() for section in sections],
+            },
+            "reports": reports,
+            "forumLogs": forum_logs,
+        }
+
+        user_message = build_document_layout_prompt(payload)
+        response = self.llm_client.stream_invoke_to_string(
+            SYSTEM_PROMPT_DOCUMENT_LAYOUT,
+            user_message,
+            temperature=0.3,
+            top_p=0.9,
+        )
+        design = self._parse_response(response)
+        logger.info("文档标题/目录设计已生成")
+        return design
+
+    def _parse_response(self, raw: str) -> Dict[str, Any]:
+        """
+        解析LLM返回的JSON文本，若失败则抛出友好错误。
+
+        使用鲁棒JSON解析器进行多重修复尝试：
+        1. 清理markdown标记和思考内容
+        2. 本地语法修复（括号平衡、逗号补全、控制字符转义等）
+        3. 使用json_repair库进行高级修复
+        4. 可选的LLM辅助修复
+
+        参数:
+            raw: LLM原始返回字符串，允许带```包裹、思考内容等。
+
+        返回:
+            dict: 结构化的设计稿。
+
+        异常:
+            ValueError: 当响应为空或JSON解析失败时抛出。
+        """
+        try:
+            result = self.json_parser.parse(
+                raw,
+                context_name="文档设计",
+                expected_keys=["title", "toc", "hero"],
+            )
+            # 验证关键字段的类型
+            if not isinstance(result.get("title"), str):
+                logger.warning("文档设计缺少title字段或类型错误，使用默认值")
+                result.setdefault("title", "未命名报告")
+
+            # 处理tocPlan字段
+            toc_plan = result.get("tocPlan", [])
+            if not isinstance(toc_plan, list):
+                logger.warning("文档设计缺少tocPlan字段或类型错误，使用空列表")
+                result["tocPlan"] = []
+            else:
+                # 清理tocPlan中的description字段
+                result["tocPlan"] = self._clean_toc_plan_descriptions(toc_plan)
+
+            if not isinstance(result.get("hero"), dict):
+                logger.warning("文档设计缺少hero字段或类型错误，使用空对象")
+                result.setdefault("hero", {})
+
+            return result
+        except JSONParseError as exc:
+            # 转换为原有的异常类型以保持向后兼容
+            raise ValueError(f"文档设计JSON解析失败: {exc}") from exc
+
+    def _clean_toc_plan_descriptions(self, toc_plan: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        清理tocPlan中每个条目的description字段，移除可能的JSON片段。
+
+        参数:
+            toc_plan: 原始的目录计划列表
+
+        返回:
+            List[Dict[str, Any]]: 清理后的目录计划列表
+        """
+        import re
+
+        def clean_text(text: Any) -> str:
+            """清理文本中的JSON片段"""
+            if not text or not isinstance(text, str):
+                return ""
+
+            cleaned = text
+
+            # 移除以逗号+空白+{开头的不完整JSON对象
+            cleaned = re.sub(r',\s*\{[^}]*$', '', cleaned)
+
+            # 移除以逗号+空白+[开头的不完整JSON数组
+            cleaned = re.sub(r',\s*\[[^\]]*$', '', cleaned)
+
+            # 移除孤立的 { 加上后续内容（如果没有匹配的 }）
+            open_brace_pos = cleaned.rfind('{')
+            if open_brace_pos != -1:
+                close_brace_pos = cleaned.rfind('}')
+                if close_brace_pos < open_brace_pos:
+                    cleaned = cleaned[:open_brace_pos].rstrip(',，、 \t\n')
+
+            # 移除孤立的 [ 加上后续内容（如果没有匹配的 ]）
+            open_bracket_pos = cleaned.rfind('[')
+            if open_bracket_pos != -1:
+                close_bracket_pos = cleaned.rfind(']')
+                if close_bracket_pos < open_bracket_pos:
+                    cleaned = cleaned[:open_bracket_pos].rstrip(',，、 \t\n')
+
+            # 移除看起来像JSON键值对的片段
+            cleaned = re.sub(r',?\s*"[^"]+"\s*:\s*"[^"]*$', '', cleaned)
+            cleaned = re.sub(r',?\s*"[^"]+"\s*:\s*[^,}\]]*$', '', cleaned)
+
+            # 清理末尾的逗号和空白
+            cleaned = cleaned.rstrip(',，、 \t\n')
+
+            return cleaned.strip()
+
+        cleaned_plan = []
+        for entry in toc_plan:
+            if not isinstance(entry, dict):
+                continue
+
+            # 清理description字段
+            if "description" in entry:
+                original_desc = entry["description"]
+                cleaned_desc = clean_text(original_desc)
+
+                if cleaned_desc != original_desc:
+                    logger.warning(
+                        f"清理目录项 '{entry.get('display', 'unknown')}' 的description字段中的JSON片段:\n"
+                        f"  原文: {original_desc[:100]}...\n"
+                        f"  清理后: {cleaned_desc[:100]}..."
+                    )
+                    entry["description"] = cleaned_desc
+
+            cleaned_plan.append(entry)
+
+        return cleaned_plan
+
+
+__all__ = ["DocumentLayoutNode"]
@@ -1,254 +0,0 @@
-"""
-HTML生成节点
-将整合后的内容转换为美观的HTML报告
-"""
-
-import json
-from datetime import datetime
-from typing import Dict, Any
-from loguru import logger
-
-from .base_node import StateMutationNode
-from ..llms.base import LLMClient
-from ..state.state import ReportState
-from ..prompts import SYSTEM_PROMPT_HTML_GENERATION
-# 不再需要text_processing依赖
-
-
-class HTMLGenerationNode(StateMutationNode):
-    """HTML生成处理节点"""
-    
-    def __init__(self, llm_client: LLMClient):
-        """
-        初始化HTML生成节点
-        
-        Args:
-            llm_client: LLM客户端
-        """
-        super().__init__(llm_client, "HTMLGenerationNode")
-    
-    def run(self, input_data: Dict[str, Any], **kwargs) -> str:
-        """
-        执行HTML生成
-        
-        Args:
-            input_data: 包含报告数据的字典
-                - query: 原始查询
-                - query_engine_report: QueryEngine报告内容
-                - media_engine_report: MediaEngine报告内容  
-                - insight_engine_report: InsightEngine报告内容
-                - forum_logs: 论坛日志内容
-                - selected_template: 选择的模板内容
-                
-        Returns:
-            生成的HTML内容
-        """
-        logger.info("开始生成HTML报告...")
-        
-        try:
-            # 准备LLM输入数据
-            llm_input = {
-                "query": input_data.get('query', ''),
-                "query_engine_report": input_data.get('query_engine_report', ''),
-                "media_engine_report": input_data.get('media_engine_report', ''),
-                "insight_engine_report": input_data.get('insight_engine_report', ''),
-                "forum_logs": input_data.get('forum_logs', ''),
-                "selected_template": input_data.get('selected_template', '')
-            }
-            
-            # 转换为JSON格式传递给LLM
-            message = json.dumps(llm_input, ensure_ascii=False, indent=2)
-            
-            # 调用LLM生成HTML
-            response = self.llm_client.stream_invoke_to_string(SYSTEM_PROMPT_HTML_GENERATION, message)
-            
-            # 处理响应（简化版）
-            processed_response = self.process_output(response)
-            
-            logger.info("HTML报告生成完成")
-            return processed_response
-            
-        except Exception as e:
-            logger.exception(f"HTML生成失败: {str(e)}")
-            # 返回备用HTML
-            return self._generate_fallback_html(input_data)
-    
-    def mutate_state(self, input_data: Dict[str, Any], state: ReportState, **kwargs) -> ReportState:
-        """
-        修改报告状态，添加生成的HTML内容
-        
-        Args:
-            input_data: 输入数据
-            state: 当前报告状态
-            **kwargs: 额外参数
-            
-        Returns:
-            更新后的报告状态
-        """
-        # 生成HTML
-        html_content = self.run(input_data, **kwargs)
-        
-        # 更新状态
-        state.html_content = html_content
-        state.mark_completed()
-        
-        return state
-    
-    def process_output(self, output: str) -> str:
-        """
-        处理LLM输出，提取HTML内容
-        
-        Args:
-            output: LLM原始输出
-            
-        Returns:
-            HTML内容
-        """
-        try:
-            logger.info(f"处理LLM原始输出，长度: {len(output)} 字符")
-            
-            html_content = output.strip()
-            
-            # 清理markdown代码块标记（如果存在）
-            if html_content.startswith('```html'):
-                html_content = html_content[7:]  # 移除 '```html'
-                if html_content.endswith('```'):
-                    html_content = html_content[:-3]  # 移除结尾的 '```'
-            elif html_content.startswith('```') and html_content.endswith('```'):
-                html_content = html_content[3:-3]  # 移除前后的 '```'
-            
-            html_content = html_content.strip()
-            
-            # 如果内容为空，返回原始输出
-            if not html_content:
-                logger.info("处理后内容为空，返回原始输出")
-                html_content = output
-            
-            logger.info(f"HTML处理完成，最终长度: {len(html_content)} 字符")
-            return html_content
-            
-        except Exception as e:
-            logger.exception(f"处理HTML输出失败: {str(e)}，返回原始输出")
-            return output
-    
-    def _generate_fallback_html(self, input_data: Dict[str, Any]) -> str:
-        """
-        生成备用HTML报告（当LLM失败时使用）
-        
-        Args:
-            input_data: 输入数据
-            
-        Returns:
-            备用HTML内容
-        """
-        logger.info("使用备用HTML生成方法")
-        
-        query = input_data.get('query', '智能舆情分析报告')
-        query_report = input_data.get('query_engine_report', '')
-        media_report = input_data.get('media_engine_report', '')
-        insight_report = input_data.get('insight_engine_report', '')
-        forum_logs = input_data.get('forum_logs', '')
-        
-        generation_time = datetime.now().strftime("%Y年%m月%d日 %H:%M:%S")
-        
-        html_content = f"""<!DOCTYPE html>
-<html lang="zh-CN">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>{query} - 智能舆情分析报告</title>
-    <style>
-        body {{
-            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
-            line-height: 1.6;
-            color: #333;
-            max-width: 1200px;
-            margin: 0 auto;
-            padding: 20px;
-            background: #f5f5f5;
-        }}
-        .container {{
-            background: white;
-            padding: 40px;
-            border-radius: 8px;
-            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
-        }}
-        h1 {{
-            color: #2c3e50;
-            border-bottom: 3px solid #3498db;
-            padding-bottom: 10px;
-        }}
-        h2 {{
-            color: #34495e;
-            margin-top: 30px;
-            margin-bottom: 15px;
-        }}
-        .section {{
-            margin-bottom: 30px;
-            padding: 20px;
-            border-left: 4px solid #3498db;
-            background: #f8f9fa;
-        }}
-        .meta {{
-            background: #e9ecef;
-            padding: 15px;
-            border-radius: 5px;
-            margin-bottom: 20px;
-        }}
-        .footer {{
-            margin-top: 40px;
-            padding-top: 20px;
-            border-top: 1px solid #eee;
-            text-align: center;
-            color: #666;
-        }}
-        pre {{
-            background: #f4f4f4;
-            padding: 15px;
-            border-radius: 5px;
-            overflow-x: auto;
-            white-space: pre-wrap;
-        }}
-    </style>
-</head>
-<body>
-    <div class="container">
-        <h1>{query}</h1>
-        
-        <div class="meta">
-            <strong>报告生成时间:</strong> {generation_time}<br>
-            <strong>数据来源:</strong> QueryEngine、MediaEngine、InsightEngine、ForumEngine<br>
-            <strong>报告类型:</strong> 综合舆情分析报告
-        </div>
-        
-        <h2>执行摘要</h2>
-        <div class="section">
-            本报告整合了多个分析引擎的研究结果，为您提供全面的舆情分析洞察。
-            通过对查询主题"{query}"的深度分析，我们从多个维度展现了当前的舆情态势。
-        </div>
-        
-        {f'<h2>QueryEngine分析结果</h2><div class="section"><pre>{query_report}</pre></div>' if query_report else ''}
-        
-        {f'<h2>MediaEngine分析结果</h2><div class="section"><pre>{media_report}</pre></div>' if media_report else ''}
-        
-        {f'<h2>InsightEngine分析结果</h2><div class="section"><pre>{insight_report}</pre></div>' if insight_report else ''}
-        
-        {f'<h2>论坛监控数据</h2><div class="section"><pre>{forum_logs}</pre></div>' if forum_logs else ''}
-        
-        <h2>综合结论</h2>
-        <div class="section">
-            基于多个分析引擎的综合研究，我们对"{query}"主题进行了全面分析。
-            各引擎从不同角度提供了深入洞察，为决策提供了重要参考。
-        </div>
-        
-        <div class="footer">
-            <p>本报告由智能舆情分析平台自动生成</p>
-            <p>ReportEngine v1.0 | 生成时间: {generation_time}</p>
-        </div>
-    </div>
-</body>
-</html>"""
-        
-        return html_content
-    
-
@@ -1,6 +1,8 @@
 """
-模板选择节点
-根据查询内容和可用模板选择最合适的报告模板
+模板选择节点。
+
+综合用户查询、三引擎报告、论坛日志与本地模板库，
+调用LLM挑选最合适的报告骨架。
 """

 import os
@@ -10,25 +12,37 @@ from loguru import logger

 from .base_node import BaseNode
 from ..prompts import SYSTEM_PROMPT_TEMPLATE_SELECTION
+from ..utils.json_parser import RobustJSONParser, JSONParseError


 class TemplateSelectionNode(BaseNode):
-    """模板选择处理节点"""
+    """
+    模板选择处理节点。
+
+    负责准备模板候选列表、构建提示词、解析LLM返回结果，
+    并在失败时回退到内置模板。
+    """
    
    def __init__(self, llm_client, template_dir: str = "ReportEngine/report_template"):
        """
        初始化模板选择节点
-        
+
        Args:
            llm_client: LLM客户端
            template_dir: 模板目录路径
        """
        super().__init__(llm_client, "TemplateSelectionNode")
        self.template_dir = template_dir
+        # 初始化鲁棒JSON解析器，启用所有修复策略
+        self.json_parser = RobustJSONParser(
+            enable_json_repair=True,
+            enable_llm_repair=False,
+            max_repair_attempts=3,
+        )
        
    def run(self, input_data: Dict[str, Any], **kwargs) -> Dict[str, Any]:
        """
-        执行模板选择
+        执行模板选择。
        
        Args:
            input_data: 包含查询和报告内容的字典
@@ -37,7 +51,7 @@ class TemplateSelectionNode(BaseNode):
                - forum_logs: 论坛日志内容
                
        Returns:
-            选择的模板信息
+            选择的模板信息，包含名称、内容与选择理由
        """
        logger.info("开始模板选择...")
        
@@ -67,7 +81,21 @@ class TemplateSelectionNode(BaseNode):
    
    def _llm_template_selection(self, query: str, reports: List[Any], forum_logs: str, 
                              available_templates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
-        """使用LLM进行模板选择"""
+        """
+        使用LLM进行模板选择。
+
+        构造模板列表与报告摘要 → 调用LLM → 解析JSON →
+        验证模板是否存在并返回标准结构。
+
+        参数:
+            query: 用户输入的主题词。
+            reports: 多个分析引擎的报告内容。
+            forum_logs: 论坛日志，可能为空。
+            available_templates: 本地可用模板清单。
+
+        返回:
+            dict | None: 若LLM成功返回合法结果则包含模板信息，否则为None。
+        """
        logger.info("尝试使用LLM进行模板选择...")
        
        # 构建模板列表
@@ -116,20 +144,22 @@ class TemplateSelectionNode(BaseNode):
        
        # 调用LLM
        response = self.llm_client.stream_invoke_to_string(SYSTEM_PROMPT_TEMPLATE_SELECTION, user_message)
-        
+
        # 检查响应是否为空
        if not response or not response.strip():
            logger.error("LLM返回空响应")
            return None
-        
+
        logger.info(f"LLM原始响应: {response}")
-        
-        # 尝试解析JSON响应
+
+        # 尝试解析JSON响应，使用鲁棒解析器
        try:
-            # 清理响应文本
-            cleaned_response = self._clean_llm_response(response)
-            result = json.loads(cleaned_response)
-            
+            result = self.json_parser.parse(
+                response,
+                context_name="模板选择",
+                expected_keys=["template_name", "selection_reason"],
+            )
+
            # 验证选择的模板是否存在
            selected_template_name = result.get('template_name', '')
            for template in available_templates:
@@ -140,30 +170,29 @@ class TemplateSelectionNode(BaseNode):
                        'template_content': template['content'],
                        'selection_reason': result.get('selection_reason', 'LLM智能选择')
                    }
-            
+
            logger.error(f"LLM选择的模板不存在: {selected_template_name}")
            return None
-            
-        except json.JSONDecodeError as e:
+
+        except JSONParseError as e:
            logger.error(f"JSON解析失败: {str(e)}")
            # 尝试从文本响应中提取模板信息
            return self._extract_template_from_text(response, available_templates)
    
-    def _clean_llm_response(self, response: str) -> str:
-        """清理LLM响应"""
-        # 移除可能的markdown代码块标记
-        if '```json' in response:
-            response = response.split('```json')[1].split('```')[0]
-        elif '```' in response:
-            response = response.split('```')[1].split('```')[0]
-        
-        # 移除前后空白
-        response = response.strip()
-        
-        return response
-    
+
    def _extract_template_from_text(self, response: str, available_templates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
-        """从文本响应中提取模板信息"""
+        """
+        从文本响应中提取模板信息。
+
+        当LLM未输出合法JSON时，尝试匹配模板名称关键字做降级。
+
+        参数:
+            response: 非结构化的LLM文本。
+            available_templates: 可选模板列表。
+
+        返回:
+            dict | None: 匹配成功时返回模板详情，否则为None。
+        """
        logger.info("尝试从文本响应中提取模板信息")
        
        # 查找响应中是否包含模板名称
@@ -186,7 +215,14 @@ class TemplateSelectionNode(BaseNode):
        return None
    
    def _get_available_templates(self) -> List[Dict[str, Any]]:
-        """获取可用的模板列表"""
+        """
+        获取可用的模板列表。
+
+        枚举模板目录下的 `.md` 文件并读取内容与描述字段。
+
+        返回:
+            list[dict]: 每项包含 name/path/content/description。
+        """
        templates = []
        
        if not os.path.exists(self.template_dir):
@@ -216,7 +252,7 @@ class TemplateSelectionNode(BaseNode):
        return templates
    
    def _extract_template_description(self, template_name: str) -> str:
-        """根据模板名称生成描述"""
+        """根据模板名称生成描述，方便LLM理解模板定位。"""
        if '企业品牌' in template_name:
            return "适用于企业品牌声誉和形象分析"
        elif '市场竞争' in template_name:
@@ -235,7 +271,12 @@ class TemplateSelectionNode(BaseNode):

    
    def _get_fallback_template(self) -> Dict[str, Any]:
-        """获取备用默认模板（空模板，让LLM自行发挥）"""
+        """
+        获取备用默认模板（空模板，让LLM自行发挥）。
+
+        返回:
+            dict: 结构体字段与LLM返回一致，方便直接替换。
+        """
        logger.info("未找到合适模板，使用空模板让LLM自行发挥")
        
        return {
@@ -0,0 +1,126 @@
+"""
+章节篇幅规划节点。
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, List
+
+from loguru import logger
+
+from ..core import TemplateSection
+from ..prompts import (
+    SYSTEM_PROMPT_WORD_BUDGET,
+    build_word_budget_prompt,
+)
+from ..utils.json_parser import RobustJSONParser, JSONParseError
+from .base_node import BaseNode
+
+
+class WordBudgetNode(BaseNode):
+    """
+    规划各章节字数与重点。
+
+    输出总字数、全局写作准则以及每章/小节的 target/min/max 字数约束。
+    """
+
+    def __init__(self, llm_client):
+        """仅记录LLM客户端引用，方便run阶段发起请求"""
+        super().__init__(llm_client, "WordBudgetNode")
+        # 初始化鲁棒JSON解析器，启用所有修复策略
+        self.json_parser = RobustJSONParser(
+            enable_json_repair=True,
+            enable_llm_repair=False,  # 可以根据需要启用LLM修复
+            max_repair_attempts=3,
+        )
+
+    def run(
+        self,
+        sections: List[TemplateSection],
+        design: Dict[str, Any],
+        reports: Dict[str, str],
+        forum_logs: str,
+        query: str,
+        template_overview: Dict[str, Any] | None = None,
+    ) -> Dict[str, Any]:
+        """
+        根据设计稿和所有素材规划章节字数，让LLM写作时有明确篇幅目标。
+
+        参数:
+            sections: 模板章节列表。
+            design: 布局节点返回的设计稿（title/toc/hero等）。
+            reports: 三引擎报告映射。
+            forum_logs: 论坛日志原文。
+            query: 用户查询词。
+            template_overview: 可选的模板概览，含章节元信息。
+
+        返回:
+            dict: 章节篇幅规划结果，包含 `totalWords`、`globalGuidelines` 与逐章 `chapters`。
+        """
+        # 输入中除了章节骨架外，还包含布局节点输出，方便约束篇幅时参考视觉主次
+        payload = {
+            "query": query,
+            "design": design,
+            "sections": [section.to_dict() for section in sections],
+            "templateOverview": template_overview
+            or {
+                "title": sections[0].title if sections else "",
+                "chapters": [section.to_dict() for section in sections],
+            },
+            "reports": reports,
+            "forumLogs": forum_logs,
+        }
+        user = build_word_budget_prompt(payload)
+        response = self.llm_client.stream_invoke_to_string(
+            SYSTEM_PROMPT_WORD_BUDGET,
+            user,
+            temperature=0.25,
+            top_p=0.85,
+        )
+        plan = self._parse_response(response)
+        logger.info("章节字数规划已生成")
+        return plan
+
+    def _parse_response(self, raw: str) -> Dict[str, Any]:
+        """
+        将LLM输出的JSON文本转为字典，失败时提示规划异常。
+
+        使用鲁棒JSON解析器进行多重修复尝试：
+        1. 清理markdown标记和思考内容
+        2. 本地语法修复（括号平衡、逗号补全、控制字符转义等）
+        3. 使用json_repair库进行高级修复
+        4. 可选的LLM辅助修复
+
+        参数:
+            raw: LLM返回值，可能包含```包裹、思考内容等。
+
+        返回:
+            dict: 合法的篇幅规划JSON。
+
+        异常:
+            ValueError: 当响应为空或JSON解析失败时抛出。
+        """
+        try:
+            result = self.json_parser.parse(
+                raw,
+                context_name="篇幅规划",
+                expected_keys=["totalWords", "globalGuidelines", "chapters"],
+            )
+            # 验证关键字段的类型
+            if not isinstance(result.get("totalWords"), (int, float)):
+                logger.warning("篇幅规划缺少totalWords字段或类型错误，使用默认值")
+                result.setdefault("totalWords", 10000)
+            if not isinstance(result.get("globalGuidelines"), list):
+                logger.warning("篇幅规划缺少globalGuidelines字段或类型错误，使用空列表")
+                result.setdefault("globalGuidelines", [])
+            if not isinstance(result.get("chapters"), (list, dict)):
+                logger.warning("篇幅规划缺少chapters字段或类型错误，使用空列表")
+                result.setdefault("chapters", [])
+            return result
+        except JSONParseError as exc:
+            # 转换为原有的异常类型以保持向后兼容
+            raise ValueError(f"篇幅规划JSON解析失败: {exc}") from exc
+
+
+__all__ = ["WordBudgetNode"]
@@ -1,18 +1,41 @@
 """
-Report Engine提示词模块
-定义报告生成各个阶段使用的系统提示词
+Report Engine提示词模块。
+
+集中导出各阶段系统提示词与辅助函数，其他模块可直接from prompts import。
 """

 from .prompts import (
    SYSTEM_PROMPT_TEMPLATE_SELECTION,
    SYSTEM_PROMPT_HTML_GENERATION,
+    SYSTEM_PROMPT_CHAPTER_JSON,
+    SYSTEM_PROMPT_CHAPTER_JSON_REPAIR,
+    SYSTEM_PROMPT_CHAPTER_JSON_RECOVERY,
+    SYSTEM_PROMPT_DOCUMENT_LAYOUT,
+    SYSTEM_PROMPT_WORD_BUDGET,
    output_schema_template_selection,
-    input_schema_html_generation
+    input_schema_html_generation,
+    chapter_generation_input_schema,
+    build_chapter_user_prompt,
+    build_chapter_repair_prompt,
+    build_chapter_recovery_payload,
+    build_document_layout_prompt,
+    build_word_budget_prompt,
 )

 __all__ = [
    "SYSTEM_PROMPT_TEMPLATE_SELECTION",
-    "SYSTEM_PROMPT_HTML_GENERATION", 
+    "SYSTEM_PROMPT_HTML_GENERATION",
+    "SYSTEM_PROMPT_CHAPTER_JSON",
+    "SYSTEM_PROMPT_CHAPTER_JSON_REPAIR",
+    "SYSTEM_PROMPT_DOCUMENT_LAYOUT",
+    "SYSTEM_PROMPT_WORD_BUDGET",
+    "SYSTEM_PROMPT_CHAPTER_JSON_RECOVERY",
    "output_schema_template_selection",
-    "input_schema_html_generation"
+    "input_schema_html_generation",
+    "chapter_generation_input_schema",
+    "build_chapter_user_prompt",
+    "build_chapter_repair_prompt",
+    "build_chapter_recovery_payload",
+    "build_document_layout_prompt",
+    "build_word_budget_prompt",
 ]
@@ -1,10 +1,19 @@
 """
-Report Engine 的所有提示词定义
-参考MediaEngine的结构，专门用于报告生成
+Report Engine 的所有提示词定义。
+
+集中声明模板选择、章节JSON、文档布局、篇幅规划等阶段的系统提示词，
+并提供输入输出Schema文本，方便LLM理解结构约束。
 """

 import json

+from ..ir import (
+    ALLOWED_BLOCK_TYPES,
+    ALLOWED_INLINE_MARKS,
+    CHAPTER_JSON_SCHEMA_TEXT,
+    IR_VERSION,
+)
+
 # ===== JSON Schema 定义 =====

 # 模板选择输出Schema
@@ -30,6 +39,58 @@ input_schema_html_generation = {
    }
 }

+# 分章节JSON生成输入Schema（给提示词说明字段）
+chapter_generation_input_schema = {
+    "type": "object",
+    "properties": {
+        "section": {
+            "type": "object",
+            "properties": {
+                "title": {"type": "string"},
+                "slug": {"type": "string"},
+                "order": {"type": "number"},
+                "number": {"type": "string"},
+                "outline": {"type": "array", "items": {"type": "string"}}
+            },
+            "required": ["title", "slug", "order"]
+        },
+        "globalContext": {
+            "type": "object",
+            "properties": {
+                "query": {"type": "string"},
+                "templateName": {"type": "string"},
+                "themeTokens": {"type": "object"},
+                "styleDirectives": {"type": "object"}
+            }
+        },
+        "reports": {
+            "type": "object",
+            "properties": {
+                "query_engine": {"type": "string"},
+                "media_engine": {"type": "string"},
+                "insight_engine": {"type": "string"}
+            }
+        },
+        "forumLogs": {"type": "string"},
+        "dataBundles": {
+            "type": "array",
+            "items": {"type": "object"}
+        },
+        "constraints": {
+            "type": "object",
+            "properties": {
+                "language": {"type": "string"},
+                "maxTokens": {"type": "number"},
+                "allowedBlocks": {
+                    "type": "array",
+                    "items": {"type": "string"}
+                }
+            }
+        }
+    },
+    "required": ["section", "globalContext", "reports"]
+}
+
 # HTML报告生成输出Schema - 已简化，不再使用JSON格式
 # output_schema_html_generation = {
 #     "type": "object",
@@ -39,6 +100,96 @@ input_schema_html_generation = {
 #     "required": ["html_content"]
 # }

+# 文档标题/目录设计输出Schema：约束DocumentLayoutNode期望的字段
+document_layout_output_schema = {
+    "type": "object",
+    "properties": {
+        "title": {"type": "string"},
+        "subtitle": {"type": "string"},
+        "tagline": {"type": "string"},
+        "tocTitle": {"type": "string"},
+        "hero": {
+            "type": "object",
+            "properties": {
+                "summary": {"type": "string"},
+                "highlights": {"type": "array", "items": {"type": "string"}},
+                "kpis": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "label": {"type": "string"},
+                            "value": {"type": "string"},
+                            "delta": {"type": "string"},
+                            "tone": {"type": "string", "enum": ["up", "down", "neutral"]},
+                        },
+                        "required": ["label", "value"],
+                    },
+                },
+                "actions": {"type": "array", "items": {"type": "string"}},
+            },
+        },
+        "themeTokens": {"type": "object"},
+        "tocPlan": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "chapterId": {"type": "string"},
+                    "anchor": {"type": "string"},
+                    "display": {"type": "string"},
+                    "description": {"type": "string"},
+                },
+                "required": ["chapterId", "display"],
+            },
+        },
+        "layoutNotes": {"type": "array", "items": {"type": "string"}},
+    },
+    "required": ["title", "tocPlan"],
+}
+
+# 章节字数规划Schema：约束WordBudgetNode的输出结构
+word_budget_output_schema = {
+    "type": "object",
+    "properties": {
+        "totalWords": {"type": "number"},
+        "tolerance": {"type": "number"},
+        "globalGuidelines": {"type": "array", "items": {"type": "string"}},
+        "chapters": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "chapterId": {"type": "string"},
+                    "title": {"type": "string"},
+                    "targetWords": {"type": "number"},
+                    "minWords": {"type": "number"},
+                "maxWords": {"type": "number"},
+                "emphasis": {"type": "array", "items": {"type": "string"}},
+                "rationale": {"type": "string"},
+                "sections": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "title": {"type": "string"},
+                            "anchor": {"type": "string"},
+                            "targetWords": {"type": "number"},
+                            "minWords": {"type": "number"},
+                            "maxWords": {"type": "number"},
+                            "notes": {"type": "string"},
+                        },
+                        "required": ["title", "targetWords"],
+                    },
+                },
+            },
+            "required": ["chapterId", "targetWords"],
+        },
+        },
+    },
+    "required": ["totalWords", "chapters"],
+}
+
 # ===== 系统提示词定义 =====

 # 模板选择的系统提示词
@@ -65,8 +216,17 @@ SYSTEM_PROMPT_TEMPLATE_SELECTION = f"""
 {json.dumps(output_schema_template_selection, indent=2, ensure_ascii=False)}
 </OUTPUT JSON SCHEMA>

-确保输出是一个符合上述输出JSON模式定义的JSON对象。
-只返回JSON对象，不要有解释或额外文本。
+**重要的输出格式要求：**
+1. 只返回符合上述Schema的纯JSON对象
+2. 严禁在JSON外添加任何思考过程、说明文字或解释
+3. 可以使用```json和```标记包裹JSON，但不要添加其他内容
+4. 确保JSON语法完全正确：
+   - 对象和数组元素之间必须有逗号分隔
+   - 字符串中的特殊字符必须正确转义（\n, \t, \"等）
+   - 括号必须成对且正确嵌套
+   - 不要使用尾随逗号（最后一个元素后不加逗号）
+   - 不要在JSON中添加注释
+5. 所有字符串值使用双引号，数值不使用引号
 """

 # HTML报告生成的系统提示词
@@ -133,3 +293,190 @@ SYSTEM_PROMPT_HTML_GENERATION = f"""

 **重要：直接返回完整的HTML代码，不要包含任何解释、说明或其他文本。只返回HTML代码本身。**
 """
+
+# 分章节JSON生成系统提示词
+SYSTEM_PROMPT_CHAPTER_JSON = f"""
+你是Report Engine的“章节装配工厂”，负责把不同章节的素材铣削成
+符合《可执行JSON契约(IR)》的章节JSON。稍后我会提供单个章节要点、
+全局数据与风格指令，你需要：
+1. 完全遵循IR版本 {IR_VERSION} 的结构，严禁输出HTML或Markdown。
+2. 仅使用以下Block类型：{', '.join(ALLOWED_BLOCK_TYPES)}；其中图表用block.type=widget并填充Chart.js配置。
+3. 所有段落都放入paragraph.inlines，混排样式通过marks表示（bold/italic/color/link等）。
+4. 所有heading必须包含anchor，锚点与编号保持模板一致，比如section-2-1。
+5. 表格需给出rows/cells/align，KPI卡请使用kpiGrid，分割线用hr。
+6. 如需引用图表/交互组件，统一用widgetType表示（例如chart.js/line、chart.js/doughnut）。
+7. 鼓励结合outline中列出的子标题，生成多层heading与细粒度内容，同时可补充callout、blockquote等。
+8. 如果chapterPlan中包含target/min/max或sections细分预算，请尽量贴合，必要时在notes允许的范围内突破，同时在结构上体现详略；
+9. 一级标题需使用中文数字（“一、二、三”），二级标题使用阿拉伯数字（“1.1、1.2”），heading.text中直接写好编号，与outline顺序对应；
+10. 严禁输出外部图片/AI生图链接，仅可使用Chart.js图表、表格、色块、callout等HTML原生组件；如需视觉辅助请改为文字描述或数据表；
+11. 段落混排需通过marks表达粗体、斜体、下划线、颜色等样式，禁止残留Markdown语法（如**text**）；
+12. 行间公式用block.type="math"并填入math.latex，行内公式在paragraph.inlines里将文本设为Latex并加上marks.type="math"，渲染层会用MathJax处理；
+13. widget配色需与CSS变量兼容，不要硬编码背景色或文字色，legend/ticks由渲染层控制；
+14. 善用callout、kpiGrid、表格、widget等提升版面丰富度，但必须遵守模板章节范围。
+15. 输出前务必自检JSON语法：禁止出现`{{}}{{`或`][`相连缺少逗号、列表项嵌套超过一层、未闭合的括号或未转义换行，`list` block的items必须是`[[block,...], ...]`结构，若无法满足则返回错误提示而不是输出不合法JSON。
+16. 所有widget块必须在顶层提供`data`或`dataRef`（可将props中的`data`上移），确保Chart.js能够直接渲染；缺失数据时宁可输出表格或段落，绝不留空。
+17. 任何block都必须声明合法`type`（heading/paragraph/list/...）；若需要普通文本请使用`paragraph`并给出`inlines`，禁止返回`type:null`或未知值。
+
+<CHAPTER JSON SCHEMA>
+{CHAPTER_JSON_SCHEMA_TEXT}
+</CHAPTER JSON SCHEMA>
+
+输出格式：
+{{"chapter": {{...遵循上述Schema的章节JSON...}}}}
+
+严禁添加除JSON以外的任何文本或注释。
+"""
+
+SYSTEM_PROMPT_CHAPTER_JSON_REPAIR = f"""
+你现在扮演Report Engine的“章节JSON修复官”，负责在章节草稿无法通过IR校验时进行兜底修复。
+
+请牢记：
+1. 所有chapter必须满足IR版本 {IR_VERSION} 约束，仅允许以下block.type：{', '.join(ALLOWED_BLOCK_TYPES)}；
+2. paragraph.inlines中的marks必须来自以下集合：{', '.join(ALLOWED_INLINE_MARKS)}；
+3. 允许的结构、字段与嵌套规则全部写在《CHAPTER JSON SCHEMA》中，任何缺少字段、数组嵌套错误或list.items不是二维数组的情况都必须修复；
+4. 不得更改事实、数值与结论，只能对结构/字段名/嵌套层级做最小修改以通过校验；
+5. 最终输出只能包含合法JSON，格式严格为：{{"chapter": {{...修复后的章节JSON...}}}}，禁止额外解释或Markdown。
+
+<CHAPTER JSON SCHEMA>
+{CHAPTER_JSON_SCHEMA_TEXT}
+</CHAPTER JSON SCHEMA>
+
+只返回JSON，不要添加注释或自然语言。
+"""
+
+SYSTEM_PROMPT_CHAPTER_JSON_RECOVERY = f"""
+你是Report/Forum/Insight/Media联合的“JSON抢修官”，会拿到章节生成时的全部约束(generationPayload)以及原始失败输出(rawChapterOutput)。
+
+请遵守：
+1. 章节必须满足IR版本 {IR_VERSION} 规范，block.type 仅能使用：{', '.join(ALLOWED_BLOCK_TYPES)}；
+2. paragraph.inlines中的marks仅可出现：{', '.join(ALLOWED_INLINE_MARKS)}，并保留原始文字顺序；
+3. 请以 generationPayload 中的 section 信息为主导，heading.text 与 anchor 必须与章节slug保持一致；
+4. 仅对JSON语法/字段/嵌套做最小必要修复，不改写事实与结论；
+5. 输出严格遵循 {{\"chapter\": {{...}}}} 格式，不添加说明。
+
+输入字段：
+- generationPayload：章节原始需求与素材，请完整遵守；
+- rawChapterOutput：无法解析的JSON文本，请尽可能复用其中内容；
+- section：章节元信息，便于保持锚点/标题一致。
+
+请直接返回修复后的JSON。
+"""
+
+# 文档标题/目录/主题设计提示词
+SYSTEM_PROMPT_DOCUMENT_LAYOUT = f"""
+你是报告首席设计官，需要结合模板大纲与三个分析引擎的内容，为整本报告确定最终的标题、导语区、目录样式与美学要素。
+
+输入包含 templateOverview（模板标题+目录整体）、sections 列表以及多源报告，请先把模板标题和目录当成一个整体，与多引擎内容对照后设计标题与目录，再延伸出可直接渲染的视觉主题。你的输出会被独立存储以便后续拼接，请确保字段齐备。
+
+目标：
+1. 生成具有中文叙事风格的 title/subtitle/tagline，并确保可直接放在封面中央，文案中需自然提到"文章总览"；
+2. 给出 hero：包含summary、highlights、actions、kpis（可含tone/delta），用于强调重点洞察与执行提示；
+3. 输出 tocPlan，一级目录固定用中文数字（"一、二、三"），二级目录用"1.1/1.2"，可在description里说明详略；如需定制目录标题，请填写 tocTitle；
+4. 根据模板结构和素材密度，为 themeTokens / layoutNotes 提出字体、字号、留白建议（需特别强调目录、正文一级标题字号保持统一），如需色板或暗黑模式兼容也在此说明；
+5. 严禁要求外部图片或AI生图，推荐Chart.js图表、表格、色块、KPI卡等可直接渲染的原生组件；
+6. 不随意增删章节，仅优化命名或描述；若有排版或章节合并提示，请放入 layoutNotes，渲染层会严格遵循。
+
+**tocPlan的description字段特别要求：**
+- description字段必须是纯文本描述，用于在目录中展示章节简介
+- 严禁在description字段中嵌套JSON结构、对象、数组或任何特殊标记
+- description应该是简洁的一句话或一小段话，描述该章节的核心内容
+- 错误示例：{{"description": "描述内容，{{\"chapterId\": \"S3\"}}"}}
+- 正确示例：{{"description": "描述内容，详细分析章节要点"}}
+- 如果需要关联chapterId，请使用tocPlan对象的chapterId字段，不要写在description中
+
+输出必须满足下述JSON Schema：
+<OUTPUT JSON SCHEMA>
+{json.dumps(document_layout_output_schema, ensure_ascii=False, indent=2)}
+</OUTPUT JSON SCHEMA>
+
+**重要的输出格式要求：**
+1. 只返回符合上述Schema的纯JSON对象
+2. 严禁在JSON外添加任何思考过程、说明文字或解释
+3. 可以使用```json和```标记包裹JSON，但不要添加其他内容
+4. 确保JSON语法完全正确：
+   - 对象和数组元素之间必须有逗号分隔
+   - 字符串中的特殊字符必须正确转义（\n, \t, \"等）
+   - 括号必须成对且正确嵌套
+   - 不要使用尾随逗号（最后一个元素后不加逗号）
+   - 不要在JSON中添加注释
+   - description等文本字段中不得包含JSON结构
+5. 所有字符串值使用双引号，数值不使用引号
+6. 再次强调：tocPlan中每个条目的description必须是纯文本，不能包含任何JSON片段
+"""
+
+# 篇幅规划提示词
+SYSTEM_PROMPT_WORD_BUDGET = f"""
+你是报告篇幅规划官，会拿到 templateOverview（模板标题+目录）、最新的标题/目录设计稿与全部素材，需要给每章及其子主题分配字数。
+
+要求：
+1. 总字数约40000字，可上下浮动5%，并给出 globalGuidelines 说明整体详略策略；
+2. chapters 中每章需包含 targetWords/min/max、需要额外展开的 emphasis、sections 数组（为该章各小节/提纲分配字数与注意事项，可注明“允许在必要时超出10%补充案例”等）；
+3. rationale 必须解释该章篇幅配置理由，引用模板/素材中的关键信息；
+4. 章节编号遵循一级中文数字、二级阿拉伯数字，便于后续统一字号；
+5. 结果写成JSON并满足下述Schema，仅用于内部存储与章节生成，不直接输出给读者。
+
+<OUTPUT JSON SCHEMA>
+{json.dumps(word_budget_output_schema, ensure_ascii=False, indent=2)}
+</OUTPUT JSON SCHEMA>
+
+**重要的输出格式要求：**
+1. 只返回符合上述Schema的纯JSON对象
+2. 严禁在JSON外添加任何思考过程、说明文字或解释
+3. 可以使用```json和```标记包裹JSON，但不要添加其他内容
+4. 确保JSON语法完全正确：
+   - 对象和数组元素之间必须有逗号分隔
+   - 字符串中的特殊字符必须正确转义（\n, \t, \"等）
+   - 括号必须成对且正确嵌套
+   - 不要使用尾随逗号（最后一个元素后不加逗号）
+   - 不要在JSON中添加注释
+5. 所有字符串值使用双引号，数值不使用引号
+"""
+
+
+def build_chapter_user_prompt(payload: dict) -> str:
+    """
+    将章节上下文序列化为提示词输入。
+
+    统一使用 `json.dumps(..., indent=2, ensure_ascii=False)`，便于LLM读取。
+    """
+    return json.dumps(payload, ensure_ascii=False, indent=2)
+
+
+def build_chapter_repair_prompt(chapter: dict, errors, original_text=None) -> str:
+    """
+    构造章节修复输入payload，包含原始章节与校验错误。
+    """
+    payload: dict = {
+        "failedChapter": chapter,
+        "validatorErrors": errors,
+    }
+    if original_text:
+        snippet = original_text[-2000:]
+        payload["rawOutputTail"] = snippet
+    return json.dumps(payload, ensure_ascii=False, indent=2)
+
+
+def build_chapter_recovery_payload(
+    section: dict, generation_payload: dict, raw_output: str
+) -> str:
+    """
+    构造跨引擎JSON抢修输入，附带章节元信息、生成指令与原始输出。
+
+    为避免提示词过长，仅保留原始输出的尾部片段以定位问题。
+    """
+    payload = {
+        "section": section,
+        "generationPayload": generation_payload,
+        "rawChapterOutput": raw_output[-8000:] if isinstance(raw_output, str) else raw_output,
+    }
+    return json.dumps(payload, ensure_ascii=False, indent=2)
+
+
+def build_document_layout_prompt(payload: dict) -> str:
+    """将文档设计所需的上下文序列化为JSON字符串，供布局节点发送给LLM。"""
+    return json.dumps(payload, ensure_ascii=False, indent=2)
+
+
+def build_word_budget_prompt(payload: dict) -> str:
+    """将篇幅规划输入转为字符串，便于送入LLM并保持字段精确。"""
+    return json.dumps(payload, ensure_ascii=False, indent=2)
@@ -0,0 +1,31 @@
+"""
+Report Engine渲染器集合。
+
+提供 HTMLRenderer 和 PDFRenderer，支持HTML和PDF输出。
+"""
+
+from .html_renderer import HTMLRenderer
+from .pdf_renderer import PDFRenderer
+from .pdf_layout_optimizer import (
+    PDFLayoutOptimizer,
+    PDFLayoutConfig,
+    PageLayout,
+    KPICardLayout,
+    CalloutLayout,
+    TableLayout,
+    ChartLayout,
+    GridLayout,
+)
+
+__all__ = [
+    "HTMLRenderer",
+    "PDFRenderer",
+    "PDFLayoutOptimizer",
+    "PDFLayoutConfig",
+    "PageLayout",
+    "KPICardLayout",
+    "CalloutLayout",
+    "TableLayout",
+    "ChartLayout",
+    "GridLayout",
+]
@@ -0,0 +1,96 @@
+Copyright 2017-2022 Adobe (http://www.adobe.com/), with Reserved Font
+Name 'Source'. Source is a trademark of Adobe in the United States
+and/or other countries.
+
+This Font Software is licensed under the SIL Open Font License,
+Version 1.1.
+
+This license is copied below, and is also available with a FAQ at:
+http://scripts.sil.org/OFL
+
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font
+creation efforts of academic and linguistic communities, and to
+provide a free and open framework in which fonts may be shared and
+improved in partnership with others.
+
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded,
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply to
+any document created using the fonts or their derivatives.
+
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+
+"Original Version" refers to the collection of Font Software
+components as distributed by the Copyright Holder(s).
+
+"Modified Version" refers to any derivative made by adding to,
+deleting, or substituting -- in part or in whole -- any of the
+components of the Original Version, by changing formats or by porting
+the Font Software to a new environment.
+
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed,
+modify, redistribute, and sell modified and unmodified copies of the
+Font Software, subject to the following conditions:
+
+1) Neither the Font Software nor any of its individual components, in
+Original or Modified Versions, may be sold by itself.
+
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the
+corresponding Copyright Holder. This restriction only applies to the
+primary font name as presented to the users.
+
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created using
+the Font Software.
+
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.
@@ -0,0 +1,52 @@
+# 第三方JavaScript库
+
+本目录包含HTML报告渲染所需的第三方JavaScript库。这些库已经被内联到生成的HTML文件中，以便在离线环境中使用。
+
+## 包含的库
+
+1. **chart.js** (204KB) - 用于图表渲染
+   - 版本: 4.5.1
+   - 来源: https://cdn.jsdelivr.net/npm/chart.js
+
+2. **chartjs-chart-sankey.js** (10KB) - Sankey图表插件
+   - 版本: 0.12.0
+   - 来源: https://unpkg.com/chartjs-chart-sankey@0.12.0/dist/chartjs-chart-sankey.min.js
+
+3. **html2canvas.min.js** (194KB) - HTML转Canvas工具
+   - 版本: 1.4.1
+   - 来源: https://cdnjs.cloudflare.com/ajax/libs/html2canvas/1.4.1/html2canvas.min.js
+
+4. **jspdf.umd.min.js** (356KB) - PDF导出库
+   - 版本: 2.5.1
+   - 来源: https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js
+
+5. **mathjax.js** (1.1MB) - 数学公式渲染引擎
+   - 版本: 3.2.2
+   - 来源: https://cdn.jsdelivr.net/npm/mathjax@3.2.2/es5/tex-mml-chtml.js
+
+## 功能说明
+
+HTML渲染器(`html_renderer.py`)会自动从本目录加载这些库文件，并将它们内联到生成的HTML中。这样做有以下优点：
+
+- ✅ 离线环境可用 - 无需网络连接即可正常显示报告
+- ✅ 加载速度快 - 不依赖外部CDN
+- ✅ 稳定性高 - 不受CDN服务中断影响
+- ✅ 版本固定 - 确保功能的一致性
+
+## 备用机制
+
+如果库文件加载失败（如文件不存在或读取错误），渲染器会自动回退到使用CDN链接，确保在任何情况下都能正常工作。
+
+## 更新库文件
+
+如需更新库文件，请：
+
+1. 从相应的CDN下载最新版本
+2. 替换本目录中的对应文件
+3. 更新本README文件中的版本信息
+
+## 注意事项
+
+- 总大小约为1.86MB，会增加生成的HTML文件大小
+- 对于不需要图表和数学公式的简单报告，这些库仍然会被包含
+- 如果需要减小文件大小，可以考虑使用更轻量的替代方案
@@ -0,0 +1,223 @@
+"""
+LaTeX 数学公式转 SVG 渲染器
+使用 matplotlib 将 LaTeX 公式渲染为 SVG 格式，用于 PDF 导出
+"""
+
+import io
+import re
+from typing import Optional
+import matplotlib
+import matplotlib.pyplot as plt
+from matplotlib import mathtext
+from loguru import logger
+
+# 使用非交互式后端
+matplotlib.use('Agg')
+
+
+class MathToSVG:
+    """将 LaTeX 数学公式转换为 SVG 的转换器"""
+
+    def __init__(self, font_size: int = 14, color: str = 'black'):
+        """
+        初始化公式转换器
+
+        Args:
+            font_size: 字体大小（点）
+            color: 文字颜色
+        """
+        self.font_size = font_size
+        self.color = color
+
+    def convert_to_svg(self, latex: str, display_mode: bool = True) -> Optional[str]:
+        """
+        将 LaTeX 公式转换为 SVG 字符串
+
+        Args:
+            latex: LaTeX 公式字符串（不包含 $$ 或 $ 符号）
+            display_mode: True 为显示模式（块级公式），False 为行内模式
+
+        Returns:
+            SVG 字符串，如果转换失败则返回 None
+        """
+        try:
+            # 清理 LaTeX 字符串，去除外层定界符，兼容 $...$ / $$...$$ / \\( \\) / \\[ \\]
+            latex = (latex or "").strip()
+            patterns = [
+                r'^\$\$(.*)\$\$$',
+                r'^\$(.*)\$$',
+                r'^\\\[(.*)\\\]$',
+                r'^\\\((.*)\\\)$',
+            ]
+            for pat in patterns:
+                m = re.match(pat, latex, re.DOTALL)
+                if m:
+                    latex = m.group(1).strip()
+                    break
+            # 清理控制字符并做常见兼容
+            latex = re.sub(r'[\x00-\x1f\x7f]', '', latex)
+            latex = latex.replace(r'\\tfrac', r'\\frac').replace(r'\\dfrac', r'\\frac')
+            if not latex:
+                logger.warning("空的 LaTeX 公式")
+                return None
+
+            # 创建图形
+            fig = plt.figure(figsize=(10, 2) if display_mode else (6, 1))
+            fig.patch.set_alpha(0)  # 透明背景
+
+            # 渲染 LaTeX
+            # 使用 mathtext 进行渲染
+            if display_mode:
+                # 显示模式：居中，较大字体
+                text = fig.text(
+                    0.5, 0.5,
+                    f'${latex}$',
+                    fontsize=self.font_size * 1.2,
+                    color=self.color,
+                    ha='center',
+                    va='center',
+                    usetex=False  # 使用 matplotlib 内置的 mathtext 而非完整 LaTeX
+                )
+            else:
+                # 行内模式：左对齐，正常字体
+                text = fig.text(
+                    0.1, 0.5,
+                    f'${latex}$',
+                    fontsize=self.font_size,
+                    color=self.color,
+                    ha='left',
+                    va='center',
+                    usetex=False
+                )
+
+            # 获取文本边界框
+            fig.canvas.draw()
+            bbox = text.get_window_extent(renderer=fig.canvas.get_renderer())
+
+            # 转换为英寸（matplotlib 使用的单位）
+            bbox_inches = bbox.transformed(fig.dpi_scale_trans.inverted())
+
+            # 调整图形大小以适应文本，添加边距
+            margin = 0.1  # 英寸
+            fig.set_size_inches(
+                bbox_inches.width + 2 * margin,
+                bbox_inches.height + 2 * margin
+            )
+
+            # 重新定位文本到中心
+            text.set_position((0.5, 0.5))
+
+            # 保存为 SVG
+            svg_buffer = io.StringIO()
+            plt.savefig(
+                svg_buffer,
+                format='svg',
+                bbox_inches='tight',
+                pad_inches=0.1,
+                transparent=True,
+                dpi=300
+            )
+            plt.close(fig)
+
+            # 获取 SVG 内容
+            svg_content = svg_buffer.getvalue()
+            svg_buffer.close()
+
+            return svg_content
+
+        except Exception as e:
+            logger.error(f"LaTeX 公式转换失败: {latex[:100]}... 错误: {str(e)}")
+            return None
+
+    def convert_inline_to_svg(self, latex: str) -> Optional[str]:
+        """
+        将行内 LaTeX 公式转换为 SVG
+
+        Args:
+            latex: LaTeX 公式字符串
+
+        Returns:
+            SVG 字符串，如果转换失败则返回 None
+        """
+        return self.convert_to_svg(latex, display_mode=False)
+
+    def convert_display_to_svg(self, latex: str) -> Optional[str]:
+        """
+        将显示模式 LaTeX 公式转换为 SVG
+
+        Args:
+            latex: LaTeX 公式字符串
+
+        Returns:
+            SVG 字符串，如果转换失败则返回 None
+        """
+        return self.convert_to_svg(latex, display_mode=True)
+
+
+def convert_math_block_to_svg(
+    latex: str,
+    font_size: int = 16,
+    color: str = 'black'
+) -> Optional[str]:
+    """
+    便捷函数：将数学公式块转换为 SVG
+
+    Args:
+        latex: LaTeX 公式字符串
+        font_size: 字体大小
+        color: 文字颜色
+
+    Returns:
+        SVG 字符串，如果转换失败则返回 None
+    """
+    converter = MathToSVG(font_size=font_size, color=color)
+    return converter.convert_display_to_svg(latex)
+
+
+def convert_math_inline_to_svg(
+    latex: str,
+    font_size: int = 14,
+    color: str = 'black'
+) -> Optional[str]:
+    """
+    便捷函数：将行内数学公式转换为 SVG
+
+    Args:
+        latex: LaTeX 公式字符串
+        font_size: 字体大小
+        color: 文字颜色
+
+    Returns:
+        SVG 字符串，如果转换失败则返回 None
+    """
+    converter = MathToSVG(font_size=font_size, color=color)
+    return converter.convert_inline_to_svg(latex)
+
+
+if __name__ == "__main__":
+    # 测试代码
+    import sys
+
+    # 测试公式
+    test_formulas = [
+        r"E = mc^2",
+        r"\frac{-b \pm \sqrt{b^2 - 4ac}}{2a}",
+        r"\int_{-\infty}^{\infty} e^{-x^2} dx = \sqrt{\pi}",
+        r"\sum_{i=1}^{n} i = \frac{n(n+1)}{2}",
+    ]
+
+    converter = MathToSVG(font_size=16)
+
+    for i, formula in enumerate(test_formulas):
+        logger.info(f"测试公式 {i+1}: {formula}")
+        svg = converter.convert_display_to_svg(formula)
+        if svg:
+            # 保存到文件
+            filename = f"test_math_{i+1}.svg"
+            with open(filename, 'w', encoding='utf-8') as f:
+                f.write(svg)
+            logger.info(f"成功保存到 {filename}")
+        else:
+            logger.error(f"公式 {i+1} 转换失败")
+
+    logger.info("测试完成")
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+PDF导出工具 - 使用Python直接生成PDF，无乱码
+
+用法:
+    python ReportEngine/scripts/export_to_pdf.py <报告IR JSON文件> [输出PDF路径]
+
+示例:
+    python ReportEngine/scripts/export_to_pdf.py final_reports/ir/report_ir_xxx.json output.pdf
+    python ReportEngine/scripts/export_to_pdf.py final_reports/ir/report_ir_xxx.json
+"""
+
+import sys
+import json
+from pathlib import Path
+from loguru import logger
+
+from ReportEngine.renderers import PDFRenderer
+
+
+def export_to_pdf(ir_json_path: str, output_pdf_path: str = None):
+    """
+    从IR JSON文件生成PDF
+
+    参数:
+        ir_json_path: Document IR JSON文件路径
+        output_pdf_path: 输出PDF路径（可选，默认为同名.pdf）
+    """
+    ir_path = Path(ir_json_path)
+
+    if not ir_path.exists():
+        logger.error(f"文件不存在: {ir_path}")
+        return False
+
+    # 读取IR数据
+    logger.info(f"读取报告: {ir_path}")
+    with open(ir_path, 'r', encoding='utf-8') as f:
+        document_ir = json.load(f)
+
+    # 确定输出路径
+    if output_pdf_path is None:
+        output_pdf_path = ir_path.parent / f"{ir_path.stem}.pdf"
+    else:
+        output_pdf_path = Path(output_pdf_path)
+
+    # 生成PDF
+    logger.info(f"开始生成PDF...")
+    renderer = PDFRenderer()
+
+    try:
+        renderer.render_to_pdf(document_ir, output_pdf_path)
+        logger.success(f"✓ PDF已生成: {output_pdf_path}")
+        return True
+    except Exception as e:
+        logger.error(f"✗ PDF生成失败: {e}")
+        logger.exception("详细错误信息:")
+        return False
+
+
+def main():
+    """主函数"""
+    if len(sys.argv) < 2:
+        print(__doc__)
+        sys.exit(1)
+
+    ir_json_path = sys.argv[1]
+    output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else None
+
+    # 检查环境变量
+    import os
+    if 'DYLD_LIBRARY_PATH' not in os.environ:
+        logger.warning("未设置DYLD_LIBRARY_PATH，尝试自动设置...")
+        os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/lib'
+
+    success = export_to_pdf(ir_json_path, output_pdf_path)
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
@@ -1,6 +1,7 @@
 """
-Report Engine状态管理模块
-定义报告生成过程中的简化状态数据结构
+Report Engine状态管理模块。
+
+导出 ReportState/ReportMetadata，供Agent与Flask接口共享。
 """

 from .state import ReportState, ReportMetadata
@@ -29,7 +29,11 @@ class ReportMetadata:

@dataclass 
 class ReportState:
-    """简化的报告状态管理"""
+    """
+    简化的报告状态管理。
+
+    存储任务基本信息、输入、输出与元数据，供Agent与Flask层共享。
+    """
    # 基本信息
    task_id: str = ""                    # 任务ID
    query: str = ""                      # 原始查询
@@ -55,24 +59,24 @@ class ReportState:
        self.metadata.query = self.query
    
    def mark_processing(self):
-        """标记为处理中"""
+        """标记为处理中，后台线程开始调度生成流程。"""
        self.status = "processing"
    
    def mark_completed(self):
-        """标记为完成"""
+        """标记为完成，同时意味着 `html_content` 已可用。"""
        self.status = "completed"
    
    def mark_failed(self, error_message: str = ""):
-        """标记为失败"""
+        """标记为失败，并记录最后一次错误消息。"""
        self.status = "failed"
        self.error_message = error_message
    
    def is_completed(self) -> bool:
-        """检查是否完成"""
+        """检查是否完成，包括状态为completed且存在HTML内容。"""
        return self.status == "completed" and bool(self.html_content)
    
    def get_progress(self) -> float:
-        """获取进度百分比"""
+        """获取进度百分比，按照模板/内容两个阶段粗略估算。"""
        if self.status == "completed":
            return 100.0
        elif self.status == "processing":
@@ -87,7 +91,7 @@ class ReportState:
            return 0.0
    
    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典格式"""
+        """转换为字典格式，方便序列化给前端。"""
        return {
            "task_id": self.task_id,
            "query": self.query,
@@ -100,7 +104,7 @@ class ReportState:
        }
    
    def save_to_file(self, file_path: str):
-        """保存状态到文件"""
+        """保存状态到文件，排除HTML正文以控制体积。"""
        try:
            state_data = self.to_dict()
            # 不保存完整的HTML内容到状态文件（太大）
@@ -113,7 +117,7 @@ class ReportState:
    
    @classmethod
    def load_from_file(cls, file_path: str) -> Optional["ReportState"]:
-        """从文件加载状态"""
+        """从文件加载状态，仅恢复关键字段便于调试。"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
@@ -135,4 +139,4 @@ class ReportState:
            
        except Exception as e:
            print(f"加载状态文件失败: {str(e)}")
-            return None
+            return None
@@ -1,6 +1,7 @@
 """
-Report Engine工具模块
-包含配置管理
+Report Engine工具模块。
+
+当前主要暴露配置读取逻辑，后续可扩展更多通用工具。
 """


@@ -0,0 +1,279 @@
+"""
+图表API修复模块。
+
+提供调用4个Engine（ReportEngine, ForumEngine, InsightEngine, MediaEngine）的LLM API
+来修复图表数据的功能。
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, List, Optional
+from loguru import logger
+
+from ReportEngine.utils.config import settings
+
+
+# 图表修复提示词
+CHART_REPAIR_SYSTEM_PROMPT = """你是一个专业的图表数据修复助手。你的任务是修复Chart.js图表数据中的格式错误，确保图表能够正常渲染。
+
+**Chart.js标准数据格式：**
+
+1. 标准图表（line, bar, pie, doughnut, radar, polarArea）：
+```json
+{
+  "type": "widget",
+  "widgetType": "chart.js/bar",
+  "widgetId": "chart-001",
+  "props": {
+    "type": "bar",
+    "title": "图表标题",
+    "options": {
+      "responsive": true,
+      "plugins": {
+        "legend": {
+          "display": true
+        }
+      }
+    }
+  },
+  "data": {
+    "labels": ["A", "B", "C"],
+    "datasets": [
+      {
+        "label": "系列1",
+        "data": [10, 20, 30]
+      }
+    ]
+  }
+}
+```
+
+2. 特殊图表（scatter, bubble）：
+```json
+{
+  "data": {
+    "datasets": [
+      {
+        "label": "系列1",
+        "data": [
+          {"x": 10, "y": 20},
+          {"x": 15, "y": 25}
+        ]
+      }
+    ]
+  }
+}
+```
+
+**修复原则：**
+1. **宁愿不改，也不要改错** - 如果不确定如何修复，保持原始数据
+2. **最小改动** - 只修复明确的错误，不要过度修改
+3. **保持数据完整性** - 不要丢失原始数据
+4. **验证修复结果** - 确保修复后符合Chart.js格式
+
+**常见错误及修复方法：**
+1. 缺少labels字段 → 根据数据生成默认labels
+2. datasets不是数组 → 转换为数组格式
+3. 数据长度不匹配 → 截断或补null
+4. 非数值数据 → 尝试转换或设为null
+5. 缺少必需字段 → 添加默认值
+
+请根据错误信息修复图表数据，并返回修复后的完整widget block（JSON格式）。
+"""
+
+
+def build_chart_repair_prompt(
+    widget_block: Dict[str, Any],
+    validation_errors: List[str]
+) -> str:
+    """
+    构建图表修复提示词。
+
+    Args:
+        widget_block: 原始widget block
+        validation_errors: 验证错误列表
+
+    Returns:
+        str: 提示词
+    """
+    block_json = json.dumps(widget_block, ensure_ascii=False, indent=2)
+    errors_text = "\n".join(f"- {error}" for error in validation_errors)
+
+    prompt = f"""请修复以下图表数据中的错误：
+
+**原始数据：**
+```json
+{block_json}
+```
+
+**检测到的错误：**
+{errors_text}
+
+**要求：**
+1. 返回修复后的完整widget block（JSON格式）
+2. 只修复明确的错误，保持其他数据不变
+3. 确保修复后的数据符合Chart.js格式要求
+4. 如果无法确定如何修复，保持原始数据
+
+**重要的输出格式要求：**
+1. 只返回纯JSON对象，不要添加任何说明文字
+2. 不要使用```json```标记包裹
+3. 确保JSON语法完全正确
+4. 所有字符串使用双引号
+"""
+    return prompt
+
+
+def create_llm_repair_functions() -> List:
+    """
+    创建LLM修复函数列表。
+
+    返回4个Engine的修复函数：
+    1. ReportEngine
+    2. ForumEngine (通过ForumHost)
+    3. InsightEngine
+    4. MediaEngine
+
+    Returns:
+        List[Callable]: 修复函数列表
+    """
+    repair_functions = []
+
+    # 1. ReportEngine修复函数
+    if settings.REPORT_ENGINE_API_KEY and settings.REPORT_ENGINE_BASE_URL:
+        def repair_with_report_engine(widget_block: Dict[str, Any], errors: List[str]) -> Optional[Dict[str, Any]]:
+            """使用ReportEngine的LLM修复图表"""
+            try:
+                from ReportEngine.llms import LLMClient
+
+                client = LLMClient(
+                    api_key=settings.REPORT_ENGINE_API_KEY,
+                    base_url=settings.REPORT_ENGINE_BASE_URL,
+                    model_name=settings.REPORT_ENGINE_MODEL_NAME or "gpt-4",
+                )
+
+                prompt = build_chart_repair_prompt(widget_block, errors)
+                response = client.invoke(
+                    CHART_REPAIR_SYSTEM_PROMPT,
+                    prompt,
+                    temperature=0.0,
+                    top_p=0.05
+                )
+
+                if not response:
+                    return None
+
+                # 解析响应
+                repaired = json.loads(response)
+                return repaired
+
+            except Exception as e:
+                logger.error(f"ReportEngine图表修复失败: {e}")
+                return None
+
+        repair_functions.append(repair_with_report_engine)
+
+    # 2. ForumEngine修复函数
+    if settings.FORUM_HOST_API_KEY and settings.FORUM_HOST_BASE_URL:
+        def repair_with_forum_engine(widget_block: Dict[str, Any], errors: List[str]) -> Optional[Dict[str, Any]]:
+            """使用ForumEngine的LLM修复图表"""
+            try:
+                from ReportEngine.llms import LLMClient
+
+                client = LLMClient(
+                    api_key=settings.FORUM_HOST_API_KEY,
+                    base_url=settings.FORUM_HOST_BASE_URL,
+                    model_name=settings.FORUM_HOST_MODEL_NAME or "gpt-4",
+                )
+
+                prompt = build_chart_repair_prompt(widget_block, errors)
+                response = client.invoke(
+                    CHART_REPAIR_SYSTEM_PROMPT,
+                    prompt,
+                    temperature=0.0,
+                    top_p=0.05
+                )
+
+                if not response:
+                    return None
+
+                repaired = json.loads(response)
+                return repaired
+
+            except Exception as e:
+                logger.error(f"ForumEngine图表修复失败: {e}")
+                return None
+
+        repair_functions.append(repair_with_forum_engine)
+
+    # 3. InsightEngine修复函数
+    if settings.INSIGHT_ENGINE_API_KEY and settings.INSIGHT_ENGINE_BASE_URL:
+        def repair_with_insight_engine(widget_block: Dict[str, Any], errors: List[str]) -> Optional[Dict[str, Any]]:
+            """使用InsightEngine的LLM修复图表"""
+            try:
+                from ReportEngine.llms import LLMClient
+
+                client = LLMClient(
+                    api_key=settings.INSIGHT_ENGINE_API_KEY,
+                    base_url=settings.INSIGHT_ENGINE_BASE_URL,
+                    model_name=settings.INSIGHT_ENGINE_MODEL_NAME or "gpt-4",
+                )
+
+                prompt = build_chart_repair_prompt(widget_block, errors)
+                response = client.invoke(
+                    CHART_REPAIR_SYSTEM_PROMPT,
+                    prompt,
+                    temperature=0.0,
+                    top_p=0.05
+                )
+
+                if not response:
+                    return None
+
+                repaired = json.loads(response)
+                return repaired
+
+            except Exception as e:
+                logger.error(f"InsightEngine图表修复失败: {e}")
+                return None
+
+        repair_functions.append(repair_with_insight_engine)
+
+    # 4. MediaEngine修复函数
+    if settings.MEDIA_ENGINE_API_KEY and settings.MEDIA_ENGINE_BASE_URL:
+        def repair_with_media_engine(widget_block: Dict[str, Any], errors: List[str]) -> Optional[Dict[str, Any]]:
+            """使用MediaEngine的LLM修复图表"""
+            try:
+                from ReportEngine.llms import LLMClient
+
+                client = LLMClient(
+                    api_key=settings.MEDIA_ENGINE_API_KEY,
+                    base_url=settings.MEDIA_ENGINE_BASE_URL,
+                    model_name=settings.MEDIA_ENGINE_MODEL_NAME or "gpt-4",
+                )
+
+                prompt = build_chart_repair_prompt(widget_block, errors)
+                response = client.invoke(
+                    CHART_REPAIR_SYSTEM_PROMPT,
+                    prompt,
+                    temperature=0.0,
+                    top_p=0.05
+                )
+
+                if not response:
+                    return None
+
+                repaired = json.loads(response)
+                return repaired
+
+            except Exception as e:
+                logger.error(f"MediaEngine图表修复失败: {e}")
+                return None
+
+        repair_functions.append(repair_with_media_engine)
+
+    if not repair_functions:
+        logger.warning("未配置任何Engine API，图表API修复功能将不可用")
+
+    return repair_functions
@@ -0,0 +1,703 @@
+"""
+图表验证和修复工具。
+
+提供对Chart.js图表数据的验证和修复能力：
+1. 验证图表数据格式是否符合Chart.js要求
+2. 本地规则修复常见问题
+3. LLM API辅助修复复杂问题
+4. 遵循"宁愿不改，也不要改错"的原则
+
+支持的图表类型：
+- line (折线图)
+- bar (柱状图)
+- pie (饼图)
+- doughnut (圆环图)
+- radar (雷达图)
+- polarArea (极地区域图)
+- scatter (散点图)
+"""
+
+from __future__ import annotations
+
+import copy
+import json
+import hashlib
+from typing import Any, Dict, List, Optional, Tuple, Callable
+from dataclasses import dataclass
+from loguru import logger
+
+
+@dataclass
+class ValidationResult:
+    """验证结果"""
+    is_valid: bool
+    errors: List[str]
+    warnings: List[str]
+
+    def has_critical_errors(self) -> bool:
+        """是否有严重错误（会导致渲染失败）"""
+        return not self.is_valid and len(self.errors) > 0
+
+
+@dataclass
+class RepairResult:
+    """修复结果"""
+    success: bool
+    repaired_block: Optional[Dict[str, Any]]
+    method: str  # 'none', 'local', 'api'
+    changes: List[str]
+
+    def has_changes(self) -> bool:
+        """是否有修改"""
+        return len(self.changes) > 0
+
+
+class ChartValidator:
+    """
+    图表验证器 - 验证Chart.js图表数据格式是否正确。
+
+    验证规则：
+    1. 基本结构验证：widgetType, props, data字段
+    2. 图表类型验证：支持的图表类型
+    3. 数据格式验证：labels和datasets结构
+    4. 数据一致性验证：labels和datasets长度匹配
+    5. 数值类型验证：数据值类型正确
+    """
+
+    # 支持的图表类型
+    SUPPORTED_CHART_TYPES = {
+        'line', 'bar', 'pie', 'doughnut', 'radar', 'polarArea', 'scatter',
+        'bubble', 'horizontalBar'
+    }
+
+    # 需要labels的图表类型
+    LABEL_REQUIRED_TYPES = {
+        'line', 'bar', 'radar', 'polarArea', 'pie', 'doughnut'
+    }
+
+    # 需要数值数据的图表类型
+    NUMERIC_DATA_TYPES = {
+        'line', 'bar', 'radar', 'polarArea', 'pie', 'doughnut'
+    }
+
+    # 需要特殊数据格式的图表类型
+    SPECIAL_DATA_TYPES = {
+        'scatter': {'x', 'y'},
+        'bubble': {'x', 'y', 'r'}
+    }
+
+    def __init__(self):
+        """初始化验证器并预留缓存结构，便于后续复用验证/修复结果"""
+
+    def validate(self, widget_block: Dict[str, Any]) -> ValidationResult:
+        """
+        验证图表格式。
+
+        Args:
+            widget_block: widget类型的block，包含widgetId/widgetType/props/data
+
+        Returns:
+            ValidationResult: 验证结果
+        """
+        errors = []
+        warnings = []
+
+        # 1. 基本结构验证
+        if not isinstance(widget_block, dict):
+            errors.append("widget_block必须是字典类型")
+            return ValidationResult(False, errors, warnings)
+
+        # 2. 检查widgetType
+        widget_type = widget_block.get('widgetType', '')
+        if not widget_type or not isinstance(widget_type, str):
+            errors.append("缺少widgetType字段或类型不正确")
+            return ValidationResult(False, errors, warnings)
+
+        # 检查是否是chart.js类型
+        if not widget_type.startswith('chart.js'):
+            # 不是图表类型，跳过验证
+            return ValidationResult(True, errors, warnings)
+
+        # 3. 提取图表类型
+        chart_type = self._extract_chart_type(widget_block)
+        if not chart_type:
+            errors.append("无法确定图表类型")
+            return ValidationResult(False, errors, warnings)
+
+        # 4. 检查是否支持该图表类型
+        if chart_type not in self.SUPPORTED_CHART_TYPES:
+            warnings.append(f"图表类型 '{chart_type}' 可能不被支持，将尝试降级渲染")
+
+        # 5. 验证数据结构
+        data = widget_block.get('data')
+        if not isinstance(data, dict):
+            errors.append("data字段必须是字典类型")
+            return ValidationResult(False, errors, warnings)
+
+        # 检测是否使用了{x, y}形式的数据点（通常用于时间轴/散点）
+        def contains_object_points(ds_list: List[Any] | None) -> bool:
+            """检查数据集中是否包含以x/y键表示的对象点，用于切换验证分支"""
+            if not isinstance(ds_list, list):
+                return False
+            for point in ds_list:
+                if isinstance(point, dict) and any(key in point for key in ('x', 'y', 't')):
+                    return True
+            return False
+
+        datasets_for_detection = data.get('datasets') or []
+        uses_object_points = any(
+            isinstance(ds, dict) and contains_object_points(ds.get('data'))
+            for ds in datasets_for_detection
+        )
+
+        # 6. 根据图表类型验证数据
+        if chart_type in self.SPECIAL_DATA_TYPES:
+            # 特殊数据格式（scatter, bubble）
+            self._validate_special_data(data, chart_type, errors, warnings)
+        else:
+            # 标准数据格式（labels + datasets）
+            self._validate_standard_data(data, chart_type, errors, warnings, uses_object_points)
+
+        # 7. 验证props
+        props = widget_block.get('props')
+        if props is not None and not isinstance(props, dict):
+            warnings.append("props字段应该是字典类型")
+
+        is_valid = len(errors) == 0
+        return ValidationResult(is_valid, errors, warnings)
+
+    def _extract_chart_type(self, widget_block: Dict[str, Any]) -> Optional[str]:
+        """
+        提取图表类型。
+
+        优先级：
+        1. props.type
+        2. widgetType中的类型（chart.js/bar -> bar）
+        3. data.type
+        """
+        # 1. 从props中获取
+        props = widget_block.get('props') or {}
+        if isinstance(props, dict):
+            chart_type = props.get('type')
+            if chart_type and isinstance(chart_type, str):
+                return chart_type.lower()
+
+        # 2. 从widgetType中提取
+        widget_type = widget_block.get('widgetType', '')
+        if '/' in widget_type:
+            chart_type = widget_type.split('/')[-1]
+            if chart_type:
+                return chart_type.lower()
+
+        # 3. 从data中获取
+        data = widget_block.get('data') or {}
+        if isinstance(data, dict):
+            chart_type = data.get('type')
+            if chart_type and isinstance(chart_type, str):
+                return chart_type.lower()
+
+        return None
+
+    def _validate_standard_data(
+        self,
+        data: Dict[str, Any],
+        chart_type: str,
+        errors: List[str],
+        warnings: List[str],
+        uses_object_points: bool = False
+    ):
+        """验证标准数据格式（labels + datasets）"""
+        labels = data.get('labels')
+        datasets = data.get('datasets')
+
+        # 验证labels
+        if chart_type in self.LABEL_REQUIRED_TYPES:
+            if not labels:
+                if uses_object_points:
+                    warnings.append(
+                        f"{chart_type}类型图表缺少labels，已根据数据点渲染（使用x值）"
+                    )
+                else:
+                    errors.append(f"{chart_type}类型图表必须包含labels字段")
+            elif not isinstance(labels, list):
+                errors.append("labels必须是数组类型")
+            elif len(labels) == 0:
+                warnings.append("labels数组为空，图表可能无法正常显示")
+
+        # 验证datasets
+        if datasets is None:
+            errors.append("缺少datasets字段")
+            return
+
+        if not isinstance(datasets, list):
+            errors.append("datasets必须是数组类型")
+            return
+
+        if len(datasets) == 0:
+            errors.append("datasets数组为空")
+            return
+
+        # 验证每个dataset
+        for idx, dataset in enumerate(datasets):
+            if not isinstance(dataset, dict):
+                errors.append(f"datasets[{idx}]必须是对象类型")
+                continue
+
+            # 验证data字段
+            ds_data = dataset.get('data')
+            if ds_data is None:
+                errors.append(f"datasets[{idx}]缺少data字段")
+                continue
+
+            if not isinstance(ds_data, list):
+                errors.append(f"datasets[{idx}].data必须是数组类型")
+                continue
+
+            if len(ds_data) == 0:
+                warnings.append(f"datasets[{idx}].data数组为空")
+                continue
+
+            # 如果是{x, y}对象形式的数据点，默认允许跳过labels长度和数值校验
+            object_points = any(
+                isinstance(value, dict) and any(key in value for key in ('x', 'y', 't'))
+                for value in ds_data
+            )
+
+            # 验证数据长度一致性
+            if labels and isinstance(labels, list) and not object_points:
+                if len(ds_data) != len(labels):
+                    warnings.append(
+                        f"datasets[{idx}].data长度({len(ds_data)})与labels长度({len(labels)})不匹配"
+                    )
+
+            # 验证数值类型
+            if chart_type in self.NUMERIC_DATA_TYPES and not object_points:
+                for data_idx, value in enumerate(ds_data):
+                    if value is not None and not isinstance(value, (int, float)):
+                        errors.append(
+                            f"datasets[{idx}].data[{data_idx}]的值'{value}'不是有效的数值类型"
+                        )
+                        break  # 只报告第一个错误
+
+    def _validate_special_data(
+        self,
+        data: Dict[str, Any],
+        chart_type: str,
+        errors: List[str],
+        warnings: List[str]
+    ):
+        """验证特殊数据格式（scatter, bubble）"""
+        datasets = data.get('datasets')
+
+        if not datasets:
+            errors.append("缺少datasets字段")
+            return
+
+        if not isinstance(datasets, list):
+            errors.append("datasets必须是数组类型")
+            return
+
+        if len(datasets) == 0:
+            errors.append("datasets数组为空")
+            return
+
+        required_keys = self.SPECIAL_DATA_TYPES.get(chart_type, set())
+
+        # 验证每个dataset
+        for idx, dataset in enumerate(datasets):
+            if not isinstance(dataset, dict):
+                errors.append(f"datasets[{idx}]必须是对象类型")
+                continue
+
+            ds_data = dataset.get('data')
+            if ds_data is None:
+                errors.append(f"datasets[{idx}]缺少data字段")
+                continue
+
+            if not isinstance(ds_data, list):
+                errors.append(f"datasets[{idx}].data必须是数组类型")
+                continue
+
+            if len(ds_data) == 0:
+                warnings.append(f"datasets[{idx}].data数组为空")
+                continue
+
+            # 验证数据点格式
+            for data_idx, point in enumerate(ds_data):
+                if not isinstance(point, dict):
+                    errors.append(
+                        f"datasets[{idx}].data[{data_idx}]必须是对象类型（包含{required_keys}字段）"
+                    )
+                    break
+
+                # 检查必需的键
+                missing_keys = required_keys - set(point.keys())
+                if missing_keys:
+                    errors.append(
+                        f"datasets[{idx}].data[{data_idx}]缺少必需字段: {missing_keys}"
+                    )
+                    break
+
+                # 验证数值类型
+                for key in required_keys:
+                    value = point.get(key)
+                    if value is not None and not isinstance(value, (int, float)):
+                        errors.append(
+                            f"datasets[{idx}].data[{data_idx}].{key}的值'{value}'不是有效的数值类型"
+                        )
+                        break
+
+    def can_render(self, widget_block: Dict[str, Any]) -> bool:
+        """
+        判断图表是否能正常渲染（快速检查）。
+
+        Args:
+            widget_block: widget类型的block
+
+        Returns:
+            bool: 是否能正常渲染
+        """
+        result = self.validate(widget_block)
+        return result.is_valid
+
+
+class ChartRepairer:
+    """
+    图表修复器 - 尝试修复图表数据。
+
+    修复策略：
+    1. 本地规则修复：修复常见问题
+    2. API修复：使用LLM修复复杂问题
+    3. 验证修复结果：确保修复后能正常渲染
+    """
+
+    def __init__(
+        self,
+        validator: ChartValidator,
+        llm_repair_fns: Optional[List[Callable]] = None
+    ):
+        """
+        初始化修复器。
+
+        Args:
+            validator: 图表验证器实例
+            llm_repair_fns: LLM修复函数列表（对应4个Engine）
+        """
+        self.validator = validator
+        self.llm_repair_fns = llm_repair_fns or []
+        # 缓存修复结果，避免同一个图表在多处被重复调用LLM
+        self._result_cache: Dict[str, RepairResult] = {}
+
+    def build_cache_key(self, widget_block: Dict[str, Any]) -> str:
+        """
+        为图表生成稳定的缓存key，保证同样的数据不会重复触发修复。
+
+        - 优先使用widgetId；
+        - 结合数据内容的哈希，避免同ID但内容变化时误用旧结果。
+        """
+        widget_id = ""
+        if isinstance(widget_block, dict):
+            widget_id = widget_block.get('widgetId') or widget_block.get('id') or ""
+        try:
+            serialized = json.dumps(
+                widget_block,
+                ensure_ascii=False,
+                sort_keys=True,
+                default=str
+            )
+        except Exception:
+            serialized = repr(widget_block)
+        digest = hashlib.md5(serialized.encode('utf-8', errors='ignore')).hexdigest()
+        return f"{widget_id}:{digest}"
+
+    def repair(
+        self,
+        widget_block: Dict[str, Any],
+        validation_result: Optional[ValidationResult] = None
+    ) -> RepairResult:
+        """
+        尝试修复图表数据。
+
+        Args:
+            widget_block: widget类型的block
+            validation_result: 验证结果（可选，如果没有会先进行验证）
+
+        Returns:
+            RepairResult: 修复结果
+        """
+        cache_key = self.build_cache_key(widget_block)
+
+        cached = self._result_cache.get(cache_key)
+        if cached:
+            # 返回缓存的深拷贝，避免外部修改影响缓存
+            return copy.deepcopy(cached)
+
+        def _cache_and_return(res: RepairResult) -> RepairResult:
+            """写入修复结果缓存并返回，避免重复调用下游修复逻辑"""
+            try:
+                self._result_cache[cache_key] = copy.deepcopy(res)
+            except Exception:
+                self._result_cache[cache_key] = res
+            return res
+
+        # 1. 如果没有验证结果，先验证
+        if validation_result is None:
+            validation_result = self.validator.validate(widget_block)
+
+        # 2. 尝试本地修复（即使验证通过也尝试，因为可能有警告）
+        logger.info(f"尝试本地修复图表")
+        local_result = self.repair_locally(widget_block, validation_result)
+
+        # 3. 验证修复结果
+        if local_result.has_changes():
+            repaired_validation = self.validator.validate(local_result.repaired_block)
+            if repaired_validation.is_valid:
+                logger.info(f"本地修复成功: {local_result.changes}")
+                return _cache_and_return(
+                    RepairResult(True, local_result.repaired_block, 'local', local_result.changes)
+                )
+            else:
+                logger.warning(f"本地修复后仍然无效: {repaired_validation.errors}")
+
+        # 4. 如果本地修复失败且有严重错误，尝试API修复
+        if validation_result.has_critical_errors() and len(self.llm_repair_fns) > 0:
+            logger.info("本地修复失败，尝试API修复")
+            api_result = self.repair_with_api(widget_block, validation_result)
+
+            if api_result.success:
+                # 验证修复结果
+                repaired_validation = self.validator.validate(api_result.repaired_block)
+                if repaired_validation.is_valid:
+                    logger.info(f"API修复成功: {api_result.changes}")
+                    return _cache_and_return(api_result)
+                else:
+                    logger.warning(f"API修复后仍然无效: {repaired_validation.errors}")
+
+        # 5. 如果验证通过，返回原始或修复后的数据
+        if validation_result.is_valid:
+            if local_result.has_changes():
+                return _cache_and_return(
+                    RepairResult(True, local_result.repaired_block, 'local', local_result.changes)
+                )
+            else:
+                return _cache_and_return(RepairResult(True, widget_block, 'none', []))
+
+        # 6. 所有修复都失败，返回原始数据
+        logger.warning("所有修复尝试失败，保持原始数据")
+        return _cache_and_return(RepairResult(False, widget_block, 'none', []))
+
+    def repair_locally(
+        self,
+        widget_block: Dict[str, Any],
+        validation_result: ValidationResult
+    ) -> RepairResult:
+        """
+        使用本地规则修复。
+
+        修复规则：
+        1. 补全缺失的基本字段
+        2. 修复数据类型错误
+        3. 修复数据长度不匹配
+        4. 清理无效数据
+        5. 添加默认值
+        """
+        repaired = copy.deepcopy(widget_block)
+        changes = []
+
+        # 1. 确保基本结构存在
+        if 'props' not in repaired or not isinstance(repaired.get('props'), dict):
+            repaired['props'] = {}
+            changes.append("添加缺失的props字段")
+
+        if 'data' not in repaired or not isinstance(repaired.get('data'), dict):
+            repaired['data'] = {}
+            changes.append("添加缺失的data字段")
+
+        # 2. 确保图表类型存在
+        chart_type = self.validator._extract_chart_type(repaired)
+        props = repaired['props']
+
+        if not chart_type:
+            # 尝试从widgetType推断
+            widget_type = repaired.get('widgetType', '')
+            if '/' in widget_type:
+                chart_type = widget_type.split('/')[-1].lower()
+                props['type'] = chart_type
+                changes.append(f"从widgetType推断图表类型: {chart_type}")
+            else:
+                # 默认使用bar类型
+                chart_type = 'bar'
+                props['type'] = chart_type
+                changes.append("设置默认图表类型: bar")
+        elif 'type' not in props or not props['type']:
+            # chart_type存在但props中没有type字段，需要添加
+            props['type'] = chart_type
+            changes.append(f"将推断的图表类型添加到props: {chart_type}")
+
+        # 3. 修复数据结构
+        data = repaired['data']
+
+        # 确保datasets存在
+        if 'datasets' not in data or not isinstance(data.get('datasets'), list):
+            data['datasets'] = []
+            changes.append("添加缺失的datasets字段")
+
+        # 如果datasets为空但data中有其他数据，尝试构造datasets
+        if len(data['datasets']) == 0:
+            constructed = self._try_construct_datasets(data, chart_type)
+            if constructed:
+                data['datasets'] = constructed
+                changes.append("从data中构造datasets")
+            elif 'labels' in data and isinstance(data.get('labels'), list) and len(data['labels']) > 0:
+                # 如果有labels但没有数据，创建一个空dataset
+                data['datasets'] = [{
+                    'label': '数据',
+                    'data': [0] * len(data['labels'])
+                }]
+                changes.append("根据labels创建默认dataset（使用零值）")
+
+        # 确保labels存在（如果需要）
+        if chart_type in ChartValidator.LABEL_REQUIRED_TYPES:
+            if 'labels' not in data or not isinstance(data.get('labels'), list):
+                # 尝试根据datasets长度生成labels
+                if data['datasets'] and len(data['datasets']) > 0:
+                    first_ds = data['datasets'][0]
+                    if isinstance(first_ds, dict) and isinstance(first_ds.get('data'), list):
+                        data_len = len(first_ds['data'])
+                        data['labels'] = [f"项目 {i+1}" for i in range(data_len)]
+                        changes.append(f"生成{data_len}个默认labels")
+
+        # 4. 修复datasets中的数据
+        for idx, dataset in enumerate(data.get('datasets', [])):
+            if not isinstance(dataset, dict):
+                continue
+
+            # 确保有data字段
+            if 'data' not in dataset or not isinstance(dataset.get('data'), list):
+                dataset['data'] = []
+                changes.append(f"为datasets[{idx}]添加空data数组")
+
+            # 确保有label
+            if 'label' not in dataset:
+                dataset['label'] = f"系列 {idx + 1}"
+                changes.append(f"为datasets[{idx}]添加默认label")
+
+            # 修复数据长度不匹配
+            labels = data.get('labels', [])
+            ds_data = dataset.get('data', [])
+            if isinstance(labels, list) and isinstance(ds_data, list):
+                if len(ds_data) < len(labels):
+                    # 数据不够，补null
+                    dataset['data'] = ds_data + [None] * (len(labels) - len(ds_data))
+                    changes.append(f"datasets[{idx}]数据长度不足，补充null")
+                elif len(ds_data) > len(labels):
+                    # 数据过多，截断
+                    dataset['data'] = ds_data[:len(labels)]
+                    changes.append(f"datasets[{idx}]数据长度过长，截断")
+
+            # 转换非数值数据为数值（如果可能）
+            if chart_type in ChartValidator.NUMERIC_DATA_TYPES:
+                ds_data = dataset.get('data', [])
+                converted = False
+                for i, value in enumerate(ds_data):
+                    if value is None:
+                        continue
+                    if not isinstance(value, (int, float)):
+                        # 尝试转换
+                        try:
+                            if isinstance(value, str):
+                                # 尝试转换字符串
+                                ds_data[i] = float(value)
+                                converted = True
+                        except (ValueError, TypeError):
+                            # 转换失败，设为null
+                            ds_data[i] = None
+                            converted = True
+                if converted:
+                    changes.append(f"datasets[{idx}]包含非数值数据，已尝试转换")
+
+        # 5. 验证修复结果
+        success = len(changes) > 0
+
+        return RepairResult(success, repaired, 'local', changes)
+
+    def _try_construct_datasets(
+        self,
+        data: Dict[str, Any],
+        chart_type: str
+    ) -> Optional[List[Dict[str, Any]]]:
+        """尝试从data中构造datasets"""
+        # 如果data直接包含数据数组，尝试构造
+        if 'values' in data and isinstance(data['values'], list):
+            return [{
+                'label': '数据',
+                'data': data['values']
+            }]
+
+        # 如果data包含series字段
+        if 'series' in data and isinstance(data['series'], list):
+            datasets = []
+            for idx, series in enumerate(data['series']):
+                if isinstance(series, dict):
+                    datasets.append({
+                        'label': series.get('name', f'系列 {idx + 1}'),
+                        'data': series.get('data', [])
+                    })
+                elif isinstance(series, list):
+                    datasets.append({
+                        'label': f'系列 {idx + 1}',
+                        'data': series
+                    })
+            if datasets:
+                return datasets
+
+        return None
+
+    def repair_with_api(
+        self,
+        widget_block: Dict[str, Any],
+        validation_result: ValidationResult
+    ) -> RepairResult:
+        """
+        使用API修复（调用4个Engine的LLM）。
+
+        策略：按顺序尝试不同的Engine，直到修复成功
+        """
+        if not self.llm_repair_fns:
+            return RepairResult(False, None, 'api', [])
+
+        for idx, repair_fn in enumerate(self.llm_repair_fns):
+            try:
+                logger.info(f"尝试使用Engine {idx + 1}修复图表")
+                repaired = repair_fn(widget_block, validation_result.errors)
+
+                if repaired and isinstance(repaired, dict):
+                    # 验证修复结果
+                    repaired_validation = self.validator.validate(repaired)
+                    if repaired_validation.is_valid:
+                        return RepairResult(
+                            True,
+                            repaired,
+                            'api',
+                            [f"使用Engine {idx + 1}修复成功"]
+                        )
+            except Exception as e:
+                logger.error(f"Engine {idx + 1}修复失败: {e}")
+                continue
+
+        return RepairResult(False, None, 'api', [])
+
+
+def create_chart_validator() -> ChartValidator:
+    """创建图表验证器实例"""
+    return ChartValidator()
+
+
+def create_chart_repairer(
+    validator: Optional[ChartValidator] = None,
+    llm_repair_fns: Optional[List[Callable]] = None
+) -> ChartRepairer:
+    """创建图表修复器实例"""
+    if validator is None:
+        validator = create_chart_validator()
+    return ChartRepairer(validator, llm_repair_fns)
@@ -1,5 +1,5 @@
 """
-Configuration management module for the Report Engine.
+Report Engine 配置模块，统一读取环境变量并提供类型安全的访问方式。
 """

 import os
@@ -15,8 +15,47 @@ class Settings(BaseSettings):
    REPORT_ENGINE_BASE_URL: Optional[str] = Field(None, description="Report Engine LLM基础URL")
    REPORT_ENGINE_MODEL_NAME: Optional[str] = Field(None, description="Report Engine LLM模型名称")
    REPORT_ENGINE_PROVIDER: Optional[str] = Field(None, description="模型服务商，仅兼容保留")
+    # 其他引擎API（用于跨引擎修复）
+    FORUM_HOST_API_KEY: Optional[str] = Field(
+        None, description="Forum Engine / Forum Host 的LLM API密钥（用于章节修复兜底）"
+    )
+    FORUM_HOST_BASE_URL: Optional[str] = Field(
+        None, description="Forum Engine API Base URL（为空则使用LLM默认配置）"
+    )
+    FORUM_HOST_MODEL_NAME: Optional[str] = Field(
+        None, description="Forum Engine LLM模型名称"
+    )
+    INSIGHT_ENGINE_API_KEY: Optional[str] = Field(
+        None, description="Insight Engine LLM API密钥，用于跨引擎章节修复"
+    )
+    INSIGHT_ENGINE_BASE_URL: Optional[str] = Field(
+        None, description="Insight Engine API Base URL"
+    )
+    INSIGHT_ENGINE_MODEL_NAME: Optional[str] = Field(
+        None, description="Insight Engine LLM模型名称"
+    )
+    MEDIA_ENGINE_API_KEY: Optional[str] = Field(
+        None, description="Media Engine LLM API密钥，用于跨引擎章节修复"
+    )
+    MEDIA_ENGINE_BASE_URL: Optional[str] = Field(
+        None, description="Media Engine API Base URL"
+    )
+    MEDIA_ENGINE_MODEL_NAME: Optional[str] = Field(
+        None, description="Media Engine LLM模型名称"
+    )
    MAX_CONTENT_LENGTH: int = Field(200000, description="最大内容长度")
    OUTPUT_DIR: str = Field("final_reports", description="主输出目录")
+    # 章节分块JSON会存储在该目录，便于溯源与断点续传
+    CHAPTER_OUTPUT_DIR: str = Field(
+        "final_reports/chapters", description="章节JSON缓存目录"
+    )
+    # 装订后的整本IR/manifest也会持久化，方便调试与审计
+    DOCUMENT_IR_OUTPUT_DIR: str = Field(
+        "final_reports/ir", description="整本IR/Manifest输出目录"
+    )
+    CHAPTER_JSON_MAX_ATTEMPTS: int = Field(
+        2, description="章节JSON解析失败时的最大尝试次数"
+    )
    TEMPLATE_DIR: str = Field("ReportEngine/report_template", description="多模板目录")
    API_TIMEOUT: float = Field(900.0, description="单API超时时间（秒）")
    MAX_RETRY_DELAY: float = Field(180.0, description="最大重试间隔（秒）")
@@ -24,8 +63,12 @@ class Settings(BaseSettings):
    LOG_FILE: str = Field("logs/report.log", description="日志输出文件")
    ENABLE_PDF_EXPORT: bool = Field(True, description="是否允许导出PDF")
    CHART_STYLE: str = Field("modern", description="图表样式：modern/classic/")
+    JSON_ERROR_LOG_DIR: str = Field(
+        "logs/json_repair_failures", description="无法修复的JSON块落盘目录"
+    )

    class Config:
+        """Pydantic配置：允许从.env读取并兼容大小写"""
        env_file = ".env"
        env_prefix = ""
        case_sensitive = False
@@ -35,12 +78,21 @@ settings = Settings()


 def print_config(config: Settings):
+    """
+    将当前配置项按人类可读格式输出到日志，方便排障。
+
+    参数:
+        config: Settings实例，通常为全局settings。
+    """
    message = ""
    message += "\n=== Report Engine 配置 ===\n"
    message += f"LLM 模型: {config.REPORT_ENGINE_MODEL_NAME}\n"
    message += f"LLM Base URL: {config.REPORT_ENGINE_BASE_URL or '(默认)'}\n"
    message += f"最大内容长度: {config.MAX_CONTENT_LENGTH}\n"
    message += f"输出目录: {config.OUTPUT_DIR}\n"
+    message += f"章节JSON目录: {config.CHAPTER_OUTPUT_DIR}\n"
+    message += f"章节JSON最大尝试次数: {config.CHAPTER_JSON_MAX_ATTEMPTS}\n"
+    message += f"整本IR目录: {config.DOCUMENT_IR_OUTPUT_DIR}\n"
    message += f"模板目录: {config.TEMPLATE_DIR}\n"
    message += f"API 超时时间: {config.API_TIMEOUT} 秒\n"
    message += f"最大重试间隔: {config.MAX_RETRY_DELAY} 秒\n"
@@ -0,0 +1,339 @@
+"""
+检测系统依赖工具
+用于检测 PDF 生成所需的系统依赖
+"""
+import os
+import sys
+import platform
+from pathlib import Path
+from loguru import logger
+from ctypes import util as ctypes_util
+
+BOX_CONTENT_WIDTH = 62
+
+
+def _box_line(text: str = "") -> str:
+    """Render a single line inside the 66-char help box."""
+    return f"║  {text:<{BOX_CONTENT_WIDTH}}║\n"
+
+
+def _get_platform_specific_instructions():
+    """
+    获取针对当前平台的安装说明
+
+    Returns:
+        str: 平台特定的安装说明
+    """
+    system = platform.system()
+
+    def _box_lines(lines):
+        """批量将多行文本包装成带边框的提示块"""
+        return "".join(_box_line(line) for line in lines)
+
+    if system == "Darwin":  # macOS
+        return _box_lines(
+            [
+                "🍎 macOS 系统解决方案：",
+                "",
+                "步骤 1: 安装依赖（宿主机执行）",
+                "  brew install pango gdk-pixbuf libffi",
+                "",
+                "步骤 2: 设置 DYLD_LIBRARY_PATH（必做）",
+                "  Apple Silicon:",
+                " export DYLD_LIBRARY_PATH=/opt/homebrew/lib:$DYLD_LIBRARY_PATH",
+                "  Intel:",
+                " export DYLD_LIBRARY_PATH=/usr/local/lib:$DYLD_LIBRARY_PATH",
+                "",
+                "步骤 3: 永久生效（推荐）",
+                "  将 export DYLD_LIBRARY_PATH=... 追加到 ~/.zshrc",
+                "  Apple 用 /opt/homebrew/lib，Intel 用 /usr/local/lib",
+                "  执行 source ~/.zshrc 后再打开新终端",
+                "",
+                "步骤 4: 新开终端执行验证",
+                "  python -m ReportEngine.utils.dependency_check",
+                "  输出含 “✓ Pango 依赖检测通过” 即配置正确",
+            ]
+        )
+    elif system == "Linux":
+        return _box_lines(
+            [
+                "🐧 Linux 系统解决方案：",
+                "",
+                "Ubuntu/Debian（宿主机执行）：",
+                "  sudo apt-get update",
+                "  sudo apt-get install -y \\",
+                "    libpango-1.0-0 libpangoft2-1.0-0 libffi-dev libcairo2",
+                "    libgdk-pixbuf-2.0-0（缺失时改为 libgdk-pixbuf2.0-0）",
+                "",
+                "CentOS/RHEL：",
+                "  sudo yum install -y pango gdk-pixbuf2 libffi-devel cairo",
+                "",
+                "Docker 部署无需额外安装，镜像已包含依赖",
+            ]
+        )
+    elif system == "Windows":
+        return _box_lines(
+            [
+                "🪟 Windows 系统解决方案：",
+                "",
+                "步骤 1: 安装 GTK3 Runtime（宿主机执行）",
+                "  下载页: README 中的 GTK3 Runtime 链接（建议默认路径）",
+                "",
+                "步骤 2: 将 GTK 安装目录下的 bin 加入 PATH（需新终端）",
+                "  set PATH=C:\\Program Files\\GTK3-Runtime Win64\\bin;%PATH%",
+                "  自定义路径请替换，或设置环境变量 GTK_BIN_PATH",
+                "  可选: 永久添加 PATH 示例:",
+                "    setx PATH \"C:\\Program Files\\GTK3-Runtime Win64\\bin;%PATH%\"",
+                "",
+                "步骤 3: 验证（新终端执行）",
+                "  python -m ReportEngine.utils.dependency_check",
+                "  输出含 “✓ Pango 依赖检测通过” 即配置正确",
+            ]
+        )
+    else:
+        return _box_lines(["请查看 PDF 导出 README 了解您系统的安装方法"])
+
+
+def _ensure_windows_gtk_paths():
+    """
+    为 Windows 自动补充 GTK/Pango 运行时搜索路径，解决 DLL 未找到问题。
+
+    Returns:
+        str | None: 成功添加的路径（没有命中则为 None）
+    """
+    if platform.system() != "Windows":
+        return None
+
+    candidates = []
+    seen = set()
+
+    def _add_candidate(path_like):
+        """收集可能的GTK安装路径，避免重复并兼容用户自定义目录"""
+        if not path_like:
+            return
+        p = Path(path_like)
+        # 如果传入的是安装根目录，尝试拼接 bin
+        if p.is_dir() and p.name.lower() == "bin":
+            key = str(p.resolve()).lower()
+            if key not in seen:
+                seen.add(key)
+                candidates.append(p)
+        else:
+            for maybe in (p, p / "bin"):
+                key = str(maybe.resolve()).lower()
+                if maybe.exists() and key not in seen:
+                    seen.add(key)
+                    candidates.append(maybe)
+
+    # 用户自定义提示优先
+    for env_var in ("GTK3_RUNTIME_PATH", "GTK_RUNTIME_PATH", "GTK_BIN_PATH", "GTK_BIN_DIR", "GTK_PATH"):
+        _add_candidate(os.environ.get(env_var))
+
+    program_files = os.environ.get("ProgramFiles", r"C:\\Program Files")
+    program_files_x86 = os.environ.get("ProgramFiles(x86)", r"C:\\Program Files (x86)")
+    default_dirs = [
+        Path(program_files) / "GTK3-Runtime Win64",
+        Path(program_files_x86) / "GTK3-Runtime Win64",
+        Path(program_files) / "GTK3-Runtime Win32",
+        Path(program_files_x86) / "GTK3-Runtime Win32",
+        Path(program_files) / "GTK3-Runtime",
+        Path(program_files_x86) / "GTK3-Runtime",
+    ]
+
+    # 常见自定义安装位置（其他盘符 / DevelopSoftware 目录）
+    common_drives = ["C", "D", "E", "F"]
+    common_names = ["GTK3-Runtime Win64", "GTK3-Runtime Win32", "GTK3-Runtime"]
+    for drive in common_drives:
+        root = Path(f"{drive}:/")
+        for name in common_names:
+            default_dirs.append(root / name)
+            default_dirs.append(root / "DevelopSoftware" / name)
+
+    # 扫描 Program Files 下所有以 GTK 开头的目录，适配自定义安装目录名
+    for root in (program_files, program_files_x86):
+        root_path = Path(root)
+        if root_path.exists():
+            for child in root_path.glob("GTK*"):
+                default_dirs.append(child)
+
+    for d in default_dirs:
+        _add_candidate(d)
+
+    # 如果用户已把自定义路径加入 PATH，也尝试识别
+    path_entries = os.environ.get("PATH", "").split(os.pathsep)
+    for entry in path_entries:
+        if not entry:
+            continue
+        # 粗筛包含 gtk 或 pango 的目录
+        if "gtk" in entry.lower() or "pango" in entry.lower():
+            _add_candidate(entry)
+
+    for path in candidates:
+        if not path or not path.exists():
+            continue
+        if not any(path.glob("pango*-1.0-*.dll")) and not (path / "pango-1.0-0.dll").exists():
+            continue
+
+        try:
+            if hasattr(os, "add_dll_directory"):
+                os.add_dll_directory(str(path))
+        except Exception:
+            # 如果添加失败，继续尝试 PATH 方式
+            pass
+
+        current_path = os.environ.get("PATH", "")
+        if str(path) not in current_path.split(";"):
+            os.environ["PATH"] = f"{path};{current_path}"
+
+        return str(path)
+
+    return None
+
+
+def prepare_pango_environment():
+    """
+    初始化运行所需的本地依赖搜索路径（当前主要针对 Windows 和 macOS）。
+
+    Returns:
+        str | None: 成功添加的路径（没有命中则为 None）
+    """
+    system = platform.system()
+    if system == "Windows":
+        return _ensure_windows_gtk_paths()
+    if system == "Darwin":
+        # 自动补全 DYLD_LIBRARY_PATH，兼容 Apple Silicon 与 Intel
+        candidates = [Path("/opt/homebrew/lib"), Path("/usr/local/lib")]
+        current = os.environ.get("DYLD_LIBRARY_PATH", "")
+        added = []
+        for c in candidates:
+            if c.exists() and str(c) not in current.split(":"):
+                added.append(str(c))
+        if added:
+            os.environ["DYLD_LIBRARY_PATH"] = ":".join(added + ([current] if current else []))
+            return os.environ["DYLD_LIBRARY_PATH"]
+    return None
+
+
+def _probe_native_libs():
+    """
+    使用 ctypes 查找关键原生库，帮助定位缺失组件。
+
+    Returns:
+        list[str]: 未找到的库标识
+    """
+    system = platform.system()
+    targets = []
+
+    if system == "Windows":
+        targets = [
+            ("pango", ["pango-1.0-0"]),
+            ("gobject", ["gobject-2.0-0"]),
+            ("gdk-pixbuf", ["gdk_pixbuf-2.0-0"]),
+            ("cairo", ["cairo-2"]),
+        ]
+    else:
+        targets = [
+            ("pango", ["pango-1.0"]),
+            ("gobject", ["gobject-2.0"]),
+            ("gdk-pixbuf", ["gdk_pixbuf-2.0"]),
+            ("cairo", ["cairo", "cairo-2"]),
+        ]
+
+    missing = []
+    for key, variants in targets:
+        found = any(ctypes_util.find_library(v) for v in variants)
+        if not found:
+            missing.append(key)
+    return missing
+
+
+def check_pango_available():
+    """
+    检测 Pango 库是否可用
+
+    Returns:
+        tuple: (is_available: bool, message: str)
+    """
+    added_path = prepare_pango_environment()
+    missing_native = _probe_native_libs()
+
+    try:
+        # 尝试导入 weasyprint 并初始化 Pango
+        from weasyprint import HTML
+        from weasyprint.text.ffi import ffi, pango
+
+        # 尝试调用 Pango 函数来确认库可用
+        pango.pango_version()
+
+        return True, "✓ Pango 依赖检测通过，PDF 导出功能可用"
+    except OSError as e:
+        # Pango 库未安装或无法加载
+        error_msg = str(e)
+        platform_instructions = _get_platform_specific_instructions()
+        windows_hint = ""
+        if platform.system() == "Windows":
+            prefix = "已尝试自动添加 GTK 路径: "
+            max_path_len = BOX_CONTENT_WIDTH - len(prefix)
+            path_display = added_path or "未找到默认路径"
+            if len(path_display) > max_path_len:
+                path_display = path_display[: max_path_len - 3] + "..."
+            windows_hint = _box_line(prefix + path_display)
+            arch_note = _box_line("🔍 若已安装仍报错：确认 Python 与 GTK 位数一致后重开终端")
+        else:
+            arch_note = ""
+
+        missing_note = ""
+        if missing_native:
+            missing_str = ", ".join(missing_native)
+            missing_note = _box_line(f"未识别到的依赖: {missing_str}")
+
+        if 'gobject' in error_msg.lower() or 'pango' in error_msg.lower() or 'gdk' in error_msg.lower():
+            box_top = "╔" + "═" * 64 + "╗\n"
+            box_bottom = "╚" + "═" * 64 + "╝"
+            return False, (
+                box_top
+                + _box_line("⚠️  PDF 导出依赖缺失")
+                + _box_line()
+                + _box_line("📄 PDF 导出功能将不可用（其他功能不受影响）")
+                + _box_line()
+                + windows_hint
+                + arch_note
+                + missing_note
+                + platform_instructions
+                + _box_line()
+                + _box_line("📖 文档：static/Partial README for PDF Exporting/README.md")
+                + box_bottom
+            )
+        return False, f"⚠ PDF 依赖加载失败: {error_msg}；缺失/未识别: {', '.join(missing_native) if missing_native else '未知'}"
+    except ImportError as e:
+        # weasyprint 未安装
+        return False, (
+            "⚠ WeasyPrint 未安装\n"
+            "解决方法: pip install weasyprint"
+        )
+    except Exception as e:
+        # 其他未知错误
+        return False, f"⚠ PDF 依赖检测失败: {e}"
+
+
+def log_dependency_status():
+    """
+    记录系统依赖状态到日志
+    """
+    is_available, message = check_pango_available()
+
+    if is_available:
+        logger.success(message)
+    else:
+        logger.warning(message)
+        logger.info("💡 提示：PDF 导出功能需要 Pango 库支持，但不影响系统其他功能的正常使用")
+        logger.info("📚 安装说明请参考：static/Partial README for PDF Exporting/README.md")
+
+    return is_available
+
+
+if __name__ == "__main__":
+    # 用于独立测试
+    is_available, message = check_pango_available()
+    print(message)
+    sys.exit(0 if is_available else 1)
@@ -0,0 +1,763 @@
+"""
+统一的JSON解析和修复工具。
+
+提供鲁棒的JSON解析能力，支持：
+1. 自动清理markdown代码块标记和思考内容
+2. 本地语法修复（括号平衡、逗号补全、控制字符转义等）
+3. 使用json_repair库进行高级修复
+4. LLM辅助修复（可选）
+5. 详细的错误日志和调试信息
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from typing import Any, Dict, List, Optional, Tuple, Callable
+from loguru import logger
+
+try:
+    from json_repair import repair_json as _json_repair_fn
+except ImportError:
+    _json_repair_fn = None
+
+
+class JSONParseError(ValueError):
+    """JSON解析失败时抛出的异常，附带原始文本方便排查。"""
+
+    def __init__(self, message: str, raw_text: Optional[str] = None):
+        """
+        构造异常并附加原始输出，便于日志中定位。
+
+        Args:
+            message: 人类可读的错误描述。
+            raw_text: 触发异常的完整LLM输出。
+        """
+        super().__init__(message)
+        self.raw_text = raw_text
+
+
+class RobustJSONParser:
+    """
+    鲁棒的JSON解析器。
+
+    集成多种修复策略，确保LLM返回的内容能够被正确解析：
+    - 清理markdown包裹、思考内容等额外信息
+    - 修复常见语法错误（缺少逗号、括号不平衡等）
+    - 转义未转义的控制字符
+    - 使用第三方库进行高级修复
+    - 可选的LLM辅助修复
+    """
+
+    # 常见的LLM思考内容模式
+    _THINKING_PATTERNS = [
+        r"^\s*<thinking>.*?</thinking>\s*",
+        r"^\s*<thought>.*?</thought>\s*",
+        r"^\s*让我想想.*?(?=\{|\[|$)",
+        r"^\s*首先.*?(?=\{|\[|$)",
+        r"^\s*分析.*?(?=\{|\[|$)",
+        r"^\s*根据.*?(?=\{|\[|$)",
+    ]
+
+    # 冒号等号模式（LLM常见错误）
+    _COLON_EQUALS_PATTERN = re.compile(r'(":\s*)=')
+
+    def __init__(
+        self,
+        llm_repair_fn: Optional[Callable[[str, str], Optional[str]]] = None,
+        enable_json_repair: bool = True,
+        enable_llm_repair: bool = False,
+        max_repair_attempts: int = 3,
+    ):
+        """
+        初始化JSON解析器。
+
+        Args:
+            llm_repair_fn: 可选的LLM修复函数，接收(原始JSON, 错误信息)返回修复后的JSON
+            enable_json_repair: 是否启用json_repair库
+            enable_llm_repair: 是否启用LLM辅助修复
+            max_repair_attempts: 最大修复尝试次数
+        """
+        self.llm_repair_fn = llm_repair_fn
+        self.enable_json_repair = enable_json_repair and _json_repair_fn is not None
+        self.enable_llm_repair = enable_llm_repair
+        self.max_repair_attempts = max_repair_attempts
+
+    def parse(
+        self,
+        raw_text: str,
+        context_name: str = "JSON",
+        expected_keys: Optional[List[str]] = None,
+        extract_wrapper_key: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """
+        解析LLM返回的JSON文本。
+
+        参数:
+            raw_text: LLM原始输出（可能包含```包裹、思考内容等）
+            context_name: 上下文名称，用于错误信息
+            expected_keys: 期望的键列表，用于验证
+            extract_wrapper_key: 如果JSON被包裹在某个键中，指定该键名进行提取
+
+        返回:
+            dict: 解析后的JSON对象
+
+        异常:
+            JSONParseError: 多种修复策略仍无法解析合法JSON
+        """
+        if not raw_text or not raw_text.strip():
+            raise JSONParseError(f"{context_name}返回空内容")
+
+        # 原始文本用于后续日志
+        original_text = raw_text
+
+        # 步骤1: 构造候选集，包含不同清理策略
+        candidates = self._build_candidate_payloads(raw_text, context_name)
+
+        # 步骤2: 尝试解析所有候选
+        last_error: Optional[json.JSONDecodeError] = None
+        for i, candidate in enumerate(candidates):
+            try:
+                data = json.loads(candidate)
+                logger.debug(f"{context_name} JSON解析成功（候选{i + 1}/{len(candidates)}）")
+                return self._extract_and_validate(
+                    data, expected_keys, extract_wrapper_key, context_name
+                )
+            except json.JSONDecodeError as exc:
+                last_error = exc
+                logger.debug(f"{context_name} 候选{i + 1}解析失败: {exc}")
+
+        cleaned = candidates[0] if candidates else original_text
+
+        # 步骤3: 使用json_repair库
+        if self.enable_json_repair:
+            repaired = self._attempt_json_repair(cleaned, context_name)
+            if repaired:
+                try:
+                    data = json.loads(repaired)
+                    logger.info(f"{context_name} JSON通过json_repair库修复成功")
+                    return self._extract_and_validate(
+                        data, expected_keys, extract_wrapper_key, context_name
+                    )
+                except json.JSONDecodeError as exc:
+                    last_error = exc
+                    logger.debug(f"{context_name} json_repair修复后仍无法解析: {exc}")
+
+        # 步骤4: 使用LLM修复（如果启用）
+        if self.enable_llm_repair and self.llm_repair_fn:
+            llm_repaired = self._attempt_llm_repair(cleaned, str(last_error), context_name)
+            if llm_repaired:
+                try:
+                    data = json.loads(llm_repaired)
+                    logger.info(f"{context_name} JSON通过LLM修复成功")
+                    return self._extract_and_validate(
+                        data, expected_keys, extract_wrapper_key, context_name
+                    )
+                except json.JSONDecodeError as exc:
+                    last_error = exc
+                    logger.warning(f"{context_name} LLM修复后仍无法解析: {exc}")
+
+        # 所有策略都失败了
+        error_msg = f"{context_name} JSON解析失败: {last_error}"
+        logger.error(error_msg)
+        logger.debug(f"原始文本前500字符: {original_text[:500]}")
+        raise JSONParseError(error_msg, raw_text=original_text) from last_error
+
+    def _build_candidate_payloads(self, raw_text: str, context_name: str) -> List[str]:
+        """
+        针对原始文本构造多个候选JSON字符串，覆盖不同的清理策略。
+
+        返回:
+            List[str]: 候选JSON文本列表
+        """
+        cleaned = self._clean_response(raw_text)
+        candidates = [cleaned]
+
+        local_repaired = self._apply_local_repairs(cleaned)
+        if local_repaired != cleaned:
+            candidates.append(local_repaired)
+
+        # 对含有三层列表结构的内容强制拉平一次
+        flattened = self._flatten_nested_arrays(local_repaired)
+        if flattened not in candidates:
+            candidates.append(flattened)
+
+        return candidates
+
+    def _clean_response(self, raw: str) -> str:
+        """
+        清理LLM响应，去除markdown标记和思考内容。
+
+        参数:
+            raw: LLM原始输出
+
+        返回:
+            str: 清理后的文本
+        """
+        cleaned = raw.strip()
+
+        # 移除思考内容（多语言支持）
+        for pattern in self._THINKING_PATTERNS:
+            cleaned = re.sub(pattern, "", cleaned, flags=re.DOTALL | re.IGNORECASE)
+
+        # 优先提取任意位置的```json```包裹内容
+        fenced_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", cleaned)
+        if fenced_match:
+            cleaned = fenced_match.group(1).strip()
+        else:
+            # 如果没有找到完整代码块，再尝试移除前后缀
+            if cleaned.startswith("```json"):
+                cleaned = cleaned[7:]
+            elif cleaned.startswith("```"):
+                cleaned = cleaned[3:]
+
+            if cleaned.endswith("```"):
+                cleaned = cleaned[:-3]
+
+            cleaned = cleaned.strip()
+
+        # 尝试提取第一个完整的JSON对象或数组
+        cleaned = self._extract_first_json_structure(cleaned)
+
+        return cleaned
+
+    def _extract_first_json_structure(self, text: str) -> str:
+        """
+        从文本中提取第一个完整的JSON对象或数组。
+
+        这对于处理LLM在JSON前后添加说明文字的情况很有用。
+
+        参数:
+            text: 可能包含JSON的文本
+
+        返回:
+            str: 提取的JSON文本，如果找不到则返回原文本
+        """
+        # 查找第一个 { 或 [
+        start_brace = text.find("{")
+        start_bracket = text.find("[")
+
+        if start_brace == -1 and start_bracket == -1:
+            return text
+
+        # 确定起始位置
+        if start_brace == -1:
+            start = start_bracket
+            opener = "["
+            closer = "]"
+        elif start_bracket == -1:
+            start = start_brace
+            opener = "{"
+            closer = "}"
+        else:
+            start = min(start_brace, start_bracket)
+            opener = text[start]
+            closer = "}" if opener == "{" else "]"
+
+        # 查找对应的结束位置
+        depth = 0
+        in_string = False
+        escaped = False
+
+        for i in range(start, len(text)):
+            ch = text[i]
+
+            if escaped:
+                escaped = False
+                continue
+
+            if ch == "\\":
+                escaped = True
+                continue
+
+            if ch == '"':
+                in_string = not in_string
+                continue
+
+            if in_string:
+                continue
+
+            if ch in "{[":
+                depth += 1
+            elif ch in "}]":
+                depth -= 1
+                if depth == 0:
+                    return text[start : i + 1]
+
+        # 如果没找到完整的结构，返回从起始位置到结尾
+        return text[start:] if start < len(text) else text
+
+    def _apply_local_repairs(self, text: str) -> str:
+        """
+        应用本地修复策略。
+
+        参数:
+            text: 原始JSON文本
+
+        返回:
+            str: 修复后的文本
+        """
+        repaired = text
+        mutated = False
+
+        # 修复 ":=" 错误
+        new_text = self._COLON_EQUALS_PATTERN.sub(r"\1", repaired)
+        if new_text != repaired:
+            logger.warning("检测到\":=\"字符，已自动移除多余的'='号")
+            repaired = new_text
+            mutated = True
+
+        # 转义控制字符
+        repaired, escaped = self._escape_control_characters(repaired)
+        if escaped:
+            logger.warning("检测到未转义的控制字符，已自动转换为转义序列")
+            mutated = True
+
+        # 修复缺少的逗号
+        repaired, commas_fixed = self._fix_missing_commas(repaired)
+        if commas_fixed:
+            logger.warning("检测到对象/数组之间缺少逗号，已自动补齐")
+            mutated = True
+
+        # 合并多余的方括号（LLM常见把二维列表层级写成三层）
+        repaired, brackets_collapsed = self._collapse_redundant_brackets(repaired)
+        if brackets_collapsed:
+            logger.warning("检测到连续的方括号嵌套，已尝试折叠为二维结构")
+            mutated = True
+
+        # 平衡括号
+        repaired, balanced = self._balance_brackets(repaired)
+        if balanced:
+            logger.warning("检测到括号不平衡，已自动补齐/剔除异常括号")
+            mutated = True
+
+        # 移除尾随逗号
+        repaired, trailing_removed = self._remove_trailing_commas(repaired)
+        if trailing_removed:
+            logger.warning("检测到尾随逗号，已自动移除")
+            mutated = True
+
+        return repaired if mutated else text
+
+    def _escape_control_characters(self, text: str) -> Tuple[str, bool]:
+        """
+        将字符串字面量中的裸换行/制表符/控制字符替换为JSON合法的转义序列。
+
+        参数:
+            text: 原始JSON文本
+
+        返回:
+            Tuple[str, bool]: (修复后的文本, 是否有修改)
+        """
+        if not text:
+            return text, False
+
+        result: List[str] = []
+        in_string = False
+        escaped = False
+        mutated = False
+        control_map = {"\n": "\\n", "\r": "\\r", "\t": "\\t"}
+
+        for ch in text:
+            if escaped:
+                result.append(ch)
+                escaped = False
+                continue
+
+            if ch == "\\":
+                result.append(ch)
+                escaped = True
+                continue
+
+            if ch == '"':
+                result.append(ch)
+                in_string = not in_string
+                continue
+
+            if in_string and ch in control_map:
+                result.append(control_map[ch])
+                mutated = True
+                continue
+
+            if in_string and ord(ch) < 0x20:
+                result.append(f"\\u{ord(ch):04x}")
+                mutated = True
+                continue
+
+            result.append(ch)
+
+        return "".join(result), mutated
+
+    def _fix_missing_commas(self, text: str) -> Tuple[str, bool]:
+        """
+        在对象/数组元素之间自动补逗号。
+
+        参数:
+            text: 原始JSON文本
+
+        返回:
+            Tuple[str, bool]: (修复后的文本, 是否有修改)
+        """
+        if not text:
+            return text, False
+
+        chars: List[str] = []
+        mutated = False
+        in_string = False
+        escaped = False
+        length = len(text)
+        i = 0
+
+        while i < length:
+            ch = text[i]
+            chars.append(ch)
+
+            if escaped:
+                escaped = False
+                i += 1
+                continue
+
+            if ch == "\\":
+                escaped = True
+                i += 1
+                continue
+
+            if ch == '"':
+                # 如果我们正在退出字符串，检查后面是否需要逗号
+                if in_string:
+                    # 查找下一个非空白字符
+                    j = i + 1
+                    while j < length and text[j] in " \t\r\n":
+                        j += 1
+                    # 如果下一个字符是 " { [ 或数字，可能需要逗号
+                    if j < length:
+                        next_ch = text[j]
+                        if next_ch in "\"[{" or next_ch.isdigit():
+                            # 检查是否已经在对象或数组中
+                            # 通过检查前面是否有未闭合的 { 或 [
+                            has_opener = False
+                            for k in range(len(chars) - 1, -1, -1):
+                                if chars[k] in "{[":
+                                    has_opener = True
+                                    break
+                                elif chars[k] in "]}":
+                                    break
+
+                            if has_opener:
+                                chars.append(",")
+                                mutated = True
+
+                in_string = not in_string
+                i += 1
+                continue
+
+            # 在 } 或 ] 后面检查是否需要逗号
+            if not in_string and ch in "}]":
+                j = i + 1
+                # 跳过空白
+                while j < length and text[j] in " \t\r\n":
+                    j += 1
+                # 如果下一个非空白字符是 { [ " 或数字，添加逗号
+                if j < length:
+                    next_ch = text[j]
+                    if next_ch in "{[\"" or next_ch.isdigit():
+                        chars.append(",")
+                        mutated = True
+
+            i += 1
+
+        return "".join(chars), mutated
+
+    def _collapse_redundant_brackets(self, text: str) -> Tuple[str, bool]:
+        """
+        针对LLM生成的三层或更多层数组（如]]], [[ / [[[）进行折叠，避免表格/列表写出额外维度。
+
+        返回:
+            Tuple[str, bool]: (修复后的文本, 是否有修改)
+        """
+        if not text:
+            return text, False
+
+        mutated = False
+
+        patterns = [
+            # 典型错误: "]]], [[{...}" -> "]], [{...}"
+            (re.compile(r"\]\s*\]\s*\]\s*,\s*\[\s*\["), "]],["),
+            # 极端情况: 连续三层开头 "[[[" -> "[["
+            (re.compile(r"\[\s*\[\s*\["), "[["),
+            # 极端情况: 结尾 "]]]" -> "]]"
+            (re.compile(r"\]\s*\]\s*\]"), "]]"),
+        ]
+
+        repaired = text
+        for pattern, replacement in patterns:
+            new_text, count = pattern.subn(replacement, repaired)
+            if count > 0:
+                mutated = True
+                repaired = new_text
+
+        return repaired, mutated
+
+    def _flatten_nested_arrays(self, text: str) -> str:
+        """
+        对明显多余的一层列表进行折叠，例如 [[[x]]] -> [[x]]。
+        """
+        if not text:
+            return text
+        text = re.sub(r"\]\s*\]\s*\]", "]]", text)
+        text = re.sub(r"\[\s*\[\s*\[", "[[", text)
+        return text
+
+    def _balance_brackets(self, text: str) -> Tuple[str, bool]:
+        """
+        尝试修复因LLM多写/少写括号导致的不平衡结构。
+
+        参数:
+            text: 原始JSON文本
+
+        返回:
+            Tuple[str, bool]: (修复后的文本, 是否有修改)
+        """
+        if not text:
+            return text, False
+
+        result: List[str] = []
+        stack: List[str] = []
+        mutated = False
+        in_string = False
+        escaped = False
+
+        opener_map = {"{": "}", "[": "]"}
+
+        for ch in text:
+            if escaped:
+                result.append(ch)
+                escaped = False
+                continue
+
+            if ch == "\\":
+                result.append(ch)
+                escaped = True
+                continue
+
+            if ch == '"':
+                result.append(ch)
+                in_string = not in_string
+                continue
+
+            if in_string:
+                result.append(ch)
+                continue
+
+            if ch in "{[":
+                stack.append(ch)
+                result.append(ch)
+                continue
+
+            if ch in "}]":
+                if stack and (
+                    (ch == "}" and stack[-1] == "{") or (ch == "]" and stack[-1] == "[")
+                ):
+                    stack.pop()
+                    result.append(ch)
+                else:
+                    # 不匹配的闭括号，忽略
+                    mutated = True
+                continue
+
+            result.append(ch)
+
+        # 补齐未闭合的括号
+        while stack:
+            opener = stack.pop()
+            result.append(opener_map[opener])
+            mutated = True
+
+        return "".join(result), mutated
+
+    def _remove_trailing_commas(self, text: str) -> Tuple[str, bool]:
+        """
+        移除JSON对象和数组中的尾随逗号。
+
+        参数:
+            text: 原始JSON文本
+
+        返回:
+            Tuple[str, bool]: (修复后的文本, 是否有修改)
+        """
+        if not text:
+            return text, False
+
+        # 使用正则表达式移除尾随逗号
+        # 匹配 , 后面跟着空白和 } 或 ] 的情况
+        pattern = r",(\s*[}\]])"
+        new_text = re.sub(pattern, r"\1", text)
+
+        return new_text, new_text != text
+
+    def _attempt_json_repair(self, text: str, context_name: str) -> Optional[str]:
+        """
+        使用json_repair库进行高级修复。
+
+        参数:
+            text: 原始JSON文本
+            context_name: 上下文名称
+
+        返回:
+            Optional[str]: 修复后的JSON文本，失败返回None
+        """
+        if not _json_repair_fn:
+            return None
+
+        try:
+            fixed = _json_repair_fn(text)
+            if fixed and fixed != text:
+                logger.info(f"{context_name} 使用json_repair库自动修复JSON")
+                return fixed
+        except Exception as exc:
+            logger.debug(f"{context_name} json_repair修复失败: {exc}")
+
+        return None
+
+    def _attempt_llm_repair(
+        self, text: str, error_msg: str, context_name: str
+    ) -> Optional[str]:
+        """
+        使用LLM进行JSON修复。
+
+        参数:
+            text: 原始JSON文本
+            error_msg: 解析错误信息
+            context_name: 上下文名称
+
+        返回:
+            Optional[str]: 修复后的JSON文本，失败返回None
+        """
+        if not self.llm_repair_fn:
+            return None
+
+        try:
+            logger.info(f"{context_name} 尝试使用LLM修复JSON")
+            repaired = self.llm_repair_fn(text, error_msg)
+            if repaired and repaired != text:
+                return repaired
+        except Exception as exc:
+            logger.warning(f"{context_name} LLM修复失败: {exc}")
+
+        return None
+
+    def _extract_and_validate(
+        self,
+        data: Any,
+        expected_keys: Optional[List[str]],
+        extract_wrapper_key: Optional[str],
+        context_name: str,
+    ) -> Dict[str, Any]:
+        """
+        提取并验证JSON数据。
+
+        参数:
+            data: 解析后的数据
+            expected_keys: 期望的键列表
+            extract_wrapper_key: 包裹键名
+            context_name: 上下文名称
+
+        返回:
+            Dict[str, Any]: 提取并验证后的数据
+
+        异常:
+            JSONParseError: 如果数据格式不符合预期
+        """
+        # 提取包裹的数据
+        if extract_wrapper_key and isinstance(data, dict):
+            if extract_wrapper_key in data:
+                data = data[extract_wrapper_key]
+            else:
+                logger.warning(
+                    f"{context_name} 未找到包裹键'{extract_wrapper_key}'，使用原始数据"
+                )
+
+        # 验证数据类型
+        if not isinstance(data, dict):
+            if isinstance(data, list):
+                if len(data) > 0:
+                    # 尝试找到最符合期望的元素
+                    best_match = None
+                    max_match_count = 0
+
+                    for item in data:
+                        if isinstance(item, dict):
+                            if expected_keys:
+                                # 计算匹配的键数量
+                                match_count = sum(1 for key in expected_keys if key in item)
+                                if match_count > max_match_count:
+                                    max_match_count = match_count
+                                    best_match = item
+                            elif best_match is None:
+                                best_match = item
+
+                    if best_match:
+                        logger.warning(
+                            f"{context_name} 返回数组，自动提取最佳匹配元素（匹配{max_match_count}/{len(expected_keys or [])}个键）"
+                        )
+                        data = best_match
+                    else:
+                        raise JSONParseError(
+                            f"{context_name} 返回的数组中没有有效的对象"
+                        )
+                else:
+                    raise JSONParseError(f"{context_name} 返回空数组")
+            else:
+                raise JSONParseError(
+                    f"{context_name} 返回的不是JSON对象: {type(data).__name__}"
+                )
+
+        # 验证必需的键
+        if expected_keys:
+            missing_keys = [key for key in expected_keys if key not in data]
+            if missing_keys:
+                logger.warning(
+                    f"{context_name} 缺少预期的键: {', '.join(missing_keys)}"
+                )
+                # 尝试修复常见的键名变体
+                data = self._try_recover_missing_keys(data, missing_keys, context_name)
+
+        return data
+
+    def _try_recover_missing_keys(
+        self, data: Dict[str, Any], missing_keys: List[str], context_name: str
+    ) -> Dict[str, Any]:
+        """
+        尝试从数据中恢复缺失的键，通过查找相似的键名。
+
+        参数:
+            data: 原始数据
+            missing_keys: 缺失的键列表
+            context_name: 上下文名称
+
+        返回:
+            Dict[str, Any]: 修复后的数据
+        """
+        # 常见的键名映射
+        key_aliases = {
+            "template_name": ["templateName", "name", "template"],
+            "selection_reason": ["selectionReason", "reason", "explanation"],
+            "title": ["reportTitle", "documentTitle"],
+            "chapters": ["chapterList", "chapterPlan", "sections"],
+            "totalWords": ["total_words", "wordCount", "totalWordCount"],
+        }
+
+        for missing_key in missing_keys:
+            if missing_key in key_aliases:
+                for alias in key_aliases[missing_key]:
+                    if alias in data:
+                        logger.info(
+                            f"{context_name} 找到键'{missing_key}'的别名'{alias}'，自动映射"
+                        )
+                        data[missing_key] = data[alias]
+                        break
+
+        return data
+
+
+__all__ = ["RobustJSONParser", "JSONParseError"]
@@ -0,0 +1,456 @@
+"""
+图表验证器和修复器的测试用例。
+
+运行测试：
+    python -m pytest ReportEngine/utils/test_chart_validator.py -v
+"""
+
+import pytest
+from ReportEngine.utils.chart_validator import (
+    ChartValidator,
+    ChartRepairer,
+    ValidationResult,
+    RepairResult,
+    create_chart_validator,
+    create_chart_repairer
+)
+
+
+class TestChartValidator:
+    """测试ChartValidator类"""
+
+    def setup_method(self):
+        """每个测试前初始化"""
+        self.validator = create_chart_validator()
+
+    def test_valid_bar_chart(self):
+        """测试有效的柱状图"""
+        widget_block = {
+            "type": "widget",
+            "widgetType": "chart.js/bar",
+            "widgetId": "chart-001",
+            "props": {
+                "type": "bar",
+                "title": "销售数据"
+            },
+            "data": {
+                "labels": ["一月", "二月", "三月"],
+                "datasets": [
+                    {
+                        "label": "销售额",
+                        "data": [100, 200, 150]
+                    }
+                ]
+            }
+        }
+
+        result = self.validator.validate(widget_block)
+        assert result.is_valid
+        assert len(result.errors) == 0
+
+    def test_valid_line_chart(self):
+        """测试有效的折线图"""
+        widget_block = {
+            "type": "widget",
+            "widgetType": "chart.js/line",
+            "widgetId": "chart-002",
+            "props": {
+                "type": "line"
+            },
+            "data": {
+                "labels": ["周一", "周二", "周三"],
+                "datasets": [
+                    {
+                        "label": "访问量",
+                        "data": [50, 75, 60]
+                    }
+                ]
+            }
+        }
+
+        result = self.validator.validate(widget_block)
+        assert result.is_valid
+
+    def test_valid_pie_chart(self):
+        """测试有效的饼图"""
+        widget_block = {
+            "widgetType": "chart.js/pie",
+            "props": {"type": "pie"},
+            "data": {
+                "labels": ["A", "B", "C"],
+                "datasets": [
+                    {
+                        "data": [30, 40, 30]
+                    }
+                ]
+            }
+        }
+
+        result = self.validator.validate(widget_block)
+        assert result.is_valid
+
+    def test_missing_widgetType(self):
+        """测试缺少widgetType"""
+        widget_block = {
+            "props": {},
+            "data": {}
+        }
+
+        result = self.validator.validate(widget_block)
+        assert not result.is_valid
+        assert "widgetType" in result.errors[0]
+
+    def test_missing_data_field(self):
+        """测试缺少data字段"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"}
+        }
+
+        result = self.validator.validate(widget_block)
+        assert not result.is_valid
+        assert "data" in result.errors[0]
+
+    def test_missing_datasets(self):
+        """测试缺少datasets"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "labels": ["A", "B"]
+            }
+        }
+
+        result = self.validator.validate(widget_block)
+        assert not result.is_valid
+        assert "datasets" in result.errors[0]
+
+    def test_empty_datasets(self):
+        """测试空datasets"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "labels": ["A", "B"],
+                "datasets": []
+            }
+        }
+
+        result = self.validator.validate(widget_block)
+        assert not result.is_valid
+        assert "空" in result.errors[0]
+
+    def test_missing_labels_for_bar_chart(self):
+        """测试柱状图缺少labels"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "datasets": [
+                    {
+                        "label": "系列1",
+                        "data": [10, 20, 30]
+                    }
+                ]
+            }
+        }
+
+        result = self.validator.validate(widget_block)
+        assert not result.is_valid
+        assert "labels" in result.errors[0]
+
+    def test_invalid_data_type(self):
+        """测试数据类型错误"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "labels": ["A", "B"],
+                "datasets": [
+                    {
+                        "label": "系列1",
+                        "data": ["abc", "def"]  # 应该是数值
+                    }
+                ]
+            }
+        }
+
+        result = self.validator.validate(widget_block)
+        assert not result.is_valid
+        assert "数值类型" in result.errors[0]
+
+    def test_data_length_mismatch_warning(self):
+        """测试数据长度不匹配（警告）"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "labels": ["A", "B", "C"],
+                "datasets": [
+                    {
+                        "label": "系列1",
+                        "data": [10, 20]  # 长度不匹配
+                    }
+                ]
+            }
+        }
+
+        result = self.validator.validate(widget_block)
+        # 长度不匹配是警告，不是错误
+        assert len(result.warnings) > 0
+        assert "不匹配" in result.warnings[0]
+
+    def test_scatter_chart(self):
+        """测试散点图（特殊数据格式）"""
+        widget_block = {
+            "widgetType": "chart.js/scatter",
+            "props": {"type": "scatter"},
+            "data": {
+                "datasets": [
+                    {
+                        "label": "数据点",
+                        "data": [
+                            {"x": 10, "y": 20},
+                            {"x": 15, "y": 25}
+                        ]
+                    }
+                ]
+            }
+        }
+
+        result = self.validator.validate(widget_block)
+        assert result.is_valid
+
+    def test_non_chart_widget(self):
+        """测试非图表类型的widget（应该跳过验证）"""
+        widget_block = {
+            "widgetType": "custom/widget",
+            "props": {},
+            "data": {}
+        }
+
+        result = self.validator.validate(widget_block)
+        # 非chart.js类型，跳过验证，返回valid
+        assert result.is_valid
+
+
+class TestChartRepairer:
+    """测试ChartRepairer类"""
+
+    def setup_method(self):
+        """每个测试前初始化"""
+        self.validator = create_chart_validator()
+        self.repairer = create_chart_repairer(validator=self.validator)
+
+    def test_repair_missing_props(self):
+        """测试修复缺少props字段"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "data": {
+                "labels": ["A", "B"],
+                "datasets": [
+                    {
+                        "label": "系列1",
+                        "data": [10, 20]
+                    }
+                ]
+            }
+        }
+
+        result = self.repairer.repair(widget_block)
+        assert result.success
+        assert "props" in result.repaired_block
+        assert result.method == "local"
+
+    def test_repair_missing_chart_type(self):
+        """测试修复缺少图表类型"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {},
+            "data": {
+                "labels": ["A", "B"],
+                "datasets": [
+                    {
+                        "label": "系列1",
+                        "data": [10, 20]
+                    }
+                ]
+            }
+        }
+
+        result = self.repairer.repair(widget_block)
+        assert result.success
+        assert result.repaired_block["props"]["type"] == "bar"
+        assert "图表类型" in str(result.changes)
+
+    def test_repair_missing_datasets(self):
+        """测试修复缺少datasets"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "labels": ["A", "B"]
+            }
+        }
+
+        result = self.repairer.repair(widget_block)
+        assert result.success
+        assert "datasets" in result.repaired_block["data"]
+        assert isinstance(result.repaired_block["data"]["datasets"], list)
+
+    def test_repair_missing_labels(self):
+        """测试修复缺少labels"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "datasets": [
+                    {
+                        "label": "系列1",
+                        "data": [10, 20, 30]
+                    }
+                ]
+            }
+        }
+
+        result = self.repairer.repair(widget_block)
+        assert result.success
+        assert "labels" in result.repaired_block["data"]
+        assert len(result.repaired_block["data"]["labels"]) == 3
+
+    def test_repair_data_length_mismatch(self):
+        """测试修复数据长度不匹配"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "labels": ["A", "B", "C", "D"],
+                "datasets": [
+                    {
+                        "label": "系列1",
+                        "data": [10, 20]  # 长度不足
+                    }
+                ]
+            }
+        }
+
+        result = self.repairer.repair(widget_block)
+        assert result.success
+        # 应该补充到4个元素
+        assert len(result.repaired_block["data"]["datasets"][0]["data"]) == 4
+
+    def test_repair_string_to_number(self):
+        """测试修复字符串类型的数值"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "labels": ["A", "B"],
+                "datasets": [
+                    {
+                        "label": "系列1",
+                        "data": ["10", "20"]  # 字符串数值
+                    }
+                ]
+            }
+        }
+
+        result = self.repairer.repair(widget_block)
+        assert result.success
+        # 应该转换为数值
+        assert isinstance(result.repaired_block["data"]["datasets"][0]["data"][0], float)
+
+    def test_repair_construct_datasets_from_values(self):
+        """测试从values字段构造datasets"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "labels": ["A", "B"],
+                "values": [10, 20]  # 使用values而不是datasets
+            }
+        }
+
+        result = self.repairer.repair(widget_block)
+        assert result.success
+        assert "datasets" in result.repaired_block["data"]
+        assert len(result.repaired_block["data"]["datasets"]) > 0
+
+    def test_no_repair_needed(self):
+        """测试不需要修复的情况"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "labels": ["A", "B"],
+                "datasets": [
+                    {
+                        "label": "系列1",
+                        "data": [10, 20]
+                    }
+                ]
+            }
+        }
+
+        result = self.repairer.repair(widget_block)
+        assert result.success
+        assert result.method == "none"
+        assert len(result.changes) == 0
+
+    def test_repair_adds_default_label(self):
+        """测试修复添加默认label"""
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "props": {"type": "bar"},
+            "data": {
+                "labels": ["A", "B"],
+                "datasets": [
+                    {
+                        # 缺少label
+                        "data": [10, 20]
+                    }
+                ]
+            }
+        }
+
+        result = self.repairer.repair(widget_block)
+        assert result.success
+        assert "label" in result.repaired_block["data"]["datasets"][0]
+
+
+class TestValidatorIntegration:
+    """集成测试"""
+
+    def test_full_validation_and_repair_workflow(self):
+        """测试完整的验证和修复流程"""
+        validator = create_chart_validator()
+        repairer = create_chart_repairer(validator=validator)
+
+        # 一个有多个问题的图表
+        widget_block = {
+            "widgetType": "chart.js/bar",
+            "data": {
+                "datasets": [
+                    {
+                        "data": ["10", "20", "30"]  # 字符串数值
+                    }
+                ]
+            }
+        }
+
+        # 1. 验证（应该失败）
+        validation = validator.validate(widget_block)
+        assert not validation.is_valid
+
+        # 2. 修复
+        repair_result = repairer.repair(widget_block, validation)
+        assert repair_result.success
+
+        # 3. 再次验证（应该通过）
+        final_validation = validator.validate(repair_result.repaired_block)
+        assert final_validation.is_valid
+
+
+if __name__ == "__main__":
+    # 运行测试
+    pytest.main([__file__, "-v", "--tb=short"])
@@ -0,0 +1,290 @@
+"""
+测试RobustJSONParser的各种修复能力。
+
+验证解析器能够处理：
+1. 基本的markdown包裹
+2. 思考内容清理
+3. 缺少逗号的修复
+4. 括号不平衡的修复
+5. 控制字符转义
+6. 尾随逗号移除
+"""
+
+import json
+import unittest
+from json_parser import RobustJSONParser, JSONParseError
+
+
+class TestRobustJSONParser(unittest.TestCase):
+    """测试鲁棒JSON解析器的各种修复策略。"""
+
+    def setUp(self):
+        """初始化解析器。"""
+        self.parser = RobustJSONParser(
+            enable_json_repair=False,  # 先测试本地修复
+            enable_llm_repair=False,
+        )
+
+    def test_basic_json(self):
+        """测试解析基本的合法JSON。"""
+        json_str = '{"name": "test", "value": 123}'
+        result = self.parser.parse(json_str, "基本测试")
+        self.assertEqual(result["name"], "test")
+        self.assertEqual(result["value"], 123)
+
+    def test_markdown_wrapped(self):
+        """测试解析被```json包裹的JSON。"""
+        json_str = """```json
+{
+  "name": "test",
+  "value": 123
+}
+```"""
+        result = self.parser.parse(json_str, "Markdown包裹测试")
+        self.assertEqual(result["name"], "test")
+        self.assertEqual(result["value"], 123)
+
+    def test_thinking_content_removal(self):
+        """测试清理思考内容。"""
+        json_str = """<thinking>让我想想如何构造这个JSON</thinking>
+{
+  "name": "test",
+  "value": 123
+}"""
+        result = self.parser.parse(json_str, "思考内容清理测试")
+        self.assertEqual(result["name"], "test")
+        self.assertEqual(result["value"], 123)
+
+    def test_missing_comma_fix(self):
+        """测试修复缺少的逗号。"""
+        # 这是实际错误中常见的情况：数组元素之间缺少逗号
+        json_str = """{
+  "totalWords": 40000,
+  "globalGuidelines": [
+    "重点突出技术红利分配失衡"
+    "详略策略：技术创新"
+  ],
+  "chapters": []
+}"""
+        result = self.parser.parse(json_str, "缺少逗号修复测试")
+        self.assertEqual(len(result["globalGuidelines"]), 2)
+
+    def test_unbalanced_brackets(self):
+        """测试修复括号不平衡。"""
+        # 缺少结束括号
+        json_str = """{
+  "name": "test",
+  "nested": {
+    "value": 123
+  }
+"""  # 缺少最外层的 }
+        result = self.parser.parse(json_str, "括号不平衡测试")
+        self.assertEqual(result["name"], "test")
+        self.assertEqual(result["nested"]["value"], 123)
+
+    def test_control_character_escape(self):
+        """测试转义控制字符。"""
+        # JSON字符串中的裸换行符应该被转义
+        json_str = """{
+  "text": "这是第一行
+这是第二行",
+  "value": 123
+}"""
+        result = self.parser.parse(json_str, "控制字符转义测试")
+        # 确保换行符被正确处理
+        self.assertIn("第一行", result["text"])
+        self.assertIn("第二行", result["text"])
+
+    def test_trailing_comma_removal(self):
+        """测试移除尾随逗号。"""
+        json_str = """{
+  "name": "test",
+  "value": 123,
+  "items": [1, 2, 3,],
+}"""
+        result = self.parser.parse(json_str, "尾随逗号测试")
+        self.assertEqual(result["name"], "test")
+        self.assertEqual(len(result["items"]), 3)
+
+    def test_colon_equals_fix(self):
+        """测试修复冒号等号错误。"""
+        json_str = """{
+  "name":= "test",
+  "value": 123
+}"""
+        result = self.parser.parse(json_str, "冒号等号测试")
+        self.assertEqual(result["name"], "test")
+
+    def test_extract_first_json(self):
+        """测试从文本中提取第一个JSON结构。"""
+        json_str = """这是一些说明文字，下面是JSON：
+{
+  "name": "test",
+  "value": 123
+}
+后面还有一些其他文字"""
+        result = self.parser.parse(json_str, "提取JSON测试")
+        self.assertEqual(result["name"], "test")
+        self.assertEqual(result["value"], 123)
+
+    def test_unterminated_string_with_json_repair(self):
+        """测试使用json_repair库修复未终止的字符串。"""
+        # 创建启用json_repair的解析器
+        parser_with_repair = RobustJSONParser(
+            enable_json_repair=True,
+            enable_llm_repair=False,
+        )
+
+        # 模拟实际错误：字符串中有未转义的控制字符或引号
+        json_str = """{
+  "template_name": "特定政策报告",
+  "selection_reason": "这是测试内容"
+}"""
+        result = parser_with_repair.parse(json_str, "未终止字符串测试")
+        # 只要能够解析成功，不报错就可以了
+        self.assertIsInstance(result, dict)
+        self.assertIn("template_name", result)
+
+    def test_array_with_best_match(self):
+        """测试从数组中提取最佳匹配的元素。"""
+        json_str = """[
+  {
+    "name": "test",
+    "value": 123
+  },
+  {
+    "totalWords": 40000,
+    "globalGuidelines": ["guide1", "guide2"],
+    "chapters": []
+  }
+]"""
+        result = self.parser.parse(
+            json_str,
+            "数组最佳匹配测试",
+            expected_keys=["totalWords", "globalGuidelines", "chapters"],
+        )
+        # 应该提取第二个元素，因为它匹配了3个键
+        self.assertEqual(result["totalWords"], 40000)
+        self.assertEqual(len(result["globalGuidelines"]), 2)
+
+    def test_key_alias_recovery(self):
+        """测试键名别名恢复。"""
+        json_str = """{
+  "templateName": "test_template",
+  "selectionReason": "This is a test"
+}"""
+        result = self.parser.parse(
+            json_str,
+            "键别名测试",
+            expected_keys=["template_name", "selection_reason"],
+        )
+        # 应该自动映射 templateName -> template_name
+        self.assertEqual(result["template_name"], "test_template")
+        self.assertEqual(result["selection_reason"], "This is a test")
+
+    def test_complex_real_world_case(self):
+        """测试真实世界的复杂案例（类似实际错误）。"""
+        # 模拟实际错误：缺少逗号、有markdown包裹、有思考内容
+        json_str = """<thinking>我需要构造一个篇幅规划</thinking>
+```json
+{
+  "totalWords": 40000,
+  "tolerance": 2000,
+  "globalGuidelines": [
+    "重点突出技术红利分配失衡、人才流失与职业认同危机等结构性矛盾"
+    "详略策略：技术创新与传统技艺的碰撞"
+    "案例导向：优先引用真实数据和调研"
+  ],
+  "chapters": [
+    {
+      "chapterId": "ch1",
+      "targetWords": 5000
+    }
+  ]
+}
+```"""
+        result = self.parser.parse(json_str, "复杂真实案例测试")
+        self.assertEqual(result["totalWords"], 40000)
+        self.assertEqual(result["tolerance"], 2000)
+        self.assertEqual(len(result["globalGuidelines"]), 3)
+        self.assertEqual(len(result["chapters"]), 1)
+
+    def test_expected_keys_validation(self):
+        """测试期望键的验证。"""
+        json_str = '{"name": "test"}'
+        # 不应该因为缺少键而失败，只是警告
+        result = self.parser.parse(
+            json_str, "键验证测试", expected_keys=["name", "value"]
+        )
+        self.assertIn("name", result)
+
+    def test_wrapper_key_extraction(self):
+        """测试从包裹键中提取数据。"""
+        json_str = """{
+  "wrapper": {
+    "name": "test",
+    "value": 123
+  }
+}"""
+        result = self.parser.parse(
+            json_str, "包裹键测试", extract_wrapper_key="wrapper"
+        )
+        self.assertEqual(result["name"], "test")
+        self.assertEqual(result["value"], 123)
+
+    def test_empty_input(self):
+        """测试空输入。"""
+        with self.assertRaises(JSONParseError):
+            self.parser.parse("", "空输入测试")
+
+    def test_invalid_json_after_all_repairs(self):
+        """测试所有修复策略都无法处理的情况。"""
+        # 这是一个严重损坏的JSON，无法修复
+        json_str = "{完全不是JSON格式的内容###"
+        with self.assertRaises(JSONParseError):
+            self.parser.parse(json_str, "无法修复测试")
+
+
+def run_manual_test():
+    """手动运行测试，打印详细信息。"""
+    print("=" * 60)
+    print("开始测试RobustJSONParser")
+    print("=" * 60)
+
+    parser = RobustJSONParser(enable_json_repair=False, enable_llm_repair=False)
+
+    # 测试实际错误案例
+    test_case = """```json
+{
+  "totalWords": 40000,
+  "tolerance": 2000,
+  "globalGuidelines": [
+    "重点突出技术红利分配失衡、人才流失与职业认同危机等结构性矛盾"
+    "详略策略：技术创新与传统技艺的碰撞"
+  ],
+  "chapters": []
+}
+```"""
+
+    print("\n测试案例：")
+    print(test_case)
+    print("\n" + "=" * 60)
+
+    try:
+        result = parser.parse(test_case, "手动测试")
+        print("\n✓ 解析成功！")
+        print("\n解析结果：")
+        print(json.dumps(result, ensure_ascii=False, indent=2))
+    except Exception as e:
+        print(f"\n✗ 解析失败: {e}")
+
+    print("\n" + "=" * 60)
+
+
+if __name__ == "__main__":
+    # 运行手动测试
+    run_manual_test()
+
+    # 运行单元测试
+    print("\n\n运行单元测试...")
+    unittest.main(verbosity=2)
@@ -220,11 +220,23 @@ def display_results(agent: DeepSearchAgent, final_report: str):

        if all_searches:
            for i, search in enumerate(all_searches):
-                with st.expander(f"搜索 {i + 1}: {search.query}"):
-                    st.write("**URL:**", search.url)
-                    st.write("**标题:**", search.title)
-                    st.write("**内容预览:**",
-                             search.content[:200] + "..." if len(search.content) > 200 else search.content)
+                query_label = search.query if search.query else "未记录查询"
+                with st.expander(f"搜索 {i + 1}: {query_label}"):
+                    paragraph_title = getattr(search, "paragraph_title", "") or "未标注段落"
+                    search_tool = getattr(search, "search_tool", "") or "未标注工具"
+                    has_result = getattr(search, "has_result", True)
+                    st.write("**段落:**", paragraph_title)
+                    st.write("**使用的工具:**", search_tool)
+                    preview = search.content or ""
+                    if not isinstance(preview, str):
+                        preview = str(preview)
+                    if len(preview) > 200:
+                        preview = preview[:200] + "..."
+                    st.write("**URL:**", search.url or "无")
+                    st.write("**标题:**", search.title or "无")
+                    st.write("**内容预览:**", preview if preview else "无可用内容")
+                    if not has_result:
+                        st.info("本次搜索未返回结果")
                    if search.score:
                        st.write("**相关度评分:**", search.score)

@@ -4,6 +4,12 @@ Flask主应用 - 统一管理三个Streamlit应用

 import os
 import sys
+
+# 【修复】尽早设置环境变量，确保所有模块都使用无缓冲模式
+os.environ['PYTHONIOENCODING'] = 'utf-8'
+os.environ['PYTHONUTF8'] = '1'
+os.environ['PYTHONUNBUFFERED'] = '1'  # 禁用Python输出缓冲，确保日志实时输出
+
 import subprocess
 import time
 import threading
@@ -30,6 +36,39 @@ app = Flask(__name__)
 app.config['SECRET_KEY'] = 'Dedicated-to-creating-a-concise-and-versatile-public-opinion-analysis-platform'
 socketio = SocketIO(app, cors_allowed_origins="*")

+# eventlet 在客户端主动断开时偶尔会抛出 ConnectionAbortedError，这里做一次防御性包裹，
+# 避免无意义的堆栈污染日志（仅在 eventlet 可用时启用）。
+def _patch_eventlet_disconnect_logging():
+    try:
+        import eventlet.wsgi  # type: ignore
+    except Exception as exc:  # pragma: no cover - 仅在生产环境有效
+        logger.debug(f"eventlet 不可用，跳过断开补丁: {exc}")
+        return
+
+    try:
+        original_finish = eventlet.wsgi.HttpProtocol.finish  # type: ignore[attr-defined]
+    except Exception as exc:  # pragma: no cover
+        logger.debug(f"eventlet 缺少 HttpProtocol.finish，跳过断开补丁: {exc}")
+        return
+
+    def _safe_finish(self, *args, **kwargs):  # pragma: no cover - 运行时才会触发
+        try:
+            return original_finish(self, *args, **kwargs)
+        except (BrokenPipeError, ConnectionResetError, ConnectionAbortedError) as exc:
+            try:
+                environ = getattr(self, 'environ', {}) or {}
+                method = environ.get('REQUEST_METHOD', '')
+                path = environ.get('PATH_INFO', '')
+                logger.warning(f"客户端已主动断开，忽略异常: {method} {path} ({exc})")
+            except Exception:
+                logger.warning(f"客户端已主动断开，忽略异常: {exc}")
+            return
+
+    eventlet.wsgi.HttpProtocol.finish = _safe_finish  # type: ignore[attr-defined]
+    logger.info("已对 eventlet 连接中断进行安全防护")
+
+_patch_eventlet_disconnect_logging()
+
 # 注册ReportEngine Blueprint
 if REPORT_ENGINE_AVAILABLE:
    app.register_blueprint(report_bp, url_prefix='/api/report')
@@ -37,10 +76,6 @@ if REPORT_ENGINE_AVAILABLE:
 else:
    logger.info("ReportEngine不可用，跳过接口注册")

-# 设置UTF-8编码环境
-os.environ['PYTHONIOENCODING'] = 'utf-8'
-os.environ['PYTHONUTF8'] = '1'
-
 # 创建日志目录
 LOG_DIR = Path('logs')
 LOG_DIR.mkdir(exist_ok=True)
@@ -342,79 +377,88 @@ def parse_forum_log_line(line):
    """解析forum.log行内容，提取对话信息"""
    import re
    
-    # 匹配格式: [时间] [来源] 内容
-    pattern = r'\[(\d{2}:\d{2}:\d{2})\]\s*\[([A-Z]+)\]\s*(.*)'
+    # 匹配格式: [时间] [来源] 内容（来源允许大小写及空格）
+    pattern = r'\[(\d{2}:\d{2}:\d{2})\]\s*\[([^\]]+)\]\s*(.*)'
    match = re.match(pattern, line)
    
-    if match:
-        timestamp, source, content = match.groups()
-        
-        # 过滤掉系统消息和空内容
-        if source == 'SYSTEM' or not content.strip():
-            return None
-        
-        # 只处理三个Engine的消息
-        if source not in ['QUERY', 'INSIGHT', 'MEDIA']:
-            return None
-        
-        # 根据来源确定消息类型和发送者
-        message_type = 'agent'
-        sender = f'{source} Engine'
-        
-        return {
-            'type': message_type,
-            'sender': sender,
-            'content': content.strip(),
-            'timestamp': timestamp,
-            'source': source
-        }
+    if not match:
+        return None
+
+    timestamp, raw_source, content = match.groups()
+    source = raw_source.strip().upper()
+
+    # 过滤掉系统消息和空内容
+    if source == 'SYSTEM' or not content.strip():
+        return None
    
-    return None
+    # 支持三个Agent和主持人
+    if source not in ['QUERY', 'INSIGHT', 'MEDIA', 'HOST']:
+        return None
+    
+    # 解码日志中的转义换行，保留多行格式
+    cleaned_content = content.replace('\\n', '\n').replace('\\r', '').strip()
+    
+    # 根据来源确定消息类型和发送者
+    if source == 'HOST':
+        message_type = 'host'
+        sender = 'Forum Host'
+    else:
+        message_type = 'agent'
+        sender = f'{source.title()} Engine'
+    
+    return {
+        'type': message_type,
+        'sender': sender,
+        'content': cleaned_content,
+        'timestamp': timestamp,
+        'source': source
+    }

 # Forum日志监听器
+# 存储每个客户端的历史日志发送位置
+forum_log_positions = {}
+
 def monitor_forum_log():
    """监听forum.log文件变化并推送到前端"""
    import time
    from pathlib import Path
-    
+
    forum_log_file = LOG_DIR / "forum.log"
    last_position = 0
    processed_lines = set()  # 用于跟踪已处理的行，避免重复
-    
-    # 如果文件存在，获取初始位置
+
+    # 如果文件存在，获取初始位置但不跳过内容
    if forum_log_file.exists():
        with open(forum_log_file, 'r', encoding='utf-8', errors='ignore') as f:
-            # 初始化时读取所有现有行，避免重复处理
-            existing_lines = f.readlines()
-            for line in existing_lines:
-                line_hash = hash(line.strip())
-                processed_lines.add(line_hash)
+            # 记录文件大小，但不添加到processed_lines
+            # 这样用户打开forum标签时可以获取历史
+            f.seek(0, 2)  # 移到文件末尾
            last_position = f.tell()
-    
+
    while True:
        try:
            if forum_log_file.exists():
                with open(forum_log_file, 'r', encoding='utf-8', errors='ignore') as f:
                    f.seek(last_position)
                    new_lines = f.readlines()
-                    
+
                    if new_lines:
                        for line in new_lines:
                            line = line.rstrip('\n\r')
                            if line.strip():
                                line_hash = hash(line.strip())
-                                
+
                                # 避免重复处理同一行
                                if line_hash in processed_lines:
                                    continue
-                                
+
                                processed_lines.add(line_hash)
-                                
+
                                # 解析日志行并发送forum消息
                                parsed_message = parse_forum_log_line(line)
                                if parsed_message:
                                    socketio.emit('forum_message', parsed_message)
-                                
+
                                # 只有在控制台显示forum时才发送控制台消息
                                timestamp = datetime.now().strftime('%H:%M:%S')
                                formatted_line = f"[{timestamp}] {line}"
@@ -422,13 +466,15 @@ def monitor_forum_log():
                                    'app': 'forum',
                                    'line': formatted_line
                                })
-                        
+
                        last_position = f.tell()
-                        
+
                        # 清理processed_lines集合，避免内存泄漏（保留最近1000行的哈希）
                        if len(processed_lines) > 1000:
-                            processed_lines.clear()
-            
+                            # 保留最近500行的哈希
+                            recent_hashes = list(processed_lines)[-500:]
+                            processed_lines = set(recent_hashes)
+
            time.sleep(1)  # 每秒检查一次
        except Exception as e:
            logger.error(f"Forum日志监听错误: {e}")
@@ -903,6 +949,57 @@ def get_forum_log():
    except Exception as e:
        return jsonify({'success': False, 'message': f'读取forum.log失败: {str(e)}'})

+@app.route('/api/forum/log/history', methods=['POST'])
+def get_forum_log_history():
+    """获取Forum历史日志（支持从指定位置开始）"""
+    try:
+        data = request.get_json()
+        start_position = data.get('position', 0)  # 客户端上次接收的位置
+        max_lines = data.get('max_lines', 1000)   # 最多返回的行数
+
+        forum_log_file = LOG_DIR / "forum.log"
+        if not forum_log_file.exists():
+            return jsonify({
+                'success': True,
+                'log_lines': [],
+                'position': 0,
+                'has_more': False
+            })
+
+        with open(forum_log_file, 'r', encoding='utf-8', errors='ignore') as f:
+            # 从指定位置开始读取
+            f.seek(start_position)
+            lines = []
+            line_count = 0
+
+            for line in f:
+                if line_count >= max_lines:
+                    break
+                line = line.rstrip('\n\r')
+                if line.strip():
+                    # 添加时间戳
+                    timestamp = datetime.now().strftime('%H:%M:%S')
+                    formatted_line = f"[{timestamp}] {line}"
+                    lines.append(formatted_line)
+                    line_count += 1
+
+            # 记录当前位置
+            current_position = f.tell()
+
+            # 检查是否还有更多内容
+            f.seek(0, 2)  # 移到文件末尾
+            end_position = f.tell()
+            has_more = current_position < end_position
+
+        return jsonify({
+            'success': True,
+            'log_lines': lines,
+            'position': current_position,
+            'has_more': has_more
+        })
+    except Exception as e:
+        return jsonify({'success': False, 'message': f'读取forum历史失败: {str(e)}'})
+
@app.route('/api/search', methods=['POST'])
 def search():
    """统一搜索接口"""
@@ -1057,4 +1154,4 @@ if __name__ == '__main__':
        logger.info("\n正在关闭应用...")
        cleanup_processes()
        
-    
+    
@@ -81,7 +81,7 @@ class Settings(BaseSettings):
    TAVILY_API_KEY: Optional[str] = Field(None, description="Tavily API（申请地址：https://www.tavily.com/）API密钥，用于Tavily网络搜索")
    
    # Bocha API（申请地址：https://open.bochaai.com/）
-    BOCHA_BASE_URL: Optional[str] = Field("https://api.bochaai.com/v1/ai-search", description="Bocha AI 搜索BaseUrl或博查网页搜索BaseUrl")
+    BOCHA_BASE_URL: Optional[str] = Field("https://api.bocha.cn/v1/ai-search", description="Bocha AI 搜索BaseUrl或博查网页搜索BaseUrl")
    BOCHA_WEB_SEARCH_API_KEY: Optional[str] = Field(None, description="Bocha API（申请地址：https://open.bochaai.com/）API密钥，用于Bocha搜索")
    
    # ================== Insight Engine 搜索配置 ====================
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+"""
+PDF导出脚本
+"""
+import json
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+
+# 添加项目路径到sys.path
+sys.path.insert(0, '/Users/mayiding/Desktop/GitMy/BettaFish')
+
+def export_pdf(ir_file_path):
+    """导出PDF"""
+    try:
+        # 读取IR文件
+        print(f"正在读取报告文件: {ir_file_path}")
+        with open(ir_file_path, 'r', encoding='utf-8') as f:
+            document_ir = json.load(f)
+
+        # 导入PDF渲染器
+        from ReportEngine.renderers.pdf_renderer import PDFRenderer
+
+        # 创建PDF渲染器
+        print("正在初始化PDF渲染器...")
+        renderer = PDFRenderer()
+
+        # 生成PDF
+        print("正在生成PDF...")
+        pdf_bytes = renderer.render_to_bytes(document_ir, optimize_layout=True)
+
+        # 确定输出文件名
+        topic = document_ir.get('metadata', {}).get('topic', 'report')
+        output_dir = Path('/Users/mayiding/Desktop/GitMy/BettaFish/final_reports/pdf')
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        pdf_filename = f"report_{topic}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
+        output_path = output_dir / pdf_filename
+
+        # 保存PDF文件
+        print(f"正在保存PDF到: {output_path}")
+        with open(output_path, 'wb') as f:
+            f.write(pdf_bytes)
+
+        print(f"✅ PDF导出成功！")
+        print(f"文件位置: {output_path}")
+        print(f"文件大小: {len(pdf_bytes) / 1024 / 1024:.2f} MB")
+
+        return str(output_path)
+
+    except Exception as e:
+        print(f"❌ PDF导出失败: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+if __name__ == "__main__":
+    # 使用最新的报告文件
+    latest_report = "/Users/mayiding/Desktop/GitMy/BettaFish/final_reports/ir/report_ir_人工智能行情发展走势_20251119_235407.json"
+
+    if os.path.exists(latest_report):
+        print("="*50)
+        print("开始导出PDF")
+        print("="*50)
+        result = export_pdf(latest_report)
+        if result:
+            print(f"\n📄 PDF文件已生成: {result}")
+    else:
+        print(f"❌ 报告文件不存在: {latest_report}")
@@ -0,0 +1,335 @@
+"""
+使用最新的章节JSON重新装订并渲染HTML报告。
+"""
+
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from loguru import logger
+
+# 确保可以找到项目内模块
+sys.path.insert(0, str(Path(__file__).parent))
+
+from ReportEngine.core import ChapterStorage, DocumentComposer
+from ReportEngine.ir import IRValidator
+from ReportEngine.renderers import HTMLRenderer
+from ReportEngine.utils.config import settings
+
+
+def find_latest_run_dir(chapter_root: Path):
+    """
+    定位章节根目录下最新一次运行的输出目录。
+
+    扫描 `chapter_root` 下所有子目录，筛选出包含 `manifest.json`
+    的候选，按修改时间倒序取最新一条。若目录不存在或没有有效
+    manifest，会记录错误并返回 None。
+
+    参数:
+        chapter_root: 章节输出的根目录（通常是 settings.CHAPTER_OUTPUT_DIR）
+
+    返回:
+        Path | None: 最新的 run 目录路径；若未找到则为 None。
+    """
+    if not chapter_root.exists():
+        logger.error(f"章节目录不存在: {chapter_root}")
+        return None
+
+    run_dirs = []
+    for candidate in chapter_root.iterdir():
+        if not candidate.is_dir():
+            continue
+        manifest_path = candidate / "manifest.json"
+        if manifest_path.exists():
+            run_dirs.append((candidate, manifest_path.stat().st_mtime))
+
+    if not run_dirs:
+        logger.error("未找到带 manifest.json 的章节目录")
+        return None
+
+    latest_dir = sorted(run_dirs, key=lambda item: item[1], reverse=True)[0][0]
+    logger.info(f"找到最新run目录: {latest_dir.name}")
+    return latest_dir
+
+
+def load_manifest(run_dir: Path):
+    """
+    读取单次运行目录内的 manifest.json。
+
+    成功时返回 reportId 以及元数据字典；读取或解析失败会记录错误
+    并返回 (None, None)，以便上层提前终止流程。
+
+    参数:
+        run_dir: 包含 manifest.json 的章节输出目录
+
+    返回:
+        tuple[str | None, dict | None]: (report_id, metadata)
+    """
+    manifest_path = run_dir / "manifest.json"
+    try:
+        with manifest_path.open("r", encoding="utf-8") as f:
+            manifest = json.load(f)
+        report_id = manifest.get("reportId") or run_dir.name
+        metadata = manifest.get("metadata") or {}
+        logger.info(f"报告ID: {report_id}")
+        if manifest.get("createdAt"):
+            logger.info(f"创建时间: {manifest['createdAt']}")
+        return report_id, metadata
+    except Exception as exc:
+        logger.error(f"读取manifest失败: {exc}")
+        return None, None
+
+
+def load_chapters(run_dir: Path):
+    """
+    读取指定 run 目录下的所有章节 JSON。
+
+    会复用 ChapterStorage 的 load_chapters 能力，自动按 order 排序。
+    读取后打印章节数量，便于确认完整性。
+
+    参数:
+        run_dir: 单次报告的章节目录
+
+    返回:
+        list[dict]: 章节 JSON 列表（若目录为空则为空列表）
+    """
+    storage = ChapterStorage(settings.CHAPTER_OUTPUT_DIR)
+    chapters = storage.load_chapters(run_dir)
+    logger.info(f"加载章节数: {len(chapters)}")
+    return chapters
+
+
+def validate_chapters(chapters):
+    """
+    使用 IRValidator 对章节结构做快速校验。
+
+    仅记录未通过的章节及前三条错误，不会中断流程；目的是在
+    重装订前发现潜在结构问题。
+
+    参数:
+        chapters: 章节 JSON 列表
+    """
+    validator = IRValidator()
+    invalid = []
+    for chapter in chapters:
+        ok, errors = validator.validate_chapter(chapter)
+        if not ok:
+            invalid.append((chapter.get("chapterId") or "unknown", errors))
+
+    if invalid:
+        logger.warning(f"有 {len(invalid)} 个章节未通过结构校验，将继续装订：")
+        for chapter_id, errors in invalid:
+            preview = "; ".join(errors[:3])
+            logger.warning(f"  - {chapter_id}: {preview}")
+    else:
+        logger.info("章节结构校验通过")
+
+
+def stitch_document(report_id, metadata, chapters):
+    """
+    将各章节与元数据装订为完整的 Document IR。
+
+    使用 DocumentComposer 统一处理章节顺序、全局元数据等，并打印
+    装订完成的章节与图表数量。
+
+    参数:
+        report_id: 报告 ID（来自 manifest 或目录名）
+        metadata: manifest 中的全局元数据
+        chapters: 已加载的章节列表
+
+    返回:
+        dict: 完整的 Document IR 对象
+    """
+    composer = DocumentComposer()
+    document_ir = composer.build_document(report_id, metadata, chapters)
+    logger.info(
+        f"装订完成: {len(document_ir.get('chapters', []))} 个章节，"
+        f"{count_charts(document_ir)} 个图表"
+    )
+    return document_ir
+
+
+def count_charts(document_ir):
+    """
+    统计整本 Document IR 中的 Chart.js 图表数量。
+
+    会遍历每章的 blocks，递归查找 widget 类型中以 `chart.js`
+    开头的组件，便于快速感知图表规模。
+
+    参数:
+        document_ir: 完整的 Document IR
+
+    返回:
+        int: 图表总数
+    """
+    chart_count = 0
+    for chapter in document_ir.get("chapters", []):
+        blocks = chapter.get("blocks", [])
+        chart_count += _count_chart_blocks(blocks)
+    return chart_count
+
+
+def _count_chart_blocks(blocks):
+    """
+    递归统计 block 列表中的 Chart.js 组件数量。
+
+    兼容嵌套的 blocks/list/table 结构，确保所有层级的图表都被计入。
+
+    参数:
+        blocks: 任意层级的 block 列表
+
+    返回:
+        int: 统计到的 chart.js 图表数量
+    """
+    count = 0
+    for block in blocks:
+        if not isinstance(block, dict):
+            continue
+        if block.get("type") == "widget" and str(block.get("widgetType", "")).startswith("chart.js"):
+            count += 1
+        nested = block.get("blocks")
+        if isinstance(nested, list):
+            count += _count_chart_blocks(nested)
+        if block.get("type") == "list":
+            for item in block.get("items", []):
+                if isinstance(item, list):
+                    count += _count_chart_blocks(item)
+        if block.get("type") == "table":
+            for row in block.get("rows", []):
+                for cell in row.get("cells", []):
+                    if isinstance(cell, dict):
+                        cell_blocks = cell.get("blocks", [])
+                        if isinstance(cell_blocks, list):
+                            count += _count_chart_blocks(cell_blocks)
+    return count
+
+
+def save_document_ir(document_ir, base_name, timestamp):
+    """
+    将重新装订好的整本 Document IR 落盘。
+
+    按 `report_ir_{slug}_{timestamp}_regen.json` 命名写入
+    `settings.DOCUMENT_IR_OUTPUT_DIR`，确保目录存在并返回保存路径。
+
+    参数:
+        document_ir: 已装订完成的整本 IR
+        base_name: 由主题/标题生成的安全文件名片段
+        timestamp: 时间戳字符串，用于区分多次重生成
+
+    返回:
+        Path: 保存的 IR 文件路径
+    """
+    output_dir = Path(settings.DOCUMENT_IR_OUTPUT_DIR)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    ir_filename = f"report_ir_{base_name}_{timestamp}_regen.json"
+    ir_path = output_dir / ir_filename
+    ir_path.write_text(json.dumps(document_ir, ensure_ascii=False, indent=2), encoding="utf-8")
+    logger.info(f"IR已保存: {ir_path}")
+    return ir_path
+
+
+def render_html(document_ir, base_name, timestamp):
+    """
+    使用 HTMLRenderer 将 Document IR 渲染为 HTML 并保存。
+
+    渲染后落盘到 `final_reports/html`，打印图表验证统计信息，方便
+    观察 Chart.js 数据的修复/失败情况。
+
+    参数:
+        document_ir: 装订完成的整本 IR
+        base_name: 文件名片段（来源于报告主题/标题）
+        timestamp: 时间戳字符串
+
+    返回:
+        Path: 生成的 HTML 文件路径
+    """
+    renderer = HTMLRenderer()
+    html_content = renderer.render(document_ir)
+
+    output_dir = Path(settings.OUTPUT_DIR) / "html"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    html_filename = f"report_html_{base_name}_{timestamp}.html"
+    html_path = output_dir / html_filename
+    html_path.write_text(html_content, encoding="utf-8")
+
+    file_size_mb = html_path.stat().st_size / (1024 * 1024)
+    logger.info(f"HTML生成成功: {html_path} ({file_size_mb:.2f} MB)")
+    logger.info(
+        "图表验证统计: "
+        f"total={renderer.chart_validation_stats.get('total', 0)}, "
+        f"valid={renderer.chart_validation_stats.get('valid', 0)}, "
+        f"repaired={renderer.chart_validation_stats.get('repaired_locally', 0) + renderer.chart_validation_stats.get('repaired_api', 0)}, "
+        f"failed={renderer.chart_validation_stats.get('failed', 0)}"
+    )
+    return html_path
+
+
+def build_slug(text):
+    """
+    将主题/标题转换为文件系统安全的片段。
+
+    仅保留字母/数字/空格/下划线/连字符，空格统一为下划线，并限制
+    最长 60 字符，避免过长文件名。
+
+    参数:
+        text: 原始主题或标题
+
+    返回:
+        str: 清洗后的安全字符串
+    """
+    text = str(text or "report")
+    sanitized = "".join(c for c in text if c.isalnum() or c in (" ", "-", "_")).strip()
+    sanitized = sanitized.replace(" ", "_")
+    return sanitized[:60] or "report"
+
+
+def main():
+    """
+    主入口：读取最新章节、装订 IR 并渲染 HTML。
+
+    流程：
+        1) 找到最新的章节 run 目录并读取 manifest；
+        2) 加载章节并执行结构校验（仅警告）；
+        3) 装订整本 IR，保存 IR 副本；
+        4) 渲染 HTML 并输出路径与统计信息。
+
+    返回:
+        int: 0 表示成功，其余表示失败。
+    """
+    logger.info("🚀 使用最新的LLM章节重新装订并渲染HTML")
+
+    chapter_root = Path(settings.CHAPTER_OUTPUT_DIR)
+    latest_run = find_latest_run_dir(chapter_root)
+    if not latest_run:
+        return 1
+
+    report_id, metadata = load_manifest(latest_run)
+    if not report_id or metadata is None:
+        return 1
+
+    chapters = load_chapters(latest_run)
+    if not chapters:
+        logger.error("未找到章节JSON，无法装订")
+        return 1
+
+    validate_chapters(chapters)
+
+    document_ir = stitch_document(report_id, metadata, chapters)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_name = build_slug(
+        metadata.get("query") or metadata.get("title") or metadata.get("reportId") or report_id
+    )
+
+    ir_path = save_document_ir(document_ir, base_name, timestamp)
+    html_path = render_html(document_ir, base_name, timestamp)
+
+    logger.info("")
+    logger.info("🎉 HTML装订与渲染完成")
+    logger.info(f"IR文件: {ir_path.resolve()}")
+    logger.info(f"HTML文件: {html_path.resolve()}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,194 @@
+"""
+使用新的SVG矢量图表功能重新生成最新报告的PDF
+"""
+
+import json
+import sys
+from pathlib import Path
+from datetime import datetime
+from loguru import logger
+
+# 添加项目路径
+sys.path.insert(0, str(Path(__file__).parent))
+
+from ReportEngine.renderers import PDFRenderer
+
+def find_latest_report():
+    """
+    在 `final_reports/ir` 中查找最新的报告 IR JSON。
+
+    按修改时间倒序选择第一条，若目录或文件缺失则记录错误并返回 None。
+
+    返回:
+        Path | None: 最新 IR 文件路径；未找到则为 None。
+    """
+    ir_dir = Path("final_reports/ir")
+
+    if not ir_dir.exists():
+        logger.error(f"报告目录不存在: {ir_dir}")
+        return None
+
+    # 获取所有JSON文件并按修改时间排序
+    json_files = sorted(ir_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True)
+
+    if not json_files:
+        logger.error("未找到报告文件")
+        return None
+
+    latest_file = json_files[0]
+    logger.info(f"找到最新报告: {latest_file.name}")
+
+    return latest_file
+
+def load_document_ir(file_path):
+    """
+    读取指定路径的 Document IR JSON，并统计章节/图表数量。
+
+    解析失败时返回 None；成功时会打印章节数与图表数，便于确认
+    输入报告的规模。
+
+    参数:
+        file_path: IR 文件路径
+
+    返回:
+        dict | None: 解析后的 Document IR；失败返回 None。
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            document_ir = json.load(f)
+
+        logger.info(f"成功加载报告: {file_path.name}")
+
+        # 统计图表数量
+        chart_count = 0
+        chapters = document_ir.get('chapters', [])
+
+        def count_charts(blocks):
+            """递归统计 block 列表中的 Chart.js 图表数量"""
+            count = 0
+            for block in blocks:
+                if isinstance(block, dict):
+                    if block.get('type') == 'widget' and block.get('widgetType', '').startswith('chart.js'):
+                        count += 1
+                    # 递归处理嵌套blocks
+                    nested = block.get('blocks')
+                    if isinstance(nested, list):
+                        count += count_charts(nested)
+            return count
+
+        for chapter in chapters:
+            blocks = chapter.get('blocks', [])
+            chart_count += count_charts(blocks)
+
+        logger.info(f"报告包含 {len(chapters)} 个章节，{chart_count} 个图表")
+
+        return document_ir
+
+    except Exception as e:
+        logger.error(f"加载报告失败: {e}")
+        return None
+
+def generate_pdf_with_vector_charts(document_ir, output_path):
+    """
+    使用 PDFRenderer 将 Document IR 渲染为包含 SVG 矢量图表的 PDF。
+
+    启用布局优化，生成后输出文件大小与成功提示；异常时返回 None。
+
+    参数:
+        document_ir: 完整的 Document IR
+        output_path: 目标 PDF 路径
+
+    返回:
+        Path | None: 成功时返回生成的 PDF 路径，失败返回 None。
+    """
+    try:
+        logger.info("=" * 60)
+        logger.info("开始生成PDF（带矢量图表）")
+        logger.info("=" * 60)
+
+        # 创建PDF渲染器
+        renderer = PDFRenderer()
+
+        # 渲染PDF
+        result_path = renderer.render_to_pdf(
+            document_ir,
+            output_path,
+            optimize_layout=True
+        )
+
+        logger.info("=" * 60)
+        logger.info(f"✓ PDF生成成功: {result_path}")
+        logger.info("=" * 60)
+
+        # 显示文件大小
+        file_size = result_path.stat().st_size
+        size_mb = file_size / (1024 * 1024)
+        logger.info(f"文件大小: {size_mb:.2f} MB")
+
+        return result_path
+
+    except Exception as e:
+        logger.error(f"生成PDF失败: {e}", exc_info=True)
+        return None
+
+def main():
+    """
+    主入口：重新生成最新报告的矢量 PDF。
+
+    步骤：
+        1) 查找最新 IR 文件；
+        2) 读取并统计报告结构；
+        3) 构造输出文件名并确保目录存在；
+        4) 调用渲染函数生成 PDF，输出路径与特性说明。
+
+    返回:
+        int: 0 表示成功，非 0 表示失败。
+    """
+    logger.info("🚀 使用SVG矢量图表重新生成最新报告的PDF")
+    logger.info("")
+
+    # 1. 找到最新报告
+    latest_report = find_latest_report()
+    if not latest_report:
+        logger.error("未找到报告文件")
+        return 1
+
+    # 2. 加载报告数据
+    document_ir = load_document_ir(latest_report)
+    if not document_ir:
+        logger.error("加载报告失败")
+        return 1
+
+    # 3. 生成输出文件名
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    report_name = latest_report.stem.replace("report_ir_", "")
+    output_filename = f"report_vector_{report_name}_{timestamp}.pdf"
+    output_path = Path("final_reports/pdf") / output_filename
+
+    # 确保输出目录存在
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    logger.info(f"输出路径: {output_path}")
+    logger.info("")
+
+    # 4. 生成PDF
+    result = generate_pdf_with_vector_charts(document_ir, output_path)
+
+    if result:
+        logger.info("")
+        logger.info("🎉 PDF生成完成！")
+        logger.info("")
+        logger.info("特性说明:")
+        logger.info("  ✓ 图表以SVG矢量格式渲染")
+        logger.info("  ✓ 支持无限缩放不失真")
+        logger.info("  ✓ 保留完整的图表视觉效果")
+        logger.info("  ✓ 折线图、柱状图、饼图等均为矢量曲线")
+        logger.info("")
+        logger.info(f"PDF文件位置: {result.absolute()}")
+        return 0
+    else:
+        logger.error("❌ PDF生成失败")
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,484 @@
+#!/usr/bin/env python
+"""
+Report Engine 命令行版本
+
+这是一个不需要前端的命令行报告生成程序。
+主要流程：
+1. 检查PDF依赖
+2. 获取最新的log、md文件
+3. 直接调用Report Engine生成报告（跳过文件增加审核）
+4. 自动保存HTML和PDF（如果有依赖）到final_reports/
+
+使用方法：
+    python report_engine_only.py [选项]
+
+选项：
+    --query QUERY     指定报告主题（可选，默认从文件名提取）
+    --skip-pdf        跳过PDF生成（即使有依赖）
+    --verbose         显示详细日志
+    --help            显示帮助信息
+"""
+
+import os
+import sys
+import json
+import argparse
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Any, Optional
+
+from loguru import logger
+
+# 全局配置
+VERBOSE = False
+
+# 配置日志
+def setup_logger(verbose: bool = False):
+    """设置日志配置"""
+    global VERBOSE
+    VERBOSE = verbose
+
+    logger.remove()  # 移除默认处理器
+    logger.add(
+        sys.stdout,
+        format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
+        level="DEBUG" if verbose else "INFO"
+    )
+
+
+def check_dependencies() -> tuple[bool, Optional[str]]:
+    """
+    检查PDF生成所需的系统依赖
+
+    Returns:
+        tuple: (is_available: bool, message: str)
+            - is_available: PDF功能是否可用
+            - message: 依赖检查结果消息
+    """
+    logger.info("=" * 70)
+    logger.info("步骤 1/4: 检查系统依赖")
+    logger.info("=" * 70)
+
+    try:
+        from ReportEngine.utils.dependency_check import check_pango_available
+        is_available, message = check_pango_available()
+
+        if is_available:
+            logger.success("✓ PDF 依赖检测通过，将同时生成 HTML 和 PDF 文件")
+        else:
+            logger.warning("⚠ PDF 依赖缺失，仅生成 HTML 文件")
+            logger.info("\n" + message)
+
+        return is_available, message
+    except Exception as e:
+        logger.error(f"依赖检查失败: {e}")
+        return False, str(e)
+
+
+def get_latest_engine_reports() -> Dict[str, str]:
+    """
+    获取三个引擎目录中的最新报告文件
+
+    Returns:
+        Dict[str, str]: 引擎名称到文件路径的映射
+    """
+    logger.info("\n" + "=" * 70)
+    logger.info("步骤 2/4: 获取最新的分析引擎报告")
+    logger.info("=" * 70)
+
+    # 定义三个引擎的目录
+    directories = {
+        'insight': 'insight_engine_streamlit_reports',
+        'media': 'media_engine_streamlit_reports',
+        'query': 'query_engine_streamlit_reports'
+    }
+
+    latest_files = {}
+
+    for engine, directory in directories.items():
+        if not os.path.exists(directory):
+            logger.warning(f"⚠ {engine.capitalize()} Engine 目录不存在: {directory}")
+            continue
+
+        # 获取所有 .md 文件
+        md_files = [f for f in os.listdir(directory) if f.endswith('.md')]
+
+        if not md_files:
+            logger.warning(f"⚠ {engine.capitalize()} Engine 目录中没有找到 .md 文件")
+            continue
+
+        # 获取最新文件
+        latest_file = max(
+            md_files,
+            key=lambda x: os.path.getmtime(os.path.join(directory, x))
+        )
+        latest_path = os.path.join(directory, latest_file)
+        latest_files[engine] = latest_path
+
+        logger.info(f"✓ 找到 {engine.capitalize()} Engine 最新报告")
+
+    if not latest_files:
+        logger.error("❌ 未找到任何引擎报告文件，请先运行分析引擎生成报告")
+        sys.exit(1)
+
+    logger.info(f"\n共找到 {len(latest_files)} 个引擎的最新报告")
+
+    return latest_files
+
+
+def confirm_file_selection(latest_files: Dict[str, str]) -> bool:
+    """
+    向用户确认选择的文件是否正确
+
+    Args:
+        latest_files: 引擎名称到文件路径的映射
+
+    Returns:
+        bool: 用户确认则返回True，否则返回False
+    """
+    logger.info("\n" + "=" * 70)
+    logger.info("请确认以下选择的文件：")
+    logger.info("=" * 70)
+
+    for engine, file_path in latest_files.items():
+        filename = os.path.basename(file_path)
+        # 获取文件修改时间
+        mtime = os.path.getmtime(file_path)
+        mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')
+
+        logger.info(f"  {engine.capitalize()} Engine:")
+        logger.info(f"    文件名: {filename}")
+        logger.info(f"    路径: {file_path}")
+        logger.info(f"    修改时间: {mtime_str}")
+        logger.info("")
+
+    logger.info("=" * 70)
+
+    # 提示用户确认
+    try:
+        response = input("是否使用以上文件生成报告? [Y/n]: ").strip().lower()
+
+        # 默认是y，所以空输入或y都表示确认
+        if response == '' or response == 'y' or response == 'yes':
+            logger.success("✓ 用户确认，继续生成报告")
+            return True
+        else:
+            logger.warning("✗ 用户取消操作")
+            return False
+    except (KeyboardInterrupt, EOFError):
+        logger.warning("\n✗ 用户取消操作")
+        return False
+
+
+def load_engine_reports(latest_files: Dict[str, str]) -> list[str]:
+    """
+    加载引擎报告内容
+
+    Args:
+        latest_files: 引擎名称到文件路径的映射
+
+    Returns:
+        list[str]: 报告内容列表
+    """
+    reports = []
+
+    for engine, file_path in latest_files.items():
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                reports.append(content)
+                logger.debug(f"已加载 {engine} 报告，长度: {len(content)} 字符")
+        except Exception as e:
+            logger.error(f"加载 {engine} 报告失败: {e}")
+
+    return reports
+
+
+def extract_query_from_reports(latest_files: Dict[str, str]) -> str:
+    """
+    从报告文件名中提取查询主题
+
+    Args:
+        latest_files: 引擎名称到文件路径的映射
+
+    Returns:
+        str: 提取的查询主题
+    """
+    # 尝试从文件名中提取主题
+    for engine, file_path in latest_files.items():
+        filename = os.path.basename(file_path)
+        # 假设文件名格式为: report_主题_时间戳.md
+        if '_' in filename:
+            parts = filename.replace('.md', '').split('_')
+            if len(parts) >= 2:
+                # 提取中间部分作为主题
+                topic = '_'.join(parts[1:-1]) if len(parts) > 2 else parts[1]
+                if topic:
+                    return topic
+
+    # 如果无法提取，返回默认值
+    return "综合分析报告"
+
+
+def generate_report(reports: list[str], query: str, pdf_available: bool) -> Dict[str, Any]:
+    """
+    调用Report Engine生成报告
+
+    Args:
+        reports: 报告内容列表
+        query: 报告主题
+        pdf_available: PDF功能是否可用
+
+    Returns:
+        Dict[str, Any]: 包含生成结果的字典
+    """
+    logger.info("\n" + "=" * 70)
+    logger.info("步骤 3/4: 生成综合报告")
+    logger.info("=" * 70)
+    logger.info(f"报告主题: {query}")
+    logger.info(f"输入报告数量: {len(reports)}")
+
+    try:
+        from ReportEngine.agent import ReportAgent
+
+        # 初始化Report Agent
+        logger.info("正在初始化 Report Engine...")
+        agent = ReportAgent()
+
+        # 定义流式事件处理器
+        def stream_handler(event_type: str, payload: Dict[str, Any]):
+            """处理Report Engine的流式事件"""
+            if event_type == 'stage':
+                stage = payload.get('stage', '')
+                if stage == 'agent_start':
+                    logger.info(f"开始生成报告: {payload.get('report_id', '')}")
+                elif stage == 'template_selected':
+                    logger.info(f"✓ 已选择模板: {payload.get('template', '')}")
+                elif stage == 'template_sliced':
+                    logger.info(f"✓ 模板解析完成，共 {payload.get('section_count', 0)} 个章节")
+                elif stage == 'layout_designed':
+                    logger.info(f"✓ 文档布局设计完成")
+                    logger.info(f"  标题: {payload.get('title', '')}")
+                elif stage == 'word_plan_ready':
+                    logger.info(f"✓ 篇幅规划完成，目标章节数: {payload.get('chapter_targets', 0)}")
+                elif stage == 'chapters_compiled':
+                    logger.info(f"✓ 章节生成完成，共 {payload.get('chapter_count', 0)} 个章节")
+                elif stage == 'html_rendered':
+                    logger.info(f"✓ HTML 渲染完成")
+                elif stage == 'report_saved':
+                    logger.info(f"✓ 报告已保存")
+            elif event_type == 'chapter_status':
+                chapter_id = payload.get('chapterId', '')
+                title = payload.get('title', '')
+                status = payload.get('status', '')
+                if status == 'generating':
+                    logger.info(f"  正在生成章节: {title}")
+                elif status == 'completed':
+                    attempt = payload.get('attempt', 1)
+                    warning = payload.get('warning', '')
+                    if warning:
+                        logger.warning(f"  ✓ 章节完成: {title} (第 {attempt} 次尝试，{payload.get('warningMessage', '')})")
+                    else:
+                        logger.success(f"  ✓ 章节完成: {title}")
+            elif event_type == 'error':
+                logger.error(f"错误: {payload.get('message', '')}")
+
+        # 生成报告
+        logger.info("开始生成报告，这可能需要几分钟时间...")
+        result = agent.generate_report(
+            query=query,
+            reports=reports,
+            forum_logs="",  # 不使用论坛日志
+            custom_template="",  # 使用自动模板选择
+            save_report=True,  # 自动保存报告
+            stream_handler=stream_handler
+        )
+
+        logger.success("✓ 报告生成成功！")
+        return result
+
+    except Exception as e:
+        logger.exception(f"❌ 报告生成失败: {e}")
+        sys.exit(1)
+
+
+def save_pdf(document_ir_path: str, query: str) -> Optional[str]:
+    """
+    从IR文件生成并保存PDF
+
+    Args:
+        document_ir_path: Document IR文件路径
+        query: 报告主题
+
+    Returns:
+        Optional[str]: PDF文件路径，如果失败则返回None
+    """
+    logger.info("\n正在生成 PDF 文件...")
+
+    try:
+        # 读取IR数据
+        with open(document_ir_path, 'r', encoding='utf-8') as f:
+            document_ir = json.load(f)
+
+        # 创建PDF渲染器
+        from ReportEngine.renderers import PDFRenderer
+        renderer = PDFRenderer()
+
+        # 准备输出路径
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        query_safe = "".join(
+            c for c in query if c.isalnum() or c in (" ", "-", "_")
+        ).rstrip()
+        query_safe = query_safe.replace(" ", "_")[:30] or "report"
+
+        pdf_dir = Path("final_reports") / "pdf"
+        pdf_dir.mkdir(parents=True, exist_ok=True)
+
+        pdf_filename = f"final_report_{query_safe}_{timestamp}.pdf"
+        pdf_path = pdf_dir / pdf_filename
+
+        # 使用 render_to_pdf 方法直接生成PDF文件（与regenerate_latest_pdf.py一致）
+        logger.info(f"开始渲染PDF: {pdf_path}")
+        result_path = renderer.render_to_pdf(
+            document_ir,
+            pdf_path,
+            optimize_layout=True
+        )
+
+        # 显示文件大小
+        file_size = result_path.stat().st_size
+        size_mb = file_size / (1024 * 1024)
+        logger.success(f"✓ PDF 已保存: {pdf_path}")
+        logger.info(f"  文件大小: {size_mb:.2f} MB")
+
+        return str(result_path)
+
+    except Exception as e:
+        logger.exception(f"❌ PDF 生成失败: {e}")
+        return None
+
+
+def parse_arguments():
+    """解析命令行参数"""
+    parser = argparse.ArgumentParser(
+        description="Report Engine 命令行版本 - 无需前端的报告生成工具",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例:
+  python report_engine_only.py
+  python report_engine_only.py --query "土木工程行业分析"
+  python report_engine_only.py --skip-pdf --verbose
+
+注意:
+  程序会自动获取三个引擎目录中的最新报告文件，
+  不进行文件增加审核，直接生成综合报告。
+        """
+    )
+
+    parser.add_argument(
+        '--query',
+        type=str,
+        default=None,
+        help='指定报告主题（默认从文件名自动提取）'
+    )
+
+    parser.add_argument(
+        '--skip-pdf',
+        action='store_true',
+        help='跳过PDF生成（即使系统支持）'
+    )
+
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='显示详细日志信息'
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    """主函数"""
+    # 解析命令行参数
+    args = parse_arguments()
+
+    # 设置日志
+    setup_logger(verbose=args.verbose)
+
+    logger.info("\n")
+    logger.info("╔" + "═" * 68 + "╗")
+    logger.info("║" + " " * 20 + "Report Engine 命令行版本" + " " * 24 + "║")
+    logger.info("╚" + "═" * 68 + "╝")
+    logger.info("\n")
+
+    # 步骤 1: 检查依赖
+    pdf_available, _ = check_dependencies()
+
+    # 如果用户指定跳过PDF，则禁用PDF生成
+    if args.skip_pdf:
+        logger.info("用户指定 --skip-pdf，将跳过 PDF 生成")
+        pdf_available = False
+
+    # 步骤 2: 获取最新文件
+    latest_files = get_latest_engine_reports()
+
+    # 确认文件选择
+    if not confirm_file_selection(latest_files):
+        logger.info("\n程序已退出")
+        sys.exit(0)
+
+    # 加载报告内容
+    reports = load_engine_reports(latest_files)
+
+    if not reports:
+        logger.error("❌ 未能加载任何报告内容")
+        sys.exit(1)
+
+    # 提取或使用指定的查询主题
+    query = args.query if args.query else extract_query_from_reports(latest_files)
+    logger.info(f"使用报告主题: {query}")
+
+    # 步骤 3: 生成报告
+    result = generate_report(reports, query, pdf_available)
+
+    # 步骤 4: 保存文件
+    logger.info("\n" + "=" * 70)
+    logger.info("步骤 4/4: 保存生成的文件")
+    logger.info("=" * 70)
+
+    # HTML 已经在 generate_report 中自动保存
+    html_path = result.get('report_filepath', '')
+    if html_path:
+        logger.success(f"✓ HTML 已保存: {result.get('report_relative_path', html_path)}")
+
+    # 如果有PDF依赖，生成并保存PDF
+    if pdf_available:
+        ir_path = result.get('ir_filepath', '')
+        if ir_path and os.path.exists(ir_path):
+            pdf_path = save_pdf(ir_path, query)
+        else:
+            logger.warning("⚠ 未找到 IR 文件，无法生成 PDF")
+    else:
+        logger.info("⚠ 跳过 PDF 生成（缺少系统依赖或用户指定跳过）")
+
+    # 总结
+    logger.info("\n" + "=" * 70)
+    logger.success("✓ 报告生成完成！")
+    logger.info("=" * 70)
+    logger.info(f"报告 ID: {result.get('report_id', 'N/A')}")
+    logger.info(f"HTML 文件: {result.get('report_relative_path', 'N/A')}")
+    if pdf_available:
+        logger.info(f"PDF 文件: final_reports/pdf/ 目录下")
+    logger.info("=" * 70)
+    logger.info("\n程序结束")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        logger.warning("\n\n用户中断程序")
+        sys.exit(0)
+    except Exception as e:
+        logger.exception(f"\n程序异常退出: {e}")
+        sys.exit(1)
@@ -13,8 +13,10 @@ eventlet==0.33.3
 # ===== HTTP请求和异步 =====
 requests==2.31.0
 httpx==0.28.1
+socksio==1.0.0
 aiofiles==23.2.1
 aiohttp>=3.8.0
+PySocks>=1.7.1

 # ===== LLM接口 =====
 openai>=1.3.0
@@ -32,11 +34,11 @@ jieba==0.42.1
 # ===== 数据库 =====
 pymysql==1.1.0
 aiomysql==0.2.0
-asyncmy==0.2.9
 aiosqlite==0.21.0
 redis>=4.6.0
 SQLAlchemy==2.0.35
 asyncpg==0.29.0
+cryptography==42.0.7

 # ===== 爬虫相关 =====
 playwright==1.45.0
@@ -46,12 +48,16 @@ beautifulsoup4>=4.12.0
 lxml>=4.9.0
 parsel==1.9.1
 pyexecjs==1.5.1
+xhshow>=0.1.3

 # ===== 可视化 =====
 plotly>=5.17.0
 matplotlib==3.9.0
 wordcloud==1.9.3

+# ===== PDF生成 =====
+weasyprint>=60.0  # PDF导出，支持Python 3.9-3.13
+
 # ===== 机器学习（可选，用于情感分析，不安装也没事写了容错程序） =====
 torch>=2.0.0 # CPU版本
 transformers>=4.30.0
@@ -68,6 +74,7 @@ tenacity==8.2.2
 loguru>=0.7.0
 pydantic==2.5.2
 pydantic-settings==2.2.1
+json-repair==0.53.0

 # ===== 开发工具（可选） =====
 pytest>=7.4.0
@@ -76,4 +83,4 @@ flake8>=6.0.0

 # ===== Web服务器 =====
 fastapi==0.110.2
-uvicorn==0.29.0
+uvicorn==0.29.0
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
panda	ff1ce2a3ba	更新部分爬虫以兼容本地运行及数据库存储	2025-12-16 10:56:56 +08:00
panda	a9eda60493	本地化&2.0	2025-12-02 14:01:39 +08:00
666ghj	ec1baf539c	Update Bocha API base URL in .env.example and config.py	2025-11-29 20:25:01 +08:00
老葛	64b94d79f9	feat(xhs): 集成xhshow库优化签名生成与请求参数 (#330 ) * feat(xhs): 集成xhshow库优化签名生成与请求参数 - 引入xhshow库用于小红书API签名生成 - 替换原有的seccore_signv2_playwright签名校验方式 - 支持GET和POST请求的差异化签名处理 - 增加对b1值从localStorage获取的容错处理 - 更新x-t时间戳为毫秒级精度 - 在获取博主笔记接口中增加xsec_token和xsec_source参数- 支持通过配置传递验证token和渠道来源 - 更新依赖文件引入xhshow库- 调整配置示例适配新的token参数要求 * Delete MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py 移除配置文件 * Add xhs_config.py for Xiaohongshu platform settings 恢复错误删除的文件 --------- Co-authored-by: gehongbin <gehongbin@autohome.com.cn> Co-authored-by: Doiiars <doiiars@qq.com>	2025-11-29 14:26:26 +08:00
666ghj	6d04ddd0c9	Update README.	2025-11-28 10:08:13 +08:00
666ghj	68b739440e	Add report example.	2025-11-28 00:59:20 +08:00
BaiFu	9c57ba1327	Add the complete demonstration file of the report component library.	2025-11-28 00:50:30 +08:00
666ghj	e251f2dbf1	Update README.	2025-11-28 00:37:24 +08:00
BaiFu	9321f854ef	Update .gitattributes for file handling	2025-11-27 20:56:05 +08:00
马一丁	86dfa2cea8	Merge pull request #341 from 666ghj/Blocked-HTML Enable the Blocked-HTML scheme in Report Engine	2025-11-27 15:54:25 +08:00
马一丁	b4319e6871	Optimize the Rendering of Hybrid-form Formulas	2025-11-27 15:22:27 +08:00
马一丁	c9023982c9	Improve the Rendering of Inline Formulas	2025-11-27 15:12:33 +08:00
马一丁	336e24f03a	Updata .env.example	2025-11-27 14:50:20 +08:00
马一丁	a3cefc6239	Optimize the Layout of the PDF’s First Page	2025-11-27 11:44:43 +08:00
马一丁	092a43694b	Fix the Issue Where Words in the Word Cloud are Displayed Incorrectly.	2025-11-27 10:58:23 +08:00
马一丁	87382f29f3	Adjust the Formatting and Layout of Data Blocks in the Generated PDF	2025-11-27 10:44:47 +08:00
马一丁	5e9da9cfbf	Add Support for Rendering Various Inline and Block-level Mathematical Formulas	2025-11-27 10:29:27 +08:00
马一丁	4e882560da	Add Comments	2025-11-27 09:51:42 +08:00
马一丁	23356631f4	Optimize the Rendering of Inline Formulas, Subscripts and Superscripts, Bubble Charts, and Horizontal Bars	2025-11-27 03:35:55 +08:00
马一丁	2a5d984aeb	Improved Support for Word Clouds	2025-11-27 03:10:30 +08:00
马一丁	e362ef6bdc	Improve the Rendering of Donut Charts	2025-11-27 01:19:12 +08:00
马一丁	b41a783d52	Optimize Health Check Procedures	2025-11-26 00:45:17 +08:00
ghmark675	e417214523	Add disk space cleanup step in CI workflow Added a step to free up disk space before building the Docker image.	2025-11-25 17:25:55 +08:00
马一丁	ce9130c1d9	Improve the Handling of Color Information Format During PDF Generation	2025-11-25 16:48:20 +08:00
马一丁	09c83af057	Add a Program for Quickly Regenerating HTML	2025-11-25 16:32:14 +08:00
马一丁	2e0a526d22	Optimize the Color Replacement Scheme for Pie Charts	2025-11-25 15:42:00 +08:00
马一丁	a7ff9edd29	Update .env.example File	2025-11-25 15:40:46 +08:00
马一丁	7e3f3a2a9a	Merge pull request #411 from 666ghj/main Update MindSpider via main	2025-11-25 10:21:49 +08:00
马一丁	6ac676efb3	New Version Dependency Name	2025-11-25 10:18:16 +08:00
马一丁	6c145399fe	Update Dockerfile	2025-11-25 10:10:19 +08:00
马一丁	49cda6d863	Reverse the Code that Caused the PDF Rendering Error	2025-11-25 01:52:00 +08:00
马一丁	e9b7a91722	PDF Enhancement Generation	2025-11-25 01:41:30 +08:00
马一丁	8505a8aafb	Merge pull request #409 from 666ghj/main Update via the main Branch	2025-11-24 20:28:41 +08:00
马一丁	107a58a08a	Fixed the Issue of Charts being Repeatedly Repaired	2025-11-24 19:27:02 +08:00
马一丁	4be9cc8a19	Update README-EN.md	2025-11-24 15:08:50 +08:00
马一丁	205665ee4f	Update README.md	2025-11-24 15:08:44 +08:00
马一丁	63592b510d	Update README-EN.md	2025-11-24 15:04:26 +08:00
马一丁	1439ab604f	Update README.md	2025-11-24 15:02:53 +08:00
马一丁	89ea04c86d	Revert "Fix the Issue of HTML Data Blocks Overlapping with Other Content" This reverts commit `79f080fa5c`.	2025-11-22 18:37:59 +08:00
马一丁	fd1c8ed895	Fixed the Error "AttributeError: 'list' object has no attribute 'get'"	2025-11-22 17:11:43 +08:00
马一丁	dfb06970b2	Fixed the Issue Where Search History was not Displayed in Media Engine	2025-11-22 16:22:30 +08:00
马一丁	bdf50ff7b5	Fixed the Issue of HOST Information not being Displayed in Forum Engine	2025-11-22 14:49:56 +08:00
马一丁	79f080fa5c	Fix the Issue of HTML Data Blocks Overlapping with Other Content	2025-11-22 13:42:07 +08:00
马一丁	8fbac74808	Fixed the Issue Where the Host Information was not Displayed	2025-11-22 13:31:42 +08:00
马一丁	1765751347	Resolve the Issue of Search History not Displaying	2025-11-22 13:24:31 +08:00
马一丁	6419d1cc08	Improve HTML's Automatic Color Replacement Function	2025-11-22 03:34:04 +08:00
马一丁	057229b35d	Added Support for Word Cloud Association When Generating PDFs	2025-11-22 02:40:15 +08:00
马一丁	37dc8e0a5d	Fix the Issue Where Dependencies for PDF Generation are Installed but not Recognized by the Program (Perhaps?)	2025-11-22 02:31:35 +08:00
马一丁	a22be6d7dd	Fixes Situations Where TCP Connections Might be Refused	2025-11-22 02:06:53 +08:00
马一丁	56b6bdd1b4	Enhance the Performance of Frontend Log Output	2025-11-22 01:48:39 +08:00
马一丁	b4b749b4c7	Add HTML Support for Word Clouds	2025-11-22 01:47:17 +08:00
马一丁	f67af71e96	Update README-EN.md	2025-11-21 12:10:33 +08:00
马一丁	80f1d9b6b0	Update README.md	2025-11-21 12:10:24 +08:00
马一丁	452640f8c5	Optimize the Method of Automatically Repairing Charts in PDF	2025-11-21 05:51:51 +08:00
马一丁	373b85a3f8	Repair the Logic of Automatically Repairing Charts	2025-11-21 05:33:47 +08:00
马一丁	a2c42bda8b	Fix Report Engine Progress Bar Error	2025-11-21 05:33:05 +08:00
马一丁	bac28b43f4	Update requirements.txt	2025-11-21 05:32:35 +08:00
马一丁	079a1f17e8	Allows for Fixing Graphic Colors and Styles When Rendering HTML	2025-11-21 03:47:30 +08:00
马一丁	ea3b4266af	Change Template Name	2025-11-21 03:46:53 +08:00
马一丁	1bba68da3b	Update README-EN.md	2025-11-21 03:42:57 +08:00
马一丁	01b29fb55b	Update README.md	2025-11-21 03:42:50 +08:00
马一丁	de8f253fa0	Improves the Front-End Console Experience	2025-11-21 00:25:48 +08:00
马一丁	b10dd449fa	Resolving Report Engine Logging Issues	2025-11-20 22:36:49 +08:00
马一丁	2fb15f5efc	Fix the Display of Report Engine Console Logs	2025-11-20 21:17:52 +08:00
马一丁	ceeac9e5e0	Update Report Engine Log Display Method	2025-11-20 18:48:20 +08:00
马一丁	5925f49669	Fix Report Engine Log Output Priority	2025-11-20 12:36:33 +08:00
马一丁	2da43dbaf6	Adjust the Front-End Log Query Mode	2025-11-20 12:07:03 +08:00
马一丁	3eff6b06c5	Displays Information above DEBUG	2025-11-20 02:31:47 +08:00
马一丁	994477fd60	Fixed Chart Handling Issues in HTML and PDF and Improved Chart Readability	2025-11-20 02:18:52 +08:00
马一丁	f98d36062e	Optimize the Rendering of Charts in HTML and PDF	2025-11-20 01:06:20 +08:00
马一丁	488122ae9d	Delete Useless Docs	2025-11-20 00:48:22 +08:00
马一丁	a89bf7de89	Fixed Bug - Fail to Export as PDF	2025-11-20 00:34:01 +08:00
马一丁	cad25b63c1	Optimize Log Output Efficiency	2025-11-20 00:32:45 +08:00
马一丁	70b6e9872a	Change Report Engine Log Output Level	2025-11-19 22:23:47 +08:00
马一丁	269b2ec5dd	Optimize Log Display Logic	2025-11-19 20:12:37 +08:00
马一丁	d8afd95c1b	Optimize the Binding of the Catalog	2025-11-19 19:13:03 +08:00
马一丁	147edbe8c7	Added Support for Formulas and Optimize the Rendering of Data Blocks When Exporting to PDF	2025-11-19 18:54:29 +08:00
马一丁	f1285c63fd	Optimize the Progress Bar Display in Report Engine	2025-11-19 16:22:02 +08:00
马一丁	cafde07233	Resolves Display Issues with Pie Charts and Line Charts When Exporting to PDF	2025-11-19 16:12:43 +08:00
马一丁	bd0aa987ad	Update Report Engine Log Display Logic	2025-11-19 15:44:05 +08:00
马一丁	66240bbf23	Update Log Display Logic	2025-11-19 15:30:51 +08:00
马一丁	29dd025778	Optimize PDF Export Logic When Using report_engine_only.py	2025-11-19 14:45:17 +08:00
马一丁	511452b284	Update README-EN.md	2025-11-19 14:13:44 +08:00
马一丁	24ffbb0374	Update README.md	2025-11-19 14:13:33 +08:00
马一丁	5b001bf1d2	Support Using Only Report Engine	2025-11-19 14:09:59 +08:00
马一丁	69ba0f22e3	Update the Dependency Handling Scheme	2025-11-19 13:37:59 +08:00
马一丁	be1fe539b5	Updata README-EN.md	2025-11-19 12:39:33 +08:00
马一丁	87a70719f5	Updata README.md	2025-11-19 12:39:22 +08:00
马一丁	d744acfd9e	Modify the Error Message When the Pango Library is Missing	2025-11-19 12:26:53 +08:00
马一丁	1cf82adef6	Optimize the Front-End Console Log Display Logic	2025-11-19 11:58:59 +08:00
马一丁	d4f8301fd5	Fixed the PDF Rendering Overflow Issue and Updated the Logic for Rendering PDFs	2025-11-19 11:35:58 +08:00
马一丁	a07d6c5292	Update the PDF Rendering Logic and Add Support for Vector Graphics	2025-11-19 00:14:40 +08:00
马一丁	d397b98d2b	Preventing Errors and Overflow During PDF Rendering	2025-11-18 23:57:08 +08:00
马一丁	52755dfbcf	Fix the Error "'ReportTask' object has no attribute 'ir_file_path'" When Exporting PDF and Ensure that Pango Dependencies are not Missing	2025-11-18 23:01:06 +08:00
马一丁	a465b5677e	Update README-EN.md	2025-11-18 22:58:00 +08:00
马一丁	ab59c0e23e	Update README.md	2025-11-18 22:57:04 +08:00
马一丁	da7c8ce2eb	Embedding Third-Party Libraries in HTML	2025-11-18 20:40:08 +08:00
马一丁	8ba2441be1	Delete the Button of "Export as PDF" in HTML and Cancel Font Embedding	2025-11-18 20:30:46 +08:00
马一丁	80adc1bf5c	Update requirements.txt	2025-11-18 20:11:27 +08:00
马一丁	26ab0616e9	Add README of "Export as PDF"	2025-11-18 20:11:03 +08:00
马一丁	5e82185bee	Modify the Logic for "Export as PDF"	2025-11-18 20:10:11 +08:00
马一丁	acfe77a326	Improve PDF Export	2025-11-18 17:33:55 +08:00
马一丁	82f3ccc3e3	Small Set of SourceHanSerifSC	2025-11-18 17:33:25 +08:00
马一丁	49c42a17ea	Optimize Progress Bar Display Issues	2025-11-18 15:59:46 +08:00
马一丁	1a302ca975	Solving the Problem of Garbled Characters in PDF Rendering	2025-11-18 14:21:41 +08:00
马一丁	85d75d6f74	Fixed the Front-End Progress Bar Display Logic	2025-11-18 13:53:15 +08:00
马一丁	eb036655a2	Fix the Front-End Console Display Logic	2025-11-18 12:58:40 +08:00
马一丁	939fea26d9	Modify the Logic for Downloading PDFs	2025-11-18 12:31:59 +08:00
马一丁	eea356f38c	Improve Rendering Compatibility	2025-11-18 11:38:31 +08:00
马一丁	90f5986284	Optimize Front-End Memory Usage	2025-11-18 11:38:03 +08:00
马一丁	80bbd0d243	Optimize Front-End Memory Usage	2025-11-18 02:10:31 +08:00
马一丁	d9f72313a3	Fix Forum Engine's Frontend Log Output	2025-11-18 01:52:40 +08:00
马一丁	0707c6f7a7	Optimize Export to PDF	2025-11-18 01:18:04 +08:00
马一丁	dffe1618d5	Add an "Export to PDF" Button and Define the Font for Exporting to PDF	2025-11-18 01:13:25 +08:00
马一丁	fdd836bf2e	Add SourceHanSerifSC	2025-11-18 01:12:22 +08:00
马一丁	19efa81802	Improve the Way to Deal with Error JSON	2025-11-18 00:53:58 +08:00
马一丁	ad36c03be0	Add the Ability to Parse JSON	2025-11-18 00:35:09 +08:00
马一丁	bf1e2bfa9c	Repair the Logic for Cleaning Data Returned by LLM	2025-11-17 22:10:37 +08:00
马一丁	b31be56297	Fixed Directory Parsing Issues and Optimized Directory Rendering	2025-11-17 21:10:13 +08:00
马一丁	f6714a35e0	Fix 'fonts' Fype Matching Issues	2025-11-17 21:10:13 +08:00
马一丁	a5f3964a73	Optimize JSON Parsing Compatibility	2025-11-17 21:10:13 +08:00
马一丁	c20cc24c78	Repair and Optimize the Chart Rendering	2025-11-17 21:10:13 +08:00
马一丁	50b6ab403e	Cleaning Data Returned by Report Engine's LLM	2025-11-17 21:10:13 +08:00
马一丁	26c133c998	Offline JS Library	2025-11-17 21:10:13 +08:00
马一丁	dd963d6689	Merge pull request #365 from 666ghj/main Update config.	2025-11-17 17:19:21 +08:00
马一丁	45ef1b0779	Merge pull request #354 from 666ghj/main Sync	2025-11-15 22:49:13 +08:00
马一丁	a5b9b5a670	Fixed Export Garbled Character Issue	2025-11-15 20:15:50 +08:00
马一丁	904df34294	Optimize the Rendering Process	2025-11-15 18:01:55 +08:00
马一丁	a12ac4234d	Optimize the Handling of Low Word Counts	2025-11-15 17:46:42 +08:00
马一丁	cab812e261	Repair the Logic of the Log Viewing System	2025-11-15 16:21:49 +08:00
马一丁	6e8741f0ae	Optimize Rendering Logic with New IR Binding	2025-11-15 15:24:10 +08:00
马一丁	fd1a23c7fb	Improve IR Binding Logic	2025-11-15 15:23:05 +08:00
马一丁	90d12a092d	Enhance Repair Capabilities	2025-11-15 15:22:31 +08:00
马一丁	fa1ebc07ec	Optimize Re-output Logic	2025-11-15 14:45:20 +08:00
马一丁	bae13bf434	Improved Formatted Output	2025-11-15 13:39:05 +08:00
马一丁	b6b2a0fb76	Correction Retry Logic	2025-11-15 10:55:44 +08:00
马一丁	f87389c7b6	Add Comments	2025-11-15 10:55:15 +08:00
马一丁	6e3abf8d15	Update README.md	2025-11-15 03:07:50 +08:00
马一丁	3e1b47d1f9	Update README-EN.md	2025-11-15 03:07:46 +08:00
马一丁	5e1fbc97cc	Add Comments	2025-11-15 02:54:29 +08:00
马一丁	5ef63ece78	Improve the Security of Regular Expression Matching	2025-11-15 02:46:09 +08:00
马一丁	79a015b77a	Improve PDF Export	2025-11-15 02:37:01 +08:00
马一丁	4798424f23	Allow LLM Repair	2025-11-15 02:21:12 +08:00
马一丁	9b9188faba	Feature-Test RE Sanitization	2025-11-15 02:09:18 +08:00
马一丁	8aa114df4a	Fix-Multiple Directories	2025-11-15 02:08:24 +08:00
马一丁	62b8276aa8	Fix-“必须是非空数组”	2025-11-15 02:07:56 +08:00
马一丁	f64f973f57	Improved Rendering	2025-11-14 23:45:28 +08:00
马一丁	2209319264	Update README-EN.md	2025-11-14 23:21:44 +08:00
马一丁	d6f1afd4e4	Update README.md	2025-11-14 23:21:36 +08:00
马一丁	6d0e8f4b8c	Add Comments	2025-11-14 19:44:04 +08:00
马一丁	52eed4d010	Increase DeepSeek Compatibility	2025-11-14 17:55:28 +08:00
马一丁	e267b1fc04	Add Comments	2025-11-13 22:49:59 +08:00
马一丁	82152547e1	Improved Rendering	2025-11-13 22:31:02 +08:00
马一丁	fa787af135	Streaming	2025-11-13 22:30:36 +08:00
马一丁	1c2f82e285	Add Cryptography	2025-11-13 11:37:50 +08:00
马一丁	3e4aa6366d	Add Comments	2025-11-13 11:37:13 +08:00
马一丁	4846b1f758	Blocked HTML	2025-11-13 10:56:28 +08:00