Fixed Directory Parsing Issues and Optimized Directory Rendering

2025-11-17 21:05:00 +08:00
parent f6714a35e0
commit b31be56297
3 changed files with 215 additions and 8 deletions
@@ -9,6 +9,7 @@ import copy
 import html
 import json
 import os
+import re
 from pathlib import Path
 from typing import Any, Dict, List
 from loguru import logger
@@ -451,23 +452,44 @@ class HTMLRenderer:
            chapters: Document IR中的章节数组。

        返回:
-            list[dict]: 规范化后的目录条目，包含level/text/anchor。
+            list[dict]: 规范化后的目录条目，包含level/text/anchor/description。
        """
        metadata = self.metadata
        toc_config = metadata.get("toc") or {}
        custom_entries = toc_config.get("customEntries")
        entries: List[Dict[str, Any]] = []
+
        if custom_entries:
            for entry in custom_entries:
                anchor = entry.get("anchor") or self.chapter_anchor_map.get(entry.get("chapterId"))
+
+                # 验证anchor是否有效
                if not anchor:
+                    logger.warning(
+                        f"目录项 '{entry.get('display') or entry.get('title')}' "
+                        f"缺少有效的anchor，已跳过"
+                    )
                    continue
+
+                # 验证anchor是否在chapter_anchor_map中或在chapters的blocks中
+                anchor_valid = self._validate_toc_anchor(anchor, chapters)
+                if not anchor_valid:
+                    logger.warning(
+                        f"目录项 '{entry.get('display') or entry.get('title')}' "
+                        f"的anchor '{anchor}' 在文档中未找到对应的章节"
+                    )
+
+                # 清理描述文本
+                description = entry.get("description")
+                if description:
+                    description = self._clean_text_from_json_artifacts(description)
+
                entries.append(
                    {
                        "level": entry.get("level", 2),
                        "text": entry.get("display") or entry.get("title") or "",
                        "anchor": anchor,
-                        "description": entry.get("description"),
+                        "description": description,
                    }
                )
            return entries
@@ -479,16 +501,52 @@ class HTMLRenderer:
                    if not anchor:
                        continue
                    mapped = self.heading_label_map.get(anchor, {})
+                    # 清理描述文本
+                    description = mapped.get("description")
+                    if description:
+                        description = self._clean_text_from_json_artifacts(description)
                    entries.append(
                        {
                            "level": block.get("level", 2),
                            "text": mapped.get("display") or block.get("text", ""),
                            "anchor": anchor,
-                            "description": mapped.get("description"),
+                            "description": description,
                        }
                    )
        return entries

+    def _validate_toc_anchor(self, anchor: str, chapters: List[Dict[str, Any]]) -> bool:
+        """
+        验证目录anchor是否在文档中存在对应的章节或heading。
+
+        参数:
+            anchor: 需要验证的anchor
+            chapters: Document IR中的章节数组
+
+        返回:
+            bool: anchor是否有效
+        """
+        # 检查是否是章节anchor
+        if anchor in self.chapter_anchor_map.values():
+            return True
+
+        # 检查是否在heading_label_map中
+        if anchor in self.heading_label_map:
+            return True
+
+        # 检查章节的blocks中是否有这个anchor
+        for chapter in chapters or []:
+            chapter_anchor = chapter.get("anchor")
+            if chapter_anchor == anchor:
+                return True
+
+            for block in chapter.get("blocks", []):
+                block_anchor = block.get("anchor")
+                if block_anchor == anchor:
+                    return True
+
+        return False
+
    def _prepare_chapters(self, chapters: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """复制章节并展开其中序列化的block，避免渲染缺失"""
        prepared: List[Dict[str, Any]] = []
@@ -640,6 +698,9 @@ class HTMLRenderer:
            str: `<li>` 形式的HTML。
        """
        desc = entry.get("description")
+        # 清理描述文本中的JSON片段
+        if desc:
+            desc = self._clean_text_from_json_artifacts(desc)
        desc_html = f'<p class="toc-desc">{self._escape_html(desc)}</p>' if desc else ""
        level = entry.get("level", 2)
        css_level = 1 if level <= 2 else min(level, 4)
@@ -1576,6 +1637,64 @@ class HTMLRenderer:

    # ====== 文本 / 安全工具 ======

+    def _clean_text_from_json_artifacts(self, text: Any) -> str:
+        """
+        清理文本中的JSON片段和伪造的结构标记。
+
+        LLM有时会在文本字段中混入未完成的JSON片段，如：
+        "描述文本，{ \"chapterId\": \"S3" 或 "描述文本，{ \"level\": 2"
+
+        此方法会：
+        1. 移除不完整的JSON对象（以 { 开头但未正确闭合的）
+        2. 移除不完整的JSON数组（以 [ 开头但未正确闭合的）
+        3. 移除孤立的JSON键值对片段
+
+        参数:
+            text: 可能包含JSON片段的文本
+
+        返回:
+            str: 清理后的纯文本
+        """
+        if not text:
+            return ""
+
+        text_str = self._safe_text(text)
+
+        # 模式1: 移除以逗号+空白+{开头的不完整JSON对象
+        # 例如: "文本，{ \"key\": \"value\"" 或 "文本，{\\n  \"key\""
+        text_str = re.sub(r',\s*\{[^}]*$', '', text_str)
+
+        # 模式2: 移除以逗号+空白+[开头的不完整JSON数组
+        text_str = re.sub(r',\s*\[[^\]]*$', '', text_str)
+
+        # 模式3: 移除孤立的 { 加上后续内容（如果没有匹配的 }）
+        # 检查是否有未闭合的 {
+        open_brace_pos = text_str.rfind('{')
+        if open_brace_pos != -1:
+            close_brace_pos = text_str.rfind('}')
+            if close_brace_pos < open_brace_pos:
+                # { 在 } 后面或没有 }，说明是未闭合的
+                # 截断到 { 之前
+                text_str = text_str[:open_brace_pos].rstrip(',，、 \t\n')
+
+        # 模式4: 类似处理 [
+        open_bracket_pos = text_str.rfind('[')
+        if open_bracket_pos != -1:
+            close_bracket_pos = text_str.rfind(']')
+            if close_bracket_pos < open_bracket_pos:
+                # [ 在 ] 后面或没有 ]，说明是未闭合的
+                text_str = text_str[:open_bracket_pos].rstrip(',，、 \t\n')
+
+        # 模式5: 移除看起来像JSON键值对的片段，如 "chapterId": "S3
+        # 这种情况通常出现在上面的模式之后
+        text_str = re.sub(r',?\s*"[^"]+"\s*:\s*"[^"]*$', '', text_str)
+        text_str = re.sub(r',?\s*"[^"]+"\s*:\s*[^,}\]]*$', '', text_str)
+
+        # 清理末尾的逗号和空白
+        text_str = text_str.rstrip(',，、 \t\n')
+
+        return text_str.strip()
+
    def _safe_text(self, value: Any) -> str:
        """将任意值安全转换为字符串，None与复杂对象容错"""
        if value is None: