Blocked HTML

This commit is contained in:
马一丁
2025-11-13 10:56:28 +08:00
parent 403dbbd296
commit 4846b1f758
20 changed files with 3660 additions and 367 deletions
+208
View File
@@ -0,0 +1,208 @@
"""
Markdown模板切片工具。
LLM需要“按章调用”,因此必须把Markdown模板解析为结构化章节队列。
这里通过轻量正则和缩进启发式,兼容“# 标题”与
“- **1.0 标题** / - 1.1 子标题”等多种写法。
"""
from __future__ import annotations
import re
import unicodedata
from dataclasses import dataclass, field
from typing import List, Optional
SECTION_ORDER_STEP = 10
@dataclass
class TemplateSection:
"""模板章节实体"""
title: str
slug: str
order: int
depth: int
raw_title: str
number: str = ""
chapter_id: str = ""
outline: List[str] = field(default_factory=list)
def to_dict(self) -> dict:
return {
"title": self.title,
"slug": self.slug,
"order": self.order,
"depth": self.depth,
"number": self.number,
"chapterId": self.chapter_id,
"outline": self.outline,
}
heading_pattern = re.compile(r"^(#{1,6})\s+(.*)$")
bullet_pattern = re.compile(r"^[-*+]\s+(.*)$")
number_pattern = re.compile(r"^(?P<num>\d+(?:\.\d+)*)(?:[\s、:.-]+(?P<label>.*))?$")
def parse_template_sections(template_md: str) -> List[TemplateSection]:
"""
将Markdown模板切分成章节列表(按大标题)。
返回的每个TemplateSection都携带slug/order/章节号,
方便后续分章调用与锚点生成。
"""
sections: List[TemplateSection] = []
current: Optional[TemplateSection] = None
order = SECTION_ORDER_STEP
used_slugs = set()
for raw_line in template_md.splitlines():
if not raw_line.strip():
continue
indent = len(raw_line) - len(raw_line.lstrip(" "))
stripped = raw_line.strip()
meta = _classify_line(stripped, indent)
if not meta:
continue
if meta["is_section"]:
slug = _ensure_unique_slug(meta["slug"], used_slugs)
section = TemplateSection(
title=meta["title"],
slug=slug,
order=order,
depth=meta["depth"],
raw_title=meta["raw"],
number=meta["number"],
)
sections.append(section)
current = section
order += SECTION_ORDER_STEP
continue
# outline
if current:
current.outline.append(meta["title"])
for idx, section in enumerate(sections, start=1):
# 为每个章节生成稳定的chapter_id,便于后续引用
section.chapter_id = f"S{idx}"
return sections
def _classify_line(stripped: str, indent: int) -> Optional[dict]:
"""根据缩进与符号分类行"""
heading_match = heading_pattern.match(stripped)
if heading_match:
level = len(heading_match.group(1))
payload = _strip_markup(heading_match.group(2).strip())
title_info = _split_number(payload)
slug = _build_slug(title_info["number"], title_info["title"])
return {
"is_section": level <= 2,
"depth": level,
"title": title_info["display"],
"raw": payload,
"number": title_info["number"],
"slug": slug,
}
bullet_match = bullet_pattern.match(stripped)
if bullet_match:
payload = _strip_markup(bullet_match.group(1).strip())
title_info = _split_number(payload)
slug = _build_slug(title_info["number"], title_info["title"])
is_section = indent <= 1
depth = 1 if indent <= 1 else 2
return {
"is_section": is_section,
"depth": depth,
"title": title_info["display"],
"raw": payload,
"number": title_info["number"],
"slug": slug,
}
# 兼容“1.1 ...”没有前缀符号的行
number_match = number_pattern.match(stripped)
if number_match and number_match.group("label"):
payload = stripped
title = number_match.group("label").strip()
number = number_match.group("num")
slug = _build_slug(number, title)
is_section = indent == 0 and number.count(".") <= 1
depth = 1 if is_section else 2
display = f"{number} {title}" if title else number
return {
"is_section": is_section,
"depth": depth,
"title": display,
"raw": payload,
"number": number,
"slug": slug,
}
return None
def _strip_markup(text: str) -> str:
"""去除包裹的**、__等简单强调标记"""
if text.startswith(("**", "__")) and text.endswith(("**", "__")) and len(text) > 4:
return text[2:-2].strip()
return text
def _split_number(payload: str) -> dict:
"""拆分编号与标题"""
match = number_pattern.match(payload)
number = match.group("num") if match else ""
label = match.group("label") if match else payload
label = (label or "").strip()
display = f"{number} {label}".strip() if number else label or payload
title_core = label or payload
return {
"number": number,
"title": title_core,
"display": display,
}
def _build_slug(number: str, title: str) -> str:
"""根据编号/标题生成锚点"""
if number:
token = number.replace(".", "-")
else:
token = _slugify_text(title)
token = token or "section"
return f"section-{token}"
def _slugify_text(text: str) -> str:
text = unicodedata.normalize("NFKD", text)
text = text.replace("·", "-").replace(" ", "-")
text = re.sub(r"[^0-9a-zA-Z\u4e00-\u9fff-]+", "-", text)
text = re.sub(r"-{2,}", "-", text)
return text.strip("-").lower()
def _ensure_unique_slug(slug: str, used: set) -> str:
if slug not in used:
used.add(slug)
return slug
base = slug
idx = 2
while slug in used:
slug = f"{base}-{idx}"
idx += 1
used.add(slug)
return slug
__all__ = ["TemplateSection", "parse_template_sections"]