Improve the Security of Regular Expression Matching

This commit is contained in:
马一丁
2025-11-15 02:46:09 +08:00
parent 79a015b77a
commit 5ef63ece78
+38 -10
View File
@@ -51,9 +51,37 @@ class TemplateSection:
} }
heading_pattern = re.compile(r"^(#{1,6})\s+(.*)$") # The parsing expressions intentionally avoid `.*` to keep matching deterministic and
bullet_pattern = re.compile(r"^[-*+]\s+(.*)$") # eliminate easy Regular-Expression-DoS gadgets on untrusted template text.
number_pattern = re.compile(r"^(?P<num>\d+(?:\.\d+)*)(?:[\s、:.-]+(?P<label>.*))?$") heading_pattern = re.compile(
r"""
(?P<marker>\#{1,6}) # Markdown heading markers
[ \t]+ # required whitespace
(?P<title>[^\r\n]+) # heading text without newline characters
""",
re.VERBOSE,
)
bullet_pattern = re.compile(
r"""
(?P<marker>[-*+]) # list bullet symbol
[ \t]+
(?P<title>[^\r\n]+)
""",
re.VERBOSE,
)
number_pattern = re.compile(
r"""
(?P<num>
(?:0|[1-9]\d*)
(?:\.(?:0|[1-9]\d*))*
)
(?:
(?:[ \t\u00A0\u3000、:-]+|\.(?!\d))+
(?P<label>[^\r\n]*)
)?
""",
re.VERBOSE,
)
def parse_template_sections(template_md: str) -> List[TemplateSection]: def parse_template_sections(template_md: str) -> List[TemplateSection]:
@@ -128,10 +156,10 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
dict | None: 识别后的元数据;无法识别时返回None。 dict | None: 识别后的元数据;无法识别时返回None。
""" """
heading_match = heading_pattern.match(stripped) heading_match = heading_pattern.fullmatch(stripped)
if heading_match: if heading_match:
level = len(heading_match.group(1)) level = len(heading_match.group("marker"))
payload = _strip_markup(heading_match.group(2).strip()) payload = _strip_markup(heading_match.group("title").strip())
title_info = _split_number(payload) title_info = _split_number(payload)
slug = _build_slug(title_info["number"], title_info["title"]) slug = _build_slug(title_info["number"], title_info["title"])
return { return {
@@ -143,9 +171,9 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
"slug": slug, "slug": slug,
} }
bullet_match = bullet_pattern.match(stripped) bullet_match = bullet_pattern.fullmatch(stripped)
if bullet_match: if bullet_match:
payload = _strip_markup(bullet_match.group(1).strip()) payload = _strip_markup(bullet_match.group("title").strip())
title_info = _split_number(payload) title_info = _split_number(payload)
slug = _build_slug(title_info["number"], title_info["title"]) slug = _build_slug(title_info["number"], title_info["title"])
is_section = indent <= 1 is_section = indent <= 1
@@ -160,7 +188,7 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
} }
# 兼容“1.1 ...”没有前缀符号的行 # 兼容“1.1 ...”没有前缀符号的行
number_match = number_pattern.match(stripped) number_match = number_pattern.fullmatch(stripped)
if number_match and number_match.group("label"): if number_match and number_match.group("label"):
payload = stripped payload = stripped
title = number_match.group("label").strip() title = number_match.group("label").strip()
@@ -201,7 +229,7 @@ def _split_number(payload: str) -> dict:
返回: 返回:
dict: 包含 number/title/display。 dict: 包含 number/title/display。
""" """
match = number_pattern.match(payload) match = number_pattern.fullmatch(payload)
number = match.group("num") if match else "" number = match.group("num") if match else ""
label = match.group("label") if match else payload label = match.group("label") if match else payload
label = (label or "").strip() label = (label or "").strip()