Improve the Security of Regular Expression Matching

2025-11-15 02:46:09 +08:00
parent 79a015b77a
commit 5ef63ece78
1 changed files with 38 additions and 10 deletions
@@ -51,9 +51,37 @@ class TemplateSection:
        }
-heading_pattern = re.compile(r"^(#{1,6})\s+(.*)$")
+# The parsing expressions intentionally avoid `.*` to keep matching deterministic and
-bullet_pattern = re.compile(r"^[-*+]\s+(.*)$")
+# eliminate easy Regular-Expression-DoS gadgets on untrusted template text.
-number_pattern = re.compile(r"^(?P<num>\d+(?:\.\d+)*)(?:[\s、:：.-]+(?P<label>.*))?$")
+heading_pattern = re.compile(
    r"""
    (?P<marker>\#{1,6})       # Markdown heading markers
    [ \t]+                    # required whitespace
    (?P<title>[^\r\n]+)       # heading text without newline characters
    """,
    re.VERBOSE,
 )
 bullet_pattern = re.compile(
    r"""
    (?P<marker>[-*+])         # list bullet symbol
    [ \t]+
    (?P<title>[^\r\n]+)
    """,
    re.VERBOSE,
 )
 number_pattern = re.compile(
    r"""
    (?P<num>
        (?:0|[1-9]\d*)
        (?:\.(?:0|[1-9]\d*))*
    )
    (?:
        (?:[ \t\u00A0\u3000、:：-]+|\.(?!\d))+
        (?P<label>[^\r\n]*)
    )?
    """,
    re.VERBOSE,
 )
 def parse_template_sections(template_md: str) -> List[TemplateSection]:
@@ -128,10 +156,10 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
        dict | None: 识别后的元数据；无法识别时返回None。
    """
-    heading_match = heading_pattern.match(stripped)
+    heading_match = heading_pattern.fullmatch(stripped)
    if heading_match:
-        level = len(heading_match.group(1))
+        level = len(heading_match.group("marker"))
-        payload = _strip_markup(heading_match.group(2).strip())
+        payload = _strip_markup(heading_match.group("title").strip())
        title_info = _split_number(payload)
        slug = _build_slug(title_info["number"], title_info["title"])
        return {
@@ -143,9 +171,9 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
            "slug": slug,
        }
-    bullet_match = bullet_pattern.match(stripped)
+    bullet_match = bullet_pattern.fullmatch(stripped)
    if bullet_match:
-        payload = _strip_markup(bullet_match.group(1).strip())
+        payload = _strip_markup(bullet_match.group("title").strip())
        title_info = _split_number(payload)
        slug = _build_slug(title_info["number"], title_info["title"])
        is_section = indent <= 1
@@ -160,7 +188,7 @@ def _classify_line(stripped: str, indent: int) -> Optional[dict]:
        }
    # 兼容“1.1 ...”没有前缀符号的行
-    number_match = number_pattern.match(stripped)
+    number_match = number_pattern.fullmatch(stripped)
    if number_match and number_match.group("label"):
        payload = stripped
        title = number_match.group("label").strip()
@@ -201,7 +229,7 @@ def _split_number(payload: str) -> dict:
    返回:
        dict: 包含 number/title/display。
    """
-    match = number_pattern.match(payload)
+    match = number_pattern.fullmatch(payload)
    number = match.group("num") if match else ""
    label = match.group("label") if match else payload
    label = (label or "").strip()