Repair the Logic for Cleaning Data Returned by LLM

2025-11-17 22:10:37 +08:00
parent b31be56297
commit bf1e2bfa9c
1 changed files with 19 additions and 14 deletions
@@ -51,12 +51,12 @@ class RobustJSONParser:
    # 常见的LLM思考内容模式
    _THINKING_PATTERNS = [
-        r"<thinking>.*?</thinking>",
+        r"^\s*<thinking>.*?</thinking>\s*",
-        r"<thought>.*?</thought>",
+        r"^\s*<thought>.*?</thought>\s*",
-        r"让我想想.*?(?=\{|\[|$)",
+        r"^\s*让我想想.*?(?=\{|\[|$)",
-        r"首先.*?(?=\{|\[|$)",
+        r"^\s*首先.*?(?=\{|\[|$)",
-        r"分析.*?(?=\{|\[|$)",
+        r"^\s*分析.*?(?=\{|\[|$)",
-        r"根据.*?(?=\{|\[|$)",
+        r"^\s*根据.*?(?=\{|\[|$)",
    ]
    # 冒号等号模式（LLM常见错误）
@@ -182,16 +182,21 @@ class RobustJSONParser:
        for pattern in self._THINKING_PATTERNS:
            cleaned = re.sub(pattern, "", cleaned, flags=re.DOTALL | re.IGNORECASE)
-        # 移除markdown代码块标记
+        # 优先提取任意位置的```json```包裹内容
-        if cleaned.startswith("```json"):
+        fenced_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", cleaned)
-            cleaned = cleaned[7:]
+        if fenced_match:
-        elif cleaned.startswith("```"):
+            cleaned = fenced_match.group(1).strip()
-            cleaned = cleaned[3:]
+        else:
            # 如果没有找到完整代码块，再尝试移除前后缀
            if cleaned.startswith("```json"):
                cleaned = cleaned[7:]
            elif cleaned.startswith("```"):
                cleaned = cleaned[3:]
-        if cleaned.endswith("```"):
+            if cleaned.endswith("```"):
-            cleaned = cleaned[:-3]
+                cleaned = cleaned[:-3]
-        cleaned = cleaned.strip()
+            cleaned = cleaned.strip()
        # 尝试提取第一个完整的JSON对象或数组
        cleaned = self._extract_first_json_structure(cleaned)