From bf1e2bfa9c1a18a0ddd3b3815991593e42c52835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=A9=AC=E4=B8=80=E4=B8=81?= <1769123563@qq.com> Date: Mon, 17 Nov 2025 22:10:37 +0800 Subject: [PATCH] Repair the Logic for Cleaning Data Returned by LLM --- ReportEngine/utils/json_parser.py | 33 ++++++++++++++++++------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/ReportEngine/utils/json_parser.py b/ReportEngine/utils/json_parser.py index 1de6a19..0184591 100644 --- a/ReportEngine/utils/json_parser.py +++ b/ReportEngine/utils/json_parser.py @@ -51,12 +51,12 @@ class RobustJSONParser: # 常见的LLM思考内容模式 _THINKING_PATTERNS = [ - r".*?", - r".*?", - r"让我想想.*?(?=\{|\[|$)", - r"首先.*?(?=\{|\[|$)", - r"分析.*?(?=\{|\[|$)", - r"根据.*?(?=\{|\[|$)", + r"^\s*.*?\s*", + r"^\s*.*?\s*", + r"^\s*让我想想.*?(?=\{|\[|$)", + r"^\s*首先.*?(?=\{|\[|$)", + r"^\s*分析.*?(?=\{|\[|$)", + r"^\s*根据.*?(?=\{|\[|$)", ] # 冒号等号模式(LLM常见错误) @@ -182,16 +182,21 @@ class RobustJSONParser: for pattern in self._THINKING_PATTERNS: cleaned = re.sub(pattern, "", cleaned, flags=re.DOTALL | re.IGNORECASE) - # 移除markdown代码块标记 - if cleaned.startswith("```json"): - cleaned = cleaned[7:] - elif cleaned.startswith("```"): - cleaned = cleaned[3:] + # 优先提取任意位置的```json```包裹内容 + fenced_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", cleaned) + if fenced_match: + cleaned = fenced_match.group(1).strip() + else: + # 如果没有找到完整代码块,再尝试移除前后缀 + if cleaned.startswith("```json"): + cleaned = cleaned[7:] + elif cleaned.startswith("```"): + cleaned = cleaned[3:] - if cleaned.endswith("```"): - cleaned = cleaned[:-3] + if cleaned.endswith("```"): + cleaned = cleaned[:-3] - cleaned = cleaned.strip() + cleaned = cleaned.strip() # 尝试提取第一个完整的JSON对象或数组 cleaned = self._extract_first_json_structure(cleaned)