From a5f3964a73b36b2682231bd3eb676727add2c224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=A9=AC=E4=B8=80=E4=B8=81?= <1769123563@qq.com> Date: Mon, 17 Nov 2025 17:36:58 +0800 Subject: [PATCH] Optimize JSON Parsing Compatibility --- ReportEngine/utils/json_parser.py | 68 ++++++++++++++++++++++++-- ReportEngine/utils/test_json_parser.py | 55 +++++++++++++++++++++ 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/ReportEngine/utils/json_parser.py b/ReportEngine/utils/json_parser.py index 2278e83..1de6a19 100644 --- a/ReportEngine/utils/json_parser.py +++ b/ReportEngine/utils/json_parser.py @@ -610,9 +610,34 @@ class RobustJSONParser: # 验证数据类型 if not isinstance(data, dict): - if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict): - logger.warning(f"{context_name} 返回数组,自动提取第一个元素") - data = data[0] + if isinstance(data, list): + if len(data) > 0: + # 尝试找到最符合期望的元素 + best_match = None + max_match_count = 0 + + for item in data: + if isinstance(item, dict): + if expected_keys: + # 计算匹配的键数量 + match_count = sum(1 for key in expected_keys if key in item) + if match_count > max_match_count: + max_match_count = match_count + best_match = item + elif best_match is None: + best_match = item + + if best_match: + logger.warning( + f"{context_name} 返回数组,自动提取最佳匹配元素(匹配{max_match_count}/{len(expected_keys or [])}个键)" + ) + data = best_match + else: + raise JSONParseError( + f"{context_name} 返回的数组中没有有效的对象" + ) + else: + raise JSONParseError(f"{context_name} 返回空数组") else: raise JSONParseError( f"{context_name} 返回的不是JSON对象: {type(data).__name__}" @@ -625,6 +650,43 @@ class RobustJSONParser: logger.warning( f"{context_name} 缺少预期的键: {', '.join(missing_keys)}" ) + # 尝试修复常见的键名变体 + data = self._try_recover_missing_keys(data, missing_keys, context_name) + + return data + + def _try_recover_missing_keys( + self, data: Dict[str, Any], missing_keys: List[str], context_name: str + ) -> Dict[str, Any]: + """ + 尝试从数据中恢复缺失的键,通过查找相似的键名。 + + 参数: + data: 原始数据 + missing_keys: 缺失的键列表 + context_name: 上下文名称 + + 返回: + Dict[str, Any]: 修复后的数据 + """ + # 常见的键名映射 + key_aliases = { + "template_name": ["templateName", "name", "template"], + "selection_reason": ["selectionReason", "reason", "explanation"], + "title": ["reportTitle", "documentTitle"], + "chapters": ["chapterList", "chapterPlan", "sections"], + "totalWords": ["total_words", "wordCount", "totalWordCount"], + } + + for missing_key in missing_keys: + if missing_key in key_aliases: + for alias in key_aliases[missing_key]: + if alias in data: + logger.info( + f"{context_name} 找到键'{missing_key}'的别名'{alias}',自动映射" + ) + data[missing_key] = data[alias] + break return data diff --git a/ReportEngine/utils/test_json_parser.py b/ReportEngine/utils/test_json_parser.py index 6c39069..6c412af 100644 --- a/ReportEngine/utils/test_json_parser.py +++ b/ReportEngine/utils/test_json_parser.py @@ -127,6 +127,61 @@ class TestRobustJSONParser(unittest.TestCase): self.assertEqual(result["name"], "test") self.assertEqual(result["value"], 123) + def test_unterminated_string_with_json_repair(self): + """测试使用json_repair库修复未终止的字符串。""" + # 创建启用json_repair的解析器 + parser_with_repair = RobustJSONParser( + enable_json_repair=True, + enable_llm_repair=False, + ) + + # 模拟实际错误:字符串中有未转义的控制字符或引号 + json_str = """{ + "template_name": "特定政策报告", + "selection_reason": "这是测试内容" +}""" + result = parser_with_repair.parse(json_str, "未终止字符串测试") + # 只要能够解析成功,不报错就可以了 + self.assertIsInstance(result, dict) + self.assertIn("template_name", result) + + def test_array_with_best_match(self): + """测试从数组中提取最佳匹配的元素。""" + json_str = """[ + { + "name": "test", + "value": 123 + }, + { + "totalWords": 40000, + "globalGuidelines": ["guide1", "guide2"], + "chapters": [] + } +]""" + result = self.parser.parse( + json_str, + "数组最佳匹配测试", + expected_keys=["totalWords", "globalGuidelines", "chapters"], + ) + # 应该提取第二个元素,因为它匹配了3个键 + self.assertEqual(result["totalWords"], 40000) + self.assertEqual(len(result["globalGuidelines"]), 2) + + def test_key_alias_recovery(self): + """测试键名别名恢复。""" + json_str = """{ + "templateName": "test_template", + "selectionReason": "This is a test" +}""" + result = self.parser.parse( + json_str, + "键别名测试", + expected_keys=["template_name", "selection_reason"], + ) + # 应该自动映射 templateName -> template_name + self.assertEqual(result["template_name"], "test_template") + self.assertEqual(result["selection_reason"], "This is a test") + def test_complex_real_world_case(self): """测试真实世界的复杂案例(类似实际错误)。""" # 模拟实际错误:缺少逗号、有markdown包裹、有思考内容