import os import json import pandas as pd import anthropic from datetime import datetime import time class AutoReview: def __init__(self): self.api_key = "sk-cp-ayedGY_WYs9N0n2hYlAhbYYAYodr7ym7a1y8DgdyCcgx439ONVJzIgZmaR7JmB5bh4iA5ZiLlFy6dOLpHSLtmG8G5WH4EKLDLZXM9gbwAupxZUuqIAUnUEk" self.client = anthropic.Anthropic( api_key=self.api_key, base_url="https://api.minimaxi.com/anthropic", timeout=30.0 ) self.test_cases = { "如何创建一人多车?": "会员营销-客户车辆-客户信息里面搜索对应车主的手机号码,点击操作列的修改按钮,车辆信息下方可添加车辆。", "卡开重了,如何撤销?": "开卡单未结算的,会员营销-卡券积分-卡单据,找到对应的开卡单号,操作列做删除;开卡单已结算的,联系财务处理。" } def match_question(self, user_query, threshold=0.7): from difflib import SequenceMatcher best_match = None best_score = 0 for std_question in self.test_cases.keys(): score = SequenceMatcher(None, user_query, std_question).ratio() if score > best_score and score >= threshold: best_score = score best_match = std_question return best_match, best_score def evaluate_consistency(self, generated_answer, standard_answer): system_prompt = """ 你是一个专业的答案一致性评审助手。请按照以下标准评判生成答案与标准答案的一致性: 一致性评分标准: - 10分:完全一致,内容、逻辑、步骤完全相同 - 8-9分:基本一致,核心内容相同,表述略有差异 - 6-7分:部分一致,核心思路相同,但有遗漏或错误步骤 - 4-5分:不太一致,只有部分内容相关 - 0-3分:不一致,内容无关或错误 请输出JSON格式,包含: - score: 0-10的整数分数 - confidence: 0-100的整数置信度 - reason: 简短的评审理由(不超过100字) """ user_prompt = f""" 【生成答案】 {generated_answer} 【标准答案】 {standard_answer} 请根据上述标准进行评审,输出JSON格式结果。 """ try: message = self.client.messages.create( model="MiniMax-M2.7", max_tokens=500, system=system_prompt, messages=[ { "role": "user", "content": [ { "type": "text", "text": user_prompt } ] } ] ) response_text = "" for block in message.content: if block.type == "text": response_text += block.text response_text = response_text.strip() if response_text.startswith("```json"): response_text = response_text[7:] if response_text.endswith("```"): response_text = response_text[:-3] response_text = response_text.strip() try: result = json.loads(response_text) return result except json.JSONDecodeError: return { "score": 0, "confidence": 50, "reason": f"解析失败: {response_text[:50]}" } except Exception as e: return { "score": 0, "confidence": 0, "reason": f"API调用失败: {str(e)[:50]}" } def run_review(self, log_data): results = [] for idx, record in enumerate(log_data): user_query = record.get('user_query', '').strip() key_outputs = record.get('key_outputs', {}) print(f"\n处理第 {idx+1} 条记录: {user_query}") matched_question, match_score = self.match_question(user_query) if matched_question and match_score >= 0.7: print(f"匹配到测试用例: {matched_question} (匹配度: {match_score:.2f})") standard_answer = self.test_cases[matched_question] for node_name, generated_answer in key_outputs.items(): if generated_answer.strip() and node_name != '单次反思': print(f" 评审节点: {node_name}") evaluation = self.evaluate_consistency(generated_answer, standard_answer) results.append({ "序号": idx + 1, "用户问题": user_query, "匹配问题": matched_question, "匹配度": f"{match_score:.2f}", "节点名称": node_name, "生成答案": generated_answer[:100] + "..." if len(generated_answer) > 100 else generated_answer, "标准答案": standard_answer[:100] + "..." if len(standard_answer) > 100 else standard_answer, "一致性评分": evaluation["score"], "置信度": evaluation["confidence"], "评审理由": evaluation["reason"] }) print(f" 评分: {evaluation['score']}, 置信度: {evaluation['confidence']}%, 理由: {evaluation['reason']}") else: print(f"未匹配到测试用例 (匹配度: {match_score:.2f})") return results def export_results(self, results): if not results: print("没有评审结果可导出") return df = pd.DataFrame(results) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = f"自动评审结果_{timestamp}.xlsx" df.to_excel(output_path, index=False) print(f"\n评审结果已导出到: {output_path}") return output_path if __name__ == "__main__": print("=== 自动评审系统 ===") reviewer = AutoReview() print(f"已加载 {len(reviewer.test_cases)} 条测试用例") log_data = [ { 'user_query': '如何创建一人多车?', 'key_outputs': { '大模型': '会员营销-客户车辆-客户信息里面搜索对应车主的手机号码,点击操作列的修改按钮,车辆信息下方可添加车辆。', '单次反思': '步骤清晰,确认操作路径正确', '大模型二次生成': '在会员营销模块中找到客户车辆管理,搜索车主手机号后点击修改,在车辆信息区域添加新车辆即可。' } } ] print(f"\n开始评审 {len(log_data)} 条记录...") results = reviewer.run_review(log_data) reviewer.export_results(results) if results: scores = [r["一致性评分"] for r in results if isinstance(r["一致性评分"], int)] if scores: avg_score = sum(scores) / len(scores) print(f"\n平均一致性评分: {avg_score:.2f}") print("\n评审完成!")