Files
F6--/张阳脚本/udesk/自动评审_simple.py
T
2026-06-02 15:08:26 +08:00

185 lines
7.3 KiB
Python

import os
import json
import pandas as pd
import anthropic
from datetime import datetime
import time
class AutoReview:
def __init__(self):
self.api_key = "sk-cp-ayedGY_WYs9N0n2hYlAhbYYAYodr7ym7a1y8DgdyCcgx439ONVJzIgZmaR7JmB5bh4iA5ZiLlFy6dOLpHSLtmG8G5WH4EKLDLZXM9gbwAupxZUuqIAUnUEk"
self.client = anthropic.Anthropic(
api_key=self.api_key,
base_url="https://api.minimaxi.com/anthropic",
timeout=30.0
)
self.test_cases = {
"如何创建一人多车?": "会员营销-客户车辆-客户信息里面搜索对应车主的手机号码,点击操作列的修改按钮,车辆信息下方可添加车辆。",
"卡开重了,如何撤销?": "开卡单未结算的,会员营销-卡券积分-卡单据,找到对应的开卡单号,操作列做删除;开卡单已结算的,联系财务处理。"
}
def match_question(self, user_query, threshold=0.7):
from difflib import SequenceMatcher
best_match = None
best_score = 0
for std_question in self.test_cases.keys():
score = SequenceMatcher(None, user_query, std_question).ratio()
if score > best_score and score >= threshold:
best_score = score
best_match = std_question
return best_match, best_score
def evaluate_consistency(self, generated_answer, standard_answer):
system_prompt = """
你是一个专业的答案一致性评审助手。请按照以下标准评判生成答案与标准答案的一致性:
一致性评分标准:
- 10分:完全一致,内容、逻辑、步骤完全相同
- 8-9分:基本一致,核心内容相同,表述略有差异
- 6-7分:部分一致,核心思路相同,但有遗漏或错误步骤
- 4-5分:不太一致,只有部分内容相关
- 0-3分:不一致,内容无关或错误
请输出JSON格式,包含:
- score: 0-10的整数分数
- confidence: 0-100的整数置信度
- reason: 简短的评审理由(不超过100字)
"""
user_prompt = f"""
【生成答案】
{generated_answer}
【标准答案】
{standard_answer}
请根据上述标准进行评审,输出JSON格式结果。
"""
try:
message = self.client.messages.create(
model="MiniMax-M2.7",
max_tokens=500,
system=system_prompt,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": user_prompt
}
]
}
]
)
response_text = ""
for block in message.content:
if block.type == "text":
response_text += block.text
response_text = response_text.strip()
if response_text.startswith("```json"):
response_text = response_text[7:]
if response_text.endswith("```"):
response_text = response_text[:-3]
response_text = response_text.strip()
try:
result = json.loads(response_text)
return result
except json.JSONDecodeError:
return {
"score": 0,
"confidence": 50,
"reason": f"解析失败: {response_text[:50]}"
}
except Exception as e:
return {
"score": 0,
"confidence": 0,
"reason": f"API调用失败: {str(e)[:50]}"
}
def run_review(self, log_data):
results = []
for idx, record in enumerate(log_data):
user_query = record.get('user_query', '').strip()
key_outputs = record.get('key_outputs', {})
print(f"\n处理第 {idx+1} 条记录: {user_query}")
matched_question, match_score = self.match_question(user_query)
if matched_question and match_score >= 0.7:
print(f"匹配到测试用例: {matched_question} (匹配度: {match_score:.2f})")
standard_answer = self.test_cases[matched_question]
for node_name, generated_answer in key_outputs.items():
if generated_answer.strip() and node_name != '单次反思':
print(f" 评审节点: {node_name}")
evaluation = self.evaluate_consistency(generated_answer, standard_answer)
results.append({
"序号": idx + 1,
"用户问题": user_query,
"匹配问题": matched_question,
"匹配度": f"{match_score:.2f}",
"节点名称": node_name,
"生成答案": generated_answer[:100] + "..." if len(generated_answer) > 100 else generated_answer,
"标准答案": standard_answer[:100] + "..." if len(standard_answer) > 100 else standard_answer,
"一致性评分": evaluation["score"],
"置信度": evaluation["confidence"],
"评审理由": evaluation["reason"]
})
print(f" 评分: {evaluation['score']}, 置信度: {evaluation['confidence']}%, 理由: {evaluation['reason']}")
else:
print(f"未匹配到测试用例 (匹配度: {match_score:.2f})")
return results
def export_results(self, results):
if not results:
print("没有评审结果可导出")
return
df = pd.DataFrame(results)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"自动评审结果_{timestamp}.xlsx"
df.to_excel(output_path, index=False)
print(f"\n评审结果已导出到: {output_path}")
return output_path
if __name__ == "__main__":
print("=== 自动评审系统 ===")
reviewer = AutoReview()
print(f"已加载 {len(reviewer.test_cases)} 条测试用例")
log_data = [
{
'user_query': '如何创建一人多车?',
'key_outputs': {
'大模型': '会员营销-客户车辆-客户信息里面搜索对应车主的手机号码,点击操作列的修改按钮,车辆信息下方可添加车辆。',
'单次反思': '步骤清晰,确认操作路径正确',
'大模型二次生成': '在会员营销模块中找到客户车辆管理,搜索车主手机号后点击修改,在车辆信息区域添加新车辆即可。'
}
}
]
print(f"\n开始评审 {len(log_data)} 条记录...")
results = reviewer.run_review(log_data)
reviewer.export_results(results)
if results:
scores = [r["一致性评分"] for r in results if isinstance(r["一致性评分"], int)]
if scores:
avg_score = sum(scores) / len(scores)
print(f"\n平均一致性评分: {avg_score:.2f}")
print("\n评审完成!")