238 lines
9.4 KiB
Python
238 lines
9.4 KiB
Python
import os
|
|
import json
|
|
import pandas as pd
|
|
import anthropic
|
|
from datetime import datetime
|
|
import time
|
|
|
|
class AutoReview:
|
|
def __init__(self):
|
|
self.api_key = "sk-cp-ayedGY_WYs9N0n2hYlAhbYYAYodr7ym7a1y8DgdyCcgx439ONVJzIgZmaR7JmB5bh4iA5ZiLlFy6dOLpHSLtmG8G5WH4EKLDLZXM9gbwAupxZUuqIAUnUEk"
|
|
self.client = anthropic.Anthropic(
|
|
api_key=self.api_key,
|
|
base_url="https://api.minimaxi.com/anthropic",
|
|
timeout=30.0
|
|
)
|
|
|
|
def load_test_cases(self, excel_path):
|
|
df = pd.read_excel(excel_path)
|
|
self.test_cases = {}
|
|
for _, row in df.iterrows():
|
|
question = str(row['提问问题']).strip()
|
|
answer = str(row['答案']).strip()
|
|
if question:
|
|
self.test_cases[question] = answer
|
|
print(f"已加载 {len(self.test_cases)} 条测试用例")
|
|
return self.test_cases
|
|
|
|
def load_log_data(self, log_path):
|
|
if os.path.exists(log_path):
|
|
with open(log_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
print(f"已加载 {len(data)} 条日志记录")
|
|
return data
|
|
else:
|
|
print(f"日志文件不存在: {log_path}")
|
|
return []
|
|
|
|
def match_question(self, user_query, threshold=0.7):
|
|
from difflib import SequenceMatcher
|
|
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for std_question in self.test_cases.keys():
|
|
score = SequenceMatcher(None, user_query, std_question).ratio()
|
|
if score > best_score and score >= threshold:
|
|
best_score = score
|
|
best_match = std_question
|
|
|
|
return best_match, best_score
|
|
|
|
def evaluate_consistency(self, generated_answer, standard_answer):
|
|
system_prompt = """
|
|
你是一个专业的答案一致性评审助手。请按照以下标准评判生成答案与标准答案的一致性:
|
|
|
|
一致性评分标准:
|
|
- 10分:完全一致,内容、逻辑、步骤完全相同
|
|
- 8-9分:基本一致,核心内容相同,表述略有差异
|
|
- 6-7分:部分一致,核心思路相同,但有遗漏或错误步骤
|
|
- 4-5分:不太一致,只有部分内容相关
|
|
- 0-3分:不一致,内容无关或错误
|
|
|
|
请输出JSON格式,包含:
|
|
- score: 0-10的整数分数
|
|
- confidence: 0-100的整数置信度
|
|
- reason: 简短的评审理由(不超过100字)
|
|
"""
|
|
|
|
user_prompt = f"""
|
|
【生成答案】
|
|
{generated_answer}
|
|
|
|
【标准答案】
|
|
{standard_answer}
|
|
|
|
请根据上述标准进行评审,输出JSON格式结果。
|
|
"""
|
|
|
|
try:
|
|
print(f"正在调用大模型进行评审...")
|
|
start_time = time.time()
|
|
message = self.client.messages.create(
|
|
model="MiniMax-M2.7",
|
|
max_tokens=500,
|
|
system=system_prompt,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": user_prompt
|
|
}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
elapsed = time.time() - start_time
|
|
print(f"大模型调用完成,耗时: {elapsed:.2f}秒")
|
|
|
|
response_text = ""
|
|
for block in message.content:
|
|
if block.type == "text":
|
|
response_text += block.text
|
|
|
|
response_text = response_text.strip()
|
|
if response_text.startswith("```json"):
|
|
response_text = response_text[7:]
|
|
if response_text.endswith("```"):
|
|
response_text = response_text[:-3]
|
|
response_text = response_text.strip()
|
|
|
|
try:
|
|
result = json.loads(response_text)
|
|
return result
|
|
except json.JSONDecodeError:
|
|
return {
|
|
"score": 0,
|
|
"confidence": 50,
|
|
"reason": f"解析失败: {response_text[:100]}"
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"score": 0,
|
|
"confidence": 0,
|
|
"reason": f"API调用失败: {str(e)}"
|
|
}
|
|
|
|
def run_review(self, log_data):
|
|
results = []
|
|
|
|
for idx, record in enumerate(log_data):
|
|
user_query = record.get('user_query', '').strip()
|
|
key_outputs = record.get('key_outputs', {})
|
|
|
|
matched_question, match_score = self.match_question(user_query)
|
|
|
|
if matched_question and match_score >= 0.7:
|
|
standard_answer = self.test_cases[matched_question]
|
|
|
|
for node_name, generated_answer in key_outputs.items():
|
|
if generated_answer.strip():
|
|
evaluation = self.evaluate_consistency(generated_answer, standard_answer)
|
|
|
|
results.append({
|
|
"序号": idx + 1,
|
|
"用户问题": user_query,
|
|
"匹配问题": matched_question,
|
|
"匹配度": f"{match_score:.2f}",
|
|
"节点名称": node_name,
|
|
"生成答案": generated_answer[:200] + "..." if len(generated_answer) > 200 else generated_answer,
|
|
"标准答案": standard_answer[:200] + "..." if len(standard_answer) > 200 else standard_answer,
|
|
"一致性评分": evaluation["score"],
|
|
"置信度": evaluation["confidence"],
|
|
"评审理由": evaluation["reason"]
|
|
})
|
|
print(f"已评审第 {idx+1} 条记录,节点: {node_name},评分: {evaluation['score']}")
|
|
else:
|
|
results.append({
|
|
"序号": idx + 1,
|
|
"用户问题": user_query,
|
|
"匹配问题": "未匹配",
|
|
"匹配度": f"{match_score:.2f}",
|
|
"节点名称": "-",
|
|
"生成答案": "-",
|
|
"标准答案": "-",
|
|
"一致性评分": "-",
|
|
"置信度": "-",
|
|
"评审理由": "未找到匹配的测试用例"
|
|
})
|
|
|
|
return results
|
|
|
|
def export_results(self, results, output_path=None):
|
|
if not results:
|
|
print("没有评审结果可导出")
|
|
return
|
|
|
|
df = pd.DataFrame(results)
|
|
|
|
if output_path is None:
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_path = f"自动评审结果_{timestamp}.xlsx"
|
|
|
|
df.to_excel(output_path, index=False)
|
|
print(f"评审结果已导出到: {output_path}")
|
|
return output_path
|
|
|
|
if __name__ == "__main__":
|
|
reviewer = AutoReview()
|
|
|
|
test_cases_path = r"C:\Users\hp_z66\Desktop\自动化测试.xlsx"
|
|
log_cache_path = r"d:\Idea Project\F6+宜搭+其它(1)\张阳脚本\udesk\audit_cache.json"
|
|
|
|
print("正在加载测试用例...")
|
|
reviewer.load_test_cases(test_cases_path)
|
|
|
|
print("正在加载日志数据...")
|
|
log_data = [
|
|
{
|
|
'user_query': '如何创建一人多车?',
|
|
'key_outputs': {
|
|
'大模型': '会员营销-客户车辆-客户信息里面搜索对应车主的手机号码,点击操作列的修改按钮,车辆信息下方可添加车辆。',
|
|
'单次反思': '步骤清晰,确认操作路径正确',
|
|
'大模型二次生成': '在会员营销模块中找到客户车辆管理,搜索车主手机号后点击修改,在车辆信息区域添加新车辆即可。'
|
|
}
|
|
},
|
|
{
|
|
'user_query': '卡开重了,如何撤销?',
|
|
'key_outputs': {
|
|
'大模型': '开卡单未结算的可以删除,已结算的需要联系管理员处理。',
|
|
'单次反思': '回答不够详细,缺少具体路径',
|
|
'大模型二次生成': '会员营销-卡券积分-卡单据,找到对应的开卡单号,未结算可直接删除;已结算需联系财务处理。'
|
|
}
|
|
},
|
|
{
|
|
'user_query': '如何修改卡可用车辆?',
|
|
'key_outputs': {
|
|
'大模型': '会员营销卡券积分卡管理找到对应的卡信息点击修改可用车辆选择指定车辆即可。',
|
|
'单次反思': '缺少分隔符,步骤不够清晰',
|
|
'大模型二次生成': '会员营销→卡券积分→卡管理,找到对应的卡信息,点击修改,可用车辆中选择指定车辆,即可勾选具体车辆。'
|
|
}
|
|
}
|
|
]
|
|
print(f"已加载 {len(log_data)} 条测试日志数据")
|
|
|
|
print("开始自动评审...")
|
|
results = reviewer.run_review(log_data)
|
|
|
|
print("导出评审结果...")
|
|
output_path = reviewer.export_results(results)
|
|
|
|
print("\n评审完成!")
|
|
if results:
|
|
scores = [r["一致性评分"] for r in results if isinstance(r["一致性评分"], int)]
|
|
if scores:
|
|
avg_score = sum(scores) / len(scores)
|
|
print(f"平均一致性评分: {avg_score:.2f}") |