F6--/张阳脚本/udesk/自动评审_simple.py

import os
import json
import pandas as pd
import anthropic
from datetime import datetime
import time

class AutoReview:
    def __init__(self):
        self.api_key = "sk-cp-ayedGY_WYs9N0n2hYlAhbYYAYodr7ym7a1y8DgdyCcgx439ONVJzIgZmaR7JmB5bh4iA5ZiLlFy6dOLpHSLtmG8G5WH4EKLDLZXM9gbwAupxZUuqIAUnUEk"
        self.client = anthropic.Anthropic(
            api_key=self.api_key,
            base_url="https://api.minimaxi.com/anthropic",
            timeout=30.0
        )
        self.test_cases = {
            "如何创建一人多车？": "会员营销-客户车辆-客户信息里面搜索对应车主的手机号码，点击操作列的修改按钮，车辆信息下方可添加车辆。",
            "卡开重了，如何撤销？": "开卡单未结算的，会员营销-卡券积分-卡单据，找到对应的开卡单号，操作列做删除；开卡单已结算的，联系财务处理。"
        }

    def match_question(self, user_query, threshold=0.7):
        from difflib import SequenceMatcher

        best_match = None
        best_score = 0

        for std_question in self.test_cases.keys():
            score = SequenceMatcher(None, user_query, std_question).ratio()
            if score > best_score and score >= threshold:
                best_score = score
                best_match = std_question

        return best_match, best_score

    def evaluate_consistency(self, generated_answer, standard_answer):
        system_prompt = """
你是一个专业的答案一致性评审助手。请按照以下标准评判生成答案与标准答案的一致性：

一致性评分标准：
- 10分：完全一致，内容、逻辑、步骤完全相同
- 8-9分：基本一致，核心内容相同，表述略有差异
- 6-7分：部分一致，核心思路相同，但有遗漏或错误步骤
- 4-5分：不太一致，只有部分内容相关
- 0-3分：不一致，内容无关或错误

请输出JSON格式，包含：
- score: 0-10的整数分数
- confidence: 0-100的整数置信度
- reason: 简短的评审理由（不超过100字）
"""

        user_prompt = f"""
【生成答案】
{generated_answer}

【标准答案】
{standard_answer}

请根据上述标准进行评审，输出JSON格式结果。
"""

        try:
            message = self.client.messages.create(
                model="MiniMax-M2.7",
                max_tokens=500,
                system=system_prompt,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": user_prompt
                            }
                        ]
                    }
                ]
            )

            response_text = ""
            for block in message.content:
                if block.type == "text":
                    response_text += block.text

            response_text = response_text.strip()
            if response_text.startswith("```json"):
                response_text = response_text[7:]
            if response_text.endswith("```"):
                response_text = response_text[:-3]
            response_text = response_text.strip()

            try:
                result = json.loads(response_text)
                return result
            except json.JSONDecodeError:
                return {
                    "score": 0,
                    "confidence": 50,
                    "reason": f"解析失败: {response_text[:50]}"
                }
        except Exception as e:
            return {
                "score": 0,
                "confidence": 0,
                "reason": f"API调用失败: {str(e)[:50]}"
            }

    def run_review(self, log_data):
        results = []

        for idx, record in enumerate(log_data):
            user_query = record.get('user_query', '').strip()
            key_outputs = record.get('key_outputs', {})

            print(f"\n处理第 {idx+1} 条记录: {user_query}")

            matched_question, match_score = self.match_question(user_query)

            if matched_question and match_score >= 0.7:
                print(f"匹配到测试用例: {matched_question} (匹配度: {match_score:.2f})")
                standard_answer = self.test_cases[matched_question]

                for node_name, generated_answer in key_outputs.items():
                    if generated_answer.strip() and node_name != '单次反思':
                        print(f"  评审节点: {node_name}")
                        evaluation = self.evaluate_consistency(generated_answer, standard_answer)

                        results.append({
                            "序号": idx + 1,
                            "用户问题": user_query,
                            "匹配问题": matched_question,
                            "匹配度": f"{match_score:.2f}",
                            "节点名称": node_name,
                            "生成答案": generated_answer[:100] + "..." if len(generated_answer) > 100 else generated_answer,
                            "标准答案": standard_answer[:100] + "..." if len(standard_answer) > 100 else standard_answer,
                            "一致性评分": evaluation["score"],
                            "置信度": evaluation["confidence"],
                            "评审理由": evaluation["reason"]
                        })
                        print(f"    评分: {evaluation['score']}, 置信度: {evaluation['confidence']}%, 理由: {evaluation['reason']}")
            else:
                print(f"未匹配到测试用例 (匹配度: {match_score:.2f})")

        return results

    def export_results(self, results):
        if not results:
            print("没有评审结果可导出")
            return

        df = pd.DataFrame(results)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = f"自动评审结果_{timestamp}.xlsx"
        df.to_excel(output_path, index=False)
        print(f"\n评审结果已导出到: {output_path}")
        return output_path

if __name__ == "__main__":
    print("=== 自动评审系统 ===")
    reviewer = AutoReview()
    print(f"已加载 {len(reviewer.test_cases)} 条测试用例")

    log_data = [
        {
            'user_query': '如何创建一人多车？',
            'key_outputs': {
                '大模型': '会员营销-客户车辆-客户信息里面搜索对应车主的手机号码，点击操作列的修改按钮，车辆信息下方可添加车辆。',
                '单次反思': '步骤清晰，确认操作路径正确',
                '大模型二次生成': '在会员营销模块中找到客户车辆管理，搜索车主手机号后点击修改，在车辆信息区域添加新车辆即可。'
            }
        }
    ]

    print(f"\n开始评审 {len(log_data)} 条记录...")
    results = reviewer.run_review(log_data)

    reviewer.export_results(results)

    if results:
        scores = [r["一致性评分"] for r in results if isinstance(r["一致性评分"], int)]
        if scores:
            avg_score = sum(scores) / len(scores)
            print(f"\n平均一致性评分: {avg_score:.2f}")

    print("\n评审完成！")