diff --git a/README-CN.md b/README-CN.md index 0d029fc..420654a 100644 --- a/README-CN.md +++ b/README-CN.md @@ -42,6 +42,7 @@ - [MySQL](https://www.mysql.com/) 数据库 - [Conda](https://docs.conda.io/en/latest/)(可选,用于环境管理) - 合法的微博账号(用于数据采集) +- OpenAI API密钥或Anthropic(Claude)API密钥(用于AI分析功能) ### 安装步骤 @@ -68,7 +69,20 @@ - 运行 `createTables.sql` 创建所需的数据库表。 - 修改 `config.py` 中的数据库连接配置,确保与您的MySQL设置匹配。 -4. 启动Flask应用: +4. 配置AI分析功能(可选): + + 设置AI分析功能所需的环境变量: + ```bash + # OpenAI API配置(使用GPT模型必需) + export OPENAI_API_KEY="你的openai密钥" + + # Anthropic API配置(使用Claude模型必需) + export ANTHROPIC_API_KEY="你的anthropic密钥" + ``` + + 注意:至少需要配置一个API密钥才能使用AI分析功能。 + +5. 启动Flask应用: ```bash python app.py @@ -90,6 +104,8 @@ - **[Matplotlib](https://matplotlib.org/)** - 数据可视化库。 - **[Scikit-learn](https://scikit-learn.org/)** - 机器学习库,用于模型训练和评估。 - **[TensorFlow](https://www.tensorflow.org/)** 或 **[PyTorch](https://pytorch.org/)** - 深度学习框架,用于高级模型开发。 +- **[OpenAI GPT](https://openai.com/)** - 先进的语言模型,用于文本分析。 +- **[Anthropic Claude](https://www.anthropic.com/)** - 智能AI模型,用于复杂文本分析。 ## 🤝 贡献 diff --git a/README.md b/README.md index 1a771dd..92fe2e3 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ Follow the steps below to run the project on your system. - [MySQL](https://www.mysql.com/) Database - [Conda](https://docs.conda.io/en/latest/) (optional, for environment management) - A valid Weibo account (for data collection) +- OpenAI API key or Anthropic (Claude) API key for AI analysis features ### Installation Steps @@ -66,13 +67,26 @@ Follow the steps below to run the project on your system. - Run `createTables.sql` to create the necessary database tables. - Modify the database connection settings in `config.py` to match your MySQL configuration. -5. Start the Flask application: +5. Configure AI Analysis (Optional): + + Set up environment variables for AI analysis features: + ```bash + # For OpenAI API (Required for GPT models) + export OPENAI_API_KEY="your-openai-key" + + # For Anthropic API (Required for Claude models) + export ANTHROPIC_API_KEY="your-anthropic-key" + ``` + + Note: At least one API key must be configured to use AI analysis features. + +6. Start the Flask application: ```bash python app.py ``` -6. Access the application: Open your browser and navigate to http://localhost:5000 to use the system. +7. Access the application: Open your browser and navigate to http://localhost:5000 to use the system. ## 🛠️ Technology Stack @@ -88,6 +102,8 @@ The Weibo Public Opinion Analysis and Prediction System employs a range of moder - **[Matplotlib](https://matplotlib.org/)** - A data visualization library. - **[Scikit-learn](https://scikit-learn.org/)** - A machine learning library used for model training and evaluation. - **[TensorFlow](https://www.tensorflow.org/)** 或 **[PyTorch](https://pytorch.org/)** - Deep learning frameworks used for advanced model development. +- **[OpenAI GPT](https://openai.com/)** - Advanced language models for text analysis. +- **[Anthropic Claude](https://www.anthropic.com/)** - AI models for sophisticated text analysis. ## 🤝 Contribution diff --git a/utils/ai_analyzer.py b/utils/ai_analyzer.py index f5cb0ac..083209c 100644 --- a/utils/ai_analyzer.py +++ b/utils/ai_analyzer.py @@ -1,4 +1,5 @@ import openai +import anthropic import json from typing import List, Dict import os @@ -8,11 +9,34 @@ from utils.logger import app_logger as logging class AIAnalyzer: def __init__(self): # 从环境变量获取API密钥 - self.api_key = os.getenv('OPENAI_API_KEY') - if not self.api_key: - raise ValueError("请设置OPENAI_API_KEY环境变量") + self.openai_key = os.getenv('OPENAI_API_KEY') + self.claude_key = os.getenv('ANTHROPIC_API_KEY') - openai.api_key = self.api_key + if not self.openai_key and not self.claude_key: + raise ValueError("请至少设置一个API密钥 (OPENAI_API_KEY 或 ANTHROPIC_API_KEY)") + + if self.openai_key: + openai.api_key = self.openai_key + if self.claude_key: + self.claude_client = anthropic.Anthropic(api_key=self.claude_key) + + # 支持的模型列表 + self.supported_models = { + # OpenAI 模型 + 'gpt-3.5-turbo': {'provider': 'openai', 'max_tokens': 2000, 'cost_per_1k': 0.0015}, + 'gpt-3.5-turbo-16k': {'provider': 'openai', 'max_tokens': 16000, 'cost_per_1k': 0.003}, + 'gpt-4': {'provider': 'openai', 'max_tokens': 8000, 'cost_per_1k': 0.03}, + 'gpt-4-32k': {'provider': 'openai', 'max_tokens': 32000, 'cost_per_1k': 0.06}, + 'gpt-4-turbo-preview': {'provider': 'openai', 'max_tokens': 128000, 'cost_per_1k': 0.01}, + + # Claude 模型 + 'claude-3-opus-20240229': {'provider': 'anthropic', 'max_tokens': 4000, 'cost_per_1k': 0.015}, + 'claude-3-sonnet-20240229': {'provider': 'anthropic', 'max_tokens': 3000, 'cost_per_1k': 0.003}, + 'claude-3-haiku-20240307': {'provider': 'anthropic', 'max_tokens': 2000, 'cost_per_1k': 0.0025}, + 'claude-2.1': {'provider': 'anthropic', 'max_tokens': 100000, 'cost_per_1k': 0.008}, + 'claude-2.0': {'provider': 'anthropic', 'max_tokens': 100000, 'cost_per_1k': 0.008}, + 'claude-instant-1.2': {'provider': 'anthropic', 'max_tokens': 100000, 'cost_per_1k': 0.0015} + } # 不同深度的分析提示词 self.prompt_templates = { @@ -73,48 +97,144 @@ class AIAnalyzer: analysis_depth: str = "standard") -> List[Dict]: """分析一批消息并返回分析结果""" try: + if model_type not in self.supported_models: + raise ValueError(f"不支持的模型类型: {model_type}") + + model_info = self.supported_models[model_type] + provider = model_info['provider'] + max_tokens = model_info['max_tokens'] + + # 根据模型类型调整批处理大小 + adjusted_batch_size = min(batch_size, self._get_optimal_batch_size(model_type)) + if adjusted_batch_size != batch_size: + logging.info(f"已将批处理大小从 {batch_size} 调整为 {adjusted_batch_size}") + all_results = [] + total_cost = 0 # 分批处理消息 - for i in range(0, len(messages), batch_size): - batch = messages[i:i + batch_size] + for i in range(0, len(messages), adjusted_batch_size): + batch = messages[i:i + adjusted_batch_size] formatted_messages = [] for msg in batch: formatted_messages.append(f"消息ID: {msg['id']}\n内容: {msg['content']}") messages_text = "\n---\n".join(formatted_messages) - - # 获取对应深度的提示词 system_prompt = self.prompt_templates.get(analysis_depth, self.prompt_templates['standard']) - # 调用OpenAI API - response = await openai.ChatCompletion.acreate( - model=model_type, - messages=[ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": f"请分析以下消息:\n{messages_text}"} - ], - temperature=0.3, # 降低随机性 - max_tokens=2000 if analysis_depth != 'deep' else 3000, - n=1 - ) - - try: - result = json.loads(response.choices[0].message.content) - if isinstance(result, dict) and 'analysis_results' in result: - all_results.extend(result['analysis_results']) - else: - logging.error(f"API返回格式不正确: {response.choices[0].message.content}") - except json.JSONDecodeError as e: - logging.error(f"JSON解析失败: {e}") - continue + if provider == 'openai': + result = await self._analyze_with_openai( + messages_text, + system_prompt, + model_type, + max_tokens + ) + else: # anthropic + result = await self._analyze_with_claude( + messages_text, + system_prompt, + model_type, + max_tokens + ) + if result: + all_results.extend(result) + # 计算本批次成本 + batch_cost = self._calculate_cost(len(messages_text), model_type) + total_cost += batch_cost + logging.info(f"批次处理完成,成本: ${batch_cost:.4f}") + + logging.info(f"分析完成,总成本: ${total_cost:.4f}") return all_results except Exception as e: logging.error(f"AI分析过程出错: {e}") return [] + def _get_optimal_batch_size(self, model_type: str) -> int: + """根据模型类型获取最优批处理大小""" + model_info = self.supported_models[model_type] + max_tokens = model_info['max_tokens'] + + # 估算每条消息的平均token数(假设为200) + avg_tokens_per_message = 200 + + # 预留20%的token用于系统提示词和响应 + available_tokens = int(max_tokens * 0.8) + + # 计算最优批处理大小 + optimal_batch_size = max(1, min(100, available_tokens // avg_tokens_per_message)) + + return optimal_batch_size + + def _calculate_cost(self, input_length: int, model_type: str) -> float: + """计算API调用成本""" + model_info = self.supported_models[model_type] + cost_per_1k = model_info['cost_per_1k'] + + # 估算token数(假设每4个字符约等于1个token) + estimated_tokens = input_length // 4 + + # 计算成本(美元) + cost = (estimated_tokens / 1000) * cost_per_1k + + return cost + + async def _analyze_with_openai(self, messages_text: str, system_prompt: str, + model: str, max_tokens: int) -> List[Dict]: + """使用OpenAI API进行分析""" + try: + response = await openai.ChatCompletion.acreate( + model=model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": f"请分析以下消息:\n{messages_text}"} + ], + temperature=0.3, + max_tokens=max_tokens, + n=1, + response_format={"type": "json_object"} # 强制JSON响应格式 + ) + + result = json.loads(response.choices[0].message.content) + if isinstance(result, dict) and 'analysis_results' in result: + return result['analysis_results'] + else: + logging.error(f"OpenAI API返回格式不正确: {response.choices[0].message.content}") + return [] + + except Exception as e: + logging.error(f"OpenAI API调用失败: {e}") + return [] + + async def _analyze_with_claude(self, messages_text: str, system_prompt: str, + model: str, max_tokens: int) -> List[Dict]: + """使用Claude API进行分析""" + try: + response = await self.claude_client.messages.create( + model=model, + max_tokens=max_tokens, + temperature=0.3, + system=system_prompt, + messages=[ + { + "role": "user", + "content": f"请分析以下消息:\n{messages_text}" + } + ] + ) + + result = json.loads(response.content[0].text) + if isinstance(result, dict) and 'analysis_results' in result: + return result['analysis_results'] + else: + logging.error(f"Claude API返回格式不正确: {response.content[0].text}") + return [] + + except Exception as e: + logging.error(f"Claude API调用失败: {e}") + return [] + def format_analysis_for_display(self, analysis: Dict) -> Dict: """将分析结果格式化为前端显示格式""" base_result = { diff --git a/views/page/templates/yuqingpredict.html b/views/page/templates/yuqingpredict.html index 3241780..dd289d1 100644 --- a/views/page/templates/yuqingpredict.html +++ b/views/page/templates/yuqingpredict.html @@ -467,8 +467,21 @@