From 339545f7fb0cd486453200633e76f4a6f02e8ee8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=88=92=E9=85=92=E7=9A=84=E6=9D=8E=E7=99=BD?= <670939375@qq.com> Date: Sat, 23 Aug 2025 20:19:57 +0800 Subject: [PATCH] Different types of base models adapted for each agent. --- InsightEngine/agent.py | 214 ++++++++- InsightEngine/llms/__init__.py | 3 +- InsightEngine/llms/kimi.py | 144 ++++++ InsightEngine/prompts/prompts.py | 79 ++-- InsightEngine/tools/__init__.py | 14 +- InsightEngine/tools/keyword_optimizer.py | 2 +- InsightEngine/tools/sentiment_analyzer.py | 445 ++++++++++++++++++ InsightEngine/utils/config.py | 22 +- MediaEngine/agent.py | 7 +- MediaEngine/llms/__init__.py | 3 +- MediaEngine/llms/gemini_llm.py | 95 ++++ MediaEngine/utils/config.py | 14 +- .../WeiboMultilingualSentiment/predict.py | 25 + .../insight_engine_streamlit_app.py | 37 +- SingleEngineApp/media_engine_streamlit_app.py | 15 +- config.py | 42 +- 16 files changed, 1072 insertions(+), 89 deletions(-) create mode 100644 InsightEngine/llms/kimi.py create mode 100644 InsightEngine/tools/sentiment_analyzer.py create mode 100644 MediaEngine/llms/gemini_llm.py diff --git a/InsightEngine/agent.py b/InsightEngine/agent.py index 5b9e41c..7e729cd 100644 --- a/InsightEngine/agent.py +++ b/InsightEngine/agent.py @@ -7,9 +7,9 @@ import json import os import re from datetime import datetime -from typing import Optional, Dict, Any, List +from typing import Optional, Dict, Any, List, Union -from .llms import DeepSeekLLM, OpenAILLM, BaseLLM +from .llms import DeepSeekLLM, OpenAILLM, KimiLLM, BaseLLM from .nodes import ( ReportStructureNode, FirstSearchNode, @@ -19,7 +19,7 @@ from .nodes import ( ReportFormattingNode ) from .state import State -from .tools import MediaCrawlerDB, DBResponse, keyword_optimizer +from .tools import MediaCrawlerDB, DBResponse, keyword_optimizer, multilingual_sentiment_analyzer from .utils import Config, load_config, format_search_results_for_prompt @@ -50,6 +50,9 @@ class DeepSearchAgent: # 初始化搜索工具集 self.search_agency = MediaCrawlerDB() + # 初始化情感分析器 + self.sentiment_analyzer = multilingual_sentiment_analyzer + # 初始化节点 self._initialize_nodes() @@ -62,6 +65,7 @@ class DeepSearchAgent: print(f"Deep Search Agent 已初始化") print(f"使用LLM: {self.llm_client.get_model_info()}") print(f"搜索工具集: MediaCrawlerDB (支持5种本地数据库查询工具)") + print(f"情感分析: WeiboMultilingualSentiment (支持22种语言的情感分析)") def _initialize_llm(self) -> BaseLLM: """初始化LLM客户端""" @@ -75,6 +79,11 @@ class DeepSearchAgent: api_key=self.config.openai_api_key, model_name=self.config.openai_model ) + elif self.config.default_llm_provider == "kimi": + return KimiLLM( + api_key=self.config.kimi_api_key, + model_name=self.config.kimi_model + ) else: raise ValueError(f"不支持的LLM提供商: {self.config.default_llm_provider}") @@ -113,7 +122,7 @@ class DeepSearchAgent: def execute_search_tool(self, tool_name: str, query: str, **kwargs) -> DBResponse: """ - 执行指定的数据库查询工具(集成关键词优化中间件) + 执行指定的数据库查询工具(集成关键词优化中间件和情感分析) Args: tool_name: 工具名称,可选值: @@ -122,11 +131,13 @@ class DeepSearchAgent: - "search_topic_by_date": 按日期搜索话题 - "get_comments_for_topic": 获取话题评论 - "search_topic_on_platform": 平台定向搜索 + - "analyze_sentiment": 对查询结果进行情感分析 query: 搜索关键词/话题 - **kwargs: 额外参数(如start_date, end_date, platform, limit等) + **kwargs: 额外参数(如start_date, end_date, platform, limit, enable_sentiment等) + enable_sentiment: 是否自动对搜索结果进行情感分析(默认True) Returns: - DBResponse对象 + DBResponse对象(可能包含情感分析结果) """ print(f" → 执行数据库查询工具: {tool_name}") @@ -134,7 +145,36 @@ class DeepSearchAgent: if tool_name == "search_hot_content": time_period = kwargs.get("time_period", "week") limit = kwargs.get("limit", 100) - return self.search_agency.search_hot_content(time_period=time_period, limit=limit) + response = self.search_agency.search_hot_content(time_period=time_period, limit=limit) + + # 检查是否需要进行情感分析 + enable_sentiment = kwargs.get("enable_sentiment", True) + if enable_sentiment and response.results and len(response.results) > 0: + print(f" 🎭 开始对热点内容进行情感分析...") + sentiment_analysis = self._perform_sentiment_analysis(response.results) + if sentiment_analysis: + # 将情感分析结果添加到响应的parameters中 + response.parameters["sentiment_analysis"] = sentiment_analysis + print(f" ✅ 情感分析完成") + + return response + + # 独立情感分析工具 + if tool_name == "analyze_sentiment": + texts = kwargs.get("texts", query) # 可以通过texts参数传递,或使用query + sentiment_result = self.analyze_sentiment_only(texts) + + # 构建DBResponse格式的响应 + return DBResponse( + tool_name="analyze_sentiment", + parameters={ + "texts": texts if isinstance(texts, list) else [texts], + **kwargs + }, + results=[], # 情感分析不返回搜索结果 + results_count=0, + metadata=sentiment_result + ) # 对于需要搜索词的工具,使用关键词优化中间件 optimized_response = keyword_optimizer.optimize_keywords( @@ -154,31 +194,35 @@ class DeepSearchAgent: try: if tool_name == "search_topic_globally": - limit_per_table = kwargs.get("limit_per_table", 100) + # 使用配置文件中的默认值,忽略agent提供的limit_per_table参数 + limit_per_table = self.config.default_search_topic_globally_limit_per_table response = self.search_agency.search_topic_globally(topic=keyword, limit_per_table=limit_per_table) elif tool_name == "search_topic_by_date": start_date = kwargs.get("start_date") end_date = kwargs.get("end_date") - limit_per_table = kwargs.get("limit_per_table", 100) + # 使用配置文件中的默认值,忽略agent提供的limit_per_table参数 + limit_per_table = self.config.default_search_topic_by_date_limit_per_table if not start_date or not end_date: raise ValueError("search_topic_by_date工具需要start_date和end_date参数") response = self.search_agency.search_topic_by_date(topic=keyword, start_date=start_date, end_date=end_date, limit_per_table=limit_per_table) elif tool_name == "get_comments_for_topic": - limit = kwargs.get("limit", 500) // len(optimized_response.optimized_keywords) + # 使用配置文件中的默认值,按关键词数量分配,但保证最小值 + limit = self.config.default_get_comments_for_topic_limit // len(optimized_response.optimized_keywords) limit = max(limit, 50) response = self.search_agency.get_comments_for_topic(topic=keyword, limit=limit) elif tool_name == "search_topic_on_platform": platform = kwargs.get("platform") start_date = kwargs.get("start_date") end_date = kwargs.get("end_date") - limit = kwargs.get("limit", 200) // len(optimized_response.optimized_keywords) + # 使用配置文件中的默认值,按关键词数量分配,但保证最小值 + limit = self.config.default_search_topic_on_platform_limit // len(optimized_response.optimized_keywords) limit = max(limit, 30) if not platform: raise ValueError("search_topic_on_platform工具需要platform参数") response = self.search_agency.search_topic_on_platform(platform=platform, topic=keyword, start_date=start_date, end_date=end_date, limit=limit) else: print(f" 未知的搜索工具: {tool_name},使用默认全局搜索") - response = self.search_agency.search_topic_globally(topic=keyword, limit_per_table=100) + response = self.search_agency.search_topic_globally(topic=keyword, limit_per_table=self.config.default_search_topic_globally_limit_per_table) # 收集结果 if response.results: @@ -209,6 +253,16 @@ class DeepSearchAgent: results_count=len(unique_results) ) + # 检查是否需要进行情感分析 + enable_sentiment = kwargs.get("enable_sentiment", True) + if enable_sentiment and unique_results and len(unique_results) > 0: + print(f" 🎭 开始对搜索结果进行情感分析...") + sentiment_analysis = self._perform_sentiment_analysis(unique_results) + if sentiment_analysis: + # 将情感分析结果添加到响应的parameters中 + integrated_response.parameters["sentiment_analysis"] = sentiment_analysis + print(f" ✅ 情感分析完成") + return integrated_response def _deduplicate_results(self, results: List) -> List: @@ -227,6 +281,99 @@ class DeepSearchAgent: return unique_results + def _perform_sentiment_analysis(self, results: List) -> Optional[Dict[str, Any]]: + """ + 对搜索结果执行情感分析 + + Args: + results: 搜索结果列表 + + Returns: + 情感分析结果字典,如果失败则返回None + """ + try: + # 初始化情感分析器(如果尚未初始化) + if not self.sentiment_analyzer.is_initialized: + print(" 初始化情感分析模型...") + if not self.sentiment_analyzer.initialize(): + print(" ❌ 情感分析模型初始化失败") + return None + + # 将查询结果转换为字典格式 + results_dict = [] + for result in results: + result_dict = { + "content": result.title_or_content, + "platform": result.platform, + "author": result.author_nickname, + "url": result.url, + "publish_time": str(result.publish_time) if result.publish_time else None + } + results_dict.append(result_dict) + + # 执行情感分析 + sentiment_analysis = self.sentiment_analyzer.analyze_query_results( + query_results=results_dict, + text_field="content", + min_confidence=0.5 + ) + + return sentiment_analysis.get("sentiment_analysis") + + except Exception as e: + print(f" ❌ 情感分析过程中发生错误: {str(e)}") + return None + + def analyze_sentiment_only(self, texts: Union[str, List[str]]) -> Dict[str, Any]: + """ + 独立的情感分析工具 + + Args: + texts: 单个文本或文本列表 + + Returns: + 情感分析结果 + """ + print(f" → 执行独立情感分析") + + try: + # 初始化情感分析器(如果尚未初始化) + if not self.sentiment_analyzer.is_initialized: + print(" 初始化情感分析模型...") + if not self.sentiment_analyzer.initialize(): + return { + "success": False, + "error": "情感分析模型初始化失败", + "results": [] + } + + # 执行分析 + if isinstance(texts, str): + result = self.sentiment_analyzer.analyze_single_text(texts) + return { + "success": True, + "total_analyzed": 1, + "results": [result.__dict__] + } + else: + batch_result = self.sentiment_analyzer.analyze_batch(texts, show_progress=True) + return { + "success": True, + "total_analyzed": batch_result.total_processed, + "success_count": batch_result.success_count, + "failed_count": batch_result.failed_count, + "average_confidence": batch_result.average_confidence, + "results": [result.__dict__ for result in batch_result.results] + } + + except Exception as e: + print(f" ❌ 情感分析过程中发生错误: {str(e)}") + return { + "success": False, + "error": str(e), + "results": [] + } + def research(self, query: str, save_report: bool = True) -> str: """ 执行深度研究 @@ -356,17 +503,23 @@ class DeepSearchAgent: print(f" ⚠️ search_topic_on_platform工具缺少平台参数,改用全局搜索") search_tool = "search_topic_globally" - # 处理限制参数 + # 处理限制参数,使用配置文件中的默认值而不是agent提供的参数 if search_tool == "search_hot_content": time_period = search_output.get("time_period", "week") - limit = search_output.get("limit", 100) + limit = self.config.default_search_hot_content_limit search_kwargs["time_period"] = time_period search_kwargs["limit"] = limit elif search_tool in ["search_topic_globally", "search_topic_by_date"]: - limit_per_table = search_output.get("limit_per_table", 100) + if search_tool == "search_topic_globally": + limit_per_table = self.config.default_search_topic_globally_limit_per_table + else: # search_topic_by_date + limit_per_table = self.config.default_search_topic_by_date_limit_per_table search_kwargs["limit_per_table"] = limit_per_table elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]: - limit = search_output.get("limit", 200) + if search_tool == "get_comments_for_topic": + limit = self.config.default_get_comments_for_topic_limit + else: # search_topic_on_platform + limit = self.config.default_search_topic_on_platform_limit search_kwargs["limit"] = limit search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs) @@ -374,8 +527,11 @@ class DeepSearchAgent: # 转换为兼容格式 search_results = [] if search_response and search_response.results: - # 每种搜索工具都有其特定的结果数量,这里取前100个作为上限 - max_results = min(len(search_response.results), 100) + # 使用配置文件控制传递给LLM的结果数量,0表示不限制 + if self.config.max_search_results_for_llm > 0: + max_results = min(len(search_response.results), self.config.max_search_results_for_llm) + else: + max_results = len(search_response.results) # 不限制,传递所有结果 for result in search_response.results[:max_results]: search_results.append({ 'title': result.title_or_content, @@ -479,14 +635,23 @@ class DeepSearchAgent: # 处理限制参数 if search_tool == "search_hot_content": time_period = reflection_output.get("time_period", "week") - limit = reflection_output.get("limit", 10) + # 使用配置文件中的默认值,不允许agent控制limit参数 + limit = self.config.default_search_hot_content_limit search_kwargs["time_period"] = time_period search_kwargs["limit"] = limit elif search_tool in ["search_topic_globally", "search_topic_by_date"]: - limit_per_table = reflection_output.get("limit_per_table", 5) + # 使用配置文件中的默认值,不允许agent控制limit_per_table参数 + if search_tool == "search_topic_globally": + limit_per_table = self.config.default_search_topic_globally_limit_per_table + else: # search_topic_by_date + limit_per_table = self.config.default_search_topic_by_date_limit_per_table search_kwargs["limit_per_table"] = limit_per_table elif search_tool in ["get_comments_for_topic", "search_topic_on_platform"]: - limit = reflection_output.get("limit", 20) + # 使用配置文件中的默认值,不允许agent控制limit参数 + if search_tool == "get_comments_for_topic": + limit = self.config.default_get_comments_for_topic_limit + else: # search_topic_on_platform + limit = self.config.default_search_topic_on_platform_limit search_kwargs["limit"] = limit search_response = self.execute_search_tool(search_tool, search_query, **search_kwargs) @@ -494,8 +659,11 @@ class DeepSearchAgent: # 转换为兼容格式 search_results = [] if search_response and search_response.results: - # 每种搜索工具都有其特定的结果数量,这里取前100个作为上限 - max_results = min(len(search_response.results), 100) + # 使用配置文件控制传递给LLM的结果数量,0表示不限制 + if self.config.max_search_results_for_llm > 0: + max_results = min(len(search_response.results), self.config.max_search_results_for_llm) + else: + max_results = len(search_response.results) # 不限制,传递所有结果 for result in search_response.results[:max_results]: search_results.append({ 'title': result.title_or_content, diff --git a/InsightEngine/llms/__init__.py b/InsightEngine/llms/__init__.py index 2e1602a..597daca 100644 --- a/InsightEngine/llms/__init__.py +++ b/InsightEngine/llms/__init__.py @@ -6,5 +6,6 @@ LLM调用模块 from .base import BaseLLM from .deepseek import DeepSeekLLM from .openai_llm import OpenAILLM +from .kimi import KimiLLM -__all__ = ["BaseLLM", "DeepSeekLLM", "OpenAILLM"] +__all__ = ["BaseLLM", "DeepSeekLLM", "OpenAILLM", "KimiLLM"] diff --git a/InsightEngine/llms/kimi.py b/InsightEngine/llms/kimi.py new file mode 100644 index 0000000..e52a61e --- /dev/null +++ b/InsightEngine/llms/kimi.py @@ -0,0 +1,144 @@ +""" +Kimi LLM实现 +使用Moonshot AI的Kimi API进行文本生成 +""" + +import os +from typing import Optional, Dict, Any +from openai import OpenAI +# 假设 .base 模块和 BaseLLM 类已存在 +from .base import BaseLLM + + +class KimiLLM(BaseLLM): + """Kimi LLM实现类""" + + def __init__(self, api_key: Optional[str] = None, model_name: Optional[str] = None): + """ + 初始化Kimi客户端 + + Args: + api_key: Kimi API密钥,如果不提供则从环境变量读取 + model_name: 模型名称,默认使用kimi-k2-0711-preview + """ + if api_key is None: + api_key = os.getenv("KIMI_API_KEY") + if not api_key: + raise ValueError("Kimi API Key未找到!请设置KIMI_API_KEY环境变量或在初始化时提供") + + super().__init__(api_key, model_name) + + # 初始化OpenAI客户端,使用Kimi的endpoint + self.client = OpenAI( + api_key=self.api_key, + base_url="https://api.moonshot.cn/v1" + ) + + self.default_model = model_name or self.get_default_model() + + def get_default_model(self) -> str: + """获取默认模型名称""" + return "kimi-k2-0711-preview" + + def invoke(self, system_prompt: str, user_prompt: str, **kwargs) -> str: + """ + 调用Kimi API生成回复 + + Args: + system_prompt: 系统提示词 + user_prompt: 用户输入 + **kwargs: 其他参数,如temperature、max_tokens等 + + Returns: + Kimi生成的回复文本 + """ + try: + # 构建消息 + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + + # 智能计算max_tokens - 根据输入长度自动调整输出长度 + input_length = len(system_prompt) + len(user_prompt) + if input_length > 100000: # 超长文本 + default_max_tokens = 81920 + elif input_length > 50000: # 超长文本 + default_max_tokens = 40960 + elif input_length > 20000: # 长文本 + default_max_tokens = 16384 + elif input_length > 5000: # 中等文本 + default_max_tokens = 8192 + else: # 短文本 + default_max_tokens = 4096 + + # 设置默认参数,针对长文本处理优化 + params = { + "model": self.default_model, + "messages": messages, + "temperature": kwargs.get("temperature", 0.6), # Kimi建议使用0.6 + "max_tokens": kwargs.get("max_tokens", default_max_tokens), # 智能调整token限制 + "stream": False + } + + # 添加其他可选参数 + if "top_p" in kwargs: + params["top_p"] = kwargs["top_p"] + if "presence_penalty" in kwargs: + params["presence_penalty"] = kwargs["presence_penalty"] + if "frequency_penalty" in kwargs: + params["frequency_penalty"] = kwargs["frequency_penalty"] + if "stop" in kwargs: + params["stop"] = kwargs["stop"] + + # 输出调试信息(仅在使用Kimi时) + print(f"[Kimi] 输入长度: {input_length}, 使用max_tokens: {params['max_tokens']}") + + # 调用API + response = self.client.chat.completions.create(**params) + + # 提取回复内容 + if response.choices and response.choices[0].message: + content = response.choices[0].message.content + return self.validate_response(content) + else: + return "" + + except Exception as e: + print(f"Kimi API调用错误: {str(e)}") + raise e + + def get_model_info(self) -> Dict[str, Any]: + """ + 获取当前模型信息 + + Returns: + 模型信息字典 + """ + return { + "provider": "Kimi", + "model": self.default_model, + "api_base": "https://api.moonshot.cn/v1", + "max_context_length": "长文本支持(200K+ tokens)" + } + + # ==================== 代码修改部分 ==================== + def invoke_long_context(self, system_prompt: str, user_prompt: str, **kwargs) -> str: + """ + 专门用于长文本处理的调用方法 (作为invoke的兼容接口)。 + 此方法通过设置推荐的默认参数,然后调用通用的invoke方法来处理请求。 + + Args: + system_prompt: 系统提示词 + user_prompt: 用户输入 + **kwargs: 其他参数 + + Returns: + Kimi生成的回复文本 + """ + # 为长文本场景,设置一个慷慨的默认 max_tokens,仅当用户未指定时生效。 + # 您原有的16384是一个非常合理的值。 + kwargs.setdefault("max_tokens", 16384) + + # 直接调用核心的invoke方法,将所有参数(包括预设的默认值)传递给它。 + return self.invoke(system_prompt, user_prompt, **kwargs) \ No newline at end of file diff --git a/InsightEngine/prompts/prompts.py b/InsightEngine/prompts/prompts.py index a00196f..aa7ded4 100644 --- a/InsightEngine/prompts/prompts.py +++ b/InsightEngine/prompts/prompts.py @@ -39,8 +39,8 @@ output_schema_first_search = { "end_date": {"type": "string", "description": "结束日期,格式YYYY-MM-DD,search_topic_by_date和search_topic_on_platform工具可能需要"}, "platform": {"type": "string", "description": "平台名称,search_topic_on_platform工具必需,可选值:bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba"}, "time_period": {"type": "string", "description": "时间周期,search_hot_content工具可选,可选值:24h, week, year"}, - "limit": {"type": "integer", "description": "结果数量限制,各工具可选参数"}, - "limit_per_table": {"type": "integer", "description": "每表结果数量限制,search_topic_globally和search_topic_by_date工具可选"} + "enable_sentiment": {"type": "boolean", "description": "是否启用自动情感分析,默认为true,适用于除analyze_sentiment外的所有搜索工具"}, + "texts": {"type": "array", "items": {"type": "string"}, "description": "文本列表,仅用于analyze_sentiment工具"} }, "required": ["search_query", "search_tool", "reasoning"] } @@ -88,8 +88,8 @@ output_schema_reflection = { "end_date": {"type": "string", "description": "结束日期,格式YYYY-MM-DD,search_topic_by_date和search_topic_on_platform工具可能需要"}, "platform": {"type": "string", "description": "平台名称,search_topic_on_platform工具必需,可选值:bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba"}, "time_period": {"type": "string", "description": "时间周期,search_hot_content工具可选,可选值:24h, week, year"}, - "limit": {"type": "integer", "description": "结果数量限制,各工具可选参数"}, - "limit_per_table": {"type": "integer", "description": "每表结果数量限制,search_topic_globally和search_topic_by_date工具可选"} + "enable_sentiment": {"type": "boolean", "description": "是否启用自动情感分析,默认为true,适用于除analyze_sentiment外的所有搜索工具"}, + "texts": {"type": "array", "items": {"type": "string"}, "description": "文本列表,仅用于analyze_sentiment工具"} }, "required": ["search_query", "search_tool", "reasoning"] } @@ -155,34 +155,40 @@ SYSTEM_PROMPT_FIRST_SEARCH = f""" {json.dumps(input_schema_first_search, indent=2, ensure_ascii=False)} -你可以使用以下5种专业的本地舆情数据库查询工具来挖掘真实的民意和公众观点: +你可以使用以下6种专业的本地舆情数据库查询工具来挖掘真实的民意和公众观点: 1. **search_hot_content** - 查找热点内容工具 - 适用于:挖掘当前最受关注的舆情事件和话题 - - 特点:基于真实的点赞、评论、分享数据发现热门话题 - - 参数:time_period ('24h', 'week', 'year'),limit(数量限制) + - 特点:基于真实的点赞、评论、分享数据发现热门话题,自动进行情感分析 + - 参数:time_period ('24h', 'week', 'year'),limit(数量限制),enable_sentiment(是否启用情感分析,默认True) 2. **search_topic_globally** - 全局话题搜索工具 - 适用于:全面了解公众对特定话题的讨论和观点 - - 特点:覆盖B站、微博、抖音、快手、小红书、知乎、贴吧等主流平台的真实用户声音 - - 参数:limit_per_table(每个表的结果数量限制) + - 特点:覆盖B站、微博、抖音、快手、小红书、知乎、贴吧等主流平台的真实用户声音,自动进行情感分析 + - 参数:limit_per_table(每个表的结果数量限制),enable_sentiment(是否启用情感分析,默认True) 3. **search_topic_by_date** - 按日期搜索话题工具 - 适用于:追踪舆情事件的时间线发展和公众情绪变化 - - 特点:精确的时间范围控制,适合分析舆情演变过程 + - 特点:精确的时间范围控制,适合分析舆情演变过程,自动进行情感分析 - 特殊要求:需要提供start_date和end_date参数,格式为'YYYY-MM-DD' - - 参数:limit_per_table(每个表的结果数量限制) + - 参数:limit_per_table(每个表的结果数量限制),enable_sentiment(是否启用情感分析,默认True) 4. **get_comments_for_topic** - 获取话题评论工具 - 适用于:深度挖掘网民的真实态度、情感和观点 - - 特点:直接获取用户评论,了解民意走向和情感倾向 - - 参数:limit(评论总数量限制) + - 特点:直接获取用户评论,了解民意走向和情感倾向,自动进行情感分析 + - 参数:limit(评论总数量限制),enable_sentiment(是否启用情感分析,默认True) 5. **search_topic_on_platform** - 平台定向搜索工具 - 适用于:分析特定社交平台用户群体的观点特征 - - 特点:针对不同平台用户群体的观点差异进行精准分析 + - 特点:针对不同平台用户群体的观点差异进行精准分析,自动进行情感分析 - 特殊要求:需要提供platform参数,可选start_date和end_date - - 参数:platform(必须),start_date, end_date(可选),limit(数量限制) + - 参数:platform(必须),start_date, end_date(可选),limit(数量限制),enable_sentiment(是否启用情感分析,默认True) + +6. **analyze_sentiment** - 多语言情感分析工具 + - 适用于:对文本内容进行专门的情感倾向分析 + - 特点:支持中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言的情感分析,输出5级情感等级(非常负面、负面、中性、正面、非常正面) + - 参数:texts(文本或文本列表),query也可用作单个文本输入 + - 用途:当搜索结果的情感倾向不明确或需要专门的情感分析时使用 **你的核心使命:挖掘真实的民意和人情味** @@ -195,11 +201,16 @@ SYSTEM_PROMPT_FIRST_SEARCH = f""" - **贴近生活语言**:用简单、直接、口语化的词汇 - **包含情感词汇**:网民常用的褒贬词、情绪词 - **考虑话题热词**:相关的网络流行语、缩写、昵称 -4. **参数优化配置**: +4. **情感分析策略选择**: + - **自动情感分析**:默认启用(enable_sentiment: true),适用于搜索工具,能自动分析搜索结果的情感倾向 + - **专门情感分析**:当需要对特定文本进行详细情感分析时,使用analyze_sentiment工具 + - **关闭情感分析**:在某些特殊情况下(如纯事实性内容),可设置enable_sentiment: false +5. **参数优化配置**: - search_topic_by_date: 必须提供start_date和end_date参数(格式:YYYY-MM-DD) - search_topic_on_platform: 必须提供platform参数(bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba之一) - - 其他工具:合理配置limit参数以获取足够的样本(建议:search_hot_content limit>=100,search_topic_globally limit_per_table>=50,search_topic_by_date limit_per_table>=100,get_comments_for_topic limit>=500,search_topic_on_platform limit>=200) -5. **阐述选择理由**:说明为什么这样的查询能够获得最真实的民意反馈 + - analyze_sentiment: 使用texts参数提供文本列表,或使用search_query作为单个文本 + - 系统自动配置数据量参数,无需手动设置limit或limit_per_table参数 +6. **阐述选择理由**:说明为什么这样的查询和情感分析策略能够获得最真实的民意反馈 **搜索词设计核心原则**: - **想象网友怎么说**:如果你是个普通网友,你会怎么讨论这个话题? @@ -251,7 +262,12 @@ SYSTEM_PROMPT_FIRST_SUMMARY = f""" 2. **展现多元观点**:呈现不同平台、不同群体的观点差异和讨论重点 3. **数据支撑分析**:用具体的点赞数、评论数、转发数等数据说明舆情热度 4. **情感色彩描述**:准确描述公众的情感倾向(愤怒、支持、担忧、期待等) -5. **避免套话官话**:使用贴近民众的语言,避免过度官方化的表述 +5. **智能运用情感分析**: + - **整合情感数据**:如果搜索结果包含自动情感分析,要充分利用情感分布数据(如"正面情感占60%,负面情感占25%") + - **情感趋势描述**:描述主要情感倾向和情感分布特征 + - **高置信度引用**:优先引用高置信度的情感分析结果 + - **情感细节分析**:结合具体的情感标签(非常正面、正面、中性、负面、非常负面)进行深度分析 +6. **避免套话官话**:使用贴近民众的语言,避免过度官方化的表述 撰写风格: - 语言生动,有感染力 @@ -277,13 +293,14 @@ SYSTEM_PROMPT_REFLECTION = f""" {json.dumps(input_schema_reflection, indent=2, ensure_ascii=False)} -你可以使用以下5种专业的本地舆情数据库查询工具来深度挖掘民意: +你可以使用以下6种专业的本地舆情数据库查询工具来深度挖掘民意: -1. **search_hot_content** - 查找热点内容工具 -2. **search_topic_globally** - 全局话题搜索工具 -3. **search_topic_by_date** - 按日期搜索话题工具 -4. **get_comments_for_topic** - 获取话题评论工具 -5. **search_topic_on_platform** - 平台定向搜索工具 +1. **search_hot_content** - 查找热点内容工具(自动情感分析) +2. **search_topic_globally** - 全局话题搜索工具(自动情感分析) +3. **search_topic_by_date** - 按日期搜索话题工具(自动情感分析) +4. **get_comments_for_topic** - 获取话题评论工具(自动情感分析) +5. **search_topic_on_platform** - 平台定向搜索工具(自动情感分析) +6. **analyze_sentiment** - 多语言情感分析工具(专门的情感分析) **反思的核心目标:让报告更有人情味和真实感** @@ -311,7 +328,7 @@ SYSTEM_PROMPT_REFLECTION = f""" 4. **参数配置要求**: - search_topic_by_date: 必须提供start_date和end_date参数(格式:YYYY-MM-DD) - search_topic_on_platform: 必须提供platform参数(bilibili, weibo, douyin, kuaishou, xhs, zhihu, tieba之一) - - 其他工具:合理配置参数以获取多样化的民意样本(建议:search_hot_content limit>=100,search_topic_globally limit_per_table>=50,search_topic_by_date limit_per_table>=100,get_comments_for_topic limit>=500,search_topic_on_platform limit>=200) + - 系统自动配置数据量参数,无需手动设置limit或limit_per_table参数 5. **阐述补充理由**:明确说明为什么需要这些额外的民意数据 @@ -357,9 +374,13 @@ SYSTEM_PROMPT_REFLECTION_SUMMARY = f""" 优化策略: 1. **融入新的民意数据**:将补充搜索到的真实用户声音整合到段落中 2. **丰富情感表达**:增加具体的情感描述和社会情绪分析 -3. **补充遗漏观点**:添加之前缺失的不同群体、平台的观点 -4. **强化数据支撑**:用具体数字和案例让分析更有说服力 -5. **优化语言表达**:让文字更生动、更贴近民众,减少官方套话 +3. **深化情感分析**: + - **整合情感变化**:如果有新的情感分析数据,对比前后情感变化趋势 + - **细化情感层次**:区分不同群体、平台的情感差异 + - **量化情感描述**:用具体的情感分布数据支撑分析(如"新增数据显示负面情感比例上升至40%") +4. **补充遗漏观点**:添加之前缺失的不同群体、平台的观点 +5. **强化数据支撑**:用具体数字和案例让分析更有说服力 +6. **优化语言表达**:让文字更生动、更贴近民众,减少官方套话 注意事项: - 保留段落的核心观点和重要信息 diff --git a/InsightEngine/tools/__init__.py b/InsightEngine/tools/__init__.py index 4866524..50783f0 100644 --- a/InsightEngine/tools/__init__.py +++ b/InsightEngine/tools/__init__.py @@ -14,6 +14,13 @@ from .keyword_optimizer import ( KeywordOptimizationResponse, keyword_optimizer ) +from .sentiment_analyzer import ( + WeiboMultilingualSentimentAnalyzer, + SentimentResult, + BatchSentimentResult, + multilingual_sentiment_analyzer, + analyze_sentiment +) __all__ = [ "MediaCrawlerDB", @@ -22,5 +29,10 @@ __all__ = [ "print_response_summary", "KeywordOptimizer", "KeywordOptimizationResponse", - "keyword_optimizer" + "keyword_optimizer", + "WeiboMultilingualSentimentAnalyzer", + "SentimentResult", + "BatchSentimentResult", + "multilingual_sentiment_analyzer", + "analyze_sentiment" ] diff --git a/InsightEngine/tools/keyword_optimizer.py b/InsightEngine/tools/keyword_optimizer.py index 8590823..81eb0a8 100644 --- a/InsightEngine/tools/keyword_optimizer.py +++ b/InsightEngine/tools/keyword_optimizer.py @@ -228,7 +228,7 @@ class KeywordOptimizer: # 清理和验证关键词 cleaned_keywords = [] - for keyword in keywords[:20]: # 最多5个 + for keyword in keywords[:20]: # 最多20个 keyword = keyword.strip().strip('"\'""''') if keyword and len(keyword) <= 20: # 合理长度 cleaned_keywords.append(keyword) diff --git a/InsightEngine/tools/sentiment_analyzer.py b/InsightEngine/tools/sentiment_analyzer.py new file mode 100644 index 0000000..1ff3a8f --- /dev/null +++ b/InsightEngine/tools/sentiment_analyzer.py @@ -0,0 +1,445 @@ +""" +多语言情感分析工具 +基于WeiboMultilingualSentiment模型为InsightEngine提供情感分析功能 +""" + +import torch +from transformers import AutoTokenizer, AutoModelForSequenceClassification +import os +import sys +from typing import List, Dict, Any, Optional, Union +from dataclasses import dataclass +import re + +# 添加项目根目录到路径,以便导入WeiboMultilingualSentiment +project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +weibo_sentiment_path = os.path.join(project_root, "SentimentAnalysisModel", "WeiboMultilingualSentiment") +sys.path.append(weibo_sentiment_path) + + +@dataclass +class SentimentResult: + """情感分析结果数据类""" + text: str + sentiment_label: str + confidence: float + probability_distribution: Dict[str, float] + success: bool = True + error_message: Optional[str] = None + + +@dataclass +class BatchSentimentResult: + """批量情感分析结果数据类""" + results: List[SentimentResult] + total_processed: int + success_count: int + failed_count: int + average_confidence: float + + +class WeiboMultilingualSentimentAnalyzer: + """ + 多语言情感分析器 + 封装WeiboMultilingualSentiment模型,为AI Agent提供情感分析功能 + """ + + def __init__(self): + """初始化情感分析器""" + self.model = None + self.tokenizer = None + self.device = None + self.is_initialized = False + + # 情感标签映射(5级分类) + self.sentiment_map = { + 0: "非常负面", + 1: "负面", + 2: "中性", + 3: "正面", + 4: "非常正面" + } + + print("WeiboMultilingualSentimentAnalyzer 已创建,调用 initialize() 来加载模型") + + def initialize(self) -> bool: + """ + 初始化模型和分词器 + + Returns: + 是否初始化成功 + """ + if self.is_initialized: + print("模型已经初始化,无需重复加载") + return True + + try: + print("正在加载多语言情感分析模型...") + + # 使用多语言情感分析模型 + model_name = "tabularisai/multilingual-sentiment-analysis" + local_model_path = os.path.join(weibo_sentiment_path, "model") + + # 检查本地是否已有模型 + if os.path.exists(local_model_path): + print("从本地加载模型...") + self.tokenizer = AutoTokenizer.from_pretrained(local_model_path) + self.model = AutoModelForSequenceClassification.from_pretrained(local_model_path) + else: + print("首次使用,正在下载模型到本地...") + # 下载并保存到本地 + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModelForSequenceClassification.from_pretrained(model_name) + + # 保存到本地 + os.makedirs(local_model_path, exist_ok=True) + self.tokenizer.save_pretrained(local_model_path) + self.model.save_pretrained(local_model_path) + print(f"模型已保存到: {local_model_path}") + + # 设置设备 + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.model.to(self.device) + self.model.eval() + self.is_initialized = True + + print(f"模型加载成功! 使用设备: {self.device}") + print("支持语言: 中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言") + print("情感等级: 非常负面、负面、中性、正面、非常正面") + + return True + + except Exception as e: + print(f"模型加载失败: {e}") + print("请检查网络连接或模型文件") + self.is_initialized = False + return False + + def _preprocess_text(self, text: str) -> str: + """ + 文本预处理 + + Args: + text: 输入文本 + + Returns: + 处理后的文本 + """ + # 基本文本清理 + if not text or not text.strip(): + return "" + + # 去除多余空格 + text = re.sub(r'\s+', ' ', text.strip()) + + return text + + def analyze_single_text(self, text: str) -> SentimentResult: + """ + 对单个文本进行情感分析 + + Args: + text: 要分析的文本 + + Returns: + SentimentResult对象 + """ + if not self.is_initialized: + return SentimentResult( + text=text, + sentiment_label="未初始化", + confidence=0.0, + probability_distribution={}, + success=False, + error_message="模型未初始化,请先调用 initialize() 方法" + ) + + try: + # 预处理文本 + processed_text = self._preprocess_text(text) + + if not processed_text: + return SentimentResult( + text=text, + sentiment_label="输入错误", + confidence=0.0, + probability_distribution={}, + success=False, + error_message="输入文本为空或无效" + ) + + # 分词编码 + inputs = self.tokenizer( + processed_text, + max_length=512, + padding=True, + truncation=True, + return_tensors='pt' + ) + + # 转移到设备 + inputs = {k: v.to(self.device) for k, v in inputs.items()} + + # 预测 + with torch.no_grad(): + outputs = self.model(**inputs) + logits = outputs.logits + probabilities = torch.softmax(logits, dim=1) + prediction = torch.argmax(probabilities, dim=1).item() + + # 构建结果 + confidence = probabilities[0][prediction].item() + label = self.sentiment_map[prediction] + + # 构建概率分布字典 + prob_dist = {} + for i, (label_name, prob) in enumerate(zip(self.sentiment_map.values(), probabilities[0])): + prob_dist[label_name] = prob.item() + + return SentimentResult( + text=text, + sentiment_label=label, + confidence=confidence, + probability_distribution=prob_dist, + success=True + ) + + except Exception as e: + return SentimentResult( + text=text, + sentiment_label="分析失败", + confidence=0.0, + probability_distribution={}, + success=False, + error_message=f"预测时发生错误: {str(e)}" + ) + + def analyze_batch(self, texts: List[str], show_progress: bool = True) -> BatchSentimentResult: + """ + 批量情感分析 + + Args: + texts: 文本列表 + show_progress: 是否显示进度 + + Returns: + BatchSentimentResult对象 + """ + if not texts: + return BatchSentimentResult( + results=[], + total_processed=0, + success_count=0, + failed_count=0, + average_confidence=0.0 + ) + + results = [] + success_count = 0 + total_confidence = 0.0 + + for i, text in enumerate(texts): + if show_progress and len(texts) > 1: + print(f"处理进度: {i+1}/{len(texts)}") + + result = self.analyze_single_text(text) + results.append(result) + + if result.success: + success_count += 1 + total_confidence += result.confidence + + average_confidence = total_confidence / success_count if success_count > 0 else 0.0 + failed_count = len(texts) - success_count + + return BatchSentimentResult( + results=results, + total_processed=len(texts), + success_count=success_count, + failed_count=failed_count, + average_confidence=average_confidence + ) + + def analyze_query_results(self, query_results: List[Dict[str, Any]], + text_field: str = "content", + min_confidence: float = 0.5) -> Dict[str, Any]: + """ + 对查询结果进行情感分析 + 专门用于分析从MediaCrawlerDB返回的查询结果 + + Args: + query_results: 查询结果列表,每个元素包含文本内容 + text_field: 文本内容字段名,默认为"content" + min_confidence: 最小置信度阈值 + + Returns: + 包含情感分析结果的字典 + """ + if not query_results: + return { + "sentiment_analysis": { + "total_analyzed": 0, + "sentiment_distribution": {}, + "high_confidence_results": [], + "summary": "没有内容需要分析" + } + } + + # 提取文本内容 + texts_to_analyze = [] + original_data = [] + + for item in query_results: + # 尝试多个可能的文本字段 + text_content = "" + for field in [text_field, "title_or_content", "content", "title", "text"]: + if field in item and item[field]: + text_content = str(item[field]) + break + + if text_content.strip(): + texts_to_analyze.append(text_content) + original_data.append(item) + + if not texts_to_analyze: + return { + "sentiment_analysis": { + "total_analyzed": 0, + "sentiment_distribution": {}, + "high_confidence_results": [], + "summary": "查询结果中没有找到可分析的文本内容" + } + } + + # 执行批量情感分析 + print(f"正在对{len(texts_to_analyze)}条内容进行情感分析...") + batch_result = self.analyze_batch(texts_to_analyze, show_progress=True) + + # 统计情感分布 + sentiment_distribution = {} + high_confidence_results = [] + + for result, original_item in zip(batch_result.results, original_data): + if result.success: + # 统计情感分布 + sentiment = result.sentiment_label + if sentiment not in sentiment_distribution: + sentiment_distribution[sentiment] = 0 + sentiment_distribution[sentiment] += 1 + + # 收集高置信度结果 + if result.confidence >= min_confidence: + high_confidence_results.append({ + "original_data": original_item, + "sentiment": result.sentiment_label, + "confidence": result.confidence, + "text_preview": result.text[:100] + "..." if len(result.text) > 100 else result.text + }) + + # 生成情感分析摘要 + total_analyzed = batch_result.success_count + if total_analyzed > 0: + dominant_sentiment = max(sentiment_distribution.items(), key=lambda x: x[1]) + sentiment_summary = f"共分析{total_analyzed}条内容,主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条,占{dominant_sentiment[1]/total_analyzed*100:.1f}%)" + else: + sentiment_summary = "情感分析失败" + + return { + "sentiment_analysis": { + "total_analyzed": total_analyzed, + "success_rate": f"{batch_result.success_count}/{batch_result.total_processed}", + "average_confidence": round(batch_result.average_confidence, 4), + "sentiment_distribution": sentiment_distribution, + "high_confidence_results": high_confidence_results, # 返回所有高置信度结果,不做限制 + "summary": sentiment_summary + } + } + + def get_model_info(self) -> Dict[str, Any]: + """ + 获取模型信息 + + Returns: + 模型信息字典 + """ + return { + "model_name": "tabularisai/multilingual-sentiment-analysis", + "supported_languages": [ + "中文", "英文", "西班牙文", "阿拉伯文", "日文", "韩文", + "德文", "法文", "意大利文", "葡萄牙文", "俄文", "荷兰文", + "波兰文", "土耳其文", "丹麦文", "希腊文", "芬兰文", + "瑞典文", "挪威文", "匈牙利文", "捷克文", "保加利亚文" + ], + "sentiment_levels": list(self.sentiment_map.values()), + "is_initialized": self.is_initialized, + "device": str(self.device) if self.device else "未设置" + } + + +# 创建全局实例(延迟初始化) +multilingual_sentiment_analyzer = WeiboMultilingualSentimentAnalyzer() + + +def analyze_sentiment(text_or_texts: Union[str, List[str]], + initialize_if_needed: bool = True) -> Union[SentimentResult, BatchSentimentResult]: + """ + 便捷的情感分析函数 + + Args: + text_or_texts: 单个文本或文本列表 + initialize_if_needed: 如果模型未初始化,是否自动初始化 + + Returns: + SentimentResult或BatchSentimentResult + """ + if initialize_if_needed and not multilingual_sentiment_analyzer.is_initialized: + if not multilingual_sentiment_analyzer.initialize(): + # 如果初始化失败,返回失败结果 + if isinstance(text_or_texts, str): + return SentimentResult( + text=text_or_texts, + sentiment_label="初始化失败", + confidence=0.0, + probability_distribution={}, + success=False, + error_message="模型初始化失败" + ) + else: + return BatchSentimentResult( + results=[], + total_processed=0, + success_count=0, + failed_count=len(text_or_texts), + average_confidence=0.0 + ) + + if isinstance(text_or_texts, str): + return multilingual_sentiment_analyzer.analyze_single_text(text_or_texts) + else: + return multilingual_sentiment_analyzer.analyze_batch(text_or_texts) + + +if __name__ == "__main__": + # 测试代码 + analyzer = WeiboMultilingualSentimentAnalyzer() + + if analyzer.initialize(): + # 测试单个文本 + result = analyzer.analyze_single_text("今天天气真好,心情特别棒!") + print(f"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})") + + # 测试批量文本 + test_texts = [ + "这家餐厅的菜味道非常棒!", + "服务态度太差了,很失望", + "I absolutely love this product!", + "The customer service was disappointing." + ] + + batch_result = analyzer.analyze_batch(test_texts) + print(f"\n批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}") + + for result in batch_result.results: + print(f"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})") + else: + print("模型初始化失败,无法进行测试") diff --git a/InsightEngine/utils/config.py b/InsightEngine/utils/config.py index e472564..463034a 100644 --- a/InsightEngine/utils/config.py +++ b/InsightEngine/utils/config.py @@ -14,6 +14,7 @@ class Config: # API密钥 deepseek_api_key: Optional[str] = None openai_api_key: Optional[str] = None + kimi_api_key: Optional[str] = None # 数据库配置 db_host: Optional[str] = None @@ -24,13 +25,14 @@ class Config: db_charset: str = "utf8mb4" # 模型配置 - default_llm_provider: str = "deepseek" # deepseek 或 openai + default_llm_provider: str = "deepseek" # deepseek、openai 或 kimi deepseek_model: str = "deepseek-chat" openai_model: str = "gpt-4o-mini" + kimi_model: str = "kimi-k2-0711-preview" # 搜索配置 search_timeout: int = 240 - max_content_length: int = 100000 + max_content_length: int = 500000 # 提高5倍以充分利用Kimi的长文本能力 # 数据库查询限制 default_search_hot_content_limit: int = 100 @@ -43,6 +45,10 @@ class Config: max_reflections: int = 3 max_paragraphs: int = 6 + # 结果处理限制 + max_search_results_for_llm: int = 0 # 0表示不限制,传递所有搜索结果给LLM + max_high_confidence_sentiment_results: int = 0 # 0表示不限制,返回所有高置信度情感分析结果 + # 输出配置 output_dir: str = "reports" save_intermediate_states: bool = True @@ -102,6 +108,10 @@ class Config: max_reflections=getattr(config_module, "MAX_REFLECTIONS", 2), max_paragraphs=getattr(config_module, "MAX_PARAGRAPHS", 5), + + max_search_results_for_llm=getattr(config_module, "MAX_SEARCH_RESULTS_FOR_LLM", 0), + max_high_confidence_sentiment_results=getattr(config_module, "MAX_HIGH_CONFIDENCE_SENTIMENT_RESULTS", 0), + output_dir=getattr(config_module, "OUTPUT_DIR", "reports"), save_intermediate_states=getattr(config_module, "SAVE_INTERMEDIATE_STATES", True) ) @@ -120,6 +130,7 @@ class Config: return cls( deepseek_api_key=config_dict.get("DEEPSEEK_API_KEY"), openai_api_key=config_dict.get("OPENAI_API_KEY"), + kimi_api_key=config_dict.get("KIMI_API_KEY"), db_host=config_dict.get("DB_HOST"), db_user=config_dict.get("DB_USER"), @@ -131,9 +142,10 @@ class Config: default_llm_provider=config_dict.get("DEFAULT_LLM_PROVIDER", "deepseek"), deepseek_model=config_dict.get("DEEPSEEK_MODEL", "deepseek-chat"), openai_model=config_dict.get("OPENAI_MODEL", "gpt-4o-mini"), + kimi_model=config_dict.get("KIMI_MODEL", "kimi-k2-0711-preview"), search_timeout=int(config_dict.get("SEARCH_TIMEOUT", "240")), - max_content_length=int(config_dict.get("SEARCH_CONTENT_MAX_LENGTH", "200000")), + max_content_length=int(config_dict.get("SEARCH_CONTENT_MAX_LENGTH", "500000")), default_search_hot_content_limit=int(config_dict.get("DEFAULT_SEARCH_HOT_CONTENT_LIMIT", "100")), default_search_topic_globally_limit_per_table=int(config_dict.get("DEFAULT_SEARCH_TOPIC_GLOBALLY_LIMIT_PER_TABLE", "50")), @@ -143,6 +155,10 @@ class Config: max_reflections=int(config_dict.get("MAX_REFLECTIONS", "2")), max_paragraphs=int(config_dict.get("MAX_PARAGRAPHS", "5")), + + max_search_results_for_llm=int(config_dict.get("MAX_SEARCH_RESULTS_FOR_LLM", "0")), + max_high_confidence_sentiment_results=int(config_dict.get("MAX_HIGH_CONFIDENCE_SENTIMENT_RESULTS", "0")), + output_dir=config_dict.get("OUTPUT_DIR", "reports"), save_intermediate_states=config_dict.get("SAVE_INTERMEDIATE_STATES", "true").lower() == "true" ) diff --git a/MediaEngine/agent.py b/MediaEngine/agent.py index 3c28323..36a0ff0 100644 --- a/MediaEngine/agent.py +++ b/MediaEngine/agent.py @@ -9,7 +9,7 @@ import re from datetime import datetime from typing import Optional, Dict, Any, List -from .llms import DeepSeekLLM, OpenAILLM, BaseLLM +from .llms import DeepSeekLLM, OpenAILLM, GeminiLLM, BaseLLM from .nodes import ( ReportStructureNode, FirstSearchNode, @@ -67,6 +67,11 @@ class DeepSearchAgent: api_key=self.config.openai_api_key, model_name=self.config.openai_model ) + elif self.config.default_llm_provider == "gemini": + return GeminiLLM( + api_key=self.config.gemini_api_key, + model_name=self.config.gemini_model + ) else: raise ValueError(f"不支持的LLM提供商: {self.config.default_llm_provider}") diff --git a/MediaEngine/llms/__init__.py b/MediaEngine/llms/__init__.py index 2e1602a..e2a168e 100644 --- a/MediaEngine/llms/__init__.py +++ b/MediaEngine/llms/__init__.py @@ -6,5 +6,6 @@ LLM调用模块 from .base import BaseLLM from .deepseek import DeepSeekLLM from .openai_llm import OpenAILLM +from .gemini_llm import GeminiLLM -__all__ = ["BaseLLM", "DeepSeekLLM", "OpenAILLM"] +__all__ = ["BaseLLM", "DeepSeekLLM", "OpenAILLM", "GeminiLLM"] diff --git a/MediaEngine/llms/gemini_llm.py b/MediaEngine/llms/gemini_llm.py new file mode 100644 index 0000000..2cdbe21 --- /dev/null +++ b/MediaEngine/llms/gemini_llm.py @@ -0,0 +1,95 @@ +""" +Gemini LLM实现 +使用Gemini 2.5-pro中转API进行文本生成 +""" + +import os +from typing import Optional, Dict, Any +from openai import OpenAI +from .base import BaseLLM + + +class GeminiLLM(BaseLLM): + """Gemini LLM实现类""" + + def __init__(self, api_key: Optional[str] = None, model_name: Optional[str] = None): + """ + 初始化Gemini客户端 + + Args: + api_key: Gemini API密钥,如果不提供则从环境变量读取 + model_name: 模型名称,默认使用gemini-2.5-pro + """ + if api_key is None: + api_key = os.getenv("GEMINI_API_KEY") + if not api_key: + raise ValueError("Gemini API Key未找到!请设置GEMINI_API_KEY环境变量或在初始化时提供") + + super().__init__(api_key, model_name) + + # 初始化OpenAI客户端,使用Gemini的中转endpoint + self.client = OpenAI( + api_key=self.api_key, + base_url="https://www.chataiapi.com/v1" + ) + + self.default_model = model_name or self.get_default_model() + + def get_default_model(self) -> str: + """获取默认模型名称""" + return "gemini-2.5-pro" + + def invoke(self, system_prompt: str, user_prompt: str, **kwargs) -> str: + """ + 调用Gemini API生成回复 + + Args: + system_prompt: 系统提示词 + user_prompt: 用户输入 + **kwargs: 其他参数,如temperature、max_tokens等 + + Returns: + Gemini生成的回复文本 + """ + try: + # 构建消息 + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + + # 设置默认参数 + params = { + "model": self.default_model, + "messages": messages, + "temperature": kwargs.get("temperature", 0.7), + "max_tokens": kwargs.get("max_tokens", 4000), + "stream": False + } + + # 调用API + response = self.client.chat.completions.create(**params) + + # 提取回复内容 + if response.choices and response.choices[0].message: + content = response.choices[0].message.content + return self.validate_response(content) + else: + return "" + + except Exception as e: + print(f"Gemini API调用错误: {str(e)}") + raise e + + def get_model_info(self) -> Dict[str, Any]: + """ + 获取当前模型信息 + + Returns: + 模型信息字典 + """ + return { + "provider": "Gemini", + "model": self.default_model, + "api_base": "https://www.chataiapi.com/v1" + } diff --git a/MediaEngine/utils/config.py b/MediaEngine/utils/config.py index 3772743..4760b3f 100644 --- a/MediaEngine/utils/config.py +++ b/MediaEngine/utils/config.py @@ -14,12 +14,14 @@ class Config: # API密钥 deepseek_api_key: Optional[str] = None openai_api_key: Optional[str] = None + gemini_api_key: Optional[str] = None bocha_api_key: Optional[str] = None # 模型配置 - default_llm_provider: str = "deepseek" # deepseek 或 openai + default_llm_provider: str = "deepseek" # deepseek、openai 或 gemini deepseek_model: str = "deepseek-chat" openai_model: str = "gpt-4o-mini" + gemini_model: str = "gemini-2.5-pro" # 搜索配置 search_timeout: int = 240 @@ -44,6 +46,10 @@ class Config: print("错误: OpenAI API Key未设置") return False + if self.default_llm_provider == "gemini" and not self.gemini_api_key: + print("错误: Gemini API Key未设置") + return False + if not self.bocha_api_key: print("错误: Bocha API Key未设置") return False @@ -65,11 +71,12 @@ class Config: return cls( deepseek_api_key=getattr(config_module, "DEEPSEEK_API_KEY", None), openai_api_key=getattr(config_module, "OPENAI_API_KEY", None), + gemini_api_key=getattr(config_module, "GEMINI_API_KEY", None), bocha_api_key=getattr(config_module, "BOCHA_API_KEY", None), default_llm_provider=getattr(config_module, "DEFAULT_LLM_PROVIDER", "deepseek"), deepseek_model=getattr(config_module, "DEEPSEEK_MODEL", "deepseek-chat"), openai_model=getattr(config_module, "OPENAI_MODEL", "gpt-4o-mini"), - + gemini_model=getattr(config_module, "GEMINI_MODEL", "gemini-2.5-pro"), search_timeout=getattr(config_module, "SEARCH_TIMEOUT", 240), max_content_length=getattr(config_module, "SEARCH_CONTENT_MAX_LENGTH", 20000), max_reflections=getattr(config_module, "MAX_REFLECTIONS", 2), @@ -92,11 +99,12 @@ class Config: return cls( deepseek_api_key=config_dict.get("DEEPSEEK_API_KEY"), openai_api_key=config_dict.get("OPENAI_API_KEY"), + gemini_api_key=config_dict.get("GEMINI_API_KEY"), bocha_api_key=config_dict.get("BOCHA_API_KEY"), default_llm_provider=config_dict.get("DEFAULT_LLM_PROVIDER", "deepseek"), deepseek_model=config_dict.get("DEEPSEEK_MODEL", "deepseek-chat"), openai_model=config_dict.get("OPENAI_MODEL", "gpt-4o-mini"), - + gemini_model=config_dict.get("GEMINI_MODEL", "gemini-2.5-pro"), search_timeout=int(config_dict.get("SEARCH_TIMEOUT", "240")), max_content_length=int(config_dict.get("SEARCH_CONTENT_MAX_LENGTH", "20000")), max_reflections=int(config_dict.get("MAX_REFLECTIONS", "2")), diff --git a/SentimentAnalysisModel/WeiboMultilingualSentiment/predict.py b/SentimentAnalysisModel/WeiboMultilingualSentiment/predict.py index 6f92096..595c067 100644 --- a/SentimentAnalysisModel/WeiboMultilingualSentiment/predict.py +++ b/SentimentAnalysisModel/WeiboMultilingualSentiment/predict.py @@ -160,6 +160,31 @@ def show_multilingual_demo(tokenizer, model, device, sentiment_map): print(f"处理 {text} 时出错: {e}") print("\n=== 示例结束 ===") + + ''' + 正在加载多语言情感分析模型... +从本地加载模型... +模型加载成功! 使用设备: cuda + +============= 多语言情感分析 ============= +支持语言: 中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言 +情感等级: 非常负面、负面、中性、正面、非常正面 +输入文本进行分析 (输入 'q' 退出): +输入 'demo' 查看多语言示例 + +请输入文本: 我喜欢你 +C:\Users\67093\.conda\envs\pytorch_python11\Lib\site-packages\transformers\models\distilbert\modeling_distilbert.py:401: UserWarning: 1Torch was not compiled with flash attention. (Triggered internally at C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\transformers\cuda\sdp_utils.cpp:263.) + attn_output = torch.nn.functional.scaled_dot_product_attention( +预测结果: 正面 (置信度: 0.5204) +详细概率分布: + 非常负面: 0.0329 + 负面: 0.0263 + 中性: 0.1987 + 正面: 0.5204 + 非常正面: 0.2216 + +请输入文本: + ''' if __name__ == "__main__": main() \ No newline at end of file diff --git a/SingleEngineApp/insight_engine_streamlit_app.py b/SingleEngineApp/insight_engine_streamlit_app.py index f5b8319..25e951c 100644 --- a/SingleEngineApp/insight_engine_streamlit_app.py +++ b/SingleEngineApp/insight_engine_streamlit_app.py @@ -10,10 +10,10 @@ from datetime import datetime import json # 添加src目录到Python路径 -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '.')) +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) from InsightEngine import DeepSearchAgent, Config -from config import DEEPSEEK_API_KEY, DB_HOST, DB_USER, DB_PASSWORD, DB_NAME, DB_PORT, DB_CHARSET +from config import DEEPSEEK_API_KEY, KIMI_API_KEY, DB_HOST, DB_USER, DB_PASSWORD, DB_NAME, DB_PORT, DB_CHARSET def main(): @@ -31,20 +31,38 @@ def main(): with st.sidebar: st.header("配置") + # 模型选择 + llm_provider = st.selectbox("LLM提供商", ["deepseek", "openai", "kimi"]) + # 高级配置 st.subheader("高级配置") max_reflections = st.slider("反思次数", 1, 5, 2) - max_content_length = st.number_input("最大内容长度", 10000, 500000, 200000) # 提高10倍:1000-50000-20000 → 10000-500000-200000 - # 模型选择 - llm_provider = st.selectbox("LLM提供商", ["deepseek", "openai"]) + # 根据选择的模型动态调整默认值 + if llm_provider == "kimi": + default_content_length = 500000 # Kimi支持长文本,使用更大的默认值 + max_limit = 1000000 # 提高上限 + st.info("💡 Kimi模型支持超长文本处理,建议使用更大的内容长度以充分利用其能力") + else: + default_content_length = 200000 + max_limit = 500000 + + max_content_length = st.number_input("最大内容长度", 10000, max_limit, default_content_length) + + # 初始化所有可能的变量 + openai_key = "" + kimi_key = "" if llm_provider == "deepseek": model_name = st.selectbox("DeepSeek模型", ["deepseek-chat"]) - else: + elif llm_provider == "openai": model_name = st.selectbox("OpenAI模型", ["gpt-4o-mini", "gpt-4o"]) openai_key = st.text_input("OpenAI API Key", type="password", value="") + else: # kimi + model_name = st.selectbox("Kimi模型", ["kimi-k2-0711-preview"]) + kimi_key = st.text_input("Kimi API Key", type="password", + value="") # 主界面 col1, col2 = st.columns([2, 1]) @@ -96,8 +114,13 @@ def main(): st.error("请提供OpenAI API Key") return + if llm_provider == "kimi" and not kimi_key and not KIMI_API_KEY: + st.error("请提供Kimi API Key或在配置文件中设置KIMI_API_KEY") + return + # 自动使用配置文件中的API密钥和数据库配置 deepseek_key = DEEPSEEK_API_KEY + kimi_key_final = kimi_key if kimi_key else KIMI_API_KEY db_host = DB_HOST db_user = DB_USER db_password = DB_PASSWORD @@ -109,6 +132,7 @@ def main(): config = Config( deepseek_api_key=deepseek_key if llm_provider == "deepseek" else None, openai_api_key=openai_key if llm_provider == "openai" else None, + kimi_api_key=kimi_key_final if llm_provider == "kimi" else None, db_host=db_host, db_user=db_user, db_password=db_password, @@ -118,6 +142,7 @@ def main(): default_llm_provider=llm_provider, deepseek_model=model_name if llm_provider == "deepseek" else "deepseek-chat", openai_model=model_name if llm_provider == "openai" else "gpt-4o-mini", + kimi_model=model_name if llm_provider == "kimi" else "kimi-k2-0711-preview", max_reflections=max_reflections, max_content_length=max_content_length, output_dir="insight_engine_streamlit_reports" diff --git a/SingleEngineApp/media_engine_streamlit_app.py b/SingleEngineApp/media_engine_streamlit_app.py index 714fbef..c79d945 100644 --- a/SingleEngineApp/media_engine_streamlit_app.py +++ b/SingleEngineApp/media_engine_streamlit_app.py @@ -13,7 +13,7 @@ import json sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) from MediaEngine import DeepSearchAgent, Config -from config import DEEPSEEK_API_KEY, BOCHA_Web_Search_API_KEY +from config import DEEPSEEK_API_KEY, BOCHA_Web_Search_API_KEY, GEMINI_API_KEY def main(): @@ -37,14 +37,16 @@ def main(): max_content_length = st.number_input("最大内容长度", 1000, 50000, 20000) # 模型选择 - llm_provider = st.selectbox("LLM提供商", ["deepseek", "openai"]) + llm_provider = st.selectbox("LLM提供商", ["deepseek", "openai", "gemini"]) + openai_key = "" # 初始化变量 if llm_provider == "deepseek": model_name = st.selectbox("DeepSeek模型", ["deepseek-chat"]) - else: + elif llm_provider == "openai": model_name = st.selectbox("OpenAI模型", ["gpt-4o-mini", "gpt-4o"]) - openai_key = st.text_input("OpenAI API Key", type="password", - value="") + openai_key = st.text_input("OpenAI API Key", type="password", value="") + else: # gemini + model_name = st.selectbox("Gemini模型", ["gemini-2.5-pro"]) # 主界面 col1, col2 = st.columns([2, 1]) @@ -98,16 +100,19 @@ def main(): # 自动使用配置文件中的API密钥 deepseek_key = DEEPSEEK_API_KEY + gemini_key = GEMINI_API_KEY # 使用config.py中的Gemini API密钥 bocha_key = BOCHA_Web_Search_API_KEY # 创建配置 config = Config( deepseek_api_key=deepseek_key if llm_provider == "deepseek" else None, openai_api_key=openai_key if llm_provider == "openai" else None, + gemini_api_key=gemini_key if llm_provider == "gemini" else None, bocha_api_key=bocha_key, default_llm_provider=llm_provider, deepseek_model=model_name if llm_provider == "deepseek" else "deepseek-chat", openai_model=model_name if llm_provider == "openai" else "gpt-4o-mini", + gemini_model=model_name if llm_provider == "gemini" else "gemini-2.5-pro", max_reflections=max_reflections, max_content_length=max_content_length, output_dir="media_engine_streamlit_reports" diff --git a/config.py b/config.py index 89a0f3d..a10f183 100644 --- a/config.py +++ b/config.py @@ -1,25 +1,37 @@ # -*- coding: utf-8 -*- """ -智能舆情分析平台配置文件 -存储数据库连接信息和API密钥 +Intelligence Public Opinion Analysis Platform Configuration File +Stores database connection information and API keys """ -# MySQL数据库配置 -DB_HOST = "rm-2zeib6b13f6tt9kncoo.mysql.rds.aliyuncs.com" +# MySQL Database Configuration +DB_HOST = "your_database_host" # e.g., "localhost" or "127.0.0.1" DB_PORT = 3306 -DB_USER = "root" -DB_PASSWORD = "mneDccc7sHHANtFk" -DB_NAME = "media_crawler" +DB_USER = "your_database_user" +DB_PASSWORD = "your_database_password" +DB_NAME = "your_database_name" DB_CHARSET = "utf8mb4" -# agent1 DeepSeek API密钥 -DEEPSEEK_API_KEY = "sk-4bbc57fadd234666a3840f1a7edc1f2e" +# DeepSeek API Key +# 申请地址https://www.deepseek.com/ +DEEPSEEK_API_KEY = "your_deepseek_api_key" -# agent2 DeepSeek API密钥 -DEEPSEEK_API_KEY_2 = "sk-b26405d2e02f475c960d21c2acce61e7" +# Tavily Search API Key +# 申请地址https://www.tavily.com/ +TAVILY_API_KEY = "your_tavily_api_key" -# Tavily搜索API密钥 -TAVILY_API_KEY = "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" +# Kimi API Key +# 申请地址https://www.kimi.com/ +KIMI_API_KEY = "your_kimi_api_key" -# 博查Web Search API密钥 -BOCHA_Web_Search_API_KEY = "sk-496b37a2a1ee4915b438dd822b03de8d" \ No newline at end of file +# Gemini API Key (via OpenAI format proxy) +# 申请地址hapi.chataiapi.com/ +GEMINI_API_KEY = "your_gemini_api_key" + +# Bocha Search API Key +# 申请地址https://open.bochaai.com/ +BOCHA_Web_Search_API_KEY = "your_bocha_web_search_api_key" + +# Guiji Flow API Key +# 申请地址https://siliconflow.cn/ +GUIJI_QWEN3_API_KEY = "your_guiji_qwen3_api_key" \ No newline at end of file