diff --git a/InsightEngine/tools/sentiment_analyzer.py b/InsightEngine/tools/sentiment_analyzer.py index 37a39da..b585772 100644 --- a/InsightEngine/tools/sentiment_analyzer.py +++ b/InsightEngine/tools/sentiment_analyzer.py @@ -11,6 +11,7 @@ import re try: import torch + TORCH_AVAILABLE = True except ImportError: torch = None # type: ignore @@ -18,6 +19,7 @@ except ImportError: try: from transformers import AutoTokenizer, AutoModelForSequenceClassification + TRANSFORMERS_AVAILABLE = True except ImportError: AutoTokenizer = None # type: ignore @@ -28,6 +30,7 @@ except ImportError: # INFO:若想跳过情感分析,可手动切换此开关为False SENTIMENT_ANALYSIS_ENABLED = True + def _describe_missing_dependencies() -> str: missing = [] if not TORCH_AVAILABLE: @@ -36,14 +39,21 @@ def _describe_missing_dependencies() -> str: missing.append("Transformers") return " / ".join(missing) + # 添加项目根目录到路径,以便导入WeiboMultilingualSentiment -project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -weibo_sentiment_path = os.path.join(project_root, "SentimentAnalysisModel", "WeiboMultilingualSentiment") +project_root = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +) +weibo_sentiment_path = os.path.join( + project_root, "SentimentAnalysisModel", "WeiboMultilingualSentiment" +) sys.path.append(weibo_sentiment_path) + @dataclass class SentimentResult: """情感分析结果数据类""" + text: str sentiment_label: str confidence: float @@ -53,9 +63,10 @@ class SentimentResult: analysis_performed: bool = True -@dataclass +@dataclass class BatchSentimentResult: """批量情感分析结果数据类""" + results: List[SentimentResult] total_processed: int success_count: int @@ -69,7 +80,7 @@ class WeiboMultilingualSentimentAnalyzer: 多语言情感分析器 封装WeiboMultilingualSentiment模型,为AI Agent提供情感分析功能 """ - + def __init__(self): """初始化情感分析器""" self.model = None @@ -78,14 +89,14 @@ class WeiboMultilingualSentimentAnalyzer: self.is_initialized = False self.is_disabled = False self.disable_reason: Optional[str] = None - + # 情感标签映射(5级分类) self.sentiment_map = { - 0: "非常负面", - 1: "负面", - 2: "中性", - 3: "正面", - 4: "非常正面" + 0: "非常负面", + 1: "负面", + 2: "中性", + 3: "正面", + 4: "非常正面", } if not SENTIMENT_ANALYSIS_ENABLED: @@ -96,9 +107,13 @@ class WeiboMultilingualSentimentAnalyzer: if self.is_disabled: reason = self.disable_reason or "Sentiment analysis disabled." - print(f"WeiboMultilingualSentimentAnalyzer initialized but disabled: {reason}") + print( + f"WeiboMultilingualSentimentAnalyzer initialized but disabled: {reason}" + ) else: - print("WeiboMultilingualSentimentAnalyzer 已创建,调用 initialize() 来加载模型") + print( + "WeiboMultilingualSentimentAnalyzer 已创建,调用 initialize() 来加载模型" + ) def disable(self, reason: Optional[str] = None, drop_state: bool = False) -> None: """Disable sentiment analysis, optionally clearing loaded resources.""" @@ -130,14 +145,18 @@ class WeiboMultilingualSentimentAnalyzer: if torch.cuda.is_available(): return torch.device("cuda") mps_backend = getattr(torch.backends, "mps", None) - if mps_backend and getattr(mps_backend, "is_available", lambda: False)() and getattr(mps_backend, "is_built", lambda: False)(): + if ( + mps_backend + and getattr(mps_backend, "is_available", lambda: False)() + and getattr(mps_backend, "is_built", lambda: False)() + ): return torch.device("mps") return torch.device("cpu") - + def initialize(self) -> bool: """ 初始化模型和分词器 - + Returns: 是否初始化成功 """ @@ -155,31 +174,35 @@ class WeiboMultilingualSentimentAnalyzer: if self.is_initialized: print("模型已经初始化,无需重复加载") return True - + try: print("正在加载多语言情感分析模型...") - + # 使用多语言情感分析模型 model_name = "tabularisai/multilingual-sentiment-analysis" local_model_path = os.path.join(weibo_sentiment_path, "model") - + # 检查本地是否已有模型 if os.path.exists(local_model_path): print("从本地加载模型...") self.tokenizer = AutoTokenizer.from_pretrained(local_model_path) - self.model = AutoModelForSequenceClassification.from_pretrained(local_model_path) + self.model = AutoModelForSequenceClassification.from_pretrained( + local_model_path + ) else: print("首次使用,正在下载模型到本地...") # 下载并保存到本地 self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model = AutoModelForSequenceClassification.from_pretrained(model_name) - + self.model = AutoModelForSequenceClassification.from_pretrained( + model_name + ) + # 保存到本地 os.makedirs(local_model_path, exist_ok=True) self.tokenizer.save_pretrained(local_model_path) self.model.save_pretrained(local_model_path) print(f"模型已保存到: {local_model_path}") - + # 设置设备 device = self._select_device() if device is None: @@ -198,46 +221,46 @@ class WeiboMultilingualSentimentAnalyzer: print("检测到 Apple MPS 设备,已使用 MPS 进行推理。") else: print("未检测到 GPU,自动使用 CPU 进行推理。") - + print(f"模型加载成功! 使用设备: {self.device}") print("支持语言: 中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言") print("情感等级: 非常负面、负面、中性、正面、非常正面") - + return True - + except Exception as e: error_message = f"模型加载失败: {e}" print(error_message) print("请检查网络连接或模型文件") self.disable(error_message, drop_state=True) return False - + def _preprocess_text(self, text: str) -> str: """ 文本预处理 - + Args: text: 输入文本 - + Returns: 处理后的文本 """ # 基本文本清理 if not text or not text.strip(): return "" - + # 去除多余空格 - text = re.sub(r'\s+', ' ', text.strip()) - + text = re.sub(r"\s+", " ", text.strip()) + return text - + def analyze_single_text(self, text: str) -> SentimentResult: """ 对单个文本进行情感分析 - + Args: text: 要分析的文本 - + Returns: SentimentResult对象 """ @@ -249,7 +272,7 @@ class WeiboMultilingualSentimentAnalyzer: probability_distribution={}, success=False, error_message=self.disable_reason or "情感分析功能已禁用", - analysis_performed=False + analysis_performed=False, ) if not self.is_initialized: @@ -260,7 +283,7 @@ class WeiboMultilingualSentimentAnalyzer: probability_distribution={}, success=False, error_message="模型未初始化,请先调用initialize() 方法", - analysis_performed=False + analysis_performed=False, ) try: @@ -275,7 +298,7 @@ class WeiboMultilingualSentimentAnalyzer: probability_distribution={}, success=False, error_message="输入文本为空或无效内容", - analysis_performed=False + analysis_performed=False, ) # 分词编码 @@ -284,7 +307,7 @@ class WeiboMultilingualSentimentAnalyzer: max_length=512, padding=True, truncation=True, - return_tensors='pt' + return_tensors="pt", ) # 转移到设备 @@ -311,7 +334,7 @@ class WeiboMultilingualSentimentAnalyzer: sentiment_label=label, confidence=confidence, probability_distribution=prob_dist, - success=True + success=True, ) except Exception as e: @@ -322,17 +345,19 @@ class WeiboMultilingualSentimentAnalyzer: probability_distribution={}, success=False, error_message=f"预测时发生错误: {str(e)}", - analysis_performed=False + analysis_performed=False, ) - def analyze_batch(self, texts: List[str], show_progress: bool = True) -> BatchSentimentResult: + def analyze_batch( + self, texts: List[str], show_progress: bool = True + ) -> BatchSentimentResult: """ 批量情感分析 - + Args: texts: 文本列表 show_progress: 是否显示进度 - + Returns: BatchSentimentResult对象 """ @@ -343,9 +368,9 @@ class WeiboMultilingualSentimentAnalyzer: success_count=0, failed_count=0, average_confidence=0.0, - analysis_performed=not self.is_disabled and self.is_initialized + analysis_performed=not self.is_disabled and self.is_initialized, ) - + if self.is_disabled or not self.is_initialized: passthrough_results = [ SentimentResult( @@ -355,7 +380,7 @@ class WeiboMultilingualSentimentAnalyzer: probability_distribution={}, success=False, error_message=self.disable_reason or "情感分析功能不可用", - analysis_performed=False + analysis_performed=False, ) for text in texts ] @@ -365,42 +390,44 @@ class WeiboMultilingualSentimentAnalyzer: success_count=0, failed_count=len(texts), average_confidence=0.0, - analysis_performed=False + analysis_performed=False, ) - + results = [] success_count = 0 total_confidence = 0.0 - + for i, text in enumerate(texts): if show_progress and len(texts) > 1: - print(f"处理进度: {i+1}/{len(texts)}") - + print(f"处理进度: {i + 1}/{len(texts)}") + result = self.analyze_single_text(text) results.append(result) - + if result.success: success_count += 1 total_confidence += result.confidence - - average_confidence = total_confidence / success_count if success_count > 0 else 0.0 + + average_confidence = ( + total_confidence / success_count if success_count > 0 else 0.0 + ) failed_count = len(texts) - success_count - + return BatchSentimentResult( results=results, total_processed=len(texts), success_count=success_count, failed_count=failed_count, average_confidence=average_confidence, - analysis_performed=True + analysis_performed=True, ) - + def _build_passthrough_analysis( self, original_data: List[Dict[str, Any]], reason: str, texts: Optional[List[str]] = None, - results: Optional[List[SentimentResult]] = None + results: Optional[List[SentimentResult]] = None, ) -> Dict[str, Any]: """ 构建在情感分析不可用时的透传结果 @@ -416,33 +443,36 @@ class WeiboMultilingualSentimentAnalyzer: "sentiment_distribution": {}, "high_confidence_results": [], "summary": f"情感分析未执行:{reason}", - "original_texts": original_data + "original_texts": original_data, } } - + if texts is not None: response["sentiment_analysis"]["passthrough_texts"] = texts - + if results is not None: response["sentiment_analysis"]["results"] = [ result.__dict__ if isinstance(result, SentimentResult) else result for result in results ] - + return response - - def analyze_query_results(self, query_results: List[Dict[str, Any]], - text_field: str = "content", - min_confidence: float = 0.5) -> Dict[str, Any]: + + def analyze_query_results( + self, + query_results: List[Dict[str, Any]], + text_field: str = "content", + min_confidence: float = 0.5, + ) -> Dict[str, Any]: """ 对查询结果进行情感分析 专门用于分析从MediaCrawlerDB返回的查询结果 - + Args: query_results: 查询结果列表,每个元素包含文本内容 text_field: 文本内容字段名,默认为"content" min_confidence: 最小置信度阈值 - + Returns: 包含情感分析结果的字典 """ @@ -452,14 +482,14 @@ class WeiboMultilingualSentimentAnalyzer: "total_analyzed": 0, "sentiment_distribution": {}, "high_confidence_results": [], - "summary": "没有内容需要分析" + "summary": "没有内容需要分析", } } - + # 提取文本内容 texts_to_analyze = [] original_data = [] - + for item in query_results: # 尝试多个可能的文本字段 text_content = "" @@ -467,49 +497,52 @@ class WeiboMultilingualSentimentAnalyzer: if field in item and item[field]: text_content = str(item[field]) break - + if text_content.strip(): texts_to_analyze.append(text_content) original_data.append(item) - + if not texts_to_analyze: return { "sentiment_analysis": { "total_analyzed": 0, "sentiment_distribution": {}, "high_confidence_results": [], - "summary": "查询结果中没有找到可分析的文本内容" + "summary": "查询结果中没有找到可分析的文本内容", } } - + if self.is_disabled: return self._build_passthrough_analysis( original_data=original_data, reason=self.disable_reason or "情感分析模型不可用", - texts=texts_to_analyze + texts=texts_to_analyze, ) - + # 执行批量情感分析 print(f"正在对{len(texts_to_analyze)}条内容进行情感分析...") batch_result = self.analyze_batch(texts_to_analyze, show_progress=True) - + if not batch_result.analysis_performed: reason = self.disable_reason or "情感分析功能不可用" if batch_result.results: - candidate_error = next((r.error_message for r in batch_result.results if r.error_message), None) + candidate_error = next( + (r.error_message for r in batch_result.results if r.error_message), + None, + ) if candidate_error: reason = candidate_error return self._build_passthrough_analysis( original_data=original_data, reason=reason, texts=texts_to_analyze, - results=batch_result.results + results=batch_result.results, ) - + # 统计情感分布 sentiment_distribution = {} high_confidence_results = [] - + for result, original_item in zip(batch_result.results, original_data): if result.success: # 统计情感分布 @@ -517,24 +550,28 @@ class WeiboMultilingualSentimentAnalyzer: if sentiment not in sentiment_distribution: sentiment_distribution[sentiment] = 0 sentiment_distribution[sentiment] += 1 - + # 收集高置信度结果 if result.confidence >= min_confidence: - high_confidence_results.append({ - "original_data": original_item, - "sentiment": result.sentiment_label, - "confidence": result.confidence, - "text_preview": result.text[:100] + "..." if len(result.text) > 100 else result.text - }) - + high_confidence_results.append( + { + "original_data": original_item, + "sentiment": result.sentiment_label, + "confidence": result.confidence, + "text_preview": result.text[:100] + "..." + if len(result.text) > 100 + else result.text, + } + ) + # 生成情感分析摘要 total_analyzed = batch_result.success_count if total_analyzed > 0: dominant_sentiment = max(sentiment_distribution.items(), key=lambda x: x[1]) - sentiment_summary = f"共分析{total_analyzed}条内容,主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条,占{dominant_sentiment[1]/total_analyzed*100:.1f}%)" + sentiment_summary = f"共分析{total_analyzed}条内容,主要情感倾向为'{dominant_sentiment[0]}'({dominant_sentiment[1]}条,占{dominant_sentiment[1] / total_analyzed * 100:.1f}%)" else: sentiment_summary = "情感分析失败" - + return { "sentiment_analysis": { "total_analyzed": total_analyzed, @@ -542,28 +579,46 @@ class WeiboMultilingualSentimentAnalyzer: "average_confidence": round(batch_result.average_confidence, 4), "sentiment_distribution": sentiment_distribution, "high_confidence_results": high_confidence_results, # 返回所有高置信度结果,不做限制 - "summary": sentiment_summary + "summary": sentiment_summary, } } - + def get_model_info(self) -> Dict[str, Any]: """ 获取模型信息 - + Returns: 模型信息字典 """ return { "model_name": "tabularisai/multilingual-sentiment-analysis", "supported_languages": [ - "中文", "英文", "西班牙文", "阿拉伯文", "日文", "韩文", - "德文", "法文", "意大利文", "葡萄牙文", "俄文", "荷兰文", - "波兰文", "土耳其文", "丹麦文", "希腊文", "芬兰文", - "瑞典文", "挪威文", "匈牙利文", "捷克文", "保加利亚文" + "中文", + "英文", + "西班牙文", + "阿拉伯文", + "日文", + "韩文", + "德文", + "法文", + "意大利文", + "葡萄牙文", + "俄文", + "荷兰文", + "波兰文", + "土耳其文", + "丹麦文", + "希腊文", + "芬兰文", + "瑞典文", + "挪威文", + "匈牙利文", + "捷克文", + "保加利亚文", ], "sentiment_levels": list(self.sentiment_map.values()), "is_initialized": self.is_initialized, - "device": str(self.device) if self.device else "未设置" + "device": str(self.device) if self.device else "未设置", } @@ -576,20 +631,23 @@ def enable_sentiment_analysis() -> bool: return multilingual_sentiment_analyzer.enable() -def disable_sentiment_analysis(reason: Optional[str] = None, drop_state: bool = False) -> None: +def disable_sentiment_analysis( + reason: Optional[str] = None, drop_state: bool = False +) -> None: """Public helper to disable sentiment analysis at runtime.""" multilingual_sentiment_analyzer.disable(reason=reason, drop_state=drop_state) -def analyze_sentiment(text_or_texts: Union[str, List[str]], - initialize_if_needed: bool = True) -> Union[SentimentResult, BatchSentimentResult]: +def analyze_sentiment( + text_or_texts: Union[str, List[str]], initialize_if_needed: bool = True +) -> Union[SentimentResult, BatchSentimentResult]: """ 便捷的情感分析函数 - + Args: text_or_texts: 单个文本或文本列表 initialize_if_needed: 如果模型未初始化,是否自动初始化 - + Returns: SentimentResult或BatchSentimentResult """ @@ -599,7 +657,7 @@ def analyze_sentiment(text_or_texts: Union[str, List[str]], and not multilingual_sentiment_analyzer.is_disabled ): multilingual_sentiment_analyzer.initialize() - + if isinstance(text_or_texts, str): return multilingual_sentiment_analyzer.analyze_single_text(text_or_texts) else: @@ -610,24 +668,30 @@ def analyze_sentiment(text_or_texts: Union[str, List[str]], if __name__ == "__main__": # 测试代码 analyzer = WeiboMultilingualSentimentAnalyzer() - + if analyzer.initialize(): # 测试单个文本 result = analyzer.analyze_single_text("今天天气真好,心情特别棒!") - print(f"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})") - + print( + f"单个文本分析: {result.sentiment_label} (置信度: {result.confidence:.4f})" + ) + # 测试批量文本 test_texts = [ "这家餐厅的菜味道非常棒!", "服务态度太差了,很失望", "I absolutely love this product!", - "The customer service was disappointing." + "The customer service was disappointing.", ] - + batch_result = analyzer.analyze_batch(test_texts) - print(f"\n批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}") - + print( + f"\n批量分析: 成功 {batch_result.success_count}/{batch_result.total_processed}" + ) + for result in batch_result.results: - print(f"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})") + print( + f"'{result.text[:30]}...' -> {result.sentiment_label} ({result.confidence:.4f})" + ) else: print("模型初始化失败,无法进行测试")