diff --git a/.gitignore b/.gitignore index b2a1796..a63202a 100644 --- a/.gitignore +++ b/.gitignore @@ -182,6 +182,7 @@ WeiboSentiment_Finetuned/GPT2-AdapterTuning/models/ WeiboSentiment_Finetuned/BertChinese-Lora/models/ WeiboSentiment_LLM/models/ WeiboSentiment_Finetuned/BertChinese-Lora/model/ +WeiboMultilingualSentiment/model/ # LoRA 和 Adapter 权重 */adapter_model.safetensors diff --git a/WeiboMultilingualSentiment/README.md b/WeiboMultilingualSentiment/README.md new file mode 100644 index 0000000..deb7bd1 --- /dev/null +++ b/WeiboMultilingualSentiment/README.md @@ -0,0 +1,119 @@ +# 多语言情感分析 - Multilingual Sentiment Analysis + +本模块使用HuggingFace上的多语言情感分析模型进行情感分析,支持22种语言。 + +## 模型信息 + +- **模型名称**: tabularisai/multilingual-sentiment-analysis +- **基础模型**: distilbert-base-multilingual-cased +- **支持语言**: 22种语言,包括: + - 中文 (中文) + - English (英语) + - Español (西班牙语) + - 日本語 (日语) + - 한국어 (韩语) + - Français (法语) + - Deutsch (德语) + - Русский (俄语) + - العربية (阿拉伯语) + - हिन्दी (印地语) + - Português (葡萄牙语) + - Italiano (意大利语) + - 等等... + +- **输出类别**: 5级情感分类 + - 非常负面 (Very Negative) + - 负面 (Negative) + - 中性 (Neutral) + - 正面 (Positive) + - 非常正面 (Very Positive) + +## 快速开始 + +1. 确保已安装依赖: +```bash +pip install transformers torch +``` + +2. 运行预测程序: +```bash +python predict.py +``` + +3. 输入任意语言的文本进行分析: +``` +请输入文本: I love this product! +预测结果: 非常正面 (置信度: 0.9456) +``` + +4. 查看多语言示例: +``` +请输入文本: demo +``` + +## 代码示例 + +```python +from transformers import AutoTokenizer, AutoModelForSequenceClassification +import torch + +# 加载模型 +model_name = "tabularisai/multilingual-sentiment-analysis" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForSequenceClassification.from_pretrained(model_name) + +# 预测 +texts = [ + "今天心情很好", # 中文 + "I love this!", # 英文 + "¡Me encanta!" # 西班牙文 +] + +for text in texts: + inputs = tokenizer(text, return_tensors="pt") + outputs = model(**inputs) + prediction = torch.argmax(outputs.logits, dim=1).item() + sentiment_map = {0: "非常负面", 1: "负面", 2: "中性", 3: "正面", 4: "非常正面"} + print(f"{text} -> {sentiment_map[prediction]}") +``` + +## 特色功能 + +- **多语言支持**: 无需指定语言,自动识别22种语言 +- **5级精细分类**: 比传统二分类更细致的情感分析 +- **高精度**: 基于DistilBERT的先进架构 +- **本地缓存**: 首次下载后保存到本地,加快后续使用 + +## 应用场景 + +- 国际社交媒体监控 +- 多语言客户反馈分析 +- 全球产品评论情感分类 +- 跨语言品牌情感追踪 +- 多语言客服优化 +- 国际市场研究 + +## 模型存储 + +- 首次运行时会自动下载模型到当前目录的 `model` 文件夹 +- 后续运行会直接从本地加载,无需重复下载 +- 模型大小约135MB,首次下载需要网络连接 + +## 文件说明 + +- `predict.py`: 主预测程序,使用直接模型调用 +- `README.md`: 使用说明 + +## 注意事项 + +- 首次运行时会自动下载模型,需要网络连接 +- 模型会保存到当前目录,方便后续使用 +- 支持GPU加速,会自动检测可用设备 +- 如需清理模型文件,删除 `model` 文件夹即可 +- 该模型基于合成数据训练,在实际应用中建议进行验证 + +## 参考信息 + +- 模型链接: https://huggingface.co/tabularisai/multilingual-sentiment-analysis +- 许可证: CC-BY-NC-4.0 (非商业使用) +- 商业使用需联系: info@tabularis.ai \ No newline at end of file diff --git a/WeiboMultilingualSentiment/predict.py b/WeiboMultilingualSentiment/predict.py new file mode 100644 index 0000000..3c1ded8 --- /dev/null +++ b/WeiboMultilingualSentiment/predict.py @@ -0,0 +1,173 @@ +import torch +from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline +import re + +def preprocess_text(text): + """简单的文本预处理,适用于多语言文本""" + text = re.sub(r"\{%.+?%\}", " ", text) # 去除 {%xxx%} + text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx + text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 + text = re.sub(r"\u200b", " ", text) # 去除特殊字符 + text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", text) # 去除URL + # 删除表情符号 + text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text) + text = re.sub(r"\s+", " ", text) # 多个空格合并 + return text.strip() + +def main(): + print("正在加载多语言情感分析模型...") + + # 使用多语言情感分析模型 + model_name = "tabularisai/multilingual-sentiment-analysis" + local_model_path = "./model" + + try: + # 检查本地是否已有模型 + import os + if os.path.exists(local_model_path): + print("从本地加载模型...") + tokenizer = AutoTokenizer.from_pretrained(local_model_path) + model = AutoModelForSequenceClassification.from_pretrained(local_model_path) + else: + print("首次使用,正在下载模型到本地...") + # 下载并保存到本地 + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForSequenceClassification.from_pretrained(model_name) + + # 保存到本地 + tokenizer.save_pretrained(local_model_path) + model.save_pretrained(local_model_path) + print(f"模型已保存到: {local_model_path}") + + # 设置设备 + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model.to(device) + model.eval() + print(f"模型加载成功! 使用设备: {device}") + + # 情感标签映射(5级分类) + sentiment_map = { + 0: "非常负面", 1: "负面", 2: "中性", 3: "正面", 4: "非常正面" + } + + except Exception as e: + print(f"模型加载失败: {e}") + print("请检查网络连接") + return + + print("\n============= 多语言情感分析 =============") + print("支持语言: 中文、英文、西班牙文、阿拉伯文、日文、韩文等22种语言") + print("情感等级: 非常负面、负面、中性、正面、非常正面") + print("输入文本进行分析 (输入 'q' 退出):") + print("输入 'demo' 查看多语言示例") + + while True: + text = input("\n请输入文本: ") + if text.lower() == 'q': + break + + if text.lower() == 'demo': + show_multilingual_demo(tokenizer, model, device, sentiment_map) + continue + + if not text.strip(): + print("输入不能为空,请重新输入") + continue + + try: + # 预处理文本 + processed_text = preprocess_text(text) + + # 分词编码 + inputs = tokenizer( + processed_text, + max_length=512, + padding=True, + truncation=True, + return_tensors='pt' + ) + + # 转移到设备 + inputs = {k: v.to(device) for k, v in inputs.items()} + + # 预测 + with torch.no_grad(): + outputs = model(**inputs) + logits = outputs.logits + probabilities = torch.softmax(logits, dim=1) + prediction = torch.argmax(probabilities, dim=1).item() + + # 输出结果 + confidence = probabilities[0][prediction].item() + label = sentiment_map[prediction] + + print(f"预测结果: {label} (置信度: {confidence:.4f})") + + # 显示所有类别的概率 + print("详细概率分布:") + for i, (label_name, prob) in enumerate(zip(sentiment_map.values(), probabilities[0])): + print(f" {label_name}: {prob:.4f}") + + except Exception as e: + print(f"预测时发生错误: {e}") + continue + +def show_multilingual_demo(tokenizer, model, device, sentiment_map): + """展示多语言情感分析示例""" + print("\n=== 多语言情感分析示例 ===") + + demo_texts = [ + # 中文 + ("今天天气真好,心情特别棒!", "中文"), + ("这家餐厅的菜味道非常棒!", "中文"), + ("服务态度太差了,很失望", "中文"), + + # 英文 + ("I absolutely love this product!", "英文"), + ("The customer service was disappointing.", "英文"), + ("The weather is fine, nothing special.", "英文"), + + # 日文 + ("このレストランの料理は本当に美味しいです!", "日文"), + ("このホテルのサービスはがっかりしました。", "日文"), + + # 韩文 + ("이 가게의 케이크는 정말 맛있어요!", "韩文"), + ("서비스가 너무 별로였어요。", "韩文"), + + # 西班牙文 + ("¡Me encanta cómo quedó la decoración!", "西班牙文"), + ("El servicio fue terrible y muy lento.", "西班牙文"), + ] + + for text, language in demo_texts: + try: + inputs = tokenizer( + text, + max_length=512, + padding=True, + truncation=True, + return_tensors='pt' + ) + + inputs = {k: v.to(device) for k, v in inputs.items()} + + with torch.no_grad(): + outputs = model(**inputs) + logits = outputs.logits + probabilities = torch.softmax(logits, dim=1) + prediction = torch.argmax(probabilities, dim=1).item() + + confidence = probabilities[0][prediction].item() + label = sentiment_map[prediction] + + print(f"\n{language}: {text}") + print(f"结果: {label} (置信度: {confidence:.4f})") + + except Exception as e: + print(f"处理 {text} 时出错: {e}") + + print("\n=== 示例结束 ===") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/WeiboSentiment_Finetuned/BertChinese-Lora/predict.py b/WeiboSentiment_Finetuned/BertChinese-Lora/predict.py index 627d879..7c1ffab 100644 --- a/WeiboSentiment_Finetuned/BertChinese-Lora/predict.py +++ b/WeiboSentiment_Finetuned/BertChinese-Lora/predict.py @@ -8,6 +8,8 @@ def preprocess_text(text): text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 text = re.sub(r"\u200b", " ", text) # 去除特殊字符 + # 删除表情符号 + text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text) text = re.sub(r"\s+", " ", text) # 多个空格合并 return text.strip() diff --git a/WeiboSentiment_Finetuned/BertChinese-Lora/predict_pipeline.py b/WeiboSentiment_Finetuned/BertChinese-Lora/predict_pipeline.py index b8e2482..e7071ef 100644 --- a/WeiboSentiment_Finetuned/BertChinese-Lora/predict_pipeline.py +++ b/WeiboSentiment_Finetuned/BertChinese-Lora/predict_pipeline.py @@ -7,6 +7,8 @@ def preprocess_text(text): text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 text = re.sub(r"\u200b", " ", text) # 去除特殊字符 + # 删除表情符号 + text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text) text = re.sub(r"\s+", " ", text) # 多个空格合并 return text.strip() diff --git a/WeiboSentiment_Finetuned/GPT2-AdapterTuning/predict.py b/WeiboSentiment_Finetuned/GPT2-AdapterTuning/predict.py index 5863576..db947d5 100644 --- a/WeiboSentiment_Finetuned/GPT2-AdapterTuning/predict.py +++ b/WeiboSentiment_Finetuned/GPT2-AdapterTuning/predict.py @@ -1,6 +1,18 @@ import torch from transformers import BertTokenizer from train import GPT2ClassifierWithAdapter +import re + +def preprocess_text(text): + """简单的文本预处理""" + text = re.sub(r"\{%.+?%\}", " ", text) # 去除 {%xxx%} + text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx + text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 + text = re.sub(r"\u200b", " ", text) # 去除特殊字符 + # 删除表情符号 + text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text) + text = re.sub(r"\s+", " ", text) # 多个空格合并 + return text.strip() def main(): # 设置设备 @@ -31,9 +43,12 @@ def main(): if text.lower() == 'q': break + # 预处理文本 + processed_text = preprocess_text(text) + # 对文本进行编码 encoding = tokenizer( - text, + processed_text, max_length=128, padding='max_length', truncation=True, diff --git a/WeiboSentiment_Finetuned/GPT2-Lora/predict.py b/WeiboSentiment_Finetuned/GPT2-Lora/predict.py index 6a65480..16026bb 100644 --- a/WeiboSentiment_Finetuned/GPT2-Lora/predict.py +++ b/WeiboSentiment_Finetuned/GPT2-Lora/predict.py @@ -2,6 +2,18 @@ import torch from transformers import GPT2ForSequenceClassification, BertTokenizer from peft import PeftModel import os +import re + +def preprocess_text(text): + """简单的文本预处理""" + text = re.sub(r"\{%.+?%\}", " ", text) # 去除 {%xxx%} + text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx + text = re.sub(r"【.+?】", " ", text) # 去除 【xx】 + text = re.sub(r"\u200b", " ", text) # 去除特殊字符 + # 删除表情符号 + text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text) + text = re.sub(r"\s+", " ", text) # 多个空格合并 + return text.strip() def main(): # 设置设备 @@ -66,9 +78,12 @@ def main(): continue try: + # 预处理文本 + processed_text = preprocess_text(text) + # 对文本进行编码 encoding = tokenizer( - text, + processed_text, max_length=128, padding='max_length', truncation=True,