Local sentiment analysis upload.

This commit is contained in:
戒酒的李白
2025-08-23 15:55:07 +08:00
parent 4e33224633
commit f448ddd466
65 changed files with 1563359 additions and 11 deletions
@@ -0,0 +1,78 @@
# 微博情感分析 - 基于BertChinese的微调模型
本模块使用HuggingFace上的预训练微博情感分析模型进行情感分析。
## 模型信息
- **模型名称**: wsqstar/GISchat-weibo-100k-fine-tuned-bert
- **模型类型**: BERT中文情感分类模型
- **训练数据**: 10万条微博数据
- **输出**: 二分类(正面/负面情感)
## 使用方法
### 方法1: 直接模型调用 (推荐)
```bash
python predict.py
```
### 方法2: Pipeline方式
```bash
python predict_pipeline.py
```
## 快速开始
1. 确保已安装依赖:
```bash
pip install transformers torch
```
2. 运行预测程序:
```bash
python predict.py
```
3. 输入微博文本进行分析:
```
请输入微博内容: 今天天气真好,心情特别棒!
预测结果: 正面情感 (置信度: 0.9234)
```
## 代码示例
```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
# 加载模型
model_name = "wsqstar/GISchat-weibo-100k-fine-tuned-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# 预测
text = "今天心情很好"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=1).item()
print("正面情感" if prediction == 1 else "负面情感")
```
## 文件说明
- `predict.py`: 主预测程序,使用直接模型调用
- `predict_pipeline.py`: 使用pipeline方式的预测程序
- `README.md`: 使用说明
## 模型存储
- 首次运行时会自动下载模型到当前目录的 `model` 文件夹
- 后续运行会直接从本地加载,无需重复下载
- 模型大小约400MB,首次下载需要网络连接
## 注意事项
- 首次运行时会自动下载模型,需要网络连接
- 模型会保存到当前目录,方便后续使用
- 支持GPU加速,会自动检测可用设备
- 如需清理模型文件,删除 `model` 文件夹即可
@@ -0,0 +1,90 @@
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import re
def preprocess_text(text):
return text
def main():
print("正在加载微博情感分析模型...")
# 使用HuggingFace预训练模型
model_name = "wsqstar/GISchat-weibo-100k-fine-tuned-bert"
local_model_path = "./model"
try:
# 检查本地是否已有模型
import os
if os.path.exists(local_model_path):
print("从本地加载模型...")
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
else:
print("首次使用,正在下载模型到本地...")
# 下载并保存到本地
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# 保存到本地
tokenizer.save_pretrained(local_model_path)
model.save_pretrained(local_model_path)
print(f"模型已保存到: {local_model_path}")
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()
print(f"模型加载成功! 使用设备: {device}")
except Exception as e:
print(f"模型加载失败: {e}")
print("请检查网络连接或使用pipeline方式")
return
print("\n============= 微博情感分析 =============")
print("输入微博内容进行分析 (输入 'q' 退出):")
while True:
text = input("\n请输入微博内容: ")
if text.lower() == 'q':
break
if not text.strip():
print("输入不能为空,请重新输入")
continue
try:
# 预处理文本
processed_text = preprocess_text(text)
# 分词编码
inputs = tokenizer(
processed_text,
max_length=512,
padding=True,
truncation=True,
return_tensors='pt'
)
# 转移到设备
inputs = {k: v.to(device) for k, v in inputs.items()}
# 预测
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
prediction = torch.argmax(probabilities, dim=1).item()
# 输出结果
confidence = probabilities[0][prediction].item()
label = "正面情感" if prediction == 1 else "负面情感"
print(f"预测结果: {label} (置信度: {confidence:.4f})")
except Exception as e:
print(f"预测时发生错误: {e}")
continue
if __name__ == "__main__":
main()
@@ -0,0 +1,101 @@
from transformers import pipeline
import re
def preprocess_text(text):
"""简单的文本预处理"""
text = re.sub(r"\{%.+?%\}", " ", text) # 去除 {%xxx%}
text = re.sub(r"@.+?( |$)", " ", text) # 去除 @xxx
text = re.sub(r"【.+?】", " ", text) # 去除 【xx】
text = re.sub(r"\u200b", " ", text) # 去除特殊字符
# 删除表情符号
text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text)
text = re.sub(r"\s+", " ", text) # 多个空格合并
return text.strip()
def main():
print("正在加载微博情感分析模型...")
# 使用pipeline方式 - 更简单
model_name = "wsqstar/GISchat-weibo-100k-fine-tuned-bert"
local_model_path = "./model"
try:
# 检查本地是否已有模型
import os
if os.path.exists(local_model_path):
print("从本地加载模型...")
classifier = pipeline(
"text-classification",
model=local_model_path,
return_all_scores=True
)
else:
print("首次使用,正在下载模型到本地...")
# 先下载模型
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# 保存到本地
tokenizer.save_pretrained(local_model_path)
model.save_pretrained(local_model_path)
print(f"模型已保存到: {local_model_path}")
# 使用本地模型创建pipeline
classifier = pipeline(
"text-classification",
model=local_model_path,
return_all_scores=True
)
print("模型加载成功!")
except Exception as e:
print(f"模型加载失败: {e}")
print("请检查网络连接")
return
print("\n============= 微博情感分析 (Pipeline版) =============")
print("输入微博内容进行分析 (输入 'q' 退出):")
while True:
text = input("\n请输入微博内容: ")
if text.lower() == 'q':
break
if not text.strip():
print("输入不能为空,请重新输入")
continue
try:
# 预处理文本
processed_text = preprocess_text(text)
# 预测
outputs = classifier(processed_text)
# 解析结果
positive_score = None
negative_score = None
for output in outputs[0]:
if output['label'] == 'LABEL_1': # 正面
positive_score = output['score']
elif output['label'] == 'LABEL_0': # 负面
negative_score = output['score']
# 确定预测结果
if positive_score > negative_score:
label = "正面情感"
confidence = positive_score
else:
label = "负面情感"
confidence = negative_score
print(f"预测结果: {label} (置信度: {confidence:.4f})")
except Exception as e:
print(f"预测时发生错误: {e}")
continue
if __name__ == "__main__":
main()
@@ -0,0 +1,93 @@
# 微博情感识别模型-GPT2-Adapter微调
## 项目说明
这是一个基于GPT2的微博情感二分类模型,采用Adapter微调技术。通过Adapter微调,只需训练少量参数就可以让模型适应情感分析任务,大幅降低计算资源需求和模型体积。
## 数据集
使用微博情感数据集(weibo_senti_100k),包含约10万条带情感标注的微博内容,正负向评论各约5万条。数据集标签:
- 标签0:负面情感
- 标签1:正面情感
## 文件结构
```
GPT2-Adpter-tuning/
├── adapter.py # Adapter层的实现
├── gpt2_adapter.py # 针对GPT2模型的Adapter实现
├── train.py # 训练脚本
├── predict.py # 简化版预测脚本(交互式使用)
├── models/ # 本地存储的预训练模型
│ └── gpt2-chinese/ # 中文GPT2模型及配置
├── dataset/ # 数据集目录
│ └── weibo_senti_100k.csv # 微博情感数据集
└── best_weibo_sentiment_model.pth # 训练好的最佳模型
```
## 技术特点
1. **参数高效微调**:相比全参数微调,仅训练约3%的参数
2. **模型性能保持**:在仅训练少量参数的情况下,保持良好的分类性能
3. **适用于资源受限环境**:模型体积小,推理速度快
## 环境依赖
- Python 3.6+
- PyTorch
- Transformers
- Pandas
- NumPy
- Scikit-learn
- Tqdm
## 使用方法
### 训练模型
```bash
python train.py
```
训练过程会自动:
- 下载并本地保存中文GPT2预训练模型
- 加载微博情感数据集
- 训练模型并保存最佳模型
### 情感分析预测
```bash
python predict.py
```
运行后将进入交互模式:
- 在控制台输入要分析的微博文本
- 系统会返回情感分析结果(正面/负面)和置信度
- 输入'q'退出程序
## 模型结构
- 基础模型:`uer/gpt2-chinese-cluecorpussmall`中文预训练模型
- 模型本地保存路径:`./models/gpt2-chinese/`
- 通过在每个GPT2Block后添加Adapter层进行微调
- 冻结原始GPT2参数,仅训练分类器和Adapter层参数
## Adapter技术
Adapter是一种参数高效的微调技术,通过在Transformer层中插入小型的瓶颈层,实现用少量参数适应下游任务的目的。主要特点:
1. **参数高效**:相比全参数微调,Adapter只需训练很小一部分参数
2. **防止遗忘**:保持原始预训练模型的参数不变,避免灾难性遗忘
3. **适应多任务**:可以为不同任务训练不同的Adapter,共享同一个基础模型
在本项目中,我们在每个GPT2Block后添加了一个Adapter层,Adapter的隐藏层大小为64,远小于原始模型的隐藏层大小(通常为768或1024)。
## 使用示例
```
使用设备: cuda
加载模型: best_weibo_sentiment_model.pth
============= 微博情感分析 =============
输入微博内容进行分析 (输入 'q' 退出):
请输入微博内容: 这部电影真是太好看了,我非常喜欢!
预测结果: 正面情感 (置信度: 0.9876)
请输入微博内容: 服务态度差,价格还贵,一点都不推荐
预测结果: 负面情感 (置信度: 0.9742)
```
## 注意事项
- 预测脚本使用本地模型路径,不需要在线下载模型
- 确保`models/gpt2-chinese/`目录包含从训练过程中保存的模型文件
- 首次运行train.py时会自动下载并保存模型,请确保网络连接
@@ -0,0 +1,42 @@
import torch
import torch.nn as nn
class AdapterLayer(nn.Module):
"""
Adapter层实现
将其添加到Transformer层中可以实现参数高效微调
"""
def __init__(self, input_size, adapter_size):
super(AdapterLayer, self).__init__()
# 降维全连接层
self.down_project = nn.Linear(input_size, adapter_size)
# 激活函数
self.activation = nn.ReLU()
# 升维全连接层
self.up_project = nn.Linear(adapter_size, input_size)
# 初始化参数
self._init_weights()
def _init_weights(self):
# 初始化down_project用较小的值
nn.init.normal_(self.down_project.weight, std=1e-2)
nn.init.zeros_(self.down_project.bias)
# 初始化up_project为接近零的值,确保训练初期对原始模型影响较小
nn.init.normal_(self.up_project.weight, std=1e-2)
nn.init.zeros_(self.up_project.bias)
def forward(self, x):
# 保存原始输入用于残差连接
residual = x
# 通过降维层
x = self.down_project(x)
# 激活
x = self.activation(x)
# 通过升维层
x = self.up_project(x)
# 残差连接
return residual + x
@@ -0,0 +1,280 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# weibo_senti_100k 说明\n",
"0. **下载地址:** [百度网盘](https://pan.baidu.com/s/1DoQbki3YwqkuwQUOj64R_g)\n",
"1. **数据概览:** 10 万多条,带情感标注 新浪微博,正负向评论约各 5 万条\n",
"2. **推荐实验:** 情感/观点/评论 倾向性分析\n",
"2. **数据来源:** [新浪微博](https://weibo.com/)\n",
"3. **原数据集:** [新浪微博,情感分析标记语料共12万条](https://download.csdn.net/download/weixin_38442818/10214750),网上搜集,具体作者、来源不详\n",
"4. **加工处理:**\n",
" 1. 将原来的 2 份文档,整合成 1 份 csv 文件\n",
" 2. 编码统一为 UTF-8\n",
" 3. 去重"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"path = 'weibo_senti_100k_文件夹_所在_路径'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. weibo_senti_100k.csv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 加载数据"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"评论数目(总体):119988\n",
"评论数目(正向):59993\n",
"评论数目(负向):59995\n"
]
}
],
"source": [
"pd_all = pd.read_csv(path + 'weibo_senti_100k.csv')\n",
"\n",
"print('评论数目(总体):%d' % pd_all.shape[0])\n",
"print('评论数目(正向):%d' % pd_all[pd_all.label==1].shape[0])\n",
"print('评论数目(负向):%d' % pd_all[pd_all.label==0].shape[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 字段说明\n",
"\n",
"| 字段 | 说明 |\n",
"| ---- | ---- |\n",
"| label | 1 表示正向评论,0 表示负向评论 |\n",
"| review | 微博内容 |"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>review</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>62050</th>\n",
" <td>0</td>\n",
" <td>太过分了@Rexzhenghao //@Janie_Zhang:招行最近负面新闻越来越多呀...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68263</th>\n",
" <td>0</td>\n",
" <td>希望你?得好?我本"?肥血?史"[晕][哈哈]@Pete三姑父</td>\n",
" </tr>\n",
" <tr>\n",
" <th>81472</th>\n",
" <td>0</td>\n",
" <td>有点想参加????[偷?]想安排下时间再决定[抓狂]//@黑晶晶crystal: @细腿大羽...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42021</th>\n",
" <td>1</td>\n",
" <td>[给力]感谢所有支持雯婕的芝麻![爱你]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7777</th>\n",
" <td>1</td>\n",
" <td>2013最后一天,在新加坡开心度过,向所有的朋友们问声:新年快乐!2014年,我们会更好[调...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100399</th>\n",
" <td>0</td>\n",
" <td>大中午出门办事找错路,曝晒中。要多杯具有多杯具。[泪][泪][汗]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82398</th>\n",
" <td>0</td>\n",
" <td>马航还会否认吗?到底在隐瞒啥呢?[抓狂]//@头条新闻: 转发微博</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106423</th>\n",
" <td>0</td>\n",
" <td>克罗地亚球迷很爱放烟火!球又没进,就硝烟四起。[晕]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24798</th>\n",
" <td>1</td>\n",
" <td>[抱抱]福芦 TangRoulou 吉祥书 8.8折优惠 &gt;&gt;&gt; http://t.cn/z...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6598</th>\n",
" <td>1</td>\n",
" <td>回复@钱旭明QXM:[嘻嘻][嘻嘻] //@钱旭明QXM:杨大哥[good][good][g...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53920</th>\n",
" <td>1</td>\n",
" <td>人家这脸长的!!!!!![哈哈]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15587</th>\n",
" <td>1</td>\n",
" <td>这个价不算高,和一天内训相比相差无几。。[哈哈]//@博通传媒v: 6个月!一个月工资1万,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>101237</th>\n",
" <td>0</td>\n",
" <td>终于收工啦,脚丫子快冻掉了[泪][泪][泪]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82449</th>\n",
" <td>0</td>\n",
" <td>我决定从今天开始我想吃什么就去吃什么,一个人吃也无所谓,重点是不要因为别人的意见委屈了自己[...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32537</th>\n",
" <td>1</td>\n",
" <td>飘雪的北京 需要双份早餐.......//@美食天下: [哈哈]//@王淼Margay: 屁...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10630</th>\n",
" <td>1</td>\n",
" <td>[耶],这个太赞了,生活大爆炸第六季马上要出啦[鼓掌] //@-郑瑜-:这个不错 //@经典...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85130</th>\n",
" <td>0</td>\n",
" <td>刚追完#倾世皇妃#,#千山暮雪#又紧随其后,网速和更新速度都太不给力,尽管我看过原著,还是焦...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105956</th>\n",
" <td>0</td>\n",
" <td>晚上看金二胖?察前?,推出的火炮基座?糟了,可以PK了[泪] //@艾米粒er: //@wi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72391</th>\n",
" <td>0</td>\n",
" <td>必须把中国足球的伟大,用我的职业演说出来 //@袁腾飞:[泪]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10761</th>\n",
" <td>1</td>\n",
" <td>[鼓掌] //@宁波香格里拉大酒店: 小编来答疑,周五晚惊艳全场的树根蛋糕到底有多长?蛋糕全...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" label review\n",
"62050 0 太过分了@Rexzhenghao //@Janie_Zhang:招行最近负面新闻越来越多呀...\n",
"68263 0 希望你?得好?我本"?肥血?史"[晕][哈哈]@Pete三姑父\n",
"81472 0 有点想参加????[偷?]想安排下时间再决定[抓狂]//@黑晶晶crystal: @细腿大羽...\n",
"42021 1 [给力]感谢所有支持雯婕的芝麻![爱你]\n",
"7777 1 2013最后一天,在新加坡开心度过,向所有的朋友们问声:新年快乐!2014年,我们会更好[调...\n",
"100399 0 大中午出门办事找错路,曝晒中。要多杯具有多杯具。[泪][泪][汗]\n",
"82398 0 马航还会否认吗?到底在隐瞒啥呢?[抓狂]//@头条新闻: 转发微博\n",
"106423 0 克罗地亚球迷很爱放烟火!球又没进,就硝烟四起。[晕]\n",
"24798 1 [抱抱]福芦 TangRoulou 吉祥书 8.8折优惠 >>> http://t.cn/z...\n",
"6598 1 回复@钱旭明QXM:[嘻嘻][嘻嘻] //@钱旭明QXM:杨大哥[good][good][g...\n",
"53920 1 人家这脸长的!!!!!![哈哈]\n",
"15587 1 这个价不算高,和一天内训相比相差无几。。[哈哈]//@博通传媒v: 6个月!一个月工资1万,...\n",
"101237 0 终于收工啦,脚丫子快冻掉了[泪][泪][泪]\n",
"82449 0 我决定从今天开始我想吃什么就去吃什么,一个人吃也无所谓,重点是不要因为别人的意见委屈了自己[...\n",
"32537 1 飘雪的北京 需要双份早餐.......//@美食天下: [哈哈]//@王淼Margay: 屁...\n",
"10630 1 [耶],这个太赞了,生活大爆炸第六季马上要出啦[鼓掌] //@-郑瑜-:这个不错 //@经典...\n",
"85130 0 刚追完#倾世皇妃#,#千山暮雪#又紧随其后,网速和更新速度都太不给力,尽管我看过原著,还是焦...\n",
"105956 0 晚上看金二胖?察前?,推出的火炮基座?糟了,可以PK了[泪] //@艾米粒er: //@wi...\n",
"72391 0 必须把中国足球的伟大,用我的职业演说出来 //@袁腾飞:[泪]\n",
"10761 1 [鼓掌] //@宁波香格里拉大酒店: 小编来答疑,周五晚惊艳全场的树根蛋糕到底有多长?蛋糕全..."
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd_all.sample(20)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"widgets": {
"state": {},
"version": "1.1.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@@ -0,0 +1,20 @@
import torch
import torch.nn as nn
class AdapterLayer(nn.Module):
def __init__(self, input_size, adapter_size):
super(AdapterLayer, self).__init__()
# 第一个全连接层降维
self.down_project = nn.Linear(input_size, adapter_size)
# ReLU激活函数
self.relu = nn.ReLU()
# 第二个全连接层升维
self.up_project = nn.Linear(adapter_size, input_size)
def forward(self, x):
# 通过Adapter层的前向传播
down_projected = self.down_project(x)
relu = self.relu(down_projected)
up_projected = self.up_project(x)
# 将Adapter的输出与输入相加(残差连接)
return x + up_projected
@@ -0,0 +1,11 @@
一种Adapter-tuning的实现方式,只提供的思路,具体可以视情况稍微修改。
这里补充一些模型层数:
GPT-2 Small12个GPT2Block,约有1.17亿个参数。
GPT-2 Medium24个GPT2Block,约有3.48亿个参数。
GPT-2 Large36个GPT2Block,约有7.55亿个参数。
GPT-2 XL (也称为Extra Large)48个GPT2Block,约有15.54亿个参数。
RoBERTa Base12个RobertaLayer,总共约有1.25亿个参数。
RoBERTa Large24个RobertaLayer,总共约有3.55亿个参数。
@@ -0,0 +1,22 @@
from transformers.models.roberta.modeling_roberta import RobertaLayer
class RobertaLayerWithAdapter(RobertaLayer):
def __init__(self, config):
super().__init__(config)
# 假设Adapter的大小为64
adapter_size = 64
self.adapter = AdapterLayer(config.hidden_size, adapter_size)
def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False):
# 调用原始的前向传播方法
self_outputs = super().forward(hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
# 得到Transformer层的输出
sequence_output = self_outputs[0]
# 将输出通过Adapter层
sequence_output = self.adapter(sequence_output)
# 返回修改后的输出(其他输出保持不变)
return (sequence_output,) + self_outputs[1:]
"""
RoBERTa的每个RobertaLayer包含一个自注意力(self-attention)机制和一个前馈网络,这些层共同构成了RoBERTa的基础架构。
"""
@@ -0,0 +1,40 @@
from transformers.models.gpt2.modeling_gpt2 import GPT2Block
class GPT2BlockWithAdapter(GPT2Block):
def __init__(self, config):
super().__init__(config)
# 假设Adapter的大小为64
adapter_size = 64
self.adapter = AdapterLayer(config.n_embd, adapter_size)
def forward(
self,
hidden_states,
layer_past=None,
attention_mask=None,
head_mask=None,
use_cache=False,
output_attentions=False,
):
# 调用原始的前向传播方法
attn_outputs = super().forward(
hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
# 得到Transformer层的输出
a = attn_outputs[0] # 输出的第一部分是attention的结果
# 将输出通过Adapter层
a = self.adapter(a)
# 返回修改后的输出(其他输出保持不变)
outputs = (a,) + attn_outputs[1:]
return outputs
"""
每个GPT2Block包含了一系列的自注意力(Self-Attention)和前馈网络(Feed-Forward)层,这些层共同构成了模型的基础架构。
"""
@@ -0,0 +1,60 @@
import torch
import torch.nn as nn
from transformers.models.gpt2.modeling_gpt2 import GPT2Block
from adapter import AdapterLayer
class GPT2BlockWithAdapter(nn.Module):
"""
带Adapter的GPT2Block层
在原始GPT2Block的基础上添加Adapter层实现参数高效微调
"""
def __init__(self, config):
super(GPT2BlockWithAdapter, self).__init__()
# 创建标准的GPT2Block
self.original_block = GPT2Block(config)
# 添加Adapter层
adapter_size = 64 # Adapter的隐藏层大小
self.adapter = AdapterLayer(config.hidden_size, adapter_size)
def forward(
self,
hidden_states,
layer_past=None,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
use_cache=False,
output_attentions=False,
**kwargs # 使用**kwargs接收所有其他参数
):
# 首先通过原始的GPT2Block,只传递它支持的参数
outputs = self.original_block(
hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
# 原始输出中的第一个元素是隐藏状态
hidden_states = outputs[0]
# 将隐藏状态通过Adapter层
hidden_states = self.adapter(hidden_states)
# 更新输出的隐藏状态
outputs = (hidden_states,) + outputs[1:]
return outputs
def load_state_dict(self, state_dict, strict=True):
"""
自定义加载参数方法,用于从原始GPT2Block加载参数
"""
# 将所有参数传递给原始Block
return self.original_block.load_state_dict(state_dict, strict=strict)
@@ -0,0 +1,69 @@
import torch
from transformers import BertTokenizer
from train import GPT2ClassifierWithAdapter
import re
def preprocess_text(text):
"""简单的文本预处理"""
return text
def main():
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 使用本地模型路径而不是在线模型名称
local_model_path = './models/gpt2-chinese'
model_path = 'best_weibo_sentiment_model.pth'
print(f"加载模型: {model_path}")
# 从本地加载tokenizer
tokenizer = BertTokenizer.from_pretrained(local_model_path)
if tokenizer.pad_token is None:
tokenizer.pad_token = '[PAD]'
# 加载模型,使用本地模型路径
model = GPT2ClassifierWithAdapter(local_model_path)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()
print("\n============= 微博情感分析 =============")
print("输入微博内容进行分析 (输入 'q' 退出):")
while True:
text = input("\n请输入微博内容: ")
if text.lower() == 'q':
break
# 预处理文本
processed_text = preprocess_text(text)
# 对文本进行编码
encoding = tokenizer(
processed_text,
max_length=128,
padding='max_length',
truncation=True,
return_tensors='pt'
)
# 转移到设备
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
# 预测
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
prediction = torch.argmax(probabilities, dim=1).item()
# 输出结果
confidence = probabilities[0][prediction].item()
label = "正面情感" if prediction == 1 else "负面情感"
print(f"预测结果: {label} (置信度: {confidence:.4f})")
if __name__ == "__main__":
main()
@@ -0,0 +1,310 @@
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Config, GPT2ForSequenceClassification, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from adapter import AdapterLayer
from gpt2_adapter import GPT2BlockWithAdapter
# 设置随机种子
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed(42)
# 定义微博情感分析数据集
class WeiboSentimentDataset(Dataset):
def __init__(self, reviews, labels, tokenizer, max_length=128):
self.reviews = reviews
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.reviews)
def __getitem__(self, idx):
review = str(self.reviews[idx])
label = self.labels[idx]
encoding = self.tokenizer(
review,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label, dtype=torch.long)
}
# 定义GPT2分类模型,带Adapter
class GPT2ClassifierWithAdapter(nn.Module):
def __init__(self, pretrained_model_name, num_labels=2):
super(GPT2ClassifierWithAdapter, self).__init__()
# 加载预训练模型
self.gpt2 = GPT2ForSequenceClassification.from_pretrained(
pretrained_model_name,
num_labels=num_labels
)
# 确保模型配置中设置了pad_token_id
self.gpt2.config.pad_token_id = self.gpt2.config.eos_token_id
# 替换原始的GPT2Block为带Adapter的版本
config = self.gpt2.config
for i in range(len(self.gpt2.transformer.h)):
# 保存原始权重
old_block = self.gpt2.transformer.h[i]
# 创建带Adapter的新Block
new_block = GPT2BlockWithAdapter(config)
# 复制原始权重
new_block.load_state_dict(old_block.state_dict(), strict=False)
# 替换
self.gpt2.transformer.h[i] = new_block
# 冻结原始GPT2参数
for param in self.gpt2.parameters():
param.requires_grad = False
# 解冻分类器层和Adapter层参数
for param in self.gpt2.score.parameters():
param.requires_grad = True
# 解冻所有Adapter层
for i in range(len(self.gpt2.transformer.h)):
for param in self.gpt2.transformer.h[i].adapter.parameters():
param.requires_grad = True
def forward(self, input_ids, attention_mask, labels=None):
return self.gpt2(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
# 训练函数
def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, device, epochs=3):
best_f1 = 0.0
for epoch in range(epochs):
print(f"======== Epoch {epoch+1} / {epochs} ========")
model.train()
total_loss = 0
# 训练循环
progress_bar = tqdm(train_dataloader, desc="Training", position=0, leave=True)
for batch in progress_bar:
# 将数据移到GPU
batch = {k: v.to(device) for k, v in batch.items()}
# 清零梯度
optimizer.zero_grad()
# 前向传播
outputs = model(
input_ids=batch['input_ids'],
attention_mask=batch['attention_mask'],
labels=batch['labels']
)
loss = outputs.loss
total_loss += loss.item()
# 反向传播
loss.backward()
# 梯度裁剪,防止梯度爆炸
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# 参数更新
optimizer.step()
scheduler.step()
# 更新进度条
progress_bar.set_postfix({"loss": loss.item()})
# 计算平均训练损失
avg_train_loss = total_loss / len(train_dataloader)
print(f"Average training loss: {avg_train_loss:.4f}")
# 评估模型
val_metrics = evaluate_model(model, val_dataloader, device)
print(f"Validation Loss: {val_metrics['loss']:.4f}")
print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")
print(f"Validation F1 Score: {val_metrics['f1']:.4f}")
# 保存最佳模型
if val_metrics['f1'] > best_f1:
best_f1 = val_metrics['f1']
torch.save(model.state_dict(), "best_weibo_sentiment_model.pth")
print("Saved best model!")
# 评估函数
def evaluate_model(model, dataloader, device):
model.eval()
total_loss = 0
all_preds = []
all_labels = []
with torch.no_grad():
for batch in tqdm(dataloader, desc="Evaluating"):
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(
input_ids=batch['input_ids'],
attention_mask=batch['attention_mask'],
labels=batch['labels']
)
loss = outputs.loss
total_loss += loss.item()
# 获取预测结果
logits = outputs.logits
preds = torch.argmax(logits, dim=1).cpu().numpy()
labels = batch['labels'].cpu().numpy()
all_preds.extend(preds)
all_labels.extend(labels)
# 计算评估指标
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='macro')
avg_loss = total_loss / len(dataloader)
return {
'loss': avg_loss,
'accuracy': accuracy,
'f1': f1
}
def main():
# 设置模型本地保存路径
model_name = 'uer/gpt2-chinese-cluecorpussmall'
local_model_path = './models/gpt2-chinese'
# 确保目录存在
os.makedirs(local_model_path, exist_ok=True)
# 加载数据集
print("加载微博情感数据集...")
df = pd.read_csv('dataset/weibo_senti_100k.csv')
# 分割数据集
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])
# 加载tokenizer和模型
print("加载预训练模型和tokenizer...")
# 检查本地是否已有模型
if os.path.exists(os.path.join(local_model_path, 'config.json')):
print(f"从本地路径加载模型: {local_model_path}")
tokenizer = BertTokenizer.from_pretrained(local_model_path)
else:
print(f"从Hugging Face下载模型到: {local_model_path}")
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
# 保存tokenizer到本地
tokenizer.save_pretrained(local_model_path)
# 设置padding token (BertTokenizer通常已有[PAD]作为padding token)
if tokenizer.pad_token is None:
# 如果没有,显式设置为[PAD]
tokenizer.pad_token = '[PAD]'
# 记录pad_token的ID,确保模型和tokenizer使用相同的pad_token_id
pad_token_id = tokenizer.pad_token_id
# 创建数据集
train_dataset = WeiboSentimentDataset(
train_df['review'].values,
train_df['label'].values,
tokenizer
)
val_dataset = WeiboSentimentDataset(
val_df['review'].values,
val_df['label'].values,
tokenizer
)
# 创建数据加载器
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 初始化模型
if (os.path.exists(os.path.join(local_model_path, 'pytorch_model.bin')) or
os.path.exists(os.path.join(local_model_path, 'model.safetensors'))):
print(f"从本地路径加载模型权重: {local_model_path}")
model = GPT2ClassifierWithAdapter(local_model_path)
else:
print(f"从Hugging Face下载模型权重到: {local_model_path}")
# 直接从Hugging Face下载并保存完整模型
temp_model = GPT2ForSequenceClassification.from_pretrained(model_name)
temp_model.save_pretrained(local_model_path)
# 然后用保存的模型创建GPT2ClassifierWithAdapter
model = GPT2ClassifierWithAdapter(local_model_path)
# 确保模型使用与tokenizer相同的pad_token_id
model.gpt2.config.pad_token_id = pad_token_id
model.to(device)
# 统计需要训练的参数
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"模型总参数量: {total_params}")
print(f"需要训练的参数量: {trainable_params} ({trainable_params/total_params*100:.2f}%)")
# 设置优化器和学习率调度器
optimizer = AdamW(
[p for p in model.parameters() if p.requires_grad],
lr=5e-5,
eps=1e-8
)
# 设置总训练步数和warmup步数
total_steps = len(train_dataloader) * 2 # 2个epoch
warmup_steps = int(total_steps * 0.1) # 10%的warmup
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=total_steps
)
# 训练模型
print("开始训练...")
train_model(
model=model,
train_dataloader=train_dataloader,
val_dataloader=val_dataloader,
optimizer=optimizer,
scheduler=scheduler,
device=device,
epochs=2
)
print("训练完成!")
if __name__ == "__main__":
main()
@@ -0,0 +1,142 @@
# 微博情感识别模型-GPT2-LoRA微调
## 项目说明
这是一个基于GPT2的微博情感二分类模型,采用LoRALow-Rank Adaptation)微调技术。通过PEFT库实现的LoRA微调,只需训练极少量参数就可以让模型适应情感分析任务,大幅降低计算资源需求和模型体积。
## 数据集
使用微博情感数据集(weibo_senti_100k),包含约10万条带情感标注的微博内容,正负向评论各约5万条。数据集标签:
- 标签0:负面情感
- 标签1:正面情感
## 文件结构
```
GPT2-Lora/
├── train.py # 训练脚本(基于PEFT库的LoRA实现)
├── predict.py # 预测脚本(交互式使用)
├── requirements.txt # 依赖包列表
├── models/ # 本地存储的预训练模型
│ └── gpt2-chinese/ # 中文GPT2模型及配置
├── dataset/ # 数据集目录
│ └── weibo_senti_100k.csv # 微博情感数据集
└── best_weibo_sentiment_lora/ # 训练好的LoRA权重(训练后生成)
```
## 技术特点
1. **极度参数高效**:相比全参数微调,仅训练约0.1%-1%的参数
2. **使用PEFT库**:基于Hugging Face官方的参数高效微调库,稳定可靠
3. **模型性能保持**:在仅训练极少参数的情况下,保持良好的分类性能
4. **部署友好**:LoRA权重文件小,便于模型部署和分享
## LoRA技术优势
LoRA (Low-Rank Adaptation) 是目前最流行的参数高效微调技术:
1. **超低参数量**:通过低秩分解,将大矩阵分解为两个小矩阵的乘积
2. **插件式设计**:LoRA权重可以动态加载和卸载,一个基础模型支持多个任务
3. **训练速度快**:参数少,训练时间短,内存占用小
4. **无损原模型**:原始预训练模型权重保持不变,避免灾难性遗忘
## 环境依赖
安装所需依赖:
```bash
pip install -r requirements.txt
```
主要依赖包:
- Python 3.8+
- PyTorch 1.13+
- Transformers 4.28+
- PEFT 0.4+
- Pandas, NumPy, Scikit-learn
## 使用方法
### 1. 安装依赖
```bash
pip install -r requirements.txt
```
### 2. 训练模型
```bash
python train.py
```
训练过程会自动:
- 下载并本地保存中文GPT2预训练模型
- 加载微博情感数据集
- 使用LoRA技术训练模型
- 保存最佳LoRA权重到 `./best_weibo_sentiment_lora/`
### 3. 情感分析预测
```bash
python predict.py
```
运行后将进入交互模式:
- 在控制台输入要分析的微博文本
- 系统会返回情感分析结果(正面/负面)和置信度
- 输入'q'退出程序
## 模型配置
- **基础模型**: `uer/gpt2-chinese-cluecorpussmall` 中文预训练模型
- **模型本地保存路径**: `./models/gpt2-chinese/`
- **LoRA配置**:
- rank (r): 8 - 低秩矩阵的秩
- alpha: 32 - 缩放因子
- target_modules: ["c_attn", "c_proj"] - 目标线性层
- dropout: 0.1 - 防止过拟合
## 性能对比
| 方法 | 可训练参数占比 | 模型文件大小 | 训练时间 | 推理速度 |
|------|----------------|--------------|----------|----------|
| 全参数微调 | 100% | ~500MB | 长 | 慢 |
| Adapter微调 | ~3% | ~50MB | 中等 | 中等 |
| **LoRA微调** | **~0.5%** | **~2MB** | **短** | **快** |
## 使用示例
```
使用设备: cuda
LoRA模型加载成功!
============= 微博情感分析 (LoRA版) =============
输入微博内容进行分析 (输入 'q' 退出):
请输入微博内容: 这部电影真是太好看了,我非常喜欢!
预测结果: 正面情感 (置信度: 0.9876)
请输入微博内容: 服务态度差,价格还贵,一点都不推荐
预测结果: 负面情感 (置信度: 0.9742)
请输入微博内容: q
```
## 注意事项
1. **首次运行**:首次运行 `train.py` 时会自动下载预训练模型,请确保网络连接
2. **GPU推荐**:虽然LoRA参数少,但建议使用GPU加速训练
3. **模型加载**:预测时需要先有训练好的LoRA权重文件
4. **兼容性**:基于PEFT库实现,与Hugging Face生态系统完全兼容
## 扩展功能
- **多任务支持**:可以为不同任务训练不同的LoRA权重,共享同一个基础模型
- **权重合并**:可以将多个LoRA权重合并,或将LoRA权重合并到基础模型中
- **动态切换**:支持运行时动态加载和切换不同的LoRA权重
## 技术原理
LoRA通过在原始线性层旁边添加两个小的矩阵A和B,使得:
```
h = W₀x + BAx
```
其中:
- W₀是冻结的预训练权重
- B ∈ ℝᵈˣʳ, A ∈ ℝʳˣᵏ是可训练的低秩矩阵
- r << min(d,k),大大减少了参数量
这种设计既保持了预训练模型的知识,又能高效地适应新任务。
@@ -0,0 +1,280 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# weibo_senti_100k 说明\n",
"0. **下载地址:** [百度网盘](https://pan.baidu.com/s/1DoQbki3YwqkuwQUOj64R_g)\n",
"1. **数据概览:** 10 万多条,带情感标注 新浪微博,正负向评论约各 5 万条\n",
"2. **推荐实验:** 情感/观点/评论 倾向性分析\n",
"2. **数据来源:** [新浪微博](https://weibo.com/)\n",
"3. **原数据集:** [新浪微博,情感分析标记语料共12万条](https://download.csdn.net/download/weixin_38442818/10214750),网上搜集,具体作者、来源不详\n",
"4. **加工处理:**\n",
" 1. 将原来的 2 份文档,整合成 1 份 csv 文件\n",
" 2. 编码统一为 UTF-8\n",
" 3. 去重"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"path = 'weibo_senti_100k_文件夹_所在_路径'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. weibo_senti_100k.csv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 加载数据"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"评论数目(总体):119988\n",
"评论数目(正向):59993\n",
"评论数目(负向):59995\n"
]
}
],
"source": [
"pd_all = pd.read_csv(path + 'weibo_senti_100k.csv')\n",
"\n",
"print('评论数目(总体):%d' % pd_all.shape[0])\n",
"print('评论数目(正向):%d' % pd_all[pd_all.label==1].shape[0])\n",
"print('评论数目(负向):%d' % pd_all[pd_all.label==0].shape[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 字段说明\n",
"\n",
"| 字段 | 说明 |\n",
"| ---- | ---- |\n",
"| label | 1 表示正向评论,0 表示负向评论 |\n",
"| review | 微博内容 |"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>review</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>62050</th>\n",
" <td>0</td>\n",
" <td>太过分了@Rexzhenghao //@Janie_Zhang:招行最近负面新闻越来越多呀...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68263</th>\n",
" <td>0</td>\n",
" <td>希望你?得好?我本"?肥血?史"[晕][哈哈]@Pete三姑父</td>\n",
" </tr>\n",
" <tr>\n",
" <th>81472</th>\n",
" <td>0</td>\n",
" <td>有点想参加????[偷?]想安排下时间再决定[抓狂]//@黑晶晶crystal: @细腿大羽...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42021</th>\n",
" <td>1</td>\n",
" <td>[给力]感谢所有支持雯婕的芝麻![爱你]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7777</th>\n",
" <td>1</td>\n",
" <td>2013最后一天,在新加坡开心度过,向所有的朋友们问声:新年快乐!2014年,我们会更好[调...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100399</th>\n",
" <td>0</td>\n",
" <td>大中午出门办事找错路,曝晒中。要多杯具有多杯具。[泪][泪][汗]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82398</th>\n",
" <td>0</td>\n",
" <td>马航还会否认吗?到底在隐瞒啥呢?[抓狂]//@头条新闻: 转发微博</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106423</th>\n",
" <td>0</td>\n",
" <td>克罗地亚球迷很爱放烟火!球又没进,就硝烟四起。[晕]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24798</th>\n",
" <td>1</td>\n",
" <td>[抱抱]福芦 TangRoulou 吉祥书 8.8折优惠 &gt;&gt;&gt; http://t.cn/z...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6598</th>\n",
" <td>1</td>\n",
" <td>回复@钱旭明QXM:[嘻嘻][嘻嘻] //@钱旭明QXM:杨大哥[good][good][g...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53920</th>\n",
" <td>1</td>\n",
" <td>人家这脸长的!!!!!![哈哈]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15587</th>\n",
" <td>1</td>\n",
" <td>这个价不算高,和一天内训相比相差无几。。[哈哈]//@博通传媒v: 6个月!一个月工资1万,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>101237</th>\n",
" <td>0</td>\n",
" <td>终于收工啦,脚丫子快冻掉了[泪][泪][泪]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82449</th>\n",
" <td>0</td>\n",
" <td>我决定从今天开始我想吃什么就去吃什么,一个人吃也无所谓,重点是不要因为别人的意见委屈了自己[...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32537</th>\n",
" <td>1</td>\n",
" <td>飘雪的北京 需要双份早餐.......//@美食天下: [哈哈]//@王淼Margay: 屁...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10630</th>\n",
" <td>1</td>\n",
" <td>[耶],这个太赞了,生活大爆炸第六季马上要出啦[鼓掌] //@-郑瑜-:这个不错 //@经典...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85130</th>\n",
" <td>0</td>\n",
" <td>刚追完#倾世皇妃#,#千山暮雪#又紧随其后,网速和更新速度都太不给力,尽管我看过原著,还是焦...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105956</th>\n",
" <td>0</td>\n",
" <td>晚上看金二胖?察前?,推出的火炮基座?糟了,可以PK了[泪] //@艾米粒er: //@wi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72391</th>\n",
" <td>0</td>\n",
" <td>必须把中国足球的伟大,用我的职业演说出来 //@袁腾飞:[泪]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10761</th>\n",
" <td>1</td>\n",
" <td>[鼓掌] //@宁波香格里拉大酒店: 小编来答疑,周五晚惊艳全场的树根蛋糕到底有多长?蛋糕全...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" label review\n",
"62050 0 太过分了@Rexzhenghao //@Janie_Zhang:招行最近负面新闻越来越多呀...\n",
"68263 0 希望你?得好?我本"?肥血?史"[晕][哈哈]@Pete三姑父\n",
"81472 0 有点想参加????[偷?]想安排下时间再决定[抓狂]//@黑晶晶crystal: @细腿大羽...\n",
"42021 1 [给力]感谢所有支持雯婕的芝麻![爱你]\n",
"7777 1 2013最后一天,在新加坡开心度过,向所有的朋友们问声:新年快乐!2014年,我们会更好[调...\n",
"100399 0 大中午出门办事找错路,曝晒中。要多杯具有多杯具。[泪][泪][汗]\n",
"82398 0 马航还会否认吗?到底在隐瞒啥呢?[抓狂]//@头条新闻: 转发微博\n",
"106423 0 克罗地亚球迷很爱放烟火!球又没进,就硝烟四起。[晕]\n",
"24798 1 [抱抱]福芦 TangRoulou 吉祥书 8.8折优惠 >>> http://t.cn/z...\n",
"6598 1 回复@钱旭明QXM:[嘻嘻][嘻嘻] //@钱旭明QXM:杨大哥[good][good][g...\n",
"53920 1 人家这脸长的!!!!!![哈哈]\n",
"15587 1 这个价不算高,和一天内训相比相差无几。。[哈哈]//@博通传媒v: 6个月!一个月工资1万,...\n",
"101237 0 终于收工啦,脚丫子快冻掉了[泪][泪][泪]\n",
"82449 0 我决定从今天开始我想吃什么就去吃什么,一个人吃也无所谓,重点是不要因为别人的意见委屈了自己[...\n",
"32537 1 飘雪的北京 需要双份早餐.......//@美食天下: [哈哈]//@王淼Margay: 屁...\n",
"10630 1 [耶],这个太赞了,生活大爆炸第六季马上要出啦[鼓掌] //@-郑瑜-:这个不错 //@经典...\n",
"85130 0 刚追完#倾世皇妃#,#千山暮雪#又紧随其后,网速和更新速度都太不给力,尽管我看过原著,还是焦...\n",
"105956 0 晚上看金二胖?察前?,推出的火炮基座?糟了,可以PK了[泪] //@艾米粒er: //@wi...\n",
"72391 0 必须把中国足球的伟大,用我的职业演说出来 //@袁腾飞:[泪]\n",
"10761 1 [鼓掌] //@宁波香格里拉大酒店: 小编来答疑,周五晚惊艳全场的树根蛋糕到底有多长?蛋糕全..."
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd_all.sample(20)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"widgets": {
"state": {},
"version": "1.1.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@@ -0,0 +1,107 @@
import torch
from transformers import GPT2ForSequenceClassification, BertTokenizer
from peft import PeftModel
import os
import re
def preprocess_text(text):
return text
def main():
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 模型和权重路径
base_model_path = './models/gpt2-chinese'
lora_model_path = './best_weibo_sentiment_lora'
print("加载模型和tokenizer...")
# 检查LoRA模型是否存在
if not os.path.exists(lora_model_path):
print(f"错误: 找不到LoRA模型路径 {lora_model_path}")
print("请先运行 train.py 进行训练")
return
# 加载tokenizer
try:
tokenizer = BertTokenizer.from_pretrained(base_model_path)
if tokenizer.pad_token is None:
tokenizer.pad_token = '[PAD]'
except Exception as e:
print(f"加载tokenizer失败: {e}")
print("请确保models/gpt2-chinese目录包含tokenizer文件")
return
# 加载基础模型
try:
base_model = GPT2ForSequenceClassification.from_pretrained(
base_model_path,
num_labels=2
)
base_model.config.pad_token_id = tokenizer.pad_token_id
except Exception as e:
print(f"加载基础模型失败: {e}")
print("请确保models/gpt2-chinese目录包含模型文件")
return
# 加载LoRA权重
try:
model = PeftModel.from_pretrained(base_model, lora_model_path)
model.to(device)
model.eval()
print("LoRA模型加载成功!")
except Exception as e:
print(f"加载LoRA权重失败: {e}")
print("请确保LoRA权重文件存在且格式正确")
return
print("\n============= 微博情感分析 (LoRA版) =============")
print("输入微博内容进行分析 (输入 'q' 退出):")
while True:
text = input("\n请输入微博内容: ")
if text.lower() == 'q':
break
if not text.strip():
print("输入不能为空,请重新输入")
continue
try:
# 预处理文本
processed_text = preprocess_text(text)
# 对文本进行编码
encoding = tokenizer(
processed_text,
max_length=128,
padding='max_length',
truncation=True,
return_tensors='pt'
)
# 转移到设备
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
# 预测
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
prediction = torch.argmax(probabilities, dim=1).item()
# 输出结果
confidence = probabilities[0][prediction].item()
label = "正面情感" if prediction == 1 else "负面情感"
print(f"预测结果: {label} (置信度: {confidence:.4f})")
except Exception as e:
print(f"预测时发生错误: {e}")
continue
if __name__ == "__main__":
main()
@@ -0,0 +1,10 @@
torch>=1.13.0
transformers>=4.28.0
peft>=0.4.0
pandas>=1.5.0
numpy>=1.21.0
scikit-learn>=1.0.0
tqdm>=4.64.0
datasets>=2.0.0
accelerate>=0.20.0
safetensors>=0.3.0
@@ -0,0 +1,283 @@
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
GPT2ForSequenceClassification,
BertTokenizer,
get_linear_schedule_with_warmup,
TrainingArguments,
Trainer
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
# 导入PEFT库中的LoRA相关组件
from peft import LoraConfig, TaskType, get_peft_model
# 设置随机种子
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed(42)
# 定义微博情感分析数据集
class WeiboSentimentDataset(Dataset):
def __init__(self, reviews, labels, tokenizer, max_length=128):
self.reviews = reviews
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.reviews)
def __getitem__(self, idx):
review = str(self.reviews[idx])
label = self.labels[idx]
encoding = self.tokenizer(
review,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': torch.tensor(label, dtype=torch.long)
}
# 训练函数
def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, device, epochs=3):
best_f1 = 0.0
for epoch in range(epochs):
print(f"======== Epoch {epoch+1} / {epochs} ========")
model.train()
total_loss = 0
# 训练循环
progress_bar = tqdm(train_dataloader, desc="Training", position=0, leave=True)
for batch in progress_bar:
# 将数据移到GPU
batch = {k: v.to(device) for k, v in batch.items()}
# 清零梯度
optimizer.zero_grad()
# 前向传播
outputs = model(
input_ids=batch['input_ids'],
attention_mask=batch['attention_mask'],
labels=batch['labels']
)
loss = outputs.loss
total_loss += loss.item()
# 反向传播
loss.backward()
# 梯度裁剪,防止梯度爆炸
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# 参数更新
optimizer.step()
scheduler.step()
# 更新进度条
progress_bar.set_postfix({"loss": loss.item()})
# 计算平均训练损失
avg_train_loss = total_loss / len(train_dataloader)
print(f"Average training loss: {avg_train_loss:.4f}")
# 评估模型
val_metrics = evaluate_model(model, val_dataloader, device)
print(f"Validation Loss: {val_metrics['loss']:.4f}")
print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")
print(f"Validation F1 Score: {val_metrics['f1']:.4f}")
# 保存最佳模型
if val_metrics['f1'] > best_f1:
best_f1 = val_metrics['f1']
# 保存LoRA权重
model.save_pretrained("./best_weibo_sentiment_lora")
print("Saved best LoRA model!")
# 评估函数
def evaluate_model(model, dataloader, device):
model.eval()
total_loss = 0
all_preds = []
all_labels = []
with torch.no_grad():
for batch in tqdm(dataloader, desc="Evaluating"):
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(
input_ids=batch['input_ids'],
attention_mask=batch['attention_mask'],
labels=batch['labels']
)
loss = outputs.loss
total_loss += loss.item()
# 获取预测结果
logits = outputs.logits
preds = torch.argmax(logits, dim=1).cpu().numpy()
labels = batch['labels'].cpu().numpy()
all_preds.extend(preds)
all_labels.extend(labels)
# 计算评估指标
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='macro')
avg_loss = total_loss / len(dataloader)
return {
'loss': avg_loss,
'accuracy': accuracy,
'f1': f1
}
def main():
# 设置模型本地保存路径
model_name = 'uer/gpt2-chinese-cluecorpussmall'
local_model_path = './models/gpt2-chinese'
# 确保目录存在
os.makedirs(local_model_path, exist_ok=True)
os.makedirs('./best_weibo_sentiment_lora', exist_ok=True)
# 加载数据集
print("加载微博情感数据集...")
df = pd.read_csv('dataset/weibo_senti_100k.csv')
# 分割数据集
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])
# 加载tokenizer
print("加载预训练模型和tokenizer...")
# 检查本地是否已有模型
if os.path.exists(os.path.join(local_model_path, 'config.json')):
print(f"从本地路径加载tokenizer: {local_model_path}")
tokenizer = BertTokenizer.from_pretrained(local_model_path)
else:
print(f"从Hugging Face下载tokenizer到: {local_model_path}")
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
# 保存tokenizer到本地
tokenizer.save_pretrained(local_model_path)
# 设置padding token
if tokenizer.pad_token is None:
tokenizer.pad_token = '[PAD]'
# 记录pad_token的ID
pad_token_id = tokenizer.pad_token_id
# 创建数据集
train_dataset = WeiboSentimentDataset(
train_df['review'].values,
train_df['label'].values,
tokenizer
)
val_dataset = WeiboSentimentDataset(
val_df['review'].values,
val_df['label'].values,
tokenizer
)
# 创建数据加载器
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 加载预训练的GPT2模型
print("加载GPT2模型...")
if (os.path.exists(os.path.join(local_model_path, 'pytorch_model.bin')) or
os.path.exists(os.path.join(local_model_path, 'model.safetensors'))):
print(f"从本地路径加载模型权重: {local_model_path}")
model = GPT2ForSequenceClassification.from_pretrained(local_model_path, num_labels=2)
else:
print(f"从Hugging Face下载模型权重到: {local_model_path}")
# 直接从Hugging Face下载并保存完整模型
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.save_pretrained(local_model_path)
# 确保模型使用与tokenizer相同的pad_token_id
model.config.pad_token_id = pad_token_id
# 配置LoRA参数
print("配置LoRA参数...")
lora_config = LoraConfig(
task_type=TaskType.SEQ_CLS, # 序列分类任务
target_modules=["c_attn", "c_proj"], # GPT2的注意力投影层
inference_mode=False, # 训练模式
r=8, # LoRA秩,控制可训练参数数量
lora_alpha=32, # LoRA alpha参数,缩放因子
lora_dropout=0.1, # LoRA Dropout
)
# 将模型转换为PEFT格式的LoRA模型
print("创建LoRA模型...")
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # 打印可训练参数数量和占比
model.to(device)
# 设置优化器和学习率调度器
print("设置优化器...")
optimizer = AdamW(
model.parameters(), # PEFT会自动处理参数筛选
lr=5e-4, # LoRA通常使用较高的学习率
eps=1e-8
)
# 设置总训练步数和warmup步数
total_steps = len(train_dataloader) * 3 # 3个epoch
warmup_steps = int(total_steps * 0.1) # 10%的warmup
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=total_steps
)
# 训练模型
print("开始训练...")
train_model(
model=model,
train_dataloader=train_dataloader,
val_dataloader=val_dataloader,
optimizer=optimizer,
scheduler=scheduler,
device=device,
epochs=3
)
print("训练完成!")
print("LoRA权重已保存到: ./best_weibo_sentiment_lora/")
if __name__ == "__main__":
main()