Local sentiment analysis upload.
This commit is contained in:
@@ -0,0 +1,413 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
BERT情感分析模型训练脚本
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from transformers import BertTokenizer, BertModel
|
||||
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_auc_score
|
||||
from typing import List, Tuple
|
||||
import warnings
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
from base_model import BaseModel
|
||||
from utils import load_corpus_bert
|
||||
|
||||
# 忽略transformers的警告
|
||||
warnings.filterwarnings("ignore")
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
|
||||
class BertDataset(Dataset):
|
||||
"""BERT数据集"""
|
||||
|
||||
def __init__(self, data: List[Tuple[str, int]]):
|
||||
self.data = [item[0] for item in data]
|
||||
self.labels = [item[1] for item in data]
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.data[index], self.labels[index]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.labels)
|
||||
|
||||
|
||||
class BertClassifier(nn.Module):
|
||||
"""BERT分类器网络"""
|
||||
|
||||
def __init__(self, input_size):
|
||||
super(BertClassifier, self).__init__()
|
||||
self.fc = nn.Linear(input_size, 1)
|
||||
self.sigmoid = nn.Sigmoid()
|
||||
|
||||
def forward(self, x):
|
||||
out = self.fc(x)
|
||||
out = self.sigmoid(out)
|
||||
return out
|
||||
|
||||
|
||||
class BertModel_Custom(BaseModel):
|
||||
"""BERT情感分析模型"""
|
||||
|
||||
def __init__(self, model_path: str = "./model/chinese_wwm_pytorch"):
|
||||
super().__init__("BERT")
|
||||
self.model_path = model_path
|
||||
self.tokenizer = None
|
||||
self.bert = None
|
||||
self.classifier = None
|
||||
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
|
||||
def _download_bert_model(self):
|
||||
"""自动下载BERT预训练模型"""
|
||||
print(f"BERT模型不存在,正在下载中文BERT预训练模型...")
|
||||
print("下载来源: bert-base-chinese (Hugging Face)")
|
||||
|
||||
try:
|
||||
# 创建模型目录
|
||||
os.makedirs(self.model_path, exist_ok=True)
|
||||
|
||||
# 使用Hugging Face的中文BERT模型
|
||||
model_name = "bert-base-chinese"
|
||||
print(f"正在从Hugging Face下载 {model_name}...")
|
||||
|
||||
# 下载tokenizer
|
||||
print("下载分词器...")
|
||||
tokenizer = BertTokenizer.from_pretrained(model_name)
|
||||
tokenizer.save_pretrained(self.model_path)
|
||||
|
||||
# 下载模型
|
||||
print("下载BERT模型...")
|
||||
bert_model = BertModel.from_pretrained(model_name)
|
||||
bert_model.save_pretrained(self.model_path)
|
||||
|
||||
print(f"✅ BERT模型下载完成,保存在: {self.model_path}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ BERT模型下载失败: {e}")
|
||||
print("\n💡 您可以手动下载BERT模型:")
|
||||
print("1. 访问 https://huggingface.co/bert-base-chinese")
|
||||
print("2. 或使用哈工大中文BERT: https://github.com/ymcui/Chinese-BERT-wwm")
|
||||
print(f"3. 将模型文件解压到: {self.model_path}")
|
||||
return False
|
||||
|
||||
def _load_bert(self):
|
||||
"""加载BERT模型和分词器"""
|
||||
print(f"加载BERT模型: {self.model_path}")
|
||||
|
||||
# 如果模型不存在,尝试自动下载
|
||||
if not os.path.exists(self.model_path) or not any(os.scandir(self.model_path)):
|
||||
print("BERT模型不存在,尝试自动下载...")
|
||||
if not self._download_bert_model():
|
||||
raise FileNotFoundError(f"BERT模型下载失败,请手动下载到: {self.model_path}")
|
||||
|
||||
try:
|
||||
self.tokenizer = BertTokenizer.from_pretrained(self.model_path)
|
||||
self.bert = BertModel.from_pretrained(self.model_path).to(self.device)
|
||||
|
||||
# 冻结BERT参数
|
||||
for param in self.bert.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
print("✅ BERT模型加载完成")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ BERT模型加载失败: {e}")
|
||||
print("尝试使用在线模型...")
|
||||
|
||||
# 如果本地加载失败,尝试直接使用在线模型
|
||||
try:
|
||||
model_name = "bert-base-chinese"
|
||||
self.tokenizer = BertTokenizer.from_pretrained(model_name)
|
||||
self.bert = BertModel.from_pretrained(model_name).to(self.device)
|
||||
|
||||
# 冻结BERT参数
|
||||
for param in self.bert.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
print("✅ 在线BERT模型加载完成")
|
||||
|
||||
except Exception as e2:
|
||||
print(f"❌ 在线模型也加载失败: {e2}")
|
||||
raise FileNotFoundError(f"无法加载BERT模型,请检查网络连接或手动下载模型到: {self.model_path}")
|
||||
|
||||
def train(self, train_data: List[Tuple[str, int]], **kwargs) -> None:
|
||||
"""训练BERT模型"""
|
||||
print(f"开始训练 {self.model_name} 模型...")
|
||||
|
||||
# 加载BERT
|
||||
self._load_bert()
|
||||
|
||||
# 超参数
|
||||
learning_rate = kwargs.get('learning_rate', 1e-3)
|
||||
num_epochs = kwargs.get('num_epochs', 10)
|
||||
batch_size = kwargs.get('batch_size', 100)
|
||||
input_size = kwargs.get('input_size', 768)
|
||||
decay_rate = kwargs.get('decay_rate', 0.9)
|
||||
|
||||
print(f"BERT超参数: lr={learning_rate}, epochs={num_epochs}, "
|
||||
f"batch_size={batch_size}, input_size={input_size}")
|
||||
|
||||
# 创建数据集
|
||||
train_dataset = BertDataset(train_data)
|
||||
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
||||
|
||||
# 创建分类器
|
||||
self.classifier = BertClassifier(input_size).to(self.device)
|
||||
|
||||
# 损失函数和优化器
|
||||
criterion = nn.BCELoss()
|
||||
optimizer = torch.optim.Adam(self.classifier.parameters(), lr=learning_rate)
|
||||
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=decay_rate)
|
||||
|
||||
# 训练循环
|
||||
self.bert.eval() # BERT始终保持评估模式
|
||||
self.classifier.train()
|
||||
|
||||
for epoch in range(num_epochs):
|
||||
total_loss = 0
|
||||
num_batches = 0
|
||||
|
||||
for i, (words, labels) in enumerate(train_loader):
|
||||
# 分词和编码
|
||||
tokens = self.tokenizer(words, padding=True, truncation=True,
|
||||
max_length=512, return_tensors='pt')
|
||||
input_ids = tokens["input_ids"].to(self.device)
|
||||
attention_mask = tokens["attention_mask"].to(self.device)
|
||||
labels = torch.tensor(labels, dtype=torch.float32).to(self.device)
|
||||
|
||||
# 获取BERT输出(冻结参数)
|
||||
with torch.no_grad():
|
||||
bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
|
||||
bert_output = bert_outputs[0][:, 0] # [CLS] token的输出
|
||||
|
||||
# 分类器前向传播
|
||||
optimizer.zero_grad()
|
||||
outputs = self.classifier(bert_output)
|
||||
logits = outputs.view(-1)
|
||||
loss = criterion(logits, labels)
|
||||
|
||||
# 反向传播
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
total_loss += loss.item()
|
||||
num_batches += 1
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
avg_loss = total_loss / num_batches
|
||||
print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}], Loss: {avg_loss:.4f}")
|
||||
total_loss = 0
|
||||
num_batches = 0
|
||||
|
||||
# 学习率衰减
|
||||
scheduler.step()
|
||||
|
||||
# 保存每个epoch的模型
|
||||
if kwargs.get('save_each_epoch', False):
|
||||
epoch_model_path = f"./model/bert_epoch_{epoch+1}.pth"
|
||||
os.makedirs(os.path.dirname(epoch_model_path), exist_ok=True)
|
||||
torch.save(self.classifier.state_dict(), epoch_model_path)
|
||||
print(f"已保存模型: {epoch_model_path}")
|
||||
|
||||
self.is_trained = True
|
||||
print(f"{self.model_name} 模型训练完成!")
|
||||
|
||||
def predict(self, texts: List[str]) -> List[int]:
|
||||
"""预测文本情感"""
|
||||
if not self.is_trained:
|
||||
raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
|
||||
|
||||
predictions = []
|
||||
batch_size = 32
|
||||
|
||||
self.bert.eval()
|
||||
self.classifier.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch_texts = texts[i:i+batch_size]
|
||||
|
||||
# 分词和编码
|
||||
tokens = self.tokenizer(batch_texts, padding=True, truncation=True,
|
||||
max_length=512, return_tensors='pt')
|
||||
input_ids = tokens["input_ids"].to(self.device)
|
||||
attention_mask = tokens["attention_mask"].to(self.device)
|
||||
|
||||
# 获取BERT输出
|
||||
bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
|
||||
bert_output = bert_outputs[0][:, 0]
|
||||
|
||||
# 分类器预测
|
||||
outputs = self.classifier(bert_output)
|
||||
outputs = outputs.view(-1)
|
||||
|
||||
# 转换为类别标签
|
||||
preds = (outputs > 0.5).cpu().numpy()
|
||||
predictions.extend(preds.astype(int).tolist())
|
||||
|
||||
return predictions
|
||||
|
||||
def predict_single(self, text: str) -> Tuple[int, float]:
|
||||
"""预测单条文本的情感"""
|
||||
if not self.is_trained:
|
||||
raise ValueError(f"模型 {self.model_name} 尚未训练,请先调用train方法")
|
||||
|
||||
self.bert.eval()
|
||||
self.classifier.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
# 分词和编码
|
||||
tokens = self.tokenizer([text], padding=True, truncation=True,
|
||||
max_length=512, return_tensors='pt')
|
||||
input_ids = tokens["input_ids"].to(self.device)
|
||||
attention_mask = tokens["attention_mask"].to(self.device)
|
||||
|
||||
# 获取BERT输出
|
||||
bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
|
||||
bert_output = bert_outputs[0][:, 0]
|
||||
|
||||
# 分类器预测
|
||||
output = self.classifier(bert_output)
|
||||
prob = output.item()
|
||||
|
||||
prediction = int(prob > 0.5)
|
||||
confidence = prob if prediction == 1 else 1 - prob
|
||||
|
||||
return prediction, confidence
|
||||
|
||||
def save_model(self, model_path: str = None) -> None:
|
||||
"""保存模型"""
|
||||
if not self.is_trained:
|
||||
raise ValueError(f"模型 {self.model_name} 尚未训练,无法保存")
|
||||
|
||||
if model_path is None:
|
||||
model_path = f"./model/{self.model_name.lower()}_model.pth"
|
||||
|
||||
os.makedirs(os.path.dirname(model_path), exist_ok=True)
|
||||
|
||||
# 保存分类器和相关信息
|
||||
model_data = {
|
||||
'classifier_state_dict': self.classifier.state_dict(),
|
||||
'model_path': self.model_path,
|
||||
'input_size': 768,
|
||||
'device': str(self.device)
|
||||
}
|
||||
|
||||
torch.save(model_data, model_path)
|
||||
print(f"模型已保存到: {model_path}")
|
||||
|
||||
def load_model(self, model_path: str) -> None:
|
||||
"""加载模型"""
|
||||
if not os.path.exists(model_path):
|
||||
raise FileNotFoundError(f"模型文件不存在: {model_path}")
|
||||
|
||||
model_data = torch.load(model_path, map_location=self.device)
|
||||
|
||||
# 设置BERT模型路径
|
||||
self.model_path = model_data['model_path']
|
||||
|
||||
# 加载BERT
|
||||
self._load_bert()
|
||||
|
||||
# 重建分类器
|
||||
input_size = model_data['input_size']
|
||||
self.classifier = BertClassifier(input_size).to(self.device)
|
||||
|
||||
# 加载分类器权重
|
||||
self.classifier.load_state_dict(model_data['classifier_state_dict'])
|
||||
|
||||
self.is_trained = True
|
||||
print(f"已加载模型: {model_path}")
|
||||
|
||||
@staticmethod
|
||||
def load_data(train_path: str, test_path: str) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]:
|
||||
"""加载BERT格式的数据"""
|
||||
print("加载训练数据...")
|
||||
train_data = load_corpus_bert(train_path)
|
||||
print(f"训练数据量: {len(train_data)}")
|
||||
|
||||
print("加载测试数据...")
|
||||
test_data = load_corpus_bert(test_path)
|
||||
print(f"测试数据量: {len(test_data)}")
|
||||
|
||||
return train_data, test_data
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
parser = argparse.ArgumentParser(description='BERT情感分析模型训练')
|
||||
parser.add_argument('--train_path', type=str, default='./data/weibo2018/train.txt',
|
||||
help='训练数据路径')
|
||||
parser.add_argument('--test_path', type=str, default='./data/weibo2018/test.txt',
|
||||
help='测试数据路径')
|
||||
parser.add_argument('--model_path', type=str, default='./model/bert_model.pth',
|
||||
help='模型保存路径')
|
||||
parser.add_argument('--bert_path', type=str, default='./model/chinese_wwm_pytorch',
|
||||
help='BERT预训练模型路径')
|
||||
parser.add_argument('--epochs', type=int, default=10,
|
||||
help='训练轮数')
|
||||
parser.add_argument('--batch_size', type=int, default=100,
|
||||
help='批大小')
|
||||
parser.add_argument('--learning_rate', type=float, default=1e-3,
|
||||
help='学习率')
|
||||
parser.add_argument('--eval_only', action='store_true',
|
||||
help='仅评估已有模型,不进行训练')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 创建模型
|
||||
model = BertModel_Custom(args.bert_path)
|
||||
|
||||
if args.eval_only:
|
||||
# 仅评估模式
|
||||
print("评估模式:加载已有模型进行评估")
|
||||
model.load_model(args.model_path)
|
||||
|
||||
# 加载测试数据
|
||||
_, test_data = model.load_data(args.train_path, args.test_path)
|
||||
|
||||
# 评估模型
|
||||
model.evaluate(test_data)
|
||||
else:
|
||||
# 训练模式
|
||||
# 加载数据
|
||||
train_data, test_data = model.load_data(args.train_path, args.test_path)
|
||||
|
||||
# 训练模型
|
||||
model.train(
|
||||
train_data,
|
||||
num_epochs=args.epochs,
|
||||
batch_size=args.batch_size,
|
||||
learning_rate=args.learning_rate
|
||||
)
|
||||
|
||||
# 评估模型
|
||||
model.evaluate(test_data)
|
||||
|
||||
# 保存模型
|
||||
model.save_model(args.model_path)
|
||||
|
||||
# 示例预测
|
||||
print("\n示例预测:")
|
||||
test_texts = [
|
||||
"今天天气真好,心情很棒",
|
||||
"这部电影太无聊了,浪费时间",
|
||||
"哈哈哈,太有趣了"
|
||||
]
|
||||
|
||||
for text in test_texts:
|
||||
pred, conf = model.predict_single(text)
|
||||
sentiment = "正面" if pred == 1 else "负面"
|
||||
print(f"文本: {text}")
|
||||
print(f"预测: {sentiment} (置信度: {conf:.4f})")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user