Local sentiment analysis upload.

This commit is contained in:
戒酒的李白
2025-08-23 15:55:07 +08:00
parent 4e33224633
commit f448ddd466
65 changed files with 1563359 additions and 11 deletions
@@ -0,0 +1,468 @@
# -*- coding: utf-8 -*-
"""
Qwen3-LoRA通用训练脚本
支持0.6B、4B、8B三种规模的模型
"""
import argparse
import os
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from datasets import Dataset
from typing import List, Tuple
import warnings
from tqdm import tqdm
from base_model import BaseQwenModel
from models_config import QWEN3_MODELS, MODEL_PATHS
warnings.filterwarnings("ignore")
class Qwen3LoRAUniversal(BaseQwenModel):
"""通用Qwen3-LoRA模型"""
def __init__(self, model_size: str = "0.6B"):
if model_size not in QWEN3_MODELS:
raise ValueError(f"不支持的模型大小: {model_size}")
super().__init__(f"Qwen3-{model_size}-LoRA")
self.model_size = model_size
self.config = QWEN3_MODELS[model_size]
self.model_name_hf = self.config["base_model"]
self.tokenizer = None
self.base_model = None
self.lora_model = None
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def _load_base_model(self):
"""加载Qwen3基础模型"""
print(f"加载{self.model_size}基础模型: {self.model_name_hf}")
# 第一步:检查当前文件夹的models目录
local_model_dir = f"./models/qwen3-{self.model_size.lower()}"
if os.path.exists(local_model_dir) and os.path.exists(os.path.join(local_model_dir, "config.json")):
try:
print(f"发现本地模型,从本地加载: {local_model_dir}")
self.tokenizer = AutoTokenizer.from_pretrained(local_model_dir)
self.base_model = AutoModelForCausalLM.from_pretrained(
local_model_dir,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto" if torch.cuda.is_available() else None
)
# 设置pad_token
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
print(f"从本地模型加载{self.model_size}基础模型成功")
return
except Exception as e:
print(f"本地模型加载失败: {e}")
# 第二步:检查HuggingFace缓存
try:
from transformers.utils import default_cache_path
cache_path = default_cache_path
print(f"检查HuggingFace缓存: {cache_path}")
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_hf)
self.base_model = AutoModelForCausalLM.from_pretrained(
self.model_name_hf,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto" if torch.cuda.is_available() else None
)
# 设置pad_token
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
print(f"从HuggingFace缓存加载{self.model_size}基础模型成功")
# 保存到本地models目录
print(f"保存模型到本地: {local_model_dir}")
os.makedirs(local_model_dir, exist_ok=True)
self.tokenizer.save_pretrained(local_model_dir)
self.base_model.save_pretrained(local_model_dir)
print(f"模型已保存到: {local_model_dir}")
except Exception as e:
print(f"从HuggingFace缓存加载失败: {e}")
# 第三步:从HuggingFace下载
try:
print(f"正在从HuggingFace下载{self.model_size}模型...")
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name_hf,
force_download=True
)
self.base_model = AutoModelForCausalLM.from_pretrained(
self.model_name_hf,
force_download=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto" if torch.cuda.is_available() else None
)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
# 保存到本地models目录
os.makedirs(local_model_dir, exist_ok=True)
self.tokenizer.save_pretrained(local_model_dir)
self.base_model.save_pretrained(local_model_dir)
print(f"{self.model_size}模型下载并保存到: {local_model_dir}")
except Exception as e2:
print(f"从HuggingFace下载也失败: {e2}")
raise RuntimeError(f"无法加载{self.model_size}模型,所有方法都失败了")
def _create_instruction_data(self, data: List[Tuple[str, int]]) -> Dataset:
"""创建指令格式的训练数据"""
instructions = []
for text, label in data:
sentiment = "正面" if label == 1 else "负面"
# 构建指令格式
instruction = f"请分析以下微博文本的情感倾向,回答'正面''负面'\n\n文本:{text}\n\n情感:"
response = sentiment
# 组合成完整的训练文本
full_text = f"{instruction}{response}{self.tokenizer.eos_token}"
instructions.append({
"instruction": instruction,
"response": response,
"text": full_text
})
return Dataset.from_list(instructions)
def _tokenize_function(self, examples):
"""分词函数"""
tokenized = self.tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=512,
return_tensors=None
)
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
def _setup_lora(self, **kwargs):
"""设置LoRA配置"""
lora_r = kwargs.get('lora_r', self.config['lora_r'])
lora_alpha = kwargs.get('lora_alpha', self.config['lora_alpha'])
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=lora_r,
lora_alpha=lora_alpha,
lora_dropout=kwargs.get('lora_dropout', 0.1),
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
self.lora_model = get_peft_model(self.base_model, lora_config)
# 统计参数
total_params = sum(p.numel() for p in self.lora_model.parameters())
trainable_params = sum(p.numel() for p in self.lora_model.parameters() if p.requires_grad)
print(f"LoRA配置完成 (r={lora_r}, alpha={lora_alpha})")
print(f"总参数: {total_params:,}")
print(f"可训练参数: {trainable_params:,}")
print(f"可训练参数比例: {trainable_params / total_params * 100:.2f}%")
self.lora_model.print_trainable_parameters() # PEFT库自带的参数统计
return lora_config
def train(self, train_data: List[Tuple[str, int]], **kwargs) -> None:
"""训练模型"""
print(f"开始训练 Qwen3-{self.model_size}-LoRA 模型...")
# 加载基础模型
self._load_base_model()
# 设置LoRA
self._setup_lora(**kwargs)
# 超参数(使用配置文件的推荐值或用户指定值)
num_epochs = kwargs.get('num_epochs', 3)
batch_size = kwargs.get('batch_size', self.config['recommended_batch_size'] // 2) # LoRA需要更少批大小
learning_rate = kwargs.get('learning_rate', self.config['recommended_lr'] / 2) # LoRA使用更小学习率
output_dir = kwargs.get('output_dir', f'./models/qwen3_lora_{self.model_size.lower()}_checkpoints')
print(f"超参数: epochs={num_epochs}, batch_size={batch_size}, lr={learning_rate}")
# 创建指令格式数据
train_dataset = self._create_instruction_data(train_data)
# 分词
tokenized_dataset = train_dataset.map(
self._tokenize_function,
batched=True,
remove_columns=train_dataset.column_names
)
# 训练参数
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=2,
learning_rate=learning_rate,
logging_steps=10,
save_steps=100,
save_total_limit=2,
remove_unused_columns=False,
dataloader_drop_last=False,
report_to=None,
)
# 数据整理器
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False,
)
# 创建训练器
trainer = Trainer(
model=self.lora_model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator,
tokenizer=self.tokenizer,
)
# 开始训练
print(f"开始LoRA微调...")
trainer.train()
# 保存模型
self.lora_model.save_pretrained(output_dir)
self.tokenizer.save_pretrained(output_dir)
self.model = self.lora_model
self.is_trained = True
print(f"Qwen3-{self.model_size}-LoRA 模型训练完成!")
def _extract_sentiment(self, generated_text: str, instruction: str) -> int:
"""从生成的文本中提取情感标签"""
response = generated_text[len(instruction):].strip()
if "正面" in response:
return 1
elif "负面" in response:
return 0
else:
return 0
def predict(self, texts: List[str]) -> List[int]:
"""预测文本情感"""
if not self.is_trained:
raise ValueError(f"模型 {self.model_name} 尚未训练")
predictions = []
self.lora_model.eval()
with torch.no_grad():
for text in tqdm(texts, desc=f"Qwen3-{self.model_size}预测中"):
pred, _ = self.predict_single(text)
predictions.append(pred)
return predictions
def predict_single(self, text: str) -> Tuple[int, float]:
"""预测单条文本的情感"""
if not self.is_trained:
raise ValueError(f"模型 {self.model_name} 尚未训练")
# 构建指令
instruction = f"请分析以下微博文本的情感倾向,回答'正面''负面'\n\n文本:{text}\n\n情感:"
# 分词
inputs = self.tokenizer(instruction, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.to(self.device) for k, v in inputs.items()}
# 生成回答
self.lora_model.eval()
with torch.no_grad():
outputs = self.lora_model.generate(
**inputs,
max_new_tokens=10,
do_sample=True,
temperature=0.1,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
)
# 解码生成的文本
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# 提取情感标签
prediction = self._extract_sentiment(generated_text, instruction)
confidence = 0.8 # 生成式模型的置信度计算较复杂,这里给个固定值
return prediction, confidence
def save_model(self, model_path: str = None) -> None:
"""保存模型"""
if not self.is_trained:
raise ValueError(f"模型 {self.model_name} 尚未训练")
if model_path is None:
model_path = MODEL_PATHS["lora"][self.model_size]
os.makedirs(model_path, exist_ok=True)
# 保存LoRA权重
self.lora_model.save_pretrained(model_path)
self.tokenizer.save_pretrained(model_path)
print(f"LoRA模型已保存到: {model_path}")
def load_model(self, model_path: str) -> None:
"""加载模型"""
if not os.path.exists(model_path):
raise FileNotFoundError(f"模型文件不存在: {model_path}")
# 加载基础模型
self._load_base_model()
# 加载LoRA权重
self.lora_model = PeftModel.from_pretrained(self.base_model, model_path)
self.model = self.lora_model
self.is_trained = True
print(f"已加载Qwen3-{self.model_size}-LoRA模型: {model_path}")
def main():
"""主函数"""
parser = argparse.ArgumentParser(description='Qwen3-LoRA通用训练脚本')
parser.add_argument('--model_size', type=str, choices=['0.6B', '4B', '8B'],
help='模型大小')
parser.add_argument('--train_path', type=str, default='./dataset/train.txt',
help='训练数据路径')
parser.add_argument('--test_path', type=str, default='./dataset/test.txt',
help='测试数据路径')
parser.add_argument('--model_path', type=str, help='模型保存路径(可选)')
parser.add_argument('--epochs', type=int, default=3, help='训练轮数')
parser.add_argument('--batch_size', type=int, help='批大小(可选,使用推荐值)')
parser.add_argument('--learning_rate', type=float, help='学习率(可选,使用推荐值)')
parser.add_argument('--lora_r', type=int, help='LoRA秩(可选,使用推荐值)')
parser.add_argument('--max_samples', type=int, default=0, help='最大训练样本数(0表示使用全部数据)')
parser.add_argument('--eval_only', action='store_true', help='仅评估模式')
args = parser.parse_args()
# 如果没有指定模型大小,则询问用户
if not args.model_size:
print("Qwen3-LoRA模型训练")
print("="*40)
print("可用模型大小:")
print(" 1. 0.6B - 轻量级,训练快速,显存需求约8GB")
print(" 2. 4B - 中等规模,性能均衡,显存需求约32GB")
print(" 3. 8B - 大规模,性能最佳,显存需求约64GB")
print("\n注意: LoRA微调比Embedding方法需要更多显存")
while True:
choice = input("\n请选择模型大小 (1/2/3): ").strip()
if choice == '1':
args.model_size = '0.6B'
break
elif choice == '2':
args.model_size = '4B'
break
elif choice == '3':
args.model_size = '8B'
break
else:
print("无效选择,请输入 1、2 或 3")
print(f"已选择: Qwen3-{args.model_size} + LoRA")
print()
# 确保models目录存在
os.makedirs('./models', exist_ok=True)
# 创建模型
model = Qwen3LoRAUniversal(args.model_size)
# 确定模型保存路径
model_path = args.model_path or MODEL_PATHS["lora"][args.model_size]
if args.eval_only:
# 仅评估模式
print(f"评估模式:加载Qwen3-{args.model_size}-LoRA模型")
model.load_model(model_path)
_, test_data = BaseQwenModel.load_data(args.train_path, args.test_path)
# LoRA评估使用少量数据
test_subset = test_data[:50]
model.evaluate(test_subset)
else:
# 训练模式
train_data, test_data = BaseQwenModel.load_data(args.train_path, args.test_path)
# 训练数据处理
if args.max_samples > 0:
train_subset = train_data[:args.max_samples]
print(f"使用 {len(train_subset)} 条数据进行LoRA训练")
else:
train_subset = train_data
print(f"使用全部 {len(train_subset)} 条数据进行LoRA训练")
# 准备训练参数
train_kwargs = {'num_epochs': args.epochs}
if args.batch_size:
train_kwargs['batch_size'] = args.batch_size
if args.learning_rate:
train_kwargs['learning_rate'] = args.learning_rate
if args.lora_r:
train_kwargs['lora_r'] = args.lora_r
# 训练模型
model.train(train_subset, **train_kwargs)
# 评估模型(使用少量测试数据)
test_subset = test_data[:50]
model.evaluate(test_subset)
# 保存模型
model.save_model(model_path)
# 示例预测
print(f"\nQwen3-{args.model_size}-LoRA 示例预测:")
test_texts = [
"今天天气真好,心情很棒",
"这部电影太无聊了,浪费时间",
"哈哈哈,太有趣了"
]
for text in test_texts:
pred, conf = model.predict_single(text)
sentiment = "正面" if pred == 1 else "负面"
print(f"文本: {text}")
print(f"预测: {sentiment} (置信度: {conf:.4f})")
print()
if __name__ == "__main__":
main()