From 5adabea09717b1bfe974184e1ef7bcc143bc3381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=88=92=E9=85=92=E7=9A=84=E6=9D=8E=E7=99=BD?= <670939375@qq.com> Date: Fri, 4 Oct 2024 23:15:44 +0800 Subject: [PATCH] BCAT is basically completed. --- model_pro/BCAT.py | 113 +++++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/model_pro/BCAT.py b/model_pro/BCAT.py index 4f916cc..cc54dcd 100644 --- a/model_pro/BCAT.py +++ b/model_pro/BCAT.py @@ -8,49 +8,49 @@ from MHA import MultiHeadAttentionLayer from classifier import FinalClassifier from BERT_CTM import BERT_CTM_Model import os -from tqdm import tqdm +from tqdm import tqdm # 导入 tqdm 库用于进度条 from sklearn.metrics import confusion_matrix -# BERT_CTM embeddings generation and loading function +# BERT_CTM 嵌入生成和加载函数 def get_bert_ctm_embeddings(texts, bert_model_path, ctm_tokenizer_path, n_components=12, num_epochs=20, save_path=None): - # Check if saved embeddings already exist + # 检查是否已经存在保存的嵌入文件 if save_path and os.path.exists(save_path): - print(f"Loading embeddings from {save_path}...") + print(f"从文件 {save_path} 加载嵌入...") embeddings = np.load(save_path) else: - print("Generating BERT+CTM embeddings...") + print("生成 BERT+CTM 嵌入...") bert_ctm_model = BERT_CTM_Model( bert_model_path=bert_model_path, ctm_tokenizer_path=ctm_tokenizer_path, n_components=n_components, num_epochs=num_epochs ) - embeddings = bert_ctm_model.train(texts) # Generate embeddings + embeddings = bert_ctm_model.train(texts) # 生成嵌入 - # Save embeddings to file + # 保存嵌入到文件 if save_path: - print(f"Saving embeddings to file {save_path}...") + print(f"保存嵌入到文件 {save_path}...") np.save(save_path, embeddings) return embeddings -# Data loading and preparation function +# 数据加载和准备函数 def prepare_dataloader(features, labels, batch_size): - """Create DataLoader for training, validation, and testing""" + """创建 DataLoader 用于训练、验证和测试""" tensor_x = torch.tensor(features, dtype=torch.float32) tensor_y = torch.tensor(labels, dtype=torch.long) dataset = TensorDataset(tensor_x, tensor_y) return DataLoader(dataset, batch_size=batch_size, shuffle=True) -# Model training function +# 训练模型函数 def train_model(train_data_path, valid_data_path, test_data_path, train_labels, valid_labels, test_labels, bert_model_path, ctm_tokenizer_path, num_heads=8, num_classes=2, epochs=10, batch_size=128, learning_rate=5e-3, model_save_path='./final_model.pt'): - # Step 1: Get BERT+CTM embeddings - print("Step 1: Getting BERT+CTM embeddings...") + # Step 1: 获取 BERT+CTM 嵌入 + print("Step 1: 获取 BERT+CTM 嵌入...") valid_features = get_bert_ctm_embeddings(valid_data_path, bert_model_path, ctm_tokenizer_path, save_path='valid_embeddings.npy') test_features = get_bert_ctm_embeddings(test_data_path, bert_model_path, ctm_tokenizer_path, @@ -58,54 +58,53 @@ def train_model(train_data_path, valid_data_path, test_data_path, train_labels, train_features = get_bert_ctm_embeddings(train_data_path, bert_model_path, ctm_tokenizer_path, save_path='train_embeddings.npy') - # Save labels to .npy file - print("Saving labels to labels.npy file...") + # 保存标签到 .npy 文件 + print("保存标签到 labels.npy 文件...") np.save('train_labels.npy', train_labels) np.save('valid_labels.npy', valid_labels) np.save('test_labels.npy', test_labels) - # Step 2: Validate label correctness - print("Step 2: Validating label correctness...") + # Step 2: 检查标签的合理性 + print("Step 2: 检查标签的合理性...") unique_labels_train = np.unique(train_labels) unique_labels_valid = np.unique(valid_labels) unique_labels_test = np.unique(test_labels) - print(f"Unique train labels: {unique_labels_train}") - print(f"Train set class distribution: {np.bincount(train_labels)}") - print(f"Unique validation labels: {unique_labels_valid}") - print(f"Validation set class distribution: {np.bincount(valid_labels)}") - print(f"Unique test labels: {unique_labels_test}") - print(f"Test set class distribution: {np.bincount(test_labels)}") + print(f"训练标签的唯一值: {unique_labels_train}") + print(f"训练集类别分布: {np.bincount(train_labels)}") + print(f"验证标签的唯一值: {unique_labels_valid}") + print(f"验证集类别分布: {np.bincount(valid_labels)}") + print(f"测试标签的唯一值: {unique_labels_test}") + print(f"测试集类别分布: {np.bincount(test_labels)}") if len(unique_labels_train) != num_classes or len(unique_labels_valid) != num_classes or len( unique_labels_test) != num_classes: - raise ValueError(f"Number of classes in labels does not match expected: expected {num_classes}, " - f"but found different classes in training, validation, or test sets") + raise ValueError(f"标签中的类别数量与期望的不符: 期望 {num_classes}, 但训练集、验证集或测试集中发现了其他类别") - # Step 3: Create DataLoader - print("Step 3: Creating DataLoader...") + # Step 3: 创建 DataLoader + print("Step 3: 创建 DataLoader...") train_loader = prepare_dataloader(train_features, train_labels, batch_size) valid_loader = prepare_dataloader(valid_features, valid_labels, batch_size) test_loader = prepare_dataloader(test_features, test_labels, batch_size) - # Step 4: Initialize CNN - print("Step 4: Initializing CNN...") - num_filters = 256 # Use 256 convolutional output channels - kernel_sizes = [2, 3, 4] # Kernel sizes for convolution + # Step 4: 初始化CNN + print("Step 4: 初始化CNN...") + num_filters = 256 # 使用256个卷积输出通道 + kernel_sizes = [2, 3, 4] # 卷积核大小 k = 3 * len(kernel_sizes) - cnn_output_dim = num_filters * (k + 1) # Calculate the output feature dimension of CNN + cnn_output_dim = num_filters * (k + 1) # 计算CNN输出的特征维度 - # Step 5: Initialize attention mechanism - print("Step 5: Initializing multi-head attention...") + # Step 5: 初始化注意力机制 + print("Step 5: 初始化多头注意力机制...") attention_model = MultiHeadAttentionLayer(embed_size=768, num_heads=8) - # Step 6: Initialize classifier - print("Step 6: Initializing classifier...") + # Step 6: 初始化分类器 + print("Step 6: 初始化分类器...") classifier_model = FinalClassifier(input_dim=768, num_classes=num_classes) optimizer = torch.optim.Adam(classifier_model.parameters(), lr=learning_rate) criterion = torch.nn.CrossEntropyLoss() - # Step 7: Start training - print("Starting training...") + # Step 7: 开始训练 + print("开始训练...") torch.autograd.set_detect_anomaly(True) for epoch in range(epochs): classifier_model.train() @@ -113,28 +112,28 @@ def train_model(train_data_path, valid_data_path, test_data_path, train_labels, y_true = [] y_pred = [] - # Use tqdm to add progress bar for CNN feature extraction + # 使用 tqdm 为 CNN 特征提取添加进度条 for batch_x, batch_y in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} - Training"): optimizer.zero_grad() batch_x = torch.mean(batch_x, dim=1) - # Extract features from CNN + # 从CNN提取特征 # cnn_output = extract_CNN_features(batch_x) # batch_x = torch.mean(batch_x, dim=1) - # cnn_output = torch.cat((batch_x, cnn_output), dim=-1) + # cnn_output = torch.cat((batch_x,cnn_output), dim=-1) attention_output = attention_model(batch_x, batch_x, batch_x) outputs = classifier_model(attention_output) outputs = torch.mean(outputs, dim=1) - loss = criterion(outputs, batch_y) # Compute loss - loss.backward() # Backpropagation - optimizer.step() # Optimize + loss = criterion(outputs, batch_y) # 计算损失 + loss.backward() # 反向传播 + optimizer.step() # 优化 epoch_loss += loss.item() - _, predicted = torch.max(outputs, 1) # Get predicted class + _, predicted = torch.max(outputs, 1) # 获取预测类别 y_true.extend(batch_y.tolist()) y_pred.extend(predicted.tolist()) - # Calculate training accuracy, precision, recall, and F1 score + # 计算训练准确率、精确率、召回率和F1分数 accuracy = accuracy_score(y_true, y_pred) precision = precision_score(y_true, y_pred, average='macro') recall = recall_score(y_true, y_pred, average='macro') @@ -144,11 +143,11 @@ def train_model(train_data_path, valid_data_path, test_data_path, train_labels, f"Epoch [{epoch + 1}/{epochs}] Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}") print(confusion_matrix(y_true, y_pred)) - # Save model + # 保存模型 torch.save(classifier_model, model_save_path) - print(f"Trained model has been saved to {model_save_path}") + print(f"训练好的模型已经保存到 {model_save_path}") - # Validation set evaluation + # 验证集评估 classifier_model.eval() y_true = [] y_pred = [] @@ -158,7 +157,7 @@ def train_model(train_data_path, valid_data_path, test_data_path, train_labels, batch_x = torch.mean(batch_x, dim=1) # cnn_output = extract_CNN_features(batch_x) # batch_x = torch.mean(batch_x, dim=1) - # cnn_output = torch.cat((batch_x, cnn_output), dim=-1) + # cnn_output = torch.cat((batch_x,cnn_output), dim=-1) attention_output = attention_model(batch_x, batch_x, batch_x) outputs = classifier_model(attention_output) outputs = torch.mean(outputs, dim=1) @@ -166,7 +165,7 @@ def train_model(train_data_path, valid_data_path, test_data_path, train_labels, y_true.extend(batch_y.tolist()) y_pred.extend(predicted.tolist()) - # Validation accuracy, precision, recall, and F1 score + # 验证集准确率、精确率、召回率和F1分数 accuracy = accuracy_score(y_true, y_pred) precision = precision_score(y_true, y_pred, average='macro') recall = recall_score(y_true, y_pred, average='macro') @@ -175,7 +174,7 @@ def train_model(train_data_path, valid_data_path, test_data_path, train_labels, print(f"\nValidation - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}") print(confusion_matrix(y_true, y_pred)) - # Test set evaluation + # 测试集评估 y_true = [] y_pred = [] @@ -184,14 +183,14 @@ def train_model(train_data_path, valid_data_path, test_data_path, train_labels, batch_x = torch.mean(batch_x, dim=1) # cnn_output = extract_CNN_features(batch_x) # batch_x = torch.mean(batch_x, dim=1) - # cnn_output = torch.cat((batch_x, cnn_output), dim=-1) + # cnn_output = torch.cat((batch_x,cnn_output), dim=-1) attention_output = attention_model(batch_x, batch_x, batch_x) outputs = classifier_model(attention_output) outputs = torch.mean(outputs, dim=1) _, predicted = torch.max(outputs, 1) y_true.extend(batch_y.tolist()) y_pred.extend(predicted.tolist()) - # Test accuracy, precision, recall, and F1 score + # 测试集准确率、精确率、召回率和F1分数 accuracy = accuracy_score(y_true, y_pred) precision = precision_score(y_true, y_pred, average='macro') recall = recall_score(y_true, y_pred, average='macro') @@ -202,7 +201,7 @@ def train_model(train_data_path, valid_data_path, test_data_path, train_labels, if __name__ == "__main__": - # Load and prepare data + # 加载和准备数据 train_data_path = './train.csv' valid_data_path = './dev.csv' test_data_path = './test.csv' @@ -215,10 +214,10 @@ if __name__ == "__main__": valid_labels = valid_data['label'].values test_labels = test_data['label'].values - # Train model + # 训练模型 bert_model_path = './bert_model' ctm_tokenizer_path = './sentence_bert_model' - # Train model + # 训练模型 train_model(train_data_path, valid_data_path, test_data_path, train_labels, valid_labels, test_labels, bert_model_path, ctm_tokenizer_path, num_heads=12, num_classes=2, model_save_path='./final_model.pt')