Added a base model class and training scripts for various sentiment analysis models, including Naive Bayes, SVM, XGBoost, LSTM, and BERT. Also, improved prediction functionality and the model loading mechanism.

2025-08-04 22:07:30 +08:00
parent bd60e2ed1b
commit 43525c5ca8
23 changed files with 1940 additions and 2362 deletions
@@ -1,12 +1,20 @@
 # -*- coding: utf-8 -*-
 import jieba
 import re
+import os
+import pickle
+from typing import List, Tuple, Any


+# 加载停用词
 stopwords = []
-with open("data/stopwords.txt", "r", encoding="utf8") as f:
-    for w in f:
-        stopwords.append(w.strip())
+stopwords_path = "data/stopwords.txt"
+if os.path.exists(stopwords_path):
+    with open(stopwords_path, "r", encoding="utf8") as f:
+        for w in f:
+            stopwords.append(w.strip())
+else:
+    print(f"警告: 停用词文件 {stopwords_path} 不存在，将使用空停用词列表")


 def load_corpus(path):
@@ -66,4 +74,65 @@ def processing_bert(text):
    text = re.sub("@.+?( |$)", " ", text)           # 去除 @xxx (用户名)
    text = re.sub("【.+?】", " ", text)              # 去除 【xx】 (里面的内容通常都不是用户自己写的)
    text = re.sub("\u200b", " ", text)              # '\u200b'是这个数据集中的一个bad case, 不用特别在意
-    return text
+    return text
+
+
+def save_model(model: Any, model_path: str) -> None:
+    """
+    保存模型到文件
+    
+    Args:
+        model: 要保存的模型对象
+        model_path: 保存路径
+    """
+    os.makedirs(os.path.dirname(model_path), exist_ok=True)
+    
+    with open(model_path, 'wb') as f:
+        pickle.dump(model, f)
+    
+    print(f"模型已保存到: {model_path}")
+
+
+def load_model(model_path: str) -> Any:
+    """
+    从文件加载模型
+    
+    Args:
+        model_path: 模型文件路径
+        
+    Returns:
+        加载的模型对象
+    """
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"模型文件不存在: {model_path}")
+    
+    with open(model_path, 'rb') as f:
+        model = pickle.load(f)
+    
+    print(f"已加载模型: {model_path}")
+    return model
+
+
+def preprocess_text_simple(text: str) -> str:
+    """
+    简单的文本预处理函数，用于预测时的文本清洗
+    
+    Args:
+        text: 原始文本
+        
+    Returns:
+        清洗后的文本
+    """
+    # 数据清洗
+    text = re.sub("\{%.+?%\}", " ", text)           # 去除 {%xxx%}
+    text = re.sub("@.+?( |$)", " ", text)           # 去除 @xxx
+    text = re.sub("【.+?】", " ", text)              # 去除 【xx】
+    text = re.sub("\u200b", " ", text)              # 去除特殊字符
+    
+    # 删除表情符号
+    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text)
+    
+    # 多个空格合并为一个
+    text = re.sub(r"\s+", " ", text)
+    
+    return text.strip()