A new multilingual sentiment analysis module has been added.

2025-08-04 19:49:59 +08:00
parent 645242a552
commit bd60e2ed1b
7 changed files with 329 additions and 2 deletions
@@ -8,6 +8,8 @@ def preprocess_text(text):
    text = re.sub(r"@.+?( |$)", " ", text)           # 去除 @xxx
    text = re.sub(r"【.+?】", " ", text)              # 去除 【xx】
    text = re.sub(r"\u200b", " ", text)              # 去除特殊字符
+    # 删除表情符号
+    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text)
    text = re.sub(r"\s+", " ", text)                 # 多个空格合并
    return text.strip()

@@ -7,6 +7,8 @@ def preprocess_text(text):
    text = re.sub(r"@.+?( |$)", " ", text)           # 去除 @xxx
    text = re.sub(r"【.+?】", " ", text)              # 去除 【xx】
    text = re.sub(r"\u200b", " ", text)              # 去除特殊字符
+    # 删除表情符号
+    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text)
    text = re.sub(r"\s+", " ", text)                 # 多个空格合并
    return text.strip()

@@ -1,6 +1,18 @@
 import torch
 from transformers import BertTokenizer
 from train import GPT2ClassifierWithAdapter
+import re
+
+def preprocess_text(text):
+    """简单的文本预处理"""
+    text = re.sub(r"\{%.+?%\}", " ", text)           # 去除 {%xxx%}
+    text = re.sub(r"@.+?( |$)", " ", text)           # 去除 @xxx
+    text = re.sub(r"【.+?】", " ", text)              # 去除 【xx】
+    text = re.sub(r"\u200b", " ", text)              # 去除特殊字符
+    # 删除表情符号
+    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text)
+    text = re.sub(r"\s+", " ", text)                 # 多个空格合并
+    return text.strip()

 def main():
    # 设置设备
@@ -31,9 +43,12 @@ def main():
        if text.lower() == 'q':
            break
        
+        # 预处理文本
+        processed_text = preprocess_text(text)
+        
        # 对文本进行编码
        encoding = tokenizer(
-            text,
+            processed_text,
            max_length=128,
            padding='max_length',
            truncation=True,
@@ -2,6 +2,18 @@ import torch
 from transformers import GPT2ForSequenceClassification, BertTokenizer
 from peft import PeftModel
 import os
+import re
+
+def preprocess_text(text):
+    """简单的文本预处理"""
+    text = re.sub(r"\{%.+?%\}", " ", text)           # 去除 {%xxx%}
+    text = re.sub(r"@.+?( |$)", " ", text)           # 去除 @xxx
+    text = re.sub(r"【.+?】", " ", text)              # 去除 【xx】
+    text = re.sub(r"\u200b", " ", text)              # 去除特殊字符
+    # 删除表情符号
+    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f018-\U0001f270\U0000231a-\U0000231b\U0000238d-\U0000238d\U000024c2-\U0001f251]+', '', text)
+    text = re.sub(r"\s+", " ", text)                 # 多个空格合并
+    return text.strip()

 def main():
    # 设置设备
@@ -66,9 +78,12 @@ def main():
            continue
        
        try:
+            # 预处理文本
+            processed_text = preprocess_text(text)
+            
            # 对文本进行编码
            encoding = tokenizer(
-                text,
+                processed_text,
                max_length=128,
                padding='max_length',
                truncation=True,