Files
bettafish-company/WeiboSentiment_MachineLearning/utils.py
T

69 lines
2.3 KiB
Python

# -*- coding: utf-8 -*-
import jieba
import re
stopwords = []
with open("data/stopwords.txt", "r", encoding="utf8") as f:
for w in f:
stopwords.append(w.strip())
def load_corpus(path):
"""
加载语料库
"""
data = []
with open(path, "r", encoding="utf8") as f:
for line in f:
[_, seniment, content] = line.split(",", 2)
content = processing(content)
data.append((content, int(seniment)))
return data
def load_corpus_bert(path):
"""
加载语料库
"""
data = []
with open(path, "r", encoding="utf8") as f:
for line in f:
[_, seniment, content] = line.split(",", 2)
content = processing_bert(content)
data.append((content, int(seniment)))
return data
def processing(text):
"""
数据预处理, 可以根据自己的需求进行重载
"""
# 数据清洗部分
text = re.sub("\{%.+?%\}", " ", text) # 去除 {%xxx%} (地理定位, 微博话题等)
text = re.sub("@.+?( |$)", " ", text) # 去除 @xxx (用户名)
text = re.sub("【.+?】", " ", text) # 去除 【xx】 (里面的内容通常都不是用户自己写的)
text = re.sub("\u200b", " ", text) # '\u200b'是这个数据集中的一个bad case, 不用特别在意
# 分词
words = [w for w in jieba.lcut(text) if w.isalpha()]
# 对否定词`不`做特殊处理: 与其后面的词进行拼接
while "" in words:
index = words.index("")
if index == len(words) - 1:
break
words[index: index+2] = ["".join(words[index: index+2])] # 列表切片赋值的酷炫写法
# 用空格拼接成字符串
result = " ".join(words)
return result
def processing_bert(text):
"""
数据预处理, 可以根据自己的需求进行重载
"""
# 数据清洗部分
text = re.sub("\{%.+?%\}", " ", text) # 去除 {%xxx%} (地理定位, 微博话题等)
text = re.sub("@.+?( |$)", " ", text) # 去除 @xxx (用户名)
text = re.sub("【.+?】", " ", text) # 去除 【xx】 (里面的内容通常都不是用户自己写的)
text = re.sub("\u200b", " ", text) # '\u200b'是这个数据集中的一个bad case, 不用特别在意
return text