69 lines
2.3 KiB
Python
69 lines
2.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
import jieba
|
|
import re
|
|
|
|
|
|
stopwords = []
|
|
with open("data/stopwords.txt", "r", encoding="utf8") as f:
|
|
for w in f:
|
|
stopwords.append(w.strip())
|
|
|
|
|
|
def load_corpus(path):
|
|
"""
|
|
加载语料库
|
|
"""
|
|
data = []
|
|
with open(path, "r", encoding="utf8") as f:
|
|
for line in f:
|
|
[_, seniment, content] = line.split(",", 2)
|
|
content = processing(content)
|
|
data.append((content, int(seniment)))
|
|
return data
|
|
|
|
|
|
def load_corpus_bert(path):
|
|
"""
|
|
加载语料库
|
|
"""
|
|
data = []
|
|
with open(path, "r", encoding="utf8") as f:
|
|
for line in f:
|
|
[_, seniment, content] = line.split(",", 2)
|
|
content = processing_bert(content)
|
|
data.append((content, int(seniment)))
|
|
return data
|
|
|
|
|
|
def processing(text):
|
|
"""
|
|
数据预处理, 可以根据自己的需求进行重载
|
|
"""
|
|
# 数据清洗部分
|
|
text = re.sub("\{%.+?%\}", " ", text) # 去除 {%xxx%} (地理定位, 微博话题等)
|
|
text = re.sub("@.+?( |$)", " ", text) # 去除 @xxx (用户名)
|
|
text = re.sub("【.+?】", " ", text) # 去除 【xx】 (里面的内容通常都不是用户自己写的)
|
|
text = re.sub("\u200b", " ", text) # '\u200b'是这个数据集中的一个bad case, 不用特别在意
|
|
# 分词
|
|
words = [w for w in jieba.lcut(text) if w.isalpha()]
|
|
# 对否定词`不`做特殊处理: 与其后面的词进行拼接
|
|
while "不" in words:
|
|
index = words.index("不")
|
|
if index == len(words) - 1:
|
|
break
|
|
words[index: index+2] = ["".join(words[index: index+2])] # 列表切片赋值的酷炫写法
|
|
# 用空格拼接成字符串
|
|
result = " ".join(words)
|
|
return result
|
|
|
|
|
|
def processing_bert(text):
|
|
"""
|
|
数据预处理, 可以根据自己的需求进行重载
|
|
"""
|
|
# 数据清洗部分
|
|
text = re.sub("\{%.+?%\}", " ", text) # 去除 {%xxx%} (地理定位, 微博话题等)
|
|
text = re.sub("@.+?( |$)", " ", text) # 去除 @xxx (用户名)
|
|
text = re.sub("【.+?】", " ", text) # 去除 【xx】 (里面的内容通常都不是用户自己写的)
|
|
text = re.sub("\u200b", " ", text) # '\u200b'是这个数据集中的一个bad case, 不用特别在意
|
|
return text |