【yuqing.py】更新代码,能在文件夹下直接运行
This commit is contained in:
+45
-38
@@ -1,48 +1,55 @@
|
|||||||
import pandas as pd # 用于数据处理
|
from snownlp import SnowNLP # 引入SnowNLP库,用于中文情感分析
|
||||||
import numpy as np # 用于科学计算
|
import csv # 用于处理CSV文件的读写操作
|
||||||
import csv # 用于读取CSV文件
|
import os # 用于操作系统相关功能
|
||||||
from snownlp import SnowNLP # 用于中文自然语言处理(此处未实际使用)
|
import sys
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer # 用于文本特征提取
|
import os
|
||||||
from sklearn.naive_bayes import MultinomialNB # 用于多项式朴素贝叶斯分类
|
|
||||||
from sklearn.model_selection import train_test_split # 用于划分训练集和测试集
|
|
||||||
from sklearn.metrics import accuracy_score # 用于计算模型准确度
|
|
||||||
|
|
||||||
|
# 获取当前文件的绝对路径
|
||||||
|
current_file_path = os.path.abspath(__file__)
|
||||||
|
|
||||||
def getSentiment_data():
|
# 获取当前文件的父目录路径
|
||||||
# 从CSV文件中读取情感数据
|
parent_dir = os.path.dirname(current_file_path)
|
||||||
sentiment_data = []
|
|
||||||
with open('./target.csv', 'r', encoding='utf8') as readerFile:
|
|
||||||
reader = csv.reader(readerFile)
|
|
||||||
for data in reader:
|
|
||||||
sentiment_data.append(data)
|
|
||||||
return sentiment_data
|
|
||||||
|
|
||||||
|
# 获取父目录的父目录路径,也就是项目根目录
|
||||||
|
project_root_dir = os.path.dirname(parent_dir)
|
||||||
|
|
||||||
ef
|
# 将项目根目录添加到 Python 路径中
|
||||||
model_train():
|
sys.path.append(project_root_dir)
|
||||||
# 获取情感数据并转换为DataFrame
|
|
||||||
sentiment_data = getSentiment_data()
|
|
||||||
df = pd.DataFrame(sentiment_data, columns=['text', 'sentiment'])
|
|
||||||
|
|
||||||
# 将数据集划分为训练集和测试集,测试集占20%
|
# 现在可以导入 utils 目录中的模块了
|
||||||
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
|
from utils.getPublicData import getAllCommentsData # 自定义函数,用于获取评论数据
|
||||||
|
|
||||||
# 初始化TfidfVectorizer,并对训练集和测试集进行文本特征提取
|
def targetFile():
|
||||||
vectorize = TfidfVectorizer()
|
targetFile = 'target.csv' # 定义目标文件名称
|
||||||
X_train = vectorize.fit_transform(train_data['text'])
|
commentsList = getAllCommentsData() # 获取所有评论数据
|
||||||
y_train = train_data['sentiment']
|
|
||||||
X_test = vectorize.transform(test_data['text'])
|
|
||||||
y_test = test_data['sentiment']
|
|
||||||
|
|
||||||
# 初始化多项式朴素贝叶斯分类器,并进行训练
|
rateData = [] # 用于存储处理后的评论数据
|
||||||
classifier = MultinomialNB()
|
good = 0 # 记录正面评论数量
|
||||||
classifier.fit(X_train, y_train)
|
bad = 0 # 记录负面评论数量
|
||||||
|
middle = 0 # 记录中性评论数量
|
||||||
|
|
||||||
# 对测试集进行预测
|
# 遍历所有评论,进行情感分析
|
||||||
y_pred = classifier.predict(X_test)
|
for index, i in enumerate(commentsList): # enumerate 是 Python 中的一个内置函数,它允许我们在遍历可迭代对象(如列表、元组或字符串)时同时获取元素的索引和值。
|
||||||
|
# |articleId|created_at | likes_counts | region | content| authorName | authorGender | authorAddress | authorAvatar
|
||||||
|
value = SnowNLP(i[4]).sentiments # 对评论内容进行情感分析
|
||||||
|
if value > 0.5: # 如果情感值大于0.5,判定为正面评论
|
||||||
|
good += 1
|
||||||
|
rateData.append([i[4], '正面'])
|
||||||
|
elif value == 0.5: # 如果情感值等于0.5,判定为中性评论
|
||||||
|
middle += 1
|
||||||
|
rateData.append([i[4], '中性'])
|
||||||
|
elif value < 0.5: # 如果情感值小于0.5,判定为负面评论
|
||||||
|
bad += 1
|
||||||
|
rateData.append([i[4], '负面'])
|
||||||
|
|
||||||
# 计算模型准确度
|
# 将处理后的评论数据写入目标文件
|
||||||
accuracy = accuracy_score(y_test, y_pred)
|
for i in rateData:
|
||||||
|
with open(targetFile, 'a+', encoding='utf8', newline='') as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerow(i) # 将每条数据写入CSV文件
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def main():
|
||||||
model_train() # 训练模型并计算准确度
|
targetFile() # 调用targetFile函数进行数据处理
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main() # 运行主函数
|
||||||
|
|||||||
Reference in New Issue
Block a user