From a94d2cdf80a55d3bc50f52af3a0c4a86c0bbff7f Mon Sep 17 00:00:00 2001 From: juanboy <2980526980@qq.com> Date: Tue, 2 Jul 2024 17:49:46 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90cutComments.py=E3=80=91=E5=88=86?= =?UTF-8?q?=E8=AF=8D=E7=BB=9F=E8=AE=A1=E8=AF=8D=E9=A2=91=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E5=AE=9A=E4=B9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- utils/cipingTotal.py | 31 +++++++++++++++++++++++++++++++ utils/cutComments.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 utils/cipingTotal.py create mode 100644 utils/cutComments.py diff --git a/utils/cipingTotal.py b/utils/cipingTotal.py new file mode 100644 index 0000000..6cf81f4 --- /dev/null +++ b/utils/cipingTotal.py @@ -0,0 +1,31 @@ +import jieba +import re + +def main(): + reader = open('./cutComments.txt','r',encoding='utf8') + strs = reader.read() + result = open('./cipingTotal.csv','w',encoding='utf8') + + # 分词,去重,列表 + word_list = jieba.cut(strs,cut_all=True) + + new_words = [] + for i in word_list: + m = re.search("\d+",i) + n = re.search("\W+",i) + if not m and not n and len(i) > 1: + new_words.append(i) + + # 统计词频 + word_count = {} + for i in set(new_words): + word_count[i] = new_words.count(i) + + # 格式整理 + list_count = sorted(word_count.items(),key=lambda x:x[1],reverse=True) + + for i in range(100): + print(list_count[i],file=result) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/cutComments.py b/utils/cutComments.py new file mode 100644 index 0000000..ab3ee3c --- /dev/null +++ b/utils/cutComments.py @@ -0,0 +1,29 @@ +from utils.getPublicData import getAllCommentsData +import jieba +targetTxt = 'cutComments.txt' + +def stopWordList(): + stopWords = [line.strip() for line in open('./stopWords.txt',encoding='utf8').readlines()] + return stopWords + +def seg_depart(sentence): + sentence_depart = jieba.cut(" ".join([x[4] for x in sentence]).strip()) + stopWords = stopWordList() + outStr = '' + for word in sentence_depart: + if word not in stopWords: + if word != '\t': + outStr += word + return outStr + +def writer_comments_cuts(): + with open(targetTxt,'a+',encoding='utf-8') as targetFile: + seg = jieba.cut(seg_depart(getAllCommentsData()),cut_all=True) + output = ' '.join(seg) + targetFile.write(output) + targetFile.write('\n') + print('写入成功') + + +if __name__ == '__main__': + writer_comments_cuts() \ No newline at end of file