diff --git a/.gitignore b/.gitignore index 10c57a2..ac842c8 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ model2/* *.pyz *.pywz .vscode +.VSCodeCounter \ No newline at end of file diff --git a/spider/saveData.py b/spider/saveData.py index 9c4c973..a2792c2 100644 --- a/spider/saveData.py +++ b/spider/saveData.py @@ -1,35 +1,108 @@ import os -from sqlalchemy import create_engine import pandas as pd -from spiderDataPackage.settings import articleAddr,commentsAddr -# from ..model.topicDefine import * +from sqlalchemy import create_engine +from getpass import getpass +import logging -engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@47.92.235.6/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') +# 配置日志 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(message)s', + handlers=[ + logging.FileHandler("save_data.log"), + logging.StreamHandler() + ] +) -def saveData(): +# 假设 articleAddr 和 commentsAddr 是绝对路径或相对于脚本的路径 +from spiderDataPackage.settings import articleAddr, commentsAddr + +def get_db_connection_interactive(): + """ + 通过终端交互获取数据库连接参数,若按回车则使用默认值。 + 返回 SQLAlchemy 的数据库引擎。 + """ + print("请依次输入数据库连接信息(直接按回车使用默认值):") + + host = input(" 1. 主机 (默认: localhost): ") or "localhost" + port_str = input(" 2. 端口 (默认: 3306): ") or "3306" try: - oldArticle = pd.read_sql('select * from article',engine) + port = int(port_str) + except ValueError: + logging.warning("端口号无效,使用默认端口 3306。") + port = 3306 + + user = input(" 3. 用户名 (默认: root): ") or "root" + password = getpass(" 4. 密码 (默认: 12345678): ") or "12345678" + db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" + + # 构建数据库连接字符串 + connection_str = f"mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}?charset=utf8mb4" + + try: + engine = create_engine(connection_str) + # 测试连接 + with engine.connect() as connection: + logging.info(f"成功连接到数据库: {user}@{host}:{port}/{db_name}") + return engine + except Exception as e: + logging.error(f"无法连接到数据库: {e}") + exit(1) + +def saveData(engine): + """ + 从数据库和CSV文件读取数据,合并后去重并保存回数据库。 + 最后删除CSV文件。 + """ + try: + # 读取旧数据 + oldArticle = pd.read_sql('SELECT * FROM article', engine) + oldComment = pd.read_sql('SELECT * FROM comments', engine) + logging.info("成功从数据库读取旧的文章和评论数据。") + + # 读取新数据 newArticle = pd.read_csv(articleAddr) - oldComment = pd.read_sql('select * from comments',engine) newComment = pd.read_csv(commentsAddr) - - mergeArticle = pd.concat([newArticle,oldArticle],join='inner') - mergeComment = pd.concat([newComment,oldComment],join='inner') - - mergeArticle.drop_duplicates(subset='id',keep='last',inplace=True) - mergeComment.drop_duplicates(subset='content',keep='last',inplace=True) - + logging.info("成功从CSV文件读取新的文章和评论数据。") + + # 合并数据 + mergeArticle = pd.concat([newArticle, oldArticle], ignore_index=True, sort=False) + mergeComment = pd.concat([newComment, oldComment], ignore_index=True, sort=False) + logging.info("成功合并新旧文章和评论数据。") + + # 去重 + mergeArticle.drop_duplicates(subset='id', keep='last', inplace=True) + mergeComment.drop_duplicates(subset='content', keep='last', inplace=True) + logging.info("成功去除重复的文章和评论数据。") + + # 保存回数据库 mergeArticle.to_sql('article', con=engine, if_exists='replace', index=False) mergeComment.to_sql('comments', con=engine, if_exists='replace', index=False) - except: - newArticle = pd.read_csv(articleAddr) - newComment = pd.read_csv(commentsAddr) - newArticle.to_sql('article',con=engine,if_exists='replace',index=False) - newComment.to_sql('comments',con=engine,if_exists='replace',index=False) + logging.info("成功将合并后的数据保存回数据库。") + + except pd.errors.EmptyDataError as e: + logging.error(f"读取CSV文件时出错: {e}") + except Exception as e: + logging.error(f"保存数据时出错: {e}") + else: + # 删除CSV文件 + try: + os.remove(articleAddr) + os.remove(commentsAddr) + logging.info("成功删除CSV文件。") + except Exception as e: + logging.warning(f"删除CSV文件时出错: {e}") - os.remove(articleAddr) - os.remove(commentsAddr) - # update_data() +def main(): + # 获取数据库连接 + engine = get_db_connection_interactive() + + # 保存数据 + saveData(engine) + + # 关闭引擎(可选,因为SQLAlchemy引擎会自动管理连接池) + engine.dispose() + logging.info("数据库连接已关闭。") if __name__ == '__main__': - saveData() \ No newline at end of file + main() diff --git a/utils/query.py b/utils/query.py index 1829d34..7386155 100644 --- a/utils/query.py +++ b/utils/query.py @@ -1,13 +1,114 @@ -from pymysql import * -conn = connect(host='47.92.235.6',port=3306,user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem') +import getpass +import pymysql +import logging + +# 配置日志 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(message)s', + handlers=[ + logging.FileHandler("database_operations.log"), + logging.StreamHandler() + ] +) + +def get_db_connection_interactive(): + """ + 通过终端交互获取数据库连接参数,若按回车则使用默认值。 + 返回一个连接对象。 + """ + print("请依次输入数据库连接信息(直接按回车使用默认值):") + + host = input(" 1. 主机 (默认: localhost): ") or "localhost" + port_str = input(" 2. 端口 (默认: 3306): ") or "3306" + try: + port = int(port_str) + except ValueError: + logging.warning("端口号无效,使用默认端口 3306。") + port = 3306 + + user = input(" 3. 用户名 (默认: root): ") or "root" + password = getpass.getpass(" 4. 密码 (默认: 312517): ") or "312517" + db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" + + logging.info(f"尝试连接到数据库: {user}@{host}:{port}/{db_name}") + + try: + connection = pymysql.connect( + host=host, + port=port, + user=user, + password=password, + database=db_name, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor # 返回字典格式 + ) + logging.info("数据库连接成功。") + return connection + except pymysql.MySQLError as e: + logging.error(f"数据库连接失败: {e}") + exit(1) + +# 获取数据库连接 +conn = get_db_connection_interactive() + +# 获取游标 cursor = conn.cursor() -def query(sql,params,type="no_select"): - params = tuple(params) - cursor.execute(sql,params) - conn.ping(reconnect=True) - if type != 'no_select': - data_list = cursor.fetchall() - conn.commit() - return data_list - else: - conn.commit() + +def query(sql, params=None, query_type="no_select"): + """ + 执行SQL查询或操作。 + + :param sql: SQL语句 + :param params: SQL参数(可选) + :param query_type: 查询类型,默认为 "no_select" + 如果不是 "no_select",则执行 fetch 操作 + :return: 如果是查询操作,返回数据列表;否则返回 None + """ + try: + if params: + params = tuple(params) + cursor.execute(sql, params) + else: + cursor.execute(sql) + + # 确保连接保持活跃 + conn.ping(reconnect=True) + + if query_type != "no_select": + data_list = cursor.fetchall() + conn.commit() + logging.info("查询成功,已获取数据。") + return data_list + else: + conn.commit() + logging.info("操作成功,已提交事务。") + except pymysql.MySQLError as e: + logging.error(f"执行SQL时出错: {e}") + conn.rollback() + return None + +def main(): + # 示例用法 + + # 执行查询操作 + select_sql = "SELECT * FROM article LIMIT 5" + articles = query(select_sql, query_type="select") + if articles: + for article in articles: + print(article) + + # 执行插入操作(根据实际表结构修改) + insert_sql = "INSERT INTO article (id, content) VALUES (%s, %s)" + new_article = (12345, "这是一条新的文章内容。") + result = query(insert_sql, params=new_article, query_type="no_select") + if result is None: + logging.info("插入操作完成。") + + # 关闭游标和连接 + cursor.close() + conn.close() + logging.info("数据库连接已关闭。") + +if __name__ == '__main__': + main() diff --git a/wordCloudPicture.py b/wordCloudPicture.py index 00f15ae..40b6b73 100644 --- a/wordCloudPicture.py +++ b/wordCloudPicture.py @@ -1,47 +1,110 @@ +import os import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt -from PIL import Image,ImageDraw -from pymysql import * -import json +from PIL import Image import numpy as np +import pymysql + def stopWordList(): - return [line.strip() for line in open('./model/stopWords.txt',encoding='utf8').readlines()] + """ + 濡傛灉 stopWords.txt 鏂囦欢鍐呭杈冨ぇ锛屾垨琚绻佽鍙栵紝 + 鍙互鑰冭檻灏嗗叾缂撳瓨璧锋潵锛岄伩鍏嶉噸澶嶈鏂囦欢銆 + """ + with open('./model/stopWords.txt', encoding='utf8') as f: + return [line.strip() for line in f.readlines()] -def get_img(field,tableName,targetImgSrc,resImgSrc): - con = connect(host='47.92.235.6',user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem',port=3306,charset='utf8mb4') - cuser = con.cursor() - sql = f'select {field} from {tableName}' - cuser.execute(sql) - data = cuser.fetchall() - text = '' - for item in data: - text += item[0] - cuser.close() - con.close() - - cut = jieba.cut(text) - newCut = [] - for word in cut: - if word not in stopWordList():newCut.append(word) - string = ' '.join(newCut) - - img = Image.open(targetImgSrc) +def generate_word_cloud(text, mask_path, font_path, output_path): + """鐢熸垚璇嶄簯骞朵繚瀛樺埌 output_path""" + img = Image.open(mask_path) img_arr = np.array(img) + wc = WordCloud( background_color="#fff", mask=img_arr, - font_path='STHUPO.TTF' + font_path=font_path ) - wc.generate_from_text(string) - - fig = plt.figure(1) - plt.imshow(wc) + wc.generate_from_text(text) + plt.figure(figsize=(8, 6)) + plt.imshow(wc, interpolation='bilinear') plt.axis('off') + plt.savefig(output_path, dpi=300, bbox_inches='tight') + plt.close() # 淇濆瓨鍚庡叧闂 - plt.savefig(resImgSrc,dpi=500) +def get_db_connection_interactive(): + """ + 閫氳繃缁堢浜や簰鑾峰彇鏁版嵁搴撹繛鎺ュ弬鏁帮紝鑻ユ寜鍥炶溅鍒欎娇鐢ㄩ粯璁ゅ笺 + """ + print("璇蜂緷娆¤緭鍏ユ暟鎹簱杩炴帴淇℃伅锛堢洿鎺ユ寜鍥炶溅浣跨敤榛樿鍊硷級锛") + host = input(" 1. 涓绘満 (榛樿: localhost): ") or "localhost" + port_str = input(" 2. 绔彛 (榛樿: 3306): ") or "3306" + port = int(port_str) -# get_img('content','comments','./static/comment.jpg','./static/commentCloud.jpg') -get_img('content','article','./static/content.jpg','./static/contentCloud.jpg') + user = input(" 3. 鐢ㄦ埛鍚 (榛樿: root): ") or "root" + password = input(" 4. 瀵嗙爜 (榛樿: 312517): ") or "12345678" + db_name = input(" 5. 鏁版嵁搴撳悕 (榛樿: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" + + print(f"\n鍗冲皢杩炴帴鍒版暟鎹簱: {user}@{host}:{port}/{db_name}\n") + + return pymysql.connect( + host=host, + user=user, + password=password, + database=db_name, + port=port, + charset='utf8mb4' + ) + +def get_img(field, table_name, target_img_src, res_img_src, connection, font_path='STHUPO.TTF'): + """ + 浠庢暟鎹簱鎷夊彇鎸囧畾瀛楁鐨勬枃鏈暟鎹紝鍒嗚瘝澶勭悊鍚庣敓鎴愯瘝浜戙 + :param field: 鏁版嵁搴撳瓧娈靛悕 + :param table_name: 鏁版嵁琛ㄥ悕 + :param target_img_src: 璇嶄簯褰㈢姸鍥 + :param res_img_src: 杈撳嚭璇嶄簯鏂囦欢璺緞 + :param connection: 宸插缓绔嬬殑鏁版嵁搴撹繛鎺 + :param font_path: 瀛椾綋鏂囦欢璺緞 + """ + cursor = connection.cursor() + sql = f'SELECT {field} FROM {table_name}' + cursor.execute(sql) + data = cursor.fetchall() + + text = '' + for item in data: + text += item[0] # item 鏄厓缁 (鍐呭,)锛屽彇绗竴涓厓绱犲嵆鍙 + + cursor.close() + + # 鍒嗚瘝 & 鍘诲仠鐢ㄨ瘝 + cut_words = jieba.cut(text) + stop_words = set(stopWordList()) + filtered_words = [word for word in cut_words if word not in stop_words] + final_text = ' '.join(filtered_words) + + # 鐢熸垚璇嶄簯 + generate_word_cloud(final_text, target_img_src, font_path, res_img_src) + +def main(): + # 1. 鑾峰彇鏁版嵁搴撹繛鎺ワ紙浜や簰寮忚緭鍏ワ級 + connection = get_db_connection_interactive() + + # 2. 鏍规嵁闇姹傜敓鎴愯瘝浜 + # 渚嬪锛氫粠 article 琛ㄧ殑 content 瀛楁鐢熸垚璇嶄簯 + try: + get_img( + field='content', + table_name='article', + target_img_src='./static/content.jpg', + res_img_src='./static/contentCloud.jpg', + connection=connection + ) + print("璇嶄簯鐢熸垚瀹屾瘯锛") + finally: + # 鍏抽棴鏁版嵁搴撹繛鎺 + connection.close() + +if __name__ == '__main__': + main()