From e58f105761cabeec154a9352c1f0e03eaec09780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=88=92=E9=85=92=E7=9A=84=E6=9D=8E=E7=99=BD?= <670939375@qq.com> Date: Thu, 9 Jan 2025 23:30:15 +0800 Subject: [PATCH] More convenient project initialization, with database initialization added to app.py. --- app.log | 2 + app.py | 137 +++++++++++++++++++++++++++++++++------- database_operations.log | 0 spider/saveData.py | 72 ++++++++++----------- utils/query.py | 66 +++++++++---------- 5 files changed, 186 insertions(+), 91 deletions(-) create mode 100644 app.log create mode 100644 database_operations.log diff --git a/app.log b/app.log new file mode 100644 index 0000000..c7f343d --- /dev/null +++ b/app.log @@ -0,0 +1,2 @@ +2025-01-09 23:29:06,246 [INFO] 尝试连接到数据库: root@localhost:3306/Weibo_PublicOpinion_AnalysisSystem +2025-01-09 23:29:06,346 [ERROR] 数据库连接失败: (1045, "Access denied for user 'root'@'localhost' (using password: YES)") diff --git a/app.py b/app.py index dcf5e39..86b2594 100644 --- a/app.py +++ b/app.py @@ -1,10 +1,104 @@ -from flask import Flask, session, request, redirect, render_template -import re -from apscheduler.schedulers.background import BackgroundScheduler -import subprocess import os -from pytz import utc +import re import logging +import getpass +import pymysql +import subprocess +from flask import Flask, session, request, redirect, render_template +from apscheduler.schedulers.background import BackgroundScheduler +from pytz import utc + +# 鍒濆鍖栨棩蹇楄褰 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(message)s', + handlers=[ + logging.FileHandler("app.log"), + logging.StreamHandler() + ] +) + +def get_db_connection_interactive(): + """ + 閫氳繃缁堢浜や簰鑾峰彇鏁版嵁搴撹繛鎺ュ弬鏁帮紝鑻ユ寜鍥炶溅鍒欎娇鐢ㄩ粯璁ゅ笺 + 杩斿洖涓涓繛鎺ュ璞° + """ + print("璇蜂緷娆¤緭鍏ユ暟鎹簱杩炴帴淇℃伅锛堢洿鎺ユ寜鍥炶溅浣跨敤榛樿鍊硷級锛") + + host = input(" 1. 涓绘満 (榛樿: localhost): ") or "localhost" + port_str = input(" 2. 绔彛 (榛樿: 3306): ") or "3306" + try: + port = int(port_str) + except ValueError: + logging.warning("绔彛鍙锋棤鏁堬紝浣跨敤榛樿绔彛 3306銆") + port = 3306 + + user = input(" 3. 鐢ㄦ埛鍚 (榛樿: root): ") or "root" + password = getpass.getpass(" 4. 瀵嗙爜 (榛樿: 12345678): ") or "12345678" + db_name = input(" 5. 鏁版嵁搴撳悕 (榛樿: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" + + logging.info(f"灏濊瘯杩炴帴鍒版暟鎹簱: {user}@{host}:{port}/{db_name}") + + try: + connection = pymysql.connect( + host=host, + port=port, + user=user, + password=password, + database=db_name, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor # 杩斿洖瀛楀吀鏍煎紡 + ) + logging.info("鏁版嵁搴撹繛鎺ユ垚鍔熴") + return connection + except pymysql.MySQLError as e: + logging.error(f"鏁版嵁搴撹繛鎺ュけ璐: {e}") + exit(1) + +def initialize_database(connection, sql_file_path): + """ + 鎵ц SQL 鏂囦欢涓殑璇彞浠ュ垵濮嬪寲鏁版嵁搴撱 + + :param connection: 宸插缓绔嬬殑鏁版嵁搴撹繛鎺 + :param sql_file_path: SQL 鏂囦欢鐨勮矾寰 + """ + try: + with open(sql_file_path, 'r', encoding='utf8') as file: + sql_commands = file.read() + + with connection.cursor() as cursor: + for statement in sql_commands.split(';'): + statement = statement.strip() + if statement: + cursor.execute(statement) + connection.commit() + logging.info("鏁版嵁搴撳垵濮嬪寲鎴愬姛銆") + except FileNotFoundError: + logging.error(f"SQL 鏂囦欢鏈壘鍒: {sql_file_path}") + exit(1) + except pymysql.MySQLError as e: + logging.error(f"鎵ц SQL 鏃跺嚭閿: {e}") + connection.rollback() + exit(1) + except Exception as e: + logging.error(f"鍒濆鍖栨暟鎹簱鏃跺嚭閿: {e}") + connection.rollback() + exit(1) + +def prompt_first_run(): + """ + 璇㈤棶鐢ㄦ埛鏄惁棣栨杩愯锛岄渶瑕佸垵濮嬪寲鏁版嵁搴撱 + + :return: Boolean锛孴rue 琛ㄧず闇瑕佸垵濮嬪寲鏁版嵁搴 + """ + while True: + choice = input("鏄惁棣栨杩愯璇ラ」鐩紝闇瑕佸垵濮嬪寲鏁版嵁搴擄紵(Y/n): ").strip().lower() + if choice in ['y', 'yes', '']: + return True + elif choice in ['n', 'no']: + return False + else: + print("璇疯緭鍏 Y 鎴 N銆") # 鍒濆鍖 Flask 搴旂敤 app = Flask(__name__) @@ -14,25 +108,13 @@ app.secret_key = 'this is secret_key you know ?' # 璁剧疆 Flask 鐨勫瘑閽ワ紝鐢 from views.page import page from views.user import user app.register_blueprint(page.pb) # 娉ㄥ唽椤甸潰钃濆浘 -app.register_blueprint(user.ub) # 娉ㄥ唽鐢ㄦ埛钃濆浘 +app.register_blueprint(user.ub) # 娉ㄥ唽鐢ㄦ埛钃濆浘 # 棣栭〉璺敱锛屾竻绌 session @app.route('/') def hello_world(): - return session.clear() # 娓呯┖ session锛岀敤鎴烽鍑虹櫥褰 - -""" -@app.before_request -def before_reuqest(): - pat = re.compile(r'^/static') # 姝e垯鍖归厤闈欐佹枃浠惰矾寰 - if re.search(pat, request.path): # 濡傛灉鏄潤鎬佹枃浠讹紝鐩存帴杩斿洖 - return - elif request.path == '/user/login' or request.path == '/user/register': # 鐧诲綍鎴栨敞鍐岄〉闈㈡棤闇楠岃瘉 - return - elif session.get('username'): # 濡傛灉 session 涓湁鐢ㄦ埛鍚嶏紝鍒欏厑璁哥户缁 - return - return redirect('/user/login') # 鍚﹀垯閲嶅畾鍚戝埌鐧诲綍椤甸潰 -""" + session.clear() # 娓呯┖ session锛岀敤鎴烽鍑虹櫥褰 + return "Session Cleared" # 涓棿浠讹細澶勭悊璇锋眰鍓嶇殑閫昏緫 @app.before_request @@ -79,6 +161,19 @@ def run_script(): # 涓荤▼搴忓叆鍙 if __name__ == '__main__': + # 妫娴嬫槸鍚﹂渶瑕佸垵濮嬪寲鏁版嵁搴 + if prompt_first_run(): + # 鑾峰彇鏁版嵁搴撹繛鎺 + connection = get_db_connection_interactive() + + # 鎵ц鏁版嵁搴撳垵濮嬪寲 + sql_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'createTables.sql') + initialize_database(connection, sql_file) + + # 鍏抽棴鏁版嵁搴撹繛鎺 + connection.close() + logging.info("鏁版嵁搴撹繛鎺ュ凡鍏抽棴銆") + # 璁剧疆瀹氭椂浠诲姟锛屽畾鏈熸墽琛岀埇铏剼鏈 scheduler = BackgroundScheduler(timezone=utc) # 鍒涘缓鍚庡彴浠诲姟璋冨害鍣 scheduler.add_job(run_script, 'interval', hours=5) # 姣5灏忔椂鎵ц涓娆$埇铏剼鏈 @@ -90,8 +185,6 @@ if __name__ == '__main__': scheduler.shutdown() # 纭繚鍦ㄥ簲鐢ㄥ叧闂椂鍏抽棴璋冨害鍣 # 璁剧疆鏃ュ織璁板綍锛屾崟鑾峰簲鐢ㄧ殑璇锋眰淇℃伅 -logging.basicConfig(level=logging.INFO) # 閰嶇疆鏃ュ織璁板綍锛岃缃棩蹇楃骇鍒负 INFO - @app.before_request def log_request_info(): # 璁板綍姣忔璇锋眰鐨勪俊鎭紝渚夸簬璋冭瘯鍜岀洃鎺 diff --git a/database_operations.log b/database_operations.log new file mode 100644 index 0000000..e69de29 diff --git a/spider/saveData.py b/spider/saveData.py index a2792c2..ff88cb1 100644 --- a/spider/saveData.py +++ b/spider/saveData.py @@ -4,7 +4,7 @@ from sqlalchemy import create_engine from getpass import getpass import logging -# 配置日志 +# 閰嶇疆鏃ュ織 logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', @@ -14,95 +14,95 @@ logging.basicConfig( ] ) -# 假设 articleAddr 和 commentsAddr 是绝对路径或相对于脚本的路径 +# 鍋囪 articleAddr 鍜 commentsAddr 鏄粷瀵硅矾寰勬垨鐩稿浜庤剼鏈殑璺緞 from spiderDataPackage.settings import articleAddr, commentsAddr def get_db_connection_interactive(): """ - 通过终端交互获取数据库连接参数,若按回车则使用默认值。 - 返回 SQLAlchemy 的数据库引擎。 + 閫氳繃缁堢浜や簰鑾峰彇鏁版嵁搴撹繛鎺ュ弬鏁帮紝鑻ユ寜鍥炶溅鍒欎娇鐢ㄩ粯璁ゅ笺 + 杩斿洖 SQLAlchemy 鐨勬暟鎹簱寮曟搸銆 """ - print("请依次输入数据库连接信息(直接按回车使用默认值):") + print("璇蜂緷娆¤緭鍏ユ暟鎹簱杩炴帴淇℃伅锛堢洿鎺ユ寜鍥炶溅浣跨敤榛樿鍊硷級锛") - host = input(" 1. 主机 (默认: localhost): ") or "localhost" - port_str = input(" 2. 端口 (默认: 3306): ") or "3306" + host = input(" 1. 涓绘満 (榛樿: localhost): ") or "localhost" + port_str = input(" 2. 绔彛 (榛樿: 3306): ") or "3306" try: port = int(port_str) except ValueError: - logging.warning("端口号无效,使用默认端口 3306。") + logging.warning("绔彛鍙锋棤鏁堬紝浣跨敤榛樿绔彛 3306銆") port = 3306 - user = input(" 3. 用户名 (默认: root): ") or "root" - password = getpass(" 4. 密码 (默认: 12345678): ") or "12345678" - db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" + user = input(" 3. 鐢ㄦ埛鍚 (榛樿: root): ") or "root" + password = getpass(" 4. 瀵嗙爜 (榛樿: 12345678): ") or "12345678" + db_name = input(" 5. 鏁版嵁搴撳悕 (榛樿: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" - # 构建数据库连接字符串 + # 鏋勫缓鏁版嵁搴撹繛鎺ュ瓧绗︿覆 connection_str = f"mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}?charset=utf8mb4" try: engine = create_engine(connection_str) - # 测试连接 + # 娴嬭瘯杩炴帴 with engine.connect() as connection: - logging.info(f"成功连接到数据库: {user}@{host}:{port}/{db_name}") + logging.info(f"鎴愬姛杩炴帴鍒版暟鎹簱: {user}@{host}:{port}/{db_name}") return engine except Exception as e: - logging.error(f"无法连接到数据库: {e}") + logging.error(f"鏃犳硶杩炴帴鍒版暟鎹簱: {e}") exit(1) def saveData(engine): """ - 从数据库和CSV文件读取数据,合并后去重并保存回数据库。 - 最后删除CSV文件。 + 浠庢暟鎹簱鍜孋SV鏂囦欢璇诲彇鏁版嵁锛屽悎骞跺悗鍘婚噸骞朵繚瀛樺洖鏁版嵁搴撱 + 鏈鍚庡垹闄SV鏂囦欢銆 """ try: - # 读取旧数据 + # 璇诲彇鏃ф暟鎹 oldArticle = pd.read_sql('SELECT * FROM article', engine) oldComment = pd.read_sql('SELECT * FROM comments', engine) - logging.info("成功从数据库读取旧的文章和评论数据。") + logging.info("鎴愬姛浠庢暟鎹簱璇诲彇鏃х殑鏂囩珷鍜岃瘎璁烘暟鎹") - # 读取新数据 + # 璇诲彇鏂版暟鎹 newArticle = pd.read_csv(articleAddr) newComment = pd.read_csv(commentsAddr) - logging.info("成功从CSV文件读取新的文章和评论数据。") + logging.info("鎴愬姛浠嶤SV鏂囦欢璇诲彇鏂扮殑鏂囩珷鍜岃瘎璁烘暟鎹") - # 合并数据 + # 鍚堝苟鏁版嵁 mergeArticle = pd.concat([newArticle, oldArticle], ignore_index=True, sort=False) mergeComment = pd.concat([newComment, oldComment], ignore_index=True, sort=False) - logging.info("成功合并新旧文章和评论数据。") + logging.info("鎴愬姛鍚堝苟鏂版棫鏂囩珷鍜岃瘎璁烘暟鎹") - # 去重 + # 鍘婚噸 mergeArticle.drop_duplicates(subset='id', keep='last', inplace=True) mergeComment.drop_duplicates(subset='content', keep='last', inplace=True) - logging.info("成功去除重复的文章和评论数据。") + logging.info("鎴愬姛鍘婚櫎閲嶅鐨勬枃绔犲拰璇勮鏁版嵁銆") - # 保存回数据库 + # 淇濆瓨鍥炴暟鎹簱 mergeArticle.to_sql('article', con=engine, if_exists='replace', index=False) mergeComment.to_sql('comments', con=engine, if_exists='replace', index=False) - logging.info("成功将合并后的数据保存回数据库。") + logging.info("鎴愬姛灏嗗悎骞跺悗鐨勬暟鎹繚瀛樺洖鏁版嵁搴撱") except pd.errors.EmptyDataError as e: - logging.error(f"读取CSV文件时出错: {e}") + logging.error(f"璇诲彇CSV鏂囦欢鏃跺嚭閿: {e}") except Exception as e: - logging.error(f"保存数据时出错: {e}") + logging.error(f"淇濆瓨鏁版嵁鏃跺嚭閿: {e}") else: - # 删除CSV文件 + # 鍒犻櫎CSV鏂囦欢 try: os.remove(articleAddr) os.remove(commentsAddr) - logging.info("成功删除CSV文件。") + logging.info("鎴愬姛鍒犻櫎CSV鏂囦欢銆") except Exception as e: - logging.warning(f"删除CSV文件时出错: {e}") + logging.warning(f"鍒犻櫎CSV鏂囦欢鏃跺嚭閿: {e}") def main(): - # 获取数据库连接 + # 鑾峰彇鏁版嵁搴撹繛鎺 engine = get_db_connection_interactive() - # 保存数据 + # 淇濆瓨鏁版嵁 saveData(engine) - # 关闭引擎(可选,因为SQLAlchemy引擎会自动管理连接池) + # 鍏抽棴寮曟搸锛堝彲閫夛紝鍥犱负SQLAlchemy寮曟搸浼氳嚜鍔ㄧ鐞嗚繛鎺ユ睜锛 engine.dispose() - logging.info("数据库连接已关闭。") + logging.info("鏁版嵁搴撹繛鎺ュ凡鍏抽棴銆") if __name__ == '__main__': main() diff --git a/utils/query.py b/utils/query.py index 7386155..10a9877 100644 --- a/utils/query.py +++ b/utils/query.py @@ -2,7 +2,7 @@ import getpass import pymysql import logging -# 配置日志 +# 閰嶇疆鏃ュ織 logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', @@ -14,24 +14,24 @@ logging.basicConfig( def get_db_connection_interactive(): """ - 通过终端交互获取数据库连接参数,若按回车则使用默认值。 - 返回一个连接对象。 + 閫氳繃缁堢浜や簰鑾峰彇鏁版嵁搴撹繛鎺ュ弬鏁帮紝鑻ユ寜鍥炶溅鍒欎娇鐢ㄩ粯璁ゅ笺 + 杩斿洖涓涓繛鎺ュ璞° """ - print("请依次输入数据库连接信息(直接按回车使用默认值):") + print("璇蜂緷娆¤緭鍏ユ暟鎹簱杩炴帴淇℃伅锛堢洿鎺ユ寜鍥炶溅浣跨敤榛樿鍊硷級锛") - host = input(" 1. 主机 (默认: localhost): ") or "localhost" - port_str = input(" 2. 端口 (默认: 3306): ") or "3306" + host = input(" 1. 涓绘満 (榛樿: localhost): ") or "localhost" + port_str = input(" 2. 绔彛 (榛樿: 3306): ") or "3306" try: port = int(port_str) except ValueError: - logging.warning("端口号无效,使用默认端口 3306。") + logging.warning("绔彛鍙锋棤鏁堬紝浣跨敤榛樿绔彛 3306銆") port = 3306 - user = input(" 3. 用户名 (默认: root): ") or "root" - password = getpass.getpass(" 4. 密码 (默认: 312517): ") or "312517" - db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" + user = input(" 3. 鐢ㄦ埛鍚 (榛樿: root): ") or "root" + password = getpass.getpass(" 4. 瀵嗙爜 (榛樿: 12345678): ") or "12345678" + db_name = input(" 5. 鏁版嵁搴撳悕 (榛樿: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem" - logging.info(f"尝试连接到数据库: {user}@{host}:{port}/{db_name}") + logging.info(f"灏濊瘯杩炴帴鍒版暟鎹簱: {user}@{host}:{port}/{db_name}") try: connection = pymysql.connect( @@ -41,29 +41,29 @@ def get_db_connection_interactive(): password=password, database=db_name, charset='utf8mb4', - cursorclass=pymysql.cursors.DictCursor # 返回字典格式 + cursorclass=pymysql.cursors.DictCursor # 杩斿洖瀛楀吀鏍煎紡 ) - logging.info("数据库连接成功。") + logging.info("鏁版嵁搴撹繛鎺ユ垚鍔熴") return connection except pymysql.MySQLError as e: - logging.error(f"数据库连接失败: {e}") + logging.error(f"鏁版嵁搴撹繛鎺ュけ璐: {e}") exit(1) -# 获取数据库连接 +# 鑾峰彇鏁版嵁搴撹繛鎺 conn = get_db_connection_interactive() -# 获取游标 +# 鑾峰彇娓告爣 cursor = conn.cursor() def query(sql, params=None, query_type="no_select"): """ - 执行SQL查询或操作。 + 鎵цSQL鏌ヨ鎴栨搷浣溿 - :param sql: SQL语句 - :param params: SQL参数(可选) - :param query_type: 查询类型,默认为 "no_select" - 如果不是 "no_select",则执行 fetch 操作 - :return: 如果是查询操作,返回数据列表;否则返回 None + :param sql: SQL璇彞 + :param params: SQL鍙傛暟锛堝彲閫夛級 + :param query_type: 鏌ヨ绫诲瀷锛岄粯璁や负 "no_select" + 濡傛灉涓嶆槸 "no_select"锛屽垯鎵ц fetch 鎿嶄綔 + :return: 濡傛灉鏄煡璇㈡搷浣滐紝杩斿洖鏁版嵁鍒楄〃锛涘惁鍒欒繑鍥 None """ try: if params: @@ -72,43 +72,43 @@ def query(sql, params=None, query_type="no_select"): else: cursor.execute(sql) - # 确保连接保持活跃 + # 纭繚杩炴帴淇濇寔娲昏穬 conn.ping(reconnect=True) if query_type != "no_select": data_list = cursor.fetchall() conn.commit() - logging.info("查询成功,已获取数据。") + logging.info("鏌ヨ鎴愬姛锛屽凡鑾峰彇鏁版嵁銆") return data_list else: conn.commit() - logging.info("操作成功,已提交事务。") + logging.info("鎿嶄綔鎴愬姛锛屽凡鎻愪氦浜嬪姟銆") except pymysql.MySQLError as e: - logging.error(f"执行SQL时出错: {e}") + logging.error(f"鎵цSQL鏃跺嚭閿: {e}") conn.rollback() return None def main(): - # 示例用法 + # 绀轰緥鐢ㄦ硶 - # 执行查询操作 + # 鎵ц鏌ヨ鎿嶄綔 select_sql = "SELECT * FROM article LIMIT 5" articles = query(select_sql, query_type="select") if articles: for article in articles: print(article) - # 执行插入操作(根据实际表结构修改) + # 鎵ц鎻掑叆鎿嶄綔锛堟牴鎹疄闄呰〃缁撴瀯淇敼锛 insert_sql = "INSERT INTO article (id, content) VALUES (%s, %s)" - new_article = (12345, "这是一条新的文章内容。") + new_article = (12345, "杩欐槸涓鏉℃柊鐨勬枃绔犲唴瀹广") result = query(insert_sql, params=new_article, query_type="no_select") if result is None: - logging.info("插入操作完成。") + logging.info("鎻掑叆鎿嶄綔瀹屾垚銆") - # 关闭游标和连接 + # 鍏抽棴娓告爣鍜岃繛鎺 cursor.close() conn.close() - logging.info("数据库连接已关闭。") + logging.info("鏁版嵁搴撹繛鎺ュ凡鍏抽棴銆") if __name__ == '__main__': main()