Modify the database hardcoding to switch to command-line interactive database connection.

This commit is contained in:
戒酒的李白
2025-01-09 23:08:55 +08:00
parent ba56f3b0d4
commit a30773715e
4 changed files with 304 additions and 66 deletions
+96 -23
View File
@@ -1,35 +1,108 @@
import os
from sqlalchemy import create_engine
import pandas as pd
from spiderDataPackage.settings import articleAddr,commentsAddr
# from ..model.topicDefine import *
from sqlalchemy import create_engine
from getpass import getpass
import logging
engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@47.92.235.6/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4')
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler("save_data.log"),
logging.StreamHandler()
]
)
def saveData():
# 假设 articleAddr 和 commentsAddr 是绝对路径或相对于脚本的路径
from spiderDataPackage.settings import articleAddr, commentsAddr
def get_db_connection_interactive():
"""
通过终端交互获取数据库连接参数,若按回车则使用默认值。
返回 SQLAlchemy 的数据库引擎。
"""
print("请依次输入数据库连接信息(直接按回车使用默认值):")
host = input(" 1. 主机 (默认: localhost): ") or "localhost"
port_str = input(" 2. 端口 (默认: 3306): ") or "3306"
try:
oldArticle = pd.read_sql('select * from article',engine)
port = int(port_str)
except ValueError:
logging.warning("端口号无效,使用默认端口 3306。")
port = 3306
user = input(" 3. 用户名 (默认: root): ") or "root"
password = getpass(" 4. 密码 (默认: 12345678): ") or "12345678"
db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem"
# 构建数据库连接字符串
connection_str = f"mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}?charset=utf8mb4"
try:
engine = create_engine(connection_str)
# 测试连接
with engine.connect() as connection:
logging.info(f"成功连接到数据库: {user}@{host}:{port}/{db_name}")
return engine
except Exception as e:
logging.error(f"无法连接到数据库: {e}")
exit(1)
def saveData(engine):
"""
从数据库和CSV文件读取数据,合并后去重并保存回数据库。
最后删除CSV文件。
"""
try:
# 读取旧数据
oldArticle = pd.read_sql('SELECT * FROM article', engine)
oldComment = pd.read_sql('SELECT * FROM comments', engine)
logging.info("成功从数据库读取旧的文章和评论数据。")
# 读取新数据
newArticle = pd.read_csv(articleAddr)
oldComment = pd.read_sql('select * from comments',engine)
newComment = pd.read_csv(commentsAddr)
mergeArticle = pd.concat([newArticle,oldArticle],join='inner')
mergeComment = pd.concat([newComment,oldComment],join='inner')
mergeArticle.drop_duplicates(subset='id',keep='last',inplace=True)
mergeComment.drop_duplicates(subset='content',keep='last',inplace=True)
logging.info("成功从CSV文件读取新的文章和评论数据。")
# 合并数据
mergeArticle = pd.concat([newArticle, oldArticle], ignore_index=True, sort=False)
mergeComment = pd.concat([newComment, oldComment], ignore_index=True, sort=False)
logging.info("成功合并新旧文章和评论数据。")
# 去重
mergeArticle.drop_duplicates(subset='id', keep='last', inplace=True)
mergeComment.drop_duplicates(subset='content', keep='last', inplace=True)
logging.info("成功去除重复的文章和评论数据。")
# 保存回数据库
mergeArticle.to_sql('article', con=engine, if_exists='replace', index=False)
mergeComment.to_sql('comments', con=engine, if_exists='replace', index=False)
except:
newArticle = pd.read_csv(articleAddr)
newComment = pd.read_csv(commentsAddr)
newArticle.to_sql('article',con=engine,if_exists='replace',index=False)
newComment.to_sql('comments',con=engine,if_exists='replace',index=False)
logging.info("成功将合并后的数据保存回数据库。")
except pd.errors.EmptyDataError as e:
logging.error(f"读取CSV文件时出错: {e}")
except Exception as e:
logging.error(f"保存数据时出错: {e}")
else:
# 删除CSV文件
try:
os.remove(articleAddr)
os.remove(commentsAddr)
logging.info("成功删除CSV文件。")
except Exception as e:
logging.warning(f"删除CSV文件时出错: {e}")
os.remove(articleAddr)
os.remove(commentsAddr)
# update_data()
def main():
# 获取数据库连接
engine = get_db_connection_interactive()
# 保存数据
saveData(engine)
# 关闭引擎(可选,因为SQLAlchemy引擎会自动管理连接池)
engine.dispose()
logging.info("数据库连接已关闭。")
if __name__ == '__main__':
saveData()
main()