Modify the database hardcoding to switch to command-line interactive database connection.

This commit is contained in:
戒酒的李白
2025-01-09 23:08:55 +08:00
parent ba56f3b0d4
commit a30773715e
4 changed files with 304 additions and 66 deletions
+1
View File
@@ -12,3 +12,4 @@ model2/*
*.pyz *.pyz
*.pywz *.pywz
.vscode .vscode
.VSCodeCounter
+93 -20
View File
@@ -1,35 +1,108 @@
import os import os
from sqlalchemy import create_engine
import pandas as pd import pandas as pd
from spiderDataPackage.settings import articleAddr,commentsAddr from sqlalchemy import create_engine
# from ..model.topicDefine import * from getpass import getpass
import logging
engine = create_engine('mysql+pymysql://XiaoXueQi:XiaoXueQi@47.92.235.6/Weibo_PublicOpinion_AnalysisSystem?charset=utf8mb4') # 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler("save_data.log"),
logging.StreamHandler()
]
)
def saveData(): # 假设 articleAddr 和 commentsAddr 是绝对路径或相对于脚本的路径
from spiderDataPackage.settings import articleAddr, commentsAddr
def get_db_connection_interactive():
"""
通过终端交互获取数据库连接参数,若按回车则使用默认值。
返回 SQLAlchemy 的数据库引擎。
"""
print("请依次输入数据库连接信息(直接按回车使用默认值):")
host = input(" 1. 主机 (默认: localhost): ") or "localhost"
port_str = input(" 2. 端口 (默认: 3306): ") or "3306"
try: try:
oldArticle = pd.read_sql('select * from article',engine) port = int(port_str)
except ValueError:
logging.warning("端口号无效,使用默认端口 3306。")
port = 3306
user = input(" 3. 用户名 (默认: root): ") or "root"
password = getpass(" 4. 密码 (默认: 12345678): ") or "12345678"
db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem"
# 构建数据库连接字符串
connection_str = f"mysql+pymysql://{user}:{password}@{host}:{port}/{db_name}?charset=utf8mb4"
try:
engine = create_engine(connection_str)
# 测试连接
with engine.connect() as connection:
logging.info(f"成功连接到数据库: {user}@{host}:{port}/{db_name}")
return engine
except Exception as e:
logging.error(f"无法连接到数据库: {e}")
exit(1)
def saveData(engine):
"""
从数据库和CSV文件读取数据,合并后去重并保存回数据库。
最后删除CSV文件。
"""
try:
# 读取旧数据
oldArticle = pd.read_sql('SELECT * FROM article', engine)
oldComment = pd.read_sql('SELECT * FROM comments', engine)
logging.info("成功从数据库读取旧的文章和评论数据。")
# 读取新数据
newArticle = pd.read_csv(articleAddr) newArticle = pd.read_csv(articleAddr)
oldComment = pd.read_sql('select * from comments',engine)
newComment = pd.read_csv(commentsAddr) newComment = pd.read_csv(commentsAddr)
logging.info("成功从CSV文件读取新的文章和评论数据。")
mergeArticle = pd.concat([newArticle,oldArticle],join='inner') # 合并数据
mergeComment = pd.concat([newComment,oldComment],join='inner') mergeArticle = pd.concat([newArticle, oldArticle], ignore_index=True, sort=False)
mergeComment = pd.concat([newComment, oldComment], ignore_index=True, sort=False)
logging.info("成功合并新旧文章和评论数据。")
mergeArticle.drop_duplicates(subset='id',keep='last',inplace=True) # 去重
mergeComment.drop_duplicates(subset='content',keep='last',inplace=True) mergeArticle.drop_duplicates(subset='id', keep='last', inplace=True)
mergeComment.drop_duplicates(subset='content', keep='last', inplace=True)
logging.info("成功去除重复的文章和评论数据。")
# 保存回数据库
mergeArticle.to_sql('article', con=engine, if_exists='replace', index=False) mergeArticle.to_sql('article', con=engine, if_exists='replace', index=False)
mergeComment.to_sql('comments', con=engine, if_exists='replace', index=False) mergeComment.to_sql('comments', con=engine, if_exists='replace', index=False)
except: logging.info("成功将合并后的数据保存回数据库。")
newArticle = pd.read_csv(articleAddr)
newComment = pd.read_csv(commentsAddr)
newArticle.to_sql('article',con=engine,if_exists='replace',index=False)
newComment.to_sql('comments',con=engine,if_exists='replace',index=False)
os.remove(articleAddr) except pd.errors.EmptyDataError as e:
os.remove(commentsAddr) logging.error(f"读取CSV文件时出错: {e}")
# update_data() except Exception as e:
logging.error(f"保存数据时出错: {e}")
else:
# 删除CSV文件
try:
os.remove(articleAddr)
os.remove(commentsAddr)
logging.info("成功删除CSV文件。")
except Exception as e:
logging.warning(f"删除CSV文件时出错: {e}")
def main():
# 获取数据库连接
engine = get_db_connection_interactive()
# 保存数据
saveData(engine)
# 关闭引擎(可选,因为SQLAlchemy引擎会自动管理连接池)
engine.dispose()
logging.info("数据库连接已关闭。")
if __name__ == '__main__': if __name__ == '__main__':
saveData() main()
+113 -12
View File
@@ -1,13 +1,114 @@
from pymysql import * import getpass
conn = connect(host='47.92.235.6',port=3306,user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem') import pymysql
import logging
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler("database_operations.log"),
logging.StreamHandler()
]
)
def get_db_connection_interactive():
"""
通过终端交互获取数据库连接参数,若按回车则使用默认值。
返回一个连接对象。
"""
print("请依次输入数据库连接信息(直接按回车使用默认值):")
host = input(" 1. 主机 (默认: localhost): ") or "localhost"
port_str = input(" 2. 端口 (默认: 3306): ") or "3306"
try:
port = int(port_str)
except ValueError:
logging.warning("端口号无效,使用默认端口 3306。")
port = 3306
user = input(" 3. 用户名 (默认: root): ") or "root"
password = getpass.getpass(" 4. 密码 (默认: 312517): ") or "312517"
db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem"
logging.info(f"尝试连接到数据库: {user}@{host}:{port}/{db_name}")
try:
connection = pymysql.connect(
host=host,
port=port,
user=user,
password=password,
database=db_name,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor # 返回字典格式
)
logging.info("数据库连接成功。")
return connection
except pymysql.MySQLError as e:
logging.error(f"数据库连接失败: {e}")
exit(1)
# 获取数据库连接
conn = get_db_connection_interactive()
# 获取游标
cursor = conn.cursor() cursor = conn.cursor()
def query(sql,params,type="no_select"):
params = tuple(params) def query(sql, params=None, query_type="no_select"):
cursor.execute(sql,params) """
conn.ping(reconnect=True) 执行SQL查询或操作。
if type != 'no_select':
data_list = cursor.fetchall() :param sql: SQL语句
conn.commit() :param params: SQL参数(可选)
return data_list :param query_type: 查询类型,默认为 "no_select"
else: 如果不是 "no_select",则执行 fetch 操作
conn.commit() :return: 如果是查询操作,返回数据列表;否则返回 None
"""
try:
if params:
params = tuple(params)
cursor.execute(sql, params)
else:
cursor.execute(sql)
# 确保连接保持活跃
conn.ping(reconnect=True)
if query_type != "no_select":
data_list = cursor.fetchall()
conn.commit()
logging.info("查询成功,已获取数据。")
return data_list
else:
conn.commit()
logging.info("操作成功,已提交事务。")
except pymysql.MySQLError as e:
logging.error(f"执行SQL时出错: {e}")
conn.rollback()
return None
def main():
# 示例用法
# 执行查询操作
select_sql = "SELECT * FROM article LIMIT 5"
articles = query(select_sql, query_type="select")
if articles:
for article in articles:
print(article)
# 执行插入操作(根据实际表结构修改)
insert_sql = "INSERT INTO article (id, content) VALUES (%s, %s)"
new_article = (12345, "这是一条新的文章内容。")
result = query(insert_sql, params=new_article, query_type="no_select")
if result is None:
logging.info("插入操作完成。")
# 关闭游标和连接
cursor.close()
conn.close()
logging.info("数据库连接已关闭。")
if __name__ == '__main__':
main()
+94 -31
View File
@@ -1,47 +1,110 @@
import os
import jieba import jieba
from wordcloud import WordCloud from wordcloud import WordCloud
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from PIL import Image,ImageDraw from PIL import Image
from pymysql import *
import json
import numpy as np import numpy as np
import pymysql
def stopWordList(): def stopWordList():
return [line.strip() for line in open('./model/stopWords.txt',encoding='utf8').readlines()] """
如果 stopWords.txt 文件内容较大,或被频繁读取,
可以考虑将其缓存起来,避免重复读文件。
"""
with open('./model/stopWords.txt', encoding='utf8') as f:
return [line.strip() for line in f.readlines()]
def get_img(field,tableName,targetImgSrc,resImgSrc): def generate_word_cloud(text, mask_path, font_path, output_path):
con = connect(host='47.92.235.6',user='XiaoXueQi',password='XiaoXueQi',database='Weibo_PublicOpinion_AnalysisSystem',port=3306,charset='utf8mb4') """生成词云并保存到 output_path"""
cuser = con.cursor() img = Image.open(mask_path)
sql = f'select {field} from {tableName}'
cuser.execute(sql)
data = cuser.fetchall()
text = ''
for item in data:
text += item[0]
cuser.close()
con.close()
cut = jieba.cut(text)
newCut = []
for word in cut:
if word not in stopWordList():newCut.append(word)
string = ' '.join(newCut)
img = Image.open(targetImgSrc)
img_arr = np.array(img) img_arr = np.array(img)
wc = WordCloud( wc = WordCloud(
background_color="#fff", background_color="#fff",
mask=img_arr, mask=img_arr,
font_path='STHUPO.TTF' font_path=font_path
) )
wc.generate_from_text(string) wc.generate_from_text(text)
fig = plt.figure(1)
plt.imshow(wc)
plt.figure(figsize=(8, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off') plt.axis('off')
plt.savefig(output_path, dpi=300, bbox_inches='tight')
plt.close() # 保存后关闭
plt.savefig(resImgSrc,dpi=500) def get_db_connection_interactive():
"""
通过终端交互获取数据库连接参数,若按回车则使用默认值。
"""
print("请依次输入数据库连接信息(直接按回车使用默认值):")
host = input(" 1. 主机 (默认: localhost): ") or "localhost"
port_str = input(" 2. 端口 (默认: 3306): ") or "3306"
port = int(port_str)
# get_img('content','comments','./static/comment.jpg','./static/commentCloud.jpg') user = input(" 3. 用户名 (默认: root): ") or "root"
get_img('content','article','./static/content.jpg','./static/contentCloud.jpg') password = input(" 4. 密码 (默认: 312517): ") or "12345678"
db_name = input(" 5. 数据库名 (默认: Weibo_PublicOpinion_AnalysisSystem): ") or "Weibo_PublicOpinion_AnalysisSystem"
print(f"\n即将连接到数据库: {user}@{host}:{port}/{db_name}\n")
return pymysql.connect(
host=host,
user=user,
password=password,
database=db_name,
port=port,
charset='utf8mb4'
)
def get_img(field, table_name, target_img_src, res_img_src, connection, font_path='STHUPO.TTF'):
"""
从数据库拉取指定字段的文本数据,分词处理后生成词云。
:param field: 数据库字段名
:param table_name: 数据表名
:param target_img_src: 词云形状图
:param res_img_src: 输出词云文件路径
:param connection: 已建立的数据库连接
:param font_path: 字体文件路径
"""
cursor = connection.cursor()
sql = f'SELECT {field} FROM {table_name}'
cursor.execute(sql)
data = cursor.fetchall()
text = ''
for item in data:
text += item[0] # item 是元组 (内容,),取第一个元素即可
cursor.close()
# 分词 & 去停用词
cut_words = jieba.cut(text)
stop_words = set(stopWordList())
filtered_words = [word for word in cut_words if word not in stop_words]
final_text = ' '.join(filtered_words)
# 生成词云
generate_word_cloud(final_text, target_img_src, font_path, res_img_src)
def main():
# 1. 获取数据库连接(交互式输入)
connection = get_db_connection_interactive()
# 2. 根据需求生成词云
# 例如:从 article 表的 content 字段生成词云
try:
get_img(
field='content',
table_name='article',
target_img_src='./static/content.jpg',
res_img_src='./static/contentCloud.jpg',
connection=connection
)
print("词云生成完毕!")
finally:
# 关闭数据库连接
connection.close()
if __name__ == '__main__':
main()