minio对象存储数据库链接

This commit is contained in:
z66
2025-09-16 17:35:53 +08:00
parent 8e92acf5d5
commit 9afa9d2e58
10 changed files with 7291 additions and 347 deletions
+277
View File
@@ -0,0 +1,277 @@
import feedparser
import requests
from datetime import datetime
import pandas as pd
import os
import pickle
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import pymysql
# 数据库连接信息
local_DB_Config = {
'host': "localhost",
'user': "root",
'password': "123123",
'database': "intelligence_system",
'charset': 'utf8mb4'
}
# 表名
table_name = "collector_rss_subscriptions"
def verify_database():
"""验证数据库连接和表结构"""
try:
conn = pymysql.connect(**local_DB_Config)
with conn.cursor() as cursor:
# 检查表是否存在
cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
if not cursor.fetchone():
print(f"错误: 表 {table_name} 不存在!")
return False
# 检查表结构
cursor.execute(f"DESCRIBE {table_name}")
columns = [col[0] for col in cursor.fetchall()]
print("表列名:", columns)
# 检查插入权限
test_sql = f"""INSERT INTO `{table_name}`
(`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`)
VALUES (%s, %s, %s, %s, %s)"""
cursor.execute(test_sql, ('测试标题', 'http://test.com', '测试内容', datetime.now(), '测试来源'))
conn.rollback()
print("数据库验证通过!")
return True
except Exception as e:
print("数据库验证失败:", e)
return False
finally:
if 'conn' in locals():
conn.close()
def load_last_update_time():
"""加载上次更新的时间"""
cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as f:
return pickle.load(f)
return None
def save_last_update_time(last_update):
"""保存本次更新的时间"""
cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl')
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
with open(cache_file, 'wb') as f:
pickle.dump(last_update, f)
def fetch_single_rss(url, timeout=15):
"""获取并解析单个 RSS 源"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
for attempt in range(3):
try:
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
response.encoding = response.apparent_encoding
feed = feedparser.parse(response.text)
if feed.bozo:
print(f"警告: 解析可能存在问题: {feed.bozo_exception}")
return feed
except requests.RequestException as e:
print(f"{attempt + 1} 次尝试获取 {url} 失败: {e}")
if attempt < 2:
time.sleep(5 * (attempt + 1))
continue
return None
def fetch_all_rss(urls, timeout=15):
"""使用线程池并发获取多个RSS源"""
feeds = {}
with ThreadPoolExecutor(max_workers=3) as executor:
future_to_url = {executor.submit(fetch_single_rss, url, timeout): url for url in urls}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
feed = future.result()
if feed:
feeds[url] = feed
except Exception as e:
print(f"获取 {url} 时发生异常: {e}")
return feeds
def process_feed_entry(entry, url):
"""处理单个RSS条目并返回结构化数据"""
# 处理标题
title = entry.get('title', '无标题') or '无标题'
if len(title) > 255:
title = title[:252] + '...'
# 处理链接
link = entry.get('link', '无链接') or '无链接'
if len(link) > 1024:
link = link[:1021] + '...'
# 处理摘要
summary = entry.get('summary', '无内容摘要')
content_list = entry.get('content', [])
content = content_list[0].value if content_list else ''
description = summary if summary != '无内容摘要' else (content[:200] + '...' if content else '无内容摘要')
# 处理发布时间
published_parsed = entry.get('published_parsed') or entry.get('updated_parsed')
if published_parsed:
entry_time = datetime(*published_parsed[:6])
else:
pub_str = entry.get('published', entry.get('updated', ''))
try:
entry_time = datetime.strptime(pub_str, '%a, %d %b %Y %H:%M:%S %z')
except:
entry_time = datetime.now()
# 处理来源URL
source_url = url or '未知来源'
if len(source_url) > 1024:
source_url = source_url[:1021] + '...'
return {
'文章标题': title,
'文章链接': link,
'文章摘要': description,
'发布时间': entry_time.strftime('%Y-%m-%d %H:%M:%S'),
'来源URL': source_url
}
def display_feed_info(feed, last_update=None, url=None):
"""处理并显示RSS源信息"""
if not feed:
print("无法显示信息:feed 为 None")
return None
print("=" * 80)
print(f"处理 RSS 源: {url}")
entries = feed.entries
data_list = []
new_last_update = last_update
for i, entry in enumerate(entries, 1):
entry_data = process_feed_entry(entry, url)
entry_time = datetime.strptime(entry_data['发布时间'], '%Y-%m-%d %H:%M:%S')
if last_update and entry_time <= last_update:
continue
if new_last_update is None or entry_time > new_last_update:
new_last_update = entry_time
print(f"\n--- 条目 {i} ---")
print(f"标题: {entry_data['文章标题']}")
print(f"链接: {entry_data['文章链接']}")
print(f"摘要: {entry_data['文章摘要'][:100]}...")
print(f"时间: {entry_data['发布时间']}")
data_list.append(entry_data)
if data_list:
df = pd.DataFrame(data_list)
write_to_database(df)
return new_last_update
def write_to_database(df):
"""将数据写入数据库"""
if df.empty:
print("没有新数据需要写入")
return
print("\n准备写入数据库的数据样例:")
print(df.iloc[0].to_dict())
try:
conn = pymysql.connect(**local_DB_Config)
with conn.cursor() as cursor:
sql = f"""INSERT IGNORE INTO `{table_name}`
(`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`)
VALUES (%s, %s, %s, %s, %s)"""
success_count = 0
for _, row in df.iterrows():
try:
cursor.execute(sql, (
row['文章标题'],
row['文章链接'],
row['文章摘要'],
row['发布时间'],
row['来源URL']
))
success_count += cursor.rowcount
except Exception as e:
print(f"插入记录时出错: {e}")
print(f"问题数据: {row.to_dict()}")
continue
conn.commit()
print(f"成功写入 {success_count}/{len(df)} 条记录")
except Exception as e:
print("数据库操作失败:", e)
finally:
if 'conn' in locals():
conn.close()
def main():
"""主函数"""
if not verify_database():
print("数据库验证失败,程序终止")
return
rss_urls = [
"https://www.chinanews.com.cn/rss/finance.xml",
"https://www.chinanews.com.cn/rss/world.xml",
"https://www.chinanews.com.cn/rss/china.xml",
"https://www.chinanews.com.cn/rss/scroll-news.xml"
]
last_update = load_last_update_time()
if last_update:
print(f"上次更新时间: {last_update.strftime('%Y-%m-%d %H:%M:%S')}")
print("\n开始获取RSS源数据...")
start_time = time.time()
feeds = fetch_all_rss(rss_urls)
print(f"获取完成,耗时: {time.time() - start_time:.2f}")
new_last_update = None
for url, feed in feeds.items():
current_last_update = display_feed_info(feed, last_update, url)
if current_last_update and (new_last_update is None or current_last_update > new_last_update):
new_last_update = current_last_update
if new_last_update:
save_last_update_time(new_last_update)
print(f"\n本次最新更新时间: {new_last_update.strftime('%Y-%m-%d %H:%M:%S')}")
else:
print("\n没有获取到新的内容")
if __name__ == "__main__":
main()