minio对象存储数据库链接
This commit is contained in:
@@ -0,0 +1,277 @@
|
||||
import feedparser
|
||||
import requests
|
||||
from datetime import datetime
|
||||
import pandas as pd
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import pymysql
|
||||
|
||||
# 数据库连接信息
|
||||
local_DB_Config = {
|
||||
'host': "localhost",
|
||||
'user': "root",
|
||||
'password': "123123",
|
||||
'database': "intelligence_system",
|
||||
'charset': 'utf8mb4'
|
||||
}
|
||||
|
||||
# 表名
|
||||
table_name = "collector_rss_subscriptions"
|
||||
|
||||
|
||||
def verify_database():
|
||||
"""验证数据库连接和表结构"""
|
||||
try:
|
||||
conn = pymysql.connect(**local_DB_Config)
|
||||
with conn.cursor() as cursor:
|
||||
# 检查表是否存在
|
||||
cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
|
||||
if not cursor.fetchone():
|
||||
print(f"错误: 表 {table_name} 不存在!")
|
||||
return False
|
||||
|
||||
# 检查表结构
|
||||
cursor.execute(f"DESCRIBE {table_name}")
|
||||
columns = [col[0] for col in cursor.fetchall()]
|
||||
print("表列名:", columns)
|
||||
|
||||
# 检查插入权限
|
||||
test_sql = f"""INSERT INTO `{table_name}`
|
||||
(`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`)
|
||||
VALUES (%s, %s, %s, %s, %s)"""
|
||||
cursor.execute(test_sql, ('测试标题', 'http://test.com', '测试内容', datetime.now(), '测试来源'))
|
||||
conn.rollback()
|
||||
|
||||
print("数据库验证通过!")
|
||||
return True
|
||||
except Exception as e:
|
||||
print("数据库验证失败:", e)
|
||||
return False
|
||||
finally:
|
||||
if 'conn' in locals():
|
||||
conn.close()
|
||||
|
||||
|
||||
def load_last_update_time():
|
||||
"""加载上次更新的时间"""
|
||||
cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl')
|
||||
if os.path.exists(cache_file):
|
||||
with open(cache_file, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
return None
|
||||
|
||||
|
||||
def save_last_update_time(last_update):
|
||||
"""保存本次更新的时间"""
|
||||
cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl')
|
||||
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
|
||||
with open(cache_file, 'wb') as f:
|
||||
pickle.dump(last_update, f)
|
||||
|
||||
|
||||
def fetch_single_rss(url, timeout=15):
|
||||
"""获取并解析单个 RSS 源"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
for attempt in range(3):
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
response.encoding = response.apparent_encoding
|
||||
feed = feedparser.parse(response.text)
|
||||
|
||||
if feed.bozo:
|
||||
print(f"警告: 解析可能存在问题: {feed.bozo_exception}")
|
||||
|
||||
return feed
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"第 {attempt + 1} 次尝试获取 {url} 失败: {e}")
|
||||
if attempt < 2:
|
||||
time.sleep(5 * (attempt + 1))
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def fetch_all_rss(urls, timeout=15):
|
||||
"""使用线程池并发获取多个RSS源"""
|
||||
feeds = {}
|
||||
with ThreadPoolExecutor(max_workers=3) as executor:
|
||||
future_to_url = {executor.submit(fetch_single_rss, url, timeout): url for url in urls}
|
||||
|
||||
for future in as_completed(future_to_url):
|
||||
url = future_to_url[future]
|
||||
try:
|
||||
feed = future.result()
|
||||
if feed:
|
||||
feeds[url] = feed
|
||||
except Exception as e:
|
||||
print(f"获取 {url} 时发生异常: {e}")
|
||||
|
||||
return feeds
|
||||
|
||||
|
||||
def process_feed_entry(entry, url):
|
||||
"""处理单个RSS条目并返回结构化数据"""
|
||||
# 处理标题
|
||||
title = entry.get('title', '无标题') or '无标题'
|
||||
if len(title) > 255:
|
||||
title = title[:252] + '...'
|
||||
|
||||
# 处理链接
|
||||
link = entry.get('link', '无链接') or '无链接'
|
||||
if len(link) > 1024:
|
||||
link = link[:1021] + '...'
|
||||
|
||||
# 处理摘要
|
||||
summary = entry.get('summary', '无内容摘要')
|
||||
content_list = entry.get('content', [])
|
||||
content = content_list[0].value if content_list else ''
|
||||
description = summary if summary != '无内容摘要' else (content[:200] + '...' if content else '无内容摘要')
|
||||
|
||||
# 处理发布时间
|
||||
published_parsed = entry.get('published_parsed') or entry.get('updated_parsed')
|
||||
if published_parsed:
|
||||
entry_time = datetime(*published_parsed[:6])
|
||||
else:
|
||||
pub_str = entry.get('published', entry.get('updated', ''))
|
||||
try:
|
||||
entry_time = datetime.strptime(pub_str, '%a, %d %b %Y %H:%M:%S %z')
|
||||
except:
|
||||
entry_time = datetime.now()
|
||||
|
||||
# 处理来源URL
|
||||
source_url = url or '未知来源'
|
||||
if len(source_url) > 1024:
|
||||
source_url = source_url[:1021] + '...'
|
||||
|
||||
return {
|
||||
'文章标题': title,
|
||||
'文章链接': link,
|
||||
'文章摘要': description,
|
||||
'发布时间': entry_time.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'来源URL': source_url
|
||||
}
|
||||
|
||||
|
||||
def display_feed_info(feed, last_update=None, url=None):
|
||||
"""处理并显示RSS源信息"""
|
||||
if not feed:
|
||||
print("无法显示信息:feed 为 None")
|
||||
return None
|
||||
|
||||
print("=" * 80)
|
||||
print(f"处理 RSS 源: {url}")
|
||||
entries = feed.entries
|
||||
data_list = []
|
||||
new_last_update = last_update
|
||||
|
||||
for i, entry in enumerate(entries, 1):
|
||||
entry_data = process_feed_entry(entry, url)
|
||||
entry_time = datetime.strptime(entry_data['发布时间'], '%Y-%m-%d %H:%M:%S')
|
||||
|
||||
if last_update and entry_time <= last_update:
|
||||
continue
|
||||
|
||||
if new_last_update is None or entry_time > new_last_update:
|
||||
new_last_update = entry_time
|
||||
|
||||
print(f"\n--- 条目 {i} ---")
|
||||
print(f"标题: {entry_data['文章标题']}")
|
||||
print(f"链接: {entry_data['文章链接']}")
|
||||
print(f"摘要: {entry_data['文章摘要'][:100]}...")
|
||||
print(f"时间: {entry_data['发布时间']}")
|
||||
|
||||
data_list.append(entry_data)
|
||||
|
||||
if data_list:
|
||||
df = pd.DataFrame(data_list)
|
||||
write_to_database(df)
|
||||
|
||||
return new_last_update
|
||||
|
||||
|
||||
def write_to_database(df):
|
||||
"""将数据写入数据库"""
|
||||
if df.empty:
|
||||
print("没有新数据需要写入")
|
||||
return
|
||||
|
||||
print("\n准备写入数据库的数据样例:")
|
||||
print(df.iloc[0].to_dict())
|
||||
|
||||
try:
|
||||
conn = pymysql.connect(**local_DB_Config)
|
||||
with conn.cursor() as cursor:
|
||||
sql = f"""INSERT IGNORE INTO `{table_name}`
|
||||
(`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`)
|
||||
VALUES (%s, %s, %s, %s, %s)"""
|
||||
|
||||
success_count = 0
|
||||
for _, row in df.iterrows():
|
||||
try:
|
||||
cursor.execute(sql, (
|
||||
row['文章标题'],
|
||||
row['文章链接'],
|
||||
row['文章摘要'],
|
||||
row['发布时间'],
|
||||
row['来源URL']
|
||||
))
|
||||
success_count += cursor.rowcount
|
||||
except Exception as e:
|
||||
print(f"插入记录时出错: {e}")
|
||||
print(f"问题数据: {row.to_dict()}")
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
print(f"成功写入 {success_count}/{len(df)} 条记录")
|
||||
|
||||
except Exception as e:
|
||||
print("数据库操作失败:", e)
|
||||
finally:
|
||||
if 'conn' in locals():
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
if not verify_database():
|
||||
print("数据库验证失败,程序终止")
|
||||
return
|
||||
|
||||
rss_urls = [
|
||||
"https://www.chinanews.com.cn/rss/finance.xml",
|
||||
"https://www.chinanews.com.cn/rss/world.xml",
|
||||
"https://www.chinanews.com.cn/rss/china.xml",
|
||||
"https://www.chinanews.com.cn/rss/scroll-news.xml"
|
||||
]
|
||||
|
||||
last_update = load_last_update_time()
|
||||
if last_update:
|
||||
print(f"上次更新时间: {last_update.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
print("\n开始获取RSS源数据...")
|
||||
start_time = time.time()
|
||||
feeds = fetch_all_rss(rss_urls)
|
||||
print(f"获取完成,耗时: {time.time() - start_time:.2f}秒")
|
||||
|
||||
new_last_update = None
|
||||
for url, feed in feeds.items():
|
||||
current_last_update = display_feed_info(feed, last_update, url)
|
||||
if current_last_update and (new_last_update is None or current_last_update > new_last_update):
|
||||
new_last_update = current_last_update
|
||||
|
||||
if new_last_update:
|
||||
save_last_update_time(new_last_update)
|
||||
print(f"\n本次最新更新时间: {new_last_update.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
else:
|
||||
print("\n没有获取到新的内容")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user