277 lines
8.6 KiB
Python
277 lines
8.6 KiB
Python
import feedparser
|
|
import requests
|
|
from datetime import datetime
|
|
import pandas as pd
|
|
import os
|
|
import pickle
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import pymysql
|
|
|
|
# 数据库连接信息
|
|
local_DB_Config = {
|
|
'host': "localhost",
|
|
'user': "root",
|
|
'password': "123123",
|
|
'database': "intelligence_system",
|
|
'charset': 'utf8mb4'
|
|
}
|
|
|
|
# 表名
|
|
table_name = "collector_rss_subscriptions"
|
|
|
|
|
|
def verify_database():
|
|
"""验证数据库连接和表结构"""
|
|
try:
|
|
conn = pymysql.connect(**local_DB_Config)
|
|
with conn.cursor() as cursor:
|
|
# 检查表是否存在
|
|
cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
|
|
if not cursor.fetchone():
|
|
print(f"错误: 表 {table_name} 不存在!")
|
|
return False
|
|
|
|
# 检查表结构
|
|
cursor.execute(f"DESCRIBE {table_name}")
|
|
columns = [col[0] for col in cursor.fetchall()]
|
|
print("表列名:", columns)
|
|
|
|
# 检查插入权限
|
|
test_sql = f"""INSERT INTO `{table_name}`
|
|
(`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`)
|
|
VALUES (%s, %s, %s, %s, %s)"""
|
|
cursor.execute(test_sql, ('测试标题', 'http://test.com', '测试内容', datetime.now(), '测试来源'))
|
|
conn.rollback()
|
|
|
|
print("数据库验证通过!")
|
|
return True
|
|
except Exception as e:
|
|
print("数据库验证失败:", e)
|
|
return False
|
|
finally:
|
|
if 'conn' in locals():
|
|
conn.close()
|
|
|
|
|
|
def load_last_update_time():
|
|
"""加载上次更新的时间"""
|
|
cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl')
|
|
if os.path.exists(cache_file):
|
|
with open(cache_file, 'rb') as f:
|
|
return pickle.load(f)
|
|
return None
|
|
|
|
|
|
def save_last_update_time(last_update):
|
|
"""保存本次更新的时间"""
|
|
cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl')
|
|
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
|
|
with open(cache_file, 'wb') as f:
|
|
pickle.dump(last_update, f)
|
|
|
|
|
|
def fetch_single_rss(url, timeout=15):
|
|
"""获取并解析单个 RSS 源"""
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
}
|
|
|
|
for attempt in range(3):
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=timeout)
|
|
response.raise_for_status()
|
|
response.encoding = response.apparent_encoding
|
|
feed = feedparser.parse(response.text)
|
|
|
|
if feed.bozo:
|
|
print(f"警告: 解析可能存在问题: {feed.bozo_exception}")
|
|
|
|
return feed
|
|
|
|
except requests.RequestException as e:
|
|
print(f"第 {attempt + 1} 次尝试获取 {url} 失败: {e}")
|
|
if attempt < 2:
|
|
time.sleep(5 * (attempt + 1))
|
|
continue
|
|
|
|
return None
|
|
|
|
|
|
def fetch_all_rss(urls, timeout=15):
|
|
"""使用线程池并发获取多个RSS源"""
|
|
feeds = {}
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
future_to_url = {executor.submit(fetch_single_rss, url, timeout): url for url in urls}
|
|
|
|
for future in as_completed(future_to_url):
|
|
url = future_to_url[future]
|
|
try:
|
|
feed = future.result()
|
|
if feed:
|
|
feeds[url] = feed
|
|
except Exception as e:
|
|
print(f"获取 {url} 时发生异常: {e}")
|
|
|
|
return feeds
|
|
|
|
|
|
def process_feed_entry(entry, url):
|
|
"""处理单个RSS条目并返回结构化数据"""
|
|
# 处理标题
|
|
title = entry.get('title', '无标题') or '无标题'
|
|
if len(title) > 255:
|
|
title = title[:252] + '...'
|
|
|
|
# 处理链接
|
|
link = entry.get('link', '无链接') or '无链接'
|
|
if len(link) > 1024:
|
|
link = link[:1021] + '...'
|
|
|
|
# 处理摘要
|
|
summary = entry.get('summary', '无内容摘要')
|
|
content_list = entry.get('content', [])
|
|
content = content_list[0].value if content_list else ''
|
|
description = summary if summary != '无内容摘要' else (content[:200] + '...' if content else '无内容摘要')
|
|
|
|
# 处理发布时间
|
|
published_parsed = entry.get('published_parsed') or entry.get('updated_parsed')
|
|
if published_parsed:
|
|
entry_time = datetime(*published_parsed[:6])
|
|
else:
|
|
pub_str = entry.get('published', entry.get('updated', ''))
|
|
try:
|
|
entry_time = datetime.strptime(pub_str, '%a, %d %b %Y %H:%M:%S %z')
|
|
except:
|
|
entry_time = datetime.now()
|
|
|
|
# 处理来源URL
|
|
source_url = url or '未知来源'
|
|
if len(source_url) > 1024:
|
|
source_url = source_url[:1021] + '...'
|
|
|
|
return {
|
|
'文章标题': title,
|
|
'文章链接': link,
|
|
'文章摘要': description,
|
|
'发布时间': entry_time.strftime('%Y-%m-%d %H:%M:%S'),
|
|
'来源URL': source_url
|
|
}
|
|
|
|
|
|
def display_feed_info(feed, last_update=None, url=None):
|
|
"""处理并显示RSS源信息"""
|
|
if not feed:
|
|
print("无法显示信息:feed 为 None")
|
|
return None
|
|
|
|
print("=" * 80)
|
|
print(f"处理 RSS 源: {url}")
|
|
entries = feed.entries
|
|
data_list = []
|
|
new_last_update = last_update
|
|
|
|
for i, entry in enumerate(entries, 1):
|
|
entry_data = process_feed_entry(entry, url)
|
|
entry_time = datetime.strptime(entry_data['发布时间'], '%Y-%m-%d %H:%M:%S')
|
|
|
|
if last_update and entry_time <= last_update:
|
|
continue
|
|
|
|
if new_last_update is None or entry_time > new_last_update:
|
|
new_last_update = entry_time
|
|
|
|
print(f"\n--- 条目 {i} ---")
|
|
print(f"标题: {entry_data['文章标题']}")
|
|
print(f"链接: {entry_data['文章链接']}")
|
|
print(f"摘要: {entry_data['文章摘要'][:100]}...")
|
|
print(f"时间: {entry_data['发布时间']}")
|
|
|
|
data_list.append(entry_data)
|
|
|
|
if data_list:
|
|
df = pd.DataFrame(data_list)
|
|
write_to_database(df)
|
|
|
|
return new_last_update
|
|
|
|
|
|
def write_to_database(df):
|
|
"""将数据写入数据库"""
|
|
if df.empty:
|
|
print("没有新数据需要写入")
|
|
return
|
|
|
|
print("\n准备写入数据库的数据样例:")
|
|
print(df.iloc[0].to_dict())
|
|
|
|
try:
|
|
conn = pymysql.connect(**local_DB_Config)
|
|
with conn.cursor() as cursor:
|
|
sql = f"""INSERT IGNORE INTO `{table_name}`
|
|
(`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`)
|
|
VALUES (%s, %s, %s, %s, %s)"""
|
|
|
|
success_count = 0
|
|
for _, row in df.iterrows():
|
|
try:
|
|
cursor.execute(sql, (
|
|
row['文章标题'],
|
|
row['文章链接'],
|
|
row['文章摘要'],
|
|
row['发布时间'],
|
|
row['来源URL']
|
|
))
|
|
success_count += cursor.rowcount
|
|
except Exception as e:
|
|
print(f"插入记录时出错: {e}")
|
|
print(f"问题数据: {row.to_dict()}")
|
|
continue
|
|
|
|
conn.commit()
|
|
print(f"成功写入 {success_count}/{len(df)} 条记录")
|
|
|
|
except Exception as e:
|
|
print("数据库操作失败:", e)
|
|
finally:
|
|
if 'conn' in locals():
|
|
conn.close()
|
|
|
|
|
|
def main():
|
|
"""主函数"""
|
|
if not verify_database():
|
|
print("数据库验证失败,程序终止")
|
|
return
|
|
|
|
rss_urls = [
|
|
"https://www.chinanews.com.cn/rss/finance.xml",
|
|
"https://www.chinanews.com.cn/rss/world.xml",
|
|
"https://www.chinanews.com.cn/rss/china.xml",
|
|
"https://www.chinanews.com.cn/rss/scroll-news.xml"
|
|
]
|
|
|
|
last_update = load_last_update_time()
|
|
if last_update:
|
|
print(f"上次更新时间: {last_update.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
print("\n开始获取RSS源数据...")
|
|
start_time = time.time()
|
|
feeds = fetch_all_rss(rss_urls)
|
|
print(f"获取完成,耗时: {time.time() - start_time:.2f}秒")
|
|
|
|
new_last_update = None
|
|
for url, feed in feeds.items():
|
|
current_last_update = display_feed_info(feed, last_update, url)
|
|
if current_last_update and (new_last_update is None or current_last_update > new_last_update):
|
|
new_last_update = current_last_update
|
|
|
|
if new_last_update:
|
|
save_last_update_time(new_last_update)
|
|
print(f"\n本次最新更新时间: {new_last_update.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
else:
|
|
print("\n没有获取到新的内容")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |