import feedparser import requests from datetime import datetime import pandas as pd import os import pickle import time from concurrent.futures import ThreadPoolExecutor, as_completed import pymysql # 数据库连接信息 local_DB_Config = { 'host': "localhost", 'user': "root", 'password': "123123", 'database': "intelligence_system", 'charset': 'utf8mb4' } # 表名 table_name = "collector_rss_subscriptions" def verify_database(): """验证数据库连接和表结构""" try: conn = pymysql.connect(**local_DB_Config) with conn.cursor() as cursor: # 检查表是否存在 cursor.execute(f"SHOW TABLES LIKE '{table_name}'") if not cursor.fetchone(): print(f"错误: 表 {table_name} 不存在!") return False # 检查表结构 cursor.execute(f"DESCRIBE {table_name}") columns = [col[0] for col in cursor.fetchall()] print("表列名:", columns) # 检查插入权限 test_sql = f"""INSERT INTO `{table_name}` (`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`) VALUES (%s, %s, %s, %s, %s)""" cursor.execute(test_sql, ('测试标题', 'http://test.com', '测试内容', datetime.now(), '测试来源')) conn.rollback() print("数据库验证通过!") return True except Exception as e: print("数据库验证失败:", e) return False finally: if 'conn' in locals(): conn.close() def load_last_update_time(): """加载上次更新的时间""" cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl') if os.path.exists(cache_file): with open(cache_file, 'rb') as f: return pickle.load(f) return None def save_last_update_time(last_update): """保存本次更新的时间""" cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl') os.makedirs(os.path.dirname(cache_file), exist_ok=True) with open(cache_file, 'wb') as f: pickle.dump(last_update, f) def fetch_single_rss(url, timeout=15): """获取并解析单个 RSS 源""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } for attempt in range(3): try: response = requests.get(url, headers=headers, timeout=timeout) response.raise_for_status() response.encoding = response.apparent_encoding feed = feedparser.parse(response.text) if feed.bozo: print(f"警告: 解析可能存在问题: {feed.bozo_exception}") return feed except requests.RequestException as e: print(f"第 {attempt + 1} 次尝试获取 {url} 失败: {e}") if attempt < 2: time.sleep(5 * (attempt + 1)) continue return None def fetch_all_rss(urls, timeout=15): """使用线程池并发获取多个RSS源""" feeds = {} with ThreadPoolExecutor(max_workers=3) as executor: future_to_url = {executor.submit(fetch_single_rss, url, timeout): url for url in urls} for future in as_completed(future_to_url): url = future_to_url[future] try: feed = future.result() if feed: feeds[url] = feed except Exception as e: print(f"获取 {url} 时发生异常: {e}") return feeds def process_feed_entry(entry, url): """处理单个RSS条目并返回结构化数据""" # 处理标题 title = entry.get('title', '无标题') or '无标题' if len(title) > 255: title = title[:252] + '...' # 处理链接 link = entry.get('link', '无链接') or '无链接' if len(link) > 1024: link = link[:1021] + '...' # 处理摘要 summary = entry.get('summary', '无内容摘要') content_list = entry.get('content', []) content = content_list[0].value if content_list else '' description = summary if summary != '无内容摘要' else (content[:200] + '...' if content else '无内容摘要') # 处理发布时间 published_parsed = entry.get('published_parsed') or entry.get('updated_parsed') if published_parsed: entry_time = datetime(*published_parsed[:6]) else: pub_str = entry.get('published', entry.get('updated', '')) try: entry_time = datetime.strptime(pub_str, '%a, %d %b %Y %H:%M:%S %z') except: entry_time = datetime.now() # 处理来源URL source_url = url or '未知来源' if len(source_url) > 1024: source_url = source_url[:1021] + '...' return { '文章标题': title, '文章链接': link, '文章摘要': description, '发布时间': entry_time.strftime('%Y-%m-%d %H:%M:%S'), '来源URL': source_url } def display_feed_info(feed, last_update=None, url=None): """处理并显示RSS源信息""" if not feed: print("无法显示信息:feed 为 None") return None print("=" * 80) print(f"处理 RSS 源: {url}") entries = feed.entries data_list = [] new_last_update = last_update for i, entry in enumerate(entries, 1): entry_data = process_feed_entry(entry, url) entry_time = datetime.strptime(entry_data['发布时间'], '%Y-%m-%d %H:%M:%S') if last_update and entry_time <= last_update: continue if new_last_update is None or entry_time > new_last_update: new_last_update = entry_time print(f"\n--- 条目 {i} ---") print(f"标题: {entry_data['文章标题']}") print(f"链接: {entry_data['文章链接']}") print(f"摘要: {entry_data['文章摘要'][:100]}...") print(f"时间: {entry_data['发布时间']}") data_list.append(entry_data) if data_list: df = pd.DataFrame(data_list) write_to_database(df) return new_last_update def write_to_database(df): """将数据写入数据库""" if df.empty: print("没有新数据需要写入") return print("\n准备写入数据库的数据样例:") print(df.iloc[0].to_dict()) try: conn = pymysql.connect(**local_DB_Config) with conn.cursor() as cursor: sql = f"""INSERT IGNORE INTO `{table_name}` (`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`) VALUES (%s, %s, %s, %s, %s)""" success_count = 0 for _, row in df.iterrows(): try: cursor.execute(sql, ( row['文章标题'], row['文章链接'], row['文章摘要'], row['发布时间'], row['来源URL'] )) success_count += cursor.rowcount except Exception as e: print(f"插入记录时出错: {e}") print(f"问题数据: {row.to_dict()}") continue conn.commit() print(f"成功写入 {success_count}/{len(df)} 条记录") except Exception as e: print("数据库操作失败:", e) finally: if 'conn' in locals(): conn.close() def main(): """主函数""" if not verify_database(): print("数据库验证失败,程序终止") return rss_urls = [ "https://www.chinanews.com.cn/rss/finance.xml", "https://www.chinanews.com.cn/rss/world.xml", "https://www.chinanews.com.cn/rss/china.xml", "https://www.chinanews.com.cn/rss/scroll-news.xml" ] last_update = load_last_update_time() if last_update: print(f"上次更新时间: {last_update.strftime('%Y-%m-%d %H:%M:%S')}") print("\n开始获取RSS源数据...") start_time = time.time() feeds = fetch_all_rss(rss_urls) print(f"获取完成,耗时: {time.time() - start_time:.2f}秒") new_last_update = None for url, feed in feeds.items(): current_last_update = display_feed_info(feed, last_update, url) if current_last_update and (new_last_update is None or current_last_update > new_last_update): new_last_update = current_last_update if new_last_update: save_last_update_time(new_last_update) print(f"\n本次最新更新时间: {new_last_update.strftime('%Y-%m-%d %H:%M:%S')}") else: print("\n没有获取到新的内容") if __name__ == "__main__": main()