intelligence_system/collectors/news_api.py

import feedparser
import requests
from datetime import datetime
import pandas as pd
import os
import pickle
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import pymysql

# 数据库连接信息
local_DB_Config = {
    'host': "localhost",
    'user': "root",
    'password': "123123",
    'database': "intelligence_system",
    'charset': 'utf8mb4'
}

# 表名
table_name = "collector_rss_subscriptions"


def verify_database():
    """验证数据库连接和表结构"""
    try:
        conn = pymysql.connect(**local_DB_Config)
        with conn.cursor() as cursor:
            # 检查表是否存在
            cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
            if not cursor.fetchone():
                print(f"错误: 表 {table_name} 不存在!")
                return False

            # 检查表结构
            cursor.execute(f"DESCRIBE {table_name}")
            columns = [col[0] for col in cursor.fetchall()]
            print("表列名:", columns)

            # 检查插入权限
            test_sql = f"""INSERT INTO `{table_name}`
                         (`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`)
                         VALUES (%s, %s, %s, %s, %s)"""
            cursor.execute(test_sql, ('测试标题', 'http://test.com', '测试内容', datetime.now(), '测试来源'))
            conn.rollback()

        print("数据库验证通过!")
        return True
    except Exception as e:
        print("数据库验证失败:", e)
        return False
    finally:
        if 'conn' in locals():
            conn.close()


def load_last_update_time():
    """加载上次更新的时间"""
    cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl')
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            return pickle.load(f)
    return None


def save_last_update_time(last_update):
    """保存本次更新的时间"""
    cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl')
    os.makedirs(os.path.dirname(cache_file), exist_ok=True)
    with open(cache_file, 'wb') as f:
        pickle.dump(last_update, f)


def fetch_single_rss(url, timeout=15):
    """获取并解析单个 RSS 源"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    for attempt in range(3):
        try:
            response = requests.get(url, headers=headers, timeout=timeout)
            response.raise_for_status()
            response.encoding = response.apparent_encoding
            feed = feedparser.parse(response.text)

            if feed.bozo:
                print(f"警告: 解析可能存在问题: {feed.bozo_exception}")

            return feed

        except requests.RequestException as e:
            print(f"第 {attempt + 1} 次尝试获取 {url} 失败: {e}")
            if attempt < 2:
                time.sleep(5 * (attempt + 1))
            continue

    return None


def fetch_all_rss(urls, timeout=15):
    """使用线程池并发获取多个RSS源"""
    feeds = {}
    with ThreadPoolExecutor(max_workers=3) as executor:
        future_to_url = {executor.submit(fetch_single_rss, url, timeout): url for url in urls}

        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                feed = future.result()
                if feed:
                    feeds[url] = feed
            except Exception as e:
                print(f"获取 {url} 时发生异常: {e}")

    return feeds


def process_feed_entry(entry, url):
    """处理单个RSS条目并返回结构化数据"""
    # 处理标题
    title = entry.get('title', '无标题') or '无标题'
    if len(title) > 255:
        title = title[:252] + '...'

    # 处理链接
    link = entry.get('link', '无链接') or '无链接'
    if len(link) > 1024:
        link = link[:1021] + '...'

    # 处理摘要
    summary = entry.get('summary', '无内容摘要')
    content_list = entry.get('content', [])
    content = content_list[0].value if content_list else ''
    description = summary if summary != '无内容摘要' else (content[:200] + '...' if content else '无内容摘要')

    # 处理发布时间
    published_parsed = entry.get('published_parsed') or entry.get('updated_parsed')
    if published_parsed:
        entry_time = datetime(*published_parsed[:6])
    else:
        pub_str = entry.get('published', entry.get('updated', ''))
        try:
            entry_time = datetime.strptime(pub_str, '%a, %d %b %Y %H:%M:%S %z')
        except:
            entry_time = datetime.now()

    # 处理来源URL
    source_url = url or '未知来源'
    if len(source_url) > 1024:
        source_url = source_url[:1021] + '...'

    return {
        '文章标题': title,
        '文章链接': link,
        '文章摘要': description,
        '发布时间': entry_time.strftime('%Y-%m-%d %H:%M:%S'),
        '来源URL': source_url
    }


def display_feed_info(feed, last_update=None, url=None):
    """处理并显示RSS源信息"""
    if not feed:
        print("无法显示信息：feed 为 None")
        return None

    print("=" * 80)
    print(f"处理 RSS 源: {url}")
    entries = feed.entries
    data_list = []
    new_last_update = last_update

    for i, entry in enumerate(entries, 1):
        entry_data = process_feed_entry(entry, url)
        entry_time = datetime.strptime(entry_data['发布时间'], '%Y-%m-%d %H:%M:%S')

        if last_update and entry_time <= last_update:
            continue

        if new_last_update is None or entry_time > new_last_update:
            new_last_update = entry_time

        print(f"\n--- 条目 {i} ---")
        print(f"标题: {entry_data['文章标题']}")
        print(f"链接: {entry_data['文章链接']}")
        print(f"摘要: {entry_data['文章摘要'][:100]}...")
        print(f"时间: {entry_data['发布时间']}")

        data_list.append(entry_data)

    if data_list:
        df = pd.DataFrame(data_list)
        write_to_database(df)

    return new_last_update


def write_to_database(df):
    """将数据写入数据库"""
    if df.empty:
        print("没有新数据需要写入")
        return

    print("\n准备写入数据库的数据样例:")
    print(df.iloc[0].to_dict())

    try:
        conn = pymysql.connect(**local_DB_Config)
        with conn.cursor() as cursor:
            sql = f"""INSERT IGNORE INTO `{table_name}`
                     (`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`)
                     VALUES (%s, %s, %s, %s, %s)"""

            success_count = 0
            for _, row in df.iterrows():
                try:
                    cursor.execute(sql, (
                        row['文章标题'],
                        row['文章链接'],
                        row['文章摘要'],
                        row['发布时间'],
                        row['来源URL']
                    ))
                    success_count += cursor.rowcount
                except Exception as e:
                    print(f"插入记录时出错: {e}")
                    print(f"问题数据: {row.to_dict()}")
                    continue

            conn.commit()
            print(f"成功写入 {success_count}/{len(df)} 条记录")

    except Exception as e:
        print("数据库操作失败:", e)
    finally:
        if 'conn' in locals():
            conn.close()


def main():
    """主函数"""
    if not verify_database():
        print("数据库验证失败，程序终止")
        return

    rss_urls = [
        "https://www.chinanews.com.cn/rss/finance.xml",
        "https://www.chinanews.com.cn/rss/world.xml",
        "https://www.chinanews.com.cn/rss/china.xml",
        "https://www.chinanews.com.cn/rss/scroll-news.xml"
    ]

    last_update = load_last_update_time()
    if last_update:
        print(f"上次更新时间: {last_update.strftime('%Y-%m-%d %H:%M:%S')}")

    print("\n开始获取RSS源数据...")
    start_time = time.time()
    feeds = fetch_all_rss(rss_urls)
    print(f"获取完成，耗时: {time.time() - start_time:.2f}秒")

    new_last_update = None
    for url, feed in feeds.items():
        current_last_update = display_feed_info(feed, last_update, url)
        if current_last_update and (new_last_update is None or current_last_update > new_last_update):
            new_last_update = current_last_update

    if new_last_update:
        save_last_update_time(new_last_update)
        print(f"\n本次最新更新时间: {new_last_update.strftime('%Y-%m-%d %H:%M:%S')}")
    else:
        print("\n没有获取到新的内容")


if __name__ == "__main__":
    main()