minio对象存储数据库链接

2025-09-16 17:35:53 +08:00
parent 8e92acf5d5
commit 9afa9d2e58
10 changed files with 7291 additions and 347 deletions
@@ -0,0 +1,277 @@
+import feedparser
+import requests
+from datetime import datetime
+import pandas as pd
+import os
+import pickle
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import pymysql
+
+# 数据库连接信息
+local_DB_Config = {
+    'host': "localhost",
+    'user': "root",
+    'password': "123123",
+    'database': "intelligence_system",
+    'charset': 'utf8mb4'
+}
+
+# 表名
+table_name = "collector_rss_subscriptions"
+
+
+def verify_database():
+    """验证数据库连接和表结构"""
+    try:
+        conn = pymysql.connect(**local_DB_Config)
+        with conn.cursor() as cursor:
+            # 检查表是否存在
+            cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
+            if not cursor.fetchone():
+                print(f"错误: 表 {table_name} 不存在!")
+                return False
+
+            # 检查表结构
+            cursor.execute(f"DESCRIBE {table_name}")
+            columns = [col[0] for col in cursor.fetchall()]
+            print("表列名:", columns)
+
+            # 检查插入权限
+            test_sql = f"""INSERT INTO `{table_name}` 
+                         (`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`)
+                         VALUES (%s, %s, %s, %s, %s)"""
+            cursor.execute(test_sql, ('测试标题', 'http://test.com', '测试内容', datetime.now(), '测试来源'))
+            conn.rollback()
+
+        print("数据库验证通过!")
+        return True
+    except Exception as e:
+        print("数据库验证失败:", e)
+        return False
+    finally:
+        if 'conn' in locals():
+            conn.close()
+
+
+def load_last_update_time():
+    """加载上次更新的时间"""
+    cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl')
+    if os.path.exists(cache_file):
+        with open(cache_file, 'rb') as f:
+            return pickle.load(f)
+    return None
+
+
+def save_last_update_time(last_update):
+    """保存本次更新的时间"""
+    cache_file = os.path.join(os.getcwd(), 'output', 'last_update.pkl')
+    os.makedirs(os.path.dirname(cache_file), exist_ok=True)
+    with open(cache_file, 'wb') as f:
+        pickle.dump(last_update, f)
+
+
+def fetch_single_rss(url, timeout=15):
+    """获取并解析单个 RSS 源"""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+
+    for attempt in range(3):
+        try:
+            response = requests.get(url, headers=headers, timeout=timeout)
+            response.raise_for_status()
+            response.encoding = response.apparent_encoding
+            feed = feedparser.parse(response.text)
+
+            if feed.bozo:
+                print(f"警告: 解析可能存在问题: {feed.bozo_exception}")
+
+            return feed
+
+        except requests.RequestException as e:
+            print(f"第 {attempt + 1} 次尝试获取 {url} 失败: {e}")
+            if attempt < 2:
+                time.sleep(5 * (attempt + 1))
+            continue
+
+    return None
+
+
+def fetch_all_rss(urls, timeout=15):
+    """使用线程池并发获取多个RSS源"""
+    feeds = {}
+    with ThreadPoolExecutor(max_workers=3) as executor:
+        future_to_url = {executor.submit(fetch_single_rss, url, timeout): url for url in urls}
+
+        for future in as_completed(future_to_url):
+            url = future_to_url[future]
+            try:
+                feed = future.result()
+                if feed:
+                    feeds[url] = feed
+            except Exception as e:
+                print(f"获取 {url} 时发生异常: {e}")
+
+    return feeds
+
+
+def process_feed_entry(entry, url):
+    """处理单个RSS条目并返回结构化数据"""
+    # 处理标题
+    title = entry.get('title', '无标题') or '无标题'
+    if len(title) > 255:
+        title = title[:252] + '...'
+
+    # 处理链接
+    link = entry.get('link', '无链接') or '无链接'
+    if len(link) > 1024:
+        link = link[:1021] + '...'
+
+    # 处理摘要
+    summary = entry.get('summary', '无内容摘要')
+    content_list = entry.get('content', [])
+    content = content_list[0].value if content_list else ''
+    description = summary if summary != '无内容摘要' else (content[:200] + '...' if content else '无内容摘要')
+
+    # 处理发布时间
+    published_parsed = entry.get('published_parsed') or entry.get('updated_parsed')
+    if published_parsed:
+        entry_time = datetime(*published_parsed[:6])
+    else:
+        pub_str = entry.get('published', entry.get('updated', ''))
+        try:
+            entry_time = datetime.strptime(pub_str, '%a, %d %b %Y %H:%M:%S %z')
+        except:
+            entry_time = datetime.now()
+
+    # 处理来源URL
+    source_url = url or '未知来源'
+    if len(source_url) > 1024:
+        source_url = source_url[:1021] + '...'
+
+    return {
+        '文章标题': title,
+        '文章链接': link,
+        '文章摘要': description,
+        '发布时间': entry_time.strftime('%Y-%m-%d %H:%M:%S'),
+        '来源URL': source_url
+    }
+
+
+def display_feed_info(feed, last_update=None, url=None):
+    """处理并显示RSS源信息"""
+    if not feed:
+        print("无法显示信息：feed 为 None")
+        return None
+
+    print("=" * 80)
+    print(f"处理 RSS 源: {url}")
+    entries = feed.entries
+    data_list = []
+    new_last_update = last_update
+
+    for i, entry in enumerate(entries, 1):
+        entry_data = process_feed_entry(entry, url)
+        entry_time = datetime.strptime(entry_data['发布时间'], '%Y-%m-%d %H:%M:%S')
+
+        if last_update and entry_time <= last_update:
+            continue
+
+        if new_last_update is None or entry_time > new_last_update:
+            new_last_update = entry_time
+
+        print(f"\n--- 条目 {i} ---")
+        print(f"标题: {entry_data['文章标题']}")
+        print(f"链接: {entry_data['文章链接']}")
+        print(f"摘要: {entry_data['文章摘要'][:100]}...")
+        print(f"时间: {entry_data['发布时间']}")
+
+        data_list.append(entry_data)
+
+    if data_list:
+        df = pd.DataFrame(data_list)
+        write_to_database(df)
+
+    return new_last_update
+
+
+def write_to_database(df):
+    """将数据写入数据库"""
+    if df.empty:
+        print("没有新数据需要写入")
+        return
+
+    print("\n准备写入数据库的数据样例:")
+    print(df.iloc[0].to_dict())
+
+    try:
+        conn = pymysql.connect(**local_DB_Config)
+        with conn.cursor() as cursor:
+            sql = f"""INSERT IGNORE INTO `{table_name}` 
+                     (`文章标题`, `文章链接`, `文章摘要`, `发布时间`, `来源URL`)
+                     VALUES (%s, %s, %s, %s, %s)"""
+
+            success_count = 0
+            for _, row in df.iterrows():
+                try:
+                    cursor.execute(sql, (
+                        row['文章标题'],
+                        row['文章链接'],
+                        row['文章摘要'],
+                        row['发布时间'],
+                        row['来源URL']
+                    ))
+                    success_count += cursor.rowcount
+                except Exception as e:
+                    print(f"插入记录时出错: {e}")
+                    print(f"问题数据: {row.to_dict()}")
+                    continue
+
+            conn.commit()
+            print(f"成功写入 {success_count}/{len(df)} 条记录")
+
+    except Exception as e:
+        print("数据库操作失败:", e)
+    finally:
+        if 'conn' in locals():
+            conn.close()
+
+
+def main():
+    """主函数"""
+    if not verify_database():
+        print("数据库验证失败，程序终止")
+        return
+
+    rss_urls = [
+        "https://www.chinanews.com.cn/rss/finance.xml",
+        "https://www.chinanews.com.cn/rss/world.xml",
+        "https://www.chinanews.com.cn/rss/china.xml",
+        "https://www.chinanews.com.cn/rss/scroll-news.xml"
+    ]
+
+    last_update = load_last_update_time()
+    if last_update:
+        print(f"上次更新时间: {last_update.strftime('%Y-%m-%d %H:%M:%S')}")
+
+    print("\n开始获取RSS源数据...")
+    start_time = time.time()
+    feeds = fetch_all_rss(rss_urls)
+    print(f"获取完成，耗时: {time.time() - start_time:.2f}秒")
+
+    new_last_update = None
+    for url, feed in feeds.items():
+        current_last_update = display_feed_info(feed, last_update, url)
+        if current_last_update and (new_last_update is None or current_last_update > new_last_update):
+            new_last_update = current_last_update
+
+    if new_last_update:
+        save_last_update_time(new_last_update)
+        print(f"\n本次最新更新时间: {new_last_update.strftime('%Y-%m-%d %H:%M:%S')}")
+    else:
+        print("\n没有获取到新的内容")
+
+
+if __name__ == "__main__":
+    main()