Uploading the AI Crawler System: MindSpider
This commit is contained in:
@@ -0,0 +1,265 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
MindSpider AI爬虫项目 - 数据库管理工具
|
||||
提供数据库状态查看、数据统计、清理等功能
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pymysql
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.append(str(project_root))
|
||||
|
||||
try:
|
||||
import config
|
||||
except ImportError:
|
||||
print("错误: 无法导入config.py配置文件")
|
||||
sys.exit(1)
|
||||
|
||||
class DatabaseManager:
|
||||
def __init__(self):
|
||||
self.connection = None
|
||||
self.connect()
|
||||
|
||||
def connect(self):
|
||||
"""连接数据库"""
|
||||
try:
|
||||
self.connection = pymysql.connect(
|
||||
host=config.DB_HOST,
|
||||
port=config.DB_PORT,
|
||||
user=config.DB_USER,
|
||||
password=config.DB_PASSWORD,
|
||||
database=config.DB_NAME,
|
||||
charset=config.DB_CHARSET,
|
||||
autocommit=True
|
||||
)
|
||||
print(f"成功连接到数据库: {config.DB_NAME}")
|
||||
except Exception as e:
|
||||
print(f"数据库连接失败: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
def close(self):
|
||||
"""关闭数据库连接"""
|
||||
if self.connection:
|
||||
self.connection.close()
|
||||
|
||||
def show_tables(self):
|
||||
"""显示所有表"""
|
||||
print("\n" + "=" * 60)
|
||||
print("数据库表列表")
|
||||
print("=" * 60)
|
||||
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute("SHOW TABLES")
|
||||
tables = cursor.fetchall()
|
||||
|
||||
if not tables:
|
||||
print("数据库中没有表")
|
||||
return
|
||||
|
||||
# 分类显示表
|
||||
mindspider_tables = []
|
||||
mediacrawler_tables = []
|
||||
|
||||
for table in tables:
|
||||
table_name = table[0]
|
||||
if table_name in ['daily_news', 'daily_topics', 'topic_news_relation', 'crawling_tasks']:
|
||||
mindspider_tables.append(table_name)
|
||||
else:
|
||||
mediacrawler_tables.append(table_name)
|
||||
|
||||
print("MindSpider核心表:")
|
||||
for table in mindspider_tables:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
count = cursor.fetchone()[0]
|
||||
print(f" - {table:<25} ({count:>6} 条记录)")
|
||||
|
||||
print("\nMediaCrawler平台表:")
|
||||
for table in mediacrawler_tables:
|
||||
try:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
count = cursor.fetchone()[0]
|
||||
print(f" - {table:<25} ({count:>6} 条记录)")
|
||||
except:
|
||||
print(f" - {table:<25} (查询失败)")
|
||||
|
||||
def show_statistics(self):
|
||||
"""显示数据统计"""
|
||||
print("\n" + "=" * 60)
|
||||
print("数据统计")
|
||||
print("=" * 60)
|
||||
|
||||
cursor = self.connection.cursor()
|
||||
|
||||
try:
|
||||
# 新闻统计
|
||||
cursor.execute("SELECT COUNT(*) FROM daily_news")
|
||||
news_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(DISTINCT crawl_date) FROM daily_news")
|
||||
news_days = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(DISTINCT source_platform) FROM daily_news")
|
||||
platforms = cursor.fetchone()[0]
|
||||
|
||||
print(f"新闻数据:")
|
||||
print(f" - 总新闻数: {news_count}")
|
||||
print(f" - 覆盖天数: {news_days}")
|
||||
print(f" - 新闻平台: {platforms}")
|
||||
|
||||
# 话题统计
|
||||
cursor.execute("SELECT COUNT(*) FROM daily_topics")
|
||||
topic_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(DISTINCT extract_date) FROM daily_topics")
|
||||
topic_days = cursor.fetchone()[0]
|
||||
|
||||
print(f"\n话题数据:")
|
||||
print(f" - 总话题数: {topic_count}")
|
||||
print(f" - 提取天数: {topic_days}")
|
||||
|
||||
# 爬取任务统计
|
||||
cursor.execute("SELECT COUNT(*) FROM crawling_tasks")
|
||||
task_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT task_status, COUNT(*) FROM crawling_tasks GROUP BY task_status")
|
||||
task_status = cursor.fetchall()
|
||||
|
||||
print(f"\n爬取任务:")
|
||||
print(f" - 总任务数: {task_count}")
|
||||
for status, count in task_status:
|
||||
print(f" - {status}: {count}")
|
||||
|
||||
# 爬取内容统计
|
||||
print(f"\n平台内容统计:")
|
||||
platform_tables = {
|
||||
'xhs_note': '小红书',
|
||||
'douyin_aweme': '抖音',
|
||||
'kuaishou_video': '快手',
|
||||
'bilibili_video': 'B站',
|
||||
'weibo_note': '微博',
|
||||
'tieba_note': '贴吧',
|
||||
'zhihu_content': '知乎'
|
||||
}
|
||||
|
||||
for table, platform in platform_tables.items():
|
||||
try:
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
||||
count = cursor.fetchone()[0]
|
||||
print(f" - {platform}: {count}")
|
||||
except:
|
||||
print(f" - {platform}: 表不存在")
|
||||
|
||||
except Exception as e:
|
||||
print(f"统计查询失败: {e}")
|
||||
|
||||
def show_recent_data(self, days=7):
|
||||
"""显示最近几天的数据"""
|
||||
print(f"\n" + "=" * 60)
|
||||
print(f"最近{days}天的数据")
|
||||
print("=" * 60)
|
||||
|
||||
cursor = self.connection.cursor()
|
||||
|
||||
# 最近的新闻
|
||||
cursor.execute("""
|
||||
SELECT crawl_date, COUNT(*) as news_count, COUNT(DISTINCT source_platform) as platforms
|
||||
FROM daily_news
|
||||
WHERE crawl_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
GROUP BY crawl_date
|
||||
ORDER BY crawl_date DESC
|
||||
""", (days,))
|
||||
|
||||
news_data = cursor.fetchall()
|
||||
if news_data:
|
||||
print("每日新闻统计:")
|
||||
for date, count, platforms in news_data:
|
||||
print(f" {date}: {count} 条新闻, {platforms} 个平台")
|
||||
|
||||
# 最近的话题
|
||||
cursor.execute("""
|
||||
SELECT extract_date, COUNT(*) as topic_count
|
||||
FROM daily_topics
|
||||
WHERE extract_date >= DATE_SUB(CURDATE(), INTERVAL %s DAY)
|
||||
GROUP BY extract_date
|
||||
ORDER BY extract_date DESC
|
||||
""", (days,))
|
||||
|
||||
topic_data = cursor.fetchall()
|
||||
if topic_data:
|
||||
print("\n每日话题统计:")
|
||||
for date, count in topic_data:
|
||||
print(f" {date}: {count} 个话题")
|
||||
|
||||
def cleanup_old_data(self, days=90, dry_run=True):
|
||||
"""清理旧数据"""
|
||||
print(f"\n" + "=" * 60)
|
||||
print(f"清理{days}天前的数据 ({'预览模式' if dry_run else '执行模式'})")
|
||||
print("=" * 60)
|
||||
|
||||
cursor = self.connection.cursor()
|
||||
cutoff_date = datetime.now() - timedelta(days=days)
|
||||
|
||||
# 检查要删除的数据
|
||||
cleanup_queries = [
|
||||
("daily_news", f"SELECT COUNT(*) FROM daily_news WHERE crawl_date < '{cutoff_date.date()}'"),
|
||||
("daily_topics", f"SELECT COUNT(*) FROM daily_topics WHERE extract_date < '{cutoff_date.date()}'"),
|
||||
("crawling_tasks", f"SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date < '{cutoff_date.date()}'")
|
||||
]
|
||||
|
||||
for table, query in cleanup_queries:
|
||||
cursor.execute(query)
|
||||
count = cursor.fetchone()[0]
|
||||
if count > 0:
|
||||
print(f" {table}: {count} 条记录将被删除")
|
||||
if not dry_run:
|
||||
delete_query = query.replace("SELECT COUNT(*)", "DELETE")
|
||||
cursor.execute(delete_query)
|
||||
print(f" 已删除 {count} 条记录")
|
||||
else:
|
||||
print(f" {table}: 无需清理")
|
||||
|
||||
if dry_run:
|
||||
print("\n这是预览模式,没有实际删除数据。使用 --execute 参数执行实际清理。")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="MindSpider数据库管理工具")
|
||||
parser.add_argument("--tables", action="store_true", help="显示所有表")
|
||||
parser.add_argument("--stats", action="store_true", help="显示数据统计")
|
||||
parser.add_argument("--recent", type=int, default=7, help="显示最近N天的数据 (默认7天)")
|
||||
parser.add_argument("--cleanup", type=int, help="清理N天前的数据")
|
||||
parser.add_argument("--execute", action="store_true", help="执行实际清理操作")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 如果没有参数,显示所有信息
|
||||
if not any([args.tables, args.stats, args.recent != 7, args.cleanup]):
|
||||
args.tables = True
|
||||
args.stats = True
|
||||
|
||||
db_manager = DatabaseManager()
|
||||
|
||||
try:
|
||||
if args.tables:
|
||||
db_manager.show_tables()
|
||||
|
||||
if args.stats:
|
||||
db_manager.show_statistics()
|
||||
|
||||
if args.recent != 7 or not any([args.tables, args.stats, args.cleanup]):
|
||||
db_manager.show_recent_data(args.recent)
|
||||
|
||||
if args.cleanup:
|
||||
db_manager.cleanup_old_data(args.cleanup, dry_run=not args.execute)
|
||||
|
||||
finally:
|
||||
db_manager.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
MindSpider AI爬虫项目 - 数据库初始化脚本
|
||||
用于创建项目所需的所有数据库表
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pymysql
|
||||
from pathlib import Path
|
||||
|
||||
# 添加项目根目录到路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.append(str(project_root))
|
||||
|
||||
# 导入配置
|
||||
try:
|
||||
import config
|
||||
except ImportError:
|
||||
print("错误: 无法导入config.py配置文件")
|
||||
print("请确保config.py文件存在于项目根目录")
|
||||
sys.exit(1)
|
||||
|
||||
def create_database_connection():
|
||||
"""创建数据库连接"""
|
||||
try:
|
||||
connection = pymysql.connect(
|
||||
host=config.DB_HOST,
|
||||
port=config.DB_PORT,
|
||||
user=config.DB_USER,
|
||||
password=config.DB_PASSWORD,
|
||||
charset=config.DB_CHARSET,
|
||||
autocommit=True
|
||||
)
|
||||
print(f"成功连接到MySQL服务器: {config.DB_HOST}:{config.DB_PORT}")
|
||||
return connection
|
||||
except Exception as e:
|
||||
print(f"连接数据库失败: {e}")
|
||||
return None
|
||||
|
||||
def create_database(connection):
|
||||
"""创建数据库"""
|
||||
try:
|
||||
cursor = connection.cursor()
|
||||
cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{config.DB_NAME}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")
|
||||
cursor.execute(f"USE `{config.DB_NAME}`")
|
||||
print(f"数据库 '{config.DB_NAME}' 创建/选择成功")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"创建数据库失败: {e}")
|
||||
return False
|
||||
|
||||
def execute_sql_file(connection, sql_file_path, description=""):
|
||||
"""执行SQL文件"""
|
||||
if not os.path.exists(sql_file_path):
|
||||
print(f"警告: SQL文件不存在: {sql_file_path}")
|
||||
return False
|
||||
|
||||
try:
|
||||
cursor = connection.cursor()
|
||||
with open(sql_file_path, 'r', encoding='utf-8') as f:
|
||||
sql_content = f.read()
|
||||
|
||||
# 分割SQL语句(简单实现,按分号分割)
|
||||
sql_statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()]
|
||||
|
||||
success_count = 0
|
||||
error_count = 0
|
||||
|
||||
for stmt in sql_statements:
|
||||
if not stmt or stmt.startswith('--'):
|
||||
continue
|
||||
try:
|
||||
cursor.execute(stmt)
|
||||
success_count += 1
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
print(f"执行SQL语句失败: {str(e)[:100]}...")
|
||||
|
||||
print(f"{description} - 成功执行: {success_count} 条语句, 失败: {error_count} 条语句")
|
||||
return error_count == 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"执行SQL文件失败 {sql_file_path}: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("=" * 60)
|
||||
print("MindSpider AI爬虫项目 - 数据库初始化")
|
||||
print("=" * 60)
|
||||
|
||||
# 检查配置
|
||||
print("检查数据库配置...")
|
||||
print(f"数据库主机: {config.DB_HOST}")
|
||||
print(f"数据库端口: {config.DB_PORT}")
|
||||
print(f"数据库名称: {config.DB_NAME}")
|
||||
print(f"数据库用户: {config.DB_USER}")
|
||||
print(f"字符集: {config.DB_CHARSET}")
|
||||
print()
|
||||
|
||||
# 创建数据库连接
|
||||
print("正在连接数据库...")
|
||||
connection = create_database_connection()
|
||||
if not connection:
|
||||
print("数据库初始化失败!")
|
||||
return False
|
||||
|
||||
try:
|
||||
# 创建数据库
|
||||
print("正在创建/选择数据库...")
|
||||
if not create_database(connection):
|
||||
return False
|
||||
|
||||
# 获取SQL文件路径
|
||||
schema_dir = Path(__file__).parent
|
||||
mediacrawler_sql = schema_dir.parent / "DeepSentimentCrawling" / "MediaCrawler" / "schema" / "tables.sql"
|
||||
mindspider_sql = schema_dir / "mindspider_tables.sql"
|
||||
|
||||
print()
|
||||
print("开始执行SQL脚本...")
|
||||
|
||||
# 1. 执行MediaCrawler的原始表结构
|
||||
if mediacrawler_sql.exists():
|
||||
print("1. 创建MediaCrawler基础表...")
|
||||
execute_sql_file(connection, str(mediacrawler_sql), "MediaCrawler基础表")
|
||||
else:
|
||||
print("警告: MediaCrawler SQL文件不存在,跳过基础表创建")
|
||||
|
||||
# 2. 执行MindSpider扩展表结构
|
||||
print("2. 创建MindSpider扩展表...")
|
||||
if mindspider_sql.exists():
|
||||
execute_sql_file(connection, str(mindspider_sql), "MindSpider扩展表")
|
||||
else:
|
||||
print("错误: MindSpider SQL文件不存在")
|
||||
return False
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("数据库初始化完成!")
|
||||
print("=" * 60)
|
||||
|
||||
# 显示创建的表
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("SHOW TABLES")
|
||||
tables = cursor.fetchall()
|
||||
|
||||
print(f"数据库 '{config.DB_NAME}' 中共创建了 {len(tables)} 个表:")
|
||||
for table in tables:
|
||||
print(f" - {table[0]}")
|
||||
|
||||
print()
|
||||
print("数据库初始化成功完成!您现在可以开始使用MindSpider了。")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"数据库初始化过程中发生错误: {e}")
|
||||
return False
|
||||
|
||||
finally:
|
||||
if connection:
|
||||
connection.close()
|
||||
print("数据库连接已关闭")
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = main()
|
||||
sys.exit(0 if success else 1)
|
||||
@@ -0,0 +1,201 @@
|
||||
-- MindSpider AI爬虫项目 - 数据库表结构
|
||||
-- 基于MediaCrawler表结构扩展,添加BroadTopicExtraction模块所需表
|
||||
|
||||
-- ===============================
|
||||
-- BroadTopicExtraction 模块表结构
|
||||
-- ===============================
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for daily_news
|
||||
-- 每日新闻表:存储get_today_news.py获取的热点新闻
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `daily_news`;
|
||||
CREATE TABLE `daily_news` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`news_id` varchar(128) NOT NULL COMMENT '新闻唯一ID',
|
||||
`source_platform` varchar(32) NOT NULL COMMENT '新闻源平台(weibo|zhihu|bilibili|toutiao|douyin等)',
|
||||
`title` varchar(500) NOT NULL COMMENT '新闻标题',
|
||||
`url` varchar(512) DEFAULT NULL COMMENT '新闻链接',
|
||||
`description` text COMMENT '新闻描述或摘要',
|
||||
`extra_info` text COMMENT '额外信息(JSON格式存储)',
|
||||
`crawl_date` date NOT NULL COMMENT '爬取日期',
|
||||
`rank_position` int DEFAULT NULL COMMENT '在热榜中的排名位置',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
PRIMARY KEY (`id`),
|
||||
UNIQUE KEY `idx_daily_news_unique` (`news_id`, `source_platform`, `crawl_date`),
|
||||
KEY `idx_daily_news_date` (`crawl_date`),
|
||||
KEY `idx_daily_news_platform` (`source_platform`),
|
||||
KEY `idx_daily_news_rank` (`rank_position`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='每日热点新闻表';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for daily_topics
|
||||
-- 每日话题表:存储TopicGPT提取的话题信息
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `daily_topics`;
|
||||
CREATE TABLE `daily_topics` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`topic_id` varchar(64) NOT NULL COMMENT '话题唯一ID',
|
||||
`topic_name` varchar(255) NOT NULL COMMENT '话题名称',
|
||||
`topic_description` text COMMENT '话题描述',
|
||||
`keywords` text COMMENT '话题关键词(JSON格式存储)',
|
||||
`extract_date` date NOT NULL COMMENT '话题提取日期',
|
||||
`relevance_score` float DEFAULT NULL COMMENT '话题相关性得分',
|
||||
`news_count` int DEFAULT 0 COMMENT '关联的新闻数量',
|
||||
`processing_status` varchar(16) DEFAULT 'pending' COMMENT '处理状态(pending|processing|completed|failed)',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
PRIMARY KEY (`id`),
|
||||
UNIQUE KEY `idx_daily_topics_unique` (`topic_id`, `extract_date`),
|
||||
KEY `idx_daily_topics_date` (`extract_date`),
|
||||
KEY `idx_daily_topics_status` (`processing_status`),
|
||||
KEY `idx_daily_topics_score` (`relevance_score`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='每日提取话题表';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for topic_news_relation
|
||||
-- 话题新闻关联表:记录话题和新闻的关联关系
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `topic_news_relation`;
|
||||
CREATE TABLE `topic_news_relation` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`topic_id` varchar(64) NOT NULL COMMENT '话题ID',
|
||||
`news_id` varchar(128) NOT NULL COMMENT '新闻ID',
|
||||
`relation_score` float DEFAULT NULL COMMENT '关联度得分',
|
||||
`extract_date` date NOT NULL COMMENT '关联提取日期',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
PRIMARY KEY (`id`),
|
||||
UNIQUE KEY `idx_topic_news_unique` (`topic_id`, `news_id`, `extract_date`),
|
||||
KEY `idx_topic_news_topic` (`topic_id`),
|
||||
KEY `idx_topic_news_news` (`news_id`),
|
||||
KEY `idx_topic_news_date` (`extract_date`),
|
||||
FOREIGN KEY (`topic_id`) REFERENCES `daily_topics`(`topic_id`) ON DELETE CASCADE,
|
||||
FOREIGN KEY (`news_id`) REFERENCES `daily_news`(`news_id`) ON DELETE CASCADE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='话题新闻关联表';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for crawling_tasks
|
||||
-- 爬取任务表:记录基于话题的平台爬取任务
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `crawling_tasks`;
|
||||
CREATE TABLE `crawling_tasks` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`task_id` varchar(64) NOT NULL COMMENT '任务唯一ID',
|
||||
`topic_id` varchar(64) NOT NULL COMMENT '关联的话题ID',
|
||||
`platform` varchar(32) NOT NULL COMMENT '目标平台(xhs|dy|ks|bili|wb|tieba|zhihu)',
|
||||
`search_keywords` text NOT NULL COMMENT '搜索关键词(JSON格式存储)',
|
||||
`task_status` varchar(16) DEFAULT 'pending' COMMENT '任务状态(pending|running|completed|failed|paused)',
|
||||
`start_time` bigint DEFAULT NULL COMMENT '任务开始时间戳',
|
||||
`end_time` bigint DEFAULT NULL COMMENT '任务结束时间戳',
|
||||
`total_crawled` int DEFAULT 0 COMMENT '已爬取内容数量',
|
||||
`success_count` int DEFAULT 0 COMMENT '成功爬取数量',
|
||||
`error_count` int DEFAULT 0 COMMENT '错误数量',
|
||||
`error_message` text COMMENT '错误信息',
|
||||
`config_params` text COMMENT '爬取配置参数(JSON格式)',
|
||||
`scheduled_date` date NOT NULL COMMENT '计划执行日期',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
PRIMARY KEY (`id`),
|
||||
UNIQUE KEY `idx_crawling_tasks_unique` (`task_id`),
|
||||
KEY `idx_crawling_tasks_topic` (`topic_id`),
|
||||
KEY `idx_crawling_tasks_platform` (`platform`),
|
||||
KEY `idx_crawling_tasks_status` (`task_status`),
|
||||
KEY `idx_crawling_tasks_date` (`scheduled_date`),
|
||||
FOREIGN KEY (`topic_id`) REFERENCES `daily_topics`(`topic_id`) ON DELETE CASCADE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='爬取任务表';
|
||||
|
||||
-- ===============================
|
||||
-- MediaCrawler表结构扩展字段
|
||||
-- ===============================
|
||||
|
||||
-- 为MediaCrawler现有表添加话题关联字段,支持MindSpider功能
|
||||
-- 注意:这些字段是可选的,不影响MediaCrawler原有功能
|
||||
|
||||
-- 为小红书笔记表添加话题关联字段
|
||||
ALTER TABLE `xhs_note`
|
||||
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
||||
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
||||
|
||||
-- 为抖音视频表添加话题关联字段
|
||||
ALTER TABLE `douyin_aweme`
|
||||
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
||||
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
||||
|
||||
-- 为快手视频表添加话题关联字段
|
||||
ALTER TABLE `kuaishou_video`
|
||||
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
||||
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
||||
|
||||
-- 为B站视频表添加话题关联字段
|
||||
ALTER TABLE `bilibili_video`
|
||||
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
||||
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
||||
|
||||
-- 为微博帖子表添加话题关联字段
|
||||
ALTER TABLE `weibo_note`
|
||||
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
||||
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
||||
|
||||
-- 为贴吧帖子表添加话题关联字段
|
||||
ALTER TABLE `tieba_note`
|
||||
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
||||
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
||||
|
||||
-- 为知乎内容表添加话题关联字段
|
||||
ALTER TABLE `zhihu_content`
|
||||
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
||||
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
||||
|
||||
-- ===============================
|
||||
-- 创建视图用于数据分析
|
||||
-- ===============================
|
||||
|
||||
-- 话题爬取统计视图
|
||||
CREATE OR REPLACE VIEW `v_topic_crawling_stats` AS
|
||||
SELECT
|
||||
dt.topic_id,
|
||||
dt.topic_name,
|
||||
dt.extract_date,
|
||||
dt.processing_status,
|
||||
COUNT(DISTINCT ct.task_id) as total_tasks,
|
||||
SUM(CASE WHEN ct.task_status = 'completed' THEN 1 ELSE 0 END) as completed_tasks,
|
||||
SUM(CASE WHEN ct.task_status = 'failed' THEN 1 ELSE 0 END) as failed_tasks,
|
||||
SUM(ct.total_crawled) as total_content_crawled,
|
||||
SUM(ct.success_count) as total_success_count,
|
||||
SUM(ct.error_count) as total_error_count
|
||||
FROM daily_topics dt
|
||||
LEFT JOIN crawling_tasks ct ON dt.topic_id = ct.topic_id
|
||||
GROUP BY dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status;
|
||||
|
||||
-- 每日数据统计视图
|
||||
CREATE OR REPLACE VIEW `v_daily_summary` AS
|
||||
SELECT
|
||||
crawl_date,
|
||||
COUNT(DISTINCT news_id) as total_news,
|
||||
COUNT(DISTINCT source_platform) as platforms_covered,
|
||||
(SELECT COUNT(*) FROM daily_topics WHERE extract_date = dn.crawl_date) as topics_extracted,
|
||||
(SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date = dn.crawl_date) as tasks_created
|
||||
FROM daily_news dn
|
||||
GROUP BY crawl_date
|
||||
ORDER BY crawl_date DESC;
|
||||
|
||||
-- ===============================
|
||||
-- 初始化索引优化
|
||||
-- ===============================
|
||||
|
||||
-- 为关联查询优化添加复合索引
|
||||
CREATE INDEX `idx_topic_date_status` ON `daily_topics` (`extract_date`, `processing_status`);
|
||||
CREATE INDEX `idx_task_topic_platform` ON `crawling_tasks` (`topic_id`, `platform`, `task_status`);
|
||||
CREATE INDEX `idx_news_date_platform` ON `daily_news` (`crawl_date`, `source_platform`);
|
||||
|
||||
-- ===============================
|
||||
-- 数据库配置优化建议
|
||||
-- ===============================
|
||||
|
||||
-- 设置合适的字符集和排序规则
|
||||
-- ALTER DATABASE mindspider CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
-- 建议的数据保留策略(可选)
|
||||
-- 可以根据需要创建事件调度器来清理历史数据
|
||||
-- 例如:删除90天前的新闻数据,保留话题和爬取结果数据
|
||||
Reference in New Issue
Block a user