202 lines
9.6 KiB
SQL
202 lines
9.6 KiB
SQL
-- MindSpider AI爬虫项目 - 数据库表结构
|
|
-- 基于MediaCrawler表结构扩展,添加BroadTopicExtraction模块所需表
|
|
|
|
-- ===============================
|
|
-- BroadTopicExtraction 模块表结构
|
|
-- ===============================
|
|
|
|
-- ----------------------------
|
|
-- Table structure for daily_news
|
|
-- 每日新闻表:存储get_today_news.py获取的热点新闻
|
|
-- ----------------------------
|
|
DROP TABLE IF EXISTS `daily_news`;
|
|
CREATE TABLE `daily_news` (
|
|
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
|
`news_id` varchar(128) NOT NULL COMMENT '新闻唯一ID',
|
|
`source_platform` varchar(32) NOT NULL COMMENT '新闻源平台(weibo|zhihu|bilibili|toutiao|douyin等)',
|
|
`title` varchar(500) NOT NULL COMMENT '新闻标题',
|
|
`url` varchar(512) DEFAULT NULL COMMENT '新闻链接',
|
|
`description` text COMMENT '新闻描述或摘要',
|
|
`extra_info` text COMMENT '额外信息(JSON格式存储)',
|
|
`crawl_date` date NOT NULL COMMENT '爬取日期',
|
|
`rank_position` int DEFAULT NULL COMMENT '在热榜中的排名位置',
|
|
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
|
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
|
PRIMARY KEY (`id`),
|
|
UNIQUE KEY `idx_daily_news_unique` (`news_id`, `source_platform`, `crawl_date`),
|
|
KEY `idx_daily_news_date` (`crawl_date`),
|
|
KEY `idx_daily_news_platform` (`source_platform`),
|
|
KEY `idx_daily_news_rank` (`rank_position`)
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='每日热点新闻表';
|
|
|
|
-- ----------------------------
|
|
-- Table structure for daily_topics
|
|
-- 每日话题表:存储TopicGPT提取的话题信息
|
|
-- ----------------------------
|
|
DROP TABLE IF EXISTS `daily_topics`;
|
|
CREATE TABLE `daily_topics` (
|
|
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
|
`topic_id` varchar(64) NOT NULL COMMENT '话题唯一ID',
|
|
`topic_name` varchar(255) NOT NULL COMMENT '话题名称',
|
|
`topic_description` text COMMENT '话题描述',
|
|
`keywords` text COMMENT '话题关键词(JSON格式存储)',
|
|
`extract_date` date NOT NULL COMMENT '话题提取日期',
|
|
`relevance_score` float DEFAULT NULL COMMENT '话题相关性得分',
|
|
`news_count` int DEFAULT 0 COMMENT '关联的新闻数量',
|
|
`processing_status` varchar(16) DEFAULT 'pending' COMMENT '处理状态(pending|processing|completed|failed)',
|
|
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
|
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
|
PRIMARY KEY (`id`),
|
|
UNIQUE KEY `idx_daily_topics_unique` (`topic_id`, `extract_date`),
|
|
KEY `idx_daily_topics_date` (`extract_date`),
|
|
KEY `idx_daily_topics_status` (`processing_status`),
|
|
KEY `idx_daily_topics_score` (`relevance_score`)
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='每日提取话题表';
|
|
|
|
-- ----------------------------
|
|
-- Table structure for topic_news_relation
|
|
-- 话题新闻关联表:记录话题和新闻的关联关系
|
|
-- ----------------------------
|
|
DROP TABLE IF EXISTS `topic_news_relation`;
|
|
CREATE TABLE `topic_news_relation` (
|
|
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
|
`topic_id` varchar(64) NOT NULL COMMENT '话题ID',
|
|
`news_id` varchar(128) NOT NULL COMMENT '新闻ID',
|
|
`relation_score` float DEFAULT NULL COMMENT '关联度得分',
|
|
`extract_date` date NOT NULL COMMENT '关联提取日期',
|
|
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
|
PRIMARY KEY (`id`),
|
|
UNIQUE KEY `idx_topic_news_unique` (`topic_id`, `news_id`, `extract_date`),
|
|
KEY `idx_topic_news_topic` (`topic_id`),
|
|
KEY `idx_topic_news_news` (`news_id`),
|
|
KEY `idx_topic_news_date` (`extract_date`),
|
|
FOREIGN KEY (`topic_id`) REFERENCES `daily_topics`(`topic_id`) ON DELETE CASCADE,
|
|
FOREIGN KEY (`news_id`) REFERENCES `daily_news`(`news_id`) ON DELETE CASCADE
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='话题新闻关联表';
|
|
|
|
-- ----------------------------
|
|
-- Table structure for crawling_tasks
|
|
-- 爬取任务表:记录基于话题的平台爬取任务
|
|
-- ----------------------------
|
|
DROP TABLE IF EXISTS `crawling_tasks`;
|
|
CREATE TABLE `crawling_tasks` (
|
|
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
|
`task_id` varchar(64) NOT NULL COMMENT '任务唯一ID',
|
|
`topic_id` varchar(64) NOT NULL COMMENT '关联的话题ID',
|
|
`platform` varchar(32) NOT NULL COMMENT '目标平台(xhs|dy|ks|bili|wb|tieba|zhihu)',
|
|
`search_keywords` text NOT NULL COMMENT '搜索关键词(JSON格式存储)',
|
|
`task_status` varchar(16) DEFAULT 'pending' COMMENT '任务状态(pending|running|completed|failed|paused)',
|
|
`start_time` bigint DEFAULT NULL COMMENT '任务开始时间戳',
|
|
`end_time` bigint DEFAULT NULL COMMENT '任务结束时间戳',
|
|
`total_crawled` int DEFAULT 0 COMMENT '已爬取内容数量',
|
|
`success_count` int DEFAULT 0 COMMENT '成功爬取数量',
|
|
`error_count` int DEFAULT 0 COMMENT '错误数量',
|
|
`error_message` text COMMENT '错误信息',
|
|
`config_params` text COMMENT '爬取配置参数(JSON格式)',
|
|
`scheduled_date` date NOT NULL COMMENT '计划执行日期',
|
|
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
|
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
|
PRIMARY KEY (`id`),
|
|
UNIQUE KEY `idx_crawling_tasks_unique` (`task_id`),
|
|
KEY `idx_crawling_tasks_topic` (`topic_id`),
|
|
KEY `idx_crawling_tasks_platform` (`platform`),
|
|
KEY `idx_crawling_tasks_status` (`task_status`),
|
|
KEY `idx_crawling_tasks_date` (`scheduled_date`),
|
|
FOREIGN KEY (`topic_id`) REFERENCES `daily_topics`(`topic_id`) ON DELETE CASCADE
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='爬取任务表';
|
|
|
|
-- ===============================
|
|
-- MediaCrawler表结构扩展字段
|
|
-- ===============================
|
|
|
|
-- 为MediaCrawler现有表添加话题关联字段,支持MindSpider功能
|
|
-- 注意:这些字段是可选的,不影响MediaCrawler原有功能
|
|
|
|
-- 为小红书笔记表添加话题关联字段
|
|
ALTER TABLE `xhs_note`
|
|
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
|
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
|
|
|
-- 为抖音视频表添加话题关联字段
|
|
ALTER TABLE `douyin_aweme`
|
|
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
|
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
|
|
|
-- 为快手视频表添加话题关联字段
|
|
ALTER TABLE `kuaishou_video`
|
|
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
|
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
|
|
|
-- 为B站视频表添加话题关联字段
|
|
ALTER TABLE `bilibili_video`
|
|
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
|
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
|
|
|
-- 为微博帖子表添加话题关联字段
|
|
ALTER TABLE `weibo_note`
|
|
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
|
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
|
|
|
-- 为贴吧帖子表添加话题关联字段
|
|
ALTER TABLE `tieba_note`
|
|
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
|
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
|
|
|
-- 为知乎内容表添加话题关联字段
|
|
ALTER TABLE `zhihu_content`
|
|
ADD COLUMN `topic_id` varchar(64) DEFAULT NULL COMMENT '关联的话题ID',
|
|
ADD COLUMN `crawling_task_id` varchar(64) DEFAULT NULL COMMENT '关联的爬取任务ID';
|
|
|
|
-- ===============================
|
|
-- 创建视图用于数据分析
|
|
-- ===============================
|
|
|
|
-- 话题爬取统计视图
|
|
CREATE OR REPLACE VIEW `v_topic_crawling_stats` AS
|
|
SELECT
|
|
dt.topic_id,
|
|
dt.topic_name,
|
|
dt.extract_date,
|
|
dt.processing_status,
|
|
COUNT(DISTINCT ct.task_id) as total_tasks,
|
|
SUM(CASE WHEN ct.task_status = 'completed' THEN 1 ELSE 0 END) as completed_tasks,
|
|
SUM(CASE WHEN ct.task_status = 'failed' THEN 1 ELSE 0 END) as failed_tasks,
|
|
SUM(ct.total_crawled) as total_content_crawled,
|
|
SUM(ct.success_count) as total_success_count,
|
|
SUM(ct.error_count) as total_error_count
|
|
FROM daily_topics dt
|
|
LEFT JOIN crawling_tasks ct ON dt.topic_id = ct.topic_id
|
|
GROUP BY dt.topic_id, dt.topic_name, dt.extract_date, dt.processing_status;
|
|
|
|
-- 每日数据统计视图
|
|
CREATE OR REPLACE VIEW `v_daily_summary` AS
|
|
SELECT
|
|
crawl_date,
|
|
COUNT(DISTINCT news_id) as total_news,
|
|
COUNT(DISTINCT source_platform) as platforms_covered,
|
|
(SELECT COUNT(*) FROM daily_topics WHERE extract_date = dn.crawl_date) as topics_extracted,
|
|
(SELECT COUNT(*) FROM crawling_tasks WHERE scheduled_date = dn.crawl_date) as tasks_created
|
|
FROM daily_news dn
|
|
GROUP BY crawl_date
|
|
ORDER BY crawl_date DESC;
|
|
|
|
-- ===============================
|
|
-- 初始化索引优化
|
|
-- ===============================
|
|
|
|
-- 为关联查询优化添加复合索引
|
|
CREATE INDEX `idx_topic_date_status` ON `daily_topics` (`extract_date`, `processing_status`);
|
|
CREATE INDEX `idx_task_topic_platform` ON `crawling_tasks` (`topic_id`, `platform`, `task_status`);
|
|
CREATE INDEX `idx_news_date_platform` ON `daily_news` (`crawl_date`, `source_platform`);
|
|
|
|
-- ===============================
|
|
-- 数据库配置优化建议
|
|
-- ===============================
|
|
|
|
-- 设置合适的字符集和排序规则
|
|
-- ALTER DATABASE mindspider CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
|
|
|
-- 建议的数据保留策略(可选)
|
|
-- 可以根据需要创建事件调度器来清理历史数据
|
|
-- 例如:删除90天前的新闻数据,保留话题和爬取结果数据
|