rss订阅数据爬取及数据处理

This commit is contained in:
z66
2025-10-23 17:18:49 +08:00
parent fd67231866
commit e1db06dd79
8 changed files with 84042 additions and 229449 deletions
+80041 -229055
View File
File diff suppressed because it is too large Load Diff
+3366
View File
File diff suppressed because it is too large Load Diff
+44 -1
View File
@@ -23,11 +23,31 @@ class IntelligenceSystem:
self._setup_signal_handlers() self._setup_signal_handlers()
log.info("系统启动 - 运行在Cron调度模式") log.info("系统启动 - 运行在Cron调度模式")
# 时间追踪变量
last_status_print_time = time.time() # 上次打印状态的时间
last_hourly_report_time = time.time() # 上次小时统计的时间
status_print_interval = 60 # 每分钟打印一次状态(60秒)
hourly_report_interval = 3600 # 每小时统计一次(3600秒)
try: try:
# 主循环 - 仅负责定期检查任务 # 主循环 - 仅负责定期检查任务
while self._running: while self._running:
current_time = time.time()
# 判断是否需要打印状态(每分钟一次)
should_print_status = (current_time - last_status_print_time) >= status_print_interval
# 检查并执行到期任务 # 检查并执行到期任务
self.scheduler.check_and_run_tasks() self.scheduler.check_and_run_tasks(print_empty_status=should_print_status)
# 更新最后打印时间
if should_print_status:
last_status_print_time = current_time
# 检查是否需要进行小时统计(每小时一次)
if (current_time - last_hourly_report_time) >= hourly_report_interval:
self._print_hourly_stats()
last_hourly_report_time = current_time
# 短间隔轮询(每10秒检查一次,保证Cron时间精度) # 短间隔轮询(每10秒检查一次,保证Cron时间精度)
time.sleep(10) time.sleep(10)
@@ -48,6 +68,29 @@ class IntelligenceSystem:
log.info(f"收到关闭信号 {signum},开始关闭系统") log.info(f"收到关闭信号 {signum},开始关闭系统")
self._running = False self._running = False
def _print_hourly_stats(self):
"""打印并重置小时统计信息"""
stats = self.scheduler.get_and_reset_hourly_stats()
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(f"\n{'='*60}")
print(f"📊 小时任务统计报告 - {now}")
print(f"{'='*60}")
print(f" 总任务数: {stats['总数']}")
print(f" 成功: {stats['成功']}")
print(f" 失败: {stats['失败']}")
if stats['总数'] > 0:
success_rate = (stats['成功'] / stats['总数']) * 100
print(f" 成功率: {success_rate:.1f}%")
print(f"{'='*60}\n")
log.info(
"小时任务统计",
总任务数=stats['总数'],
成功=stats['成功'],
失败=stats['失败']
)
def shutdown(self): def shutdown(self):
"""优雅关闭系统""" """优雅关闭系统"""
log.info("开始优雅关闭系统") log.info("开始优雅关闭系统")
Binary file not shown.
+91 -99
View File
@@ -3,62 +3,57 @@
### 参考文档 ### 参考文档
https://alidocs.dingtalk.com/i/nodes/NZQYprEoWoexdo1ohPdxXvDbJ1waOeDk?utm_scene=team_space https://alidocs.dingtalk.com/i/nodes/NZQYprEoWoexdo1ohPdxXvDbJ1waOeDk?utm_scene=team_space
### 程序框架 ### 程序框架(当前实现)
```angular2html ```angular2html
intelligence_system/ intelligence_system/
├── collectors/ # 数据采集层 ├── collectors/ # 数据采集层
│ ├── weibo_spider.py # 黑猫爬虫 │ ├── complaint_spider.py # 投诉信息爬虫(结构化入库/附件走MinIO)
│ ├── rss_subscriptions.py # rss订阅 │ ├── rss_subscriptions.py # RSS 订阅抓取
── news_api.py # 新闻接口 ── internal/ # 内部数据收集(保留)
└── jian_dao_cloud.py # 简道云表单收集器(示例/占位)
│ └── internal/ # 内部数据收集
│ ├── jian_dao_cloud.py # 简道云表单收集器
├── processors/ # 数据处理层 ├── processors/ # 数据处理层
│ ├── data_cleaner.py # 数据清洗(去重/标准化) │ ├── processor_rss_data.py # RSS数据清洗、分词、过滤与入库
│ ├── schema_mapper.py # 数据结构转换器 │ ├── keywords.txt # 行业关键词(用于分词/过滤)
│ ├── text_parser.py # 文本解析(PDF/HTML等) │ ├── stopwords.txt # 停用词
── image_analyzer.py # 图像识别(OpenCV集成) ── ai_engine/
├── video_processor.py # 音视频分离分析 └── ai_proessor_rss_data # 预留(AI分析扩展占位)
│ │
│ └── ai_engine/ # AI分析核心
│ ├── nlp_processor.py # 自然语言处理引擎
│ ├── sentiment_analyzer.py # 情感分析模型
│ └── topic_modeler.py # LDA主题建模工具
├── services/ # 应用服务层 ├── services/ # 应用服务层(保留)
│ ├── monitoring/ # 舆情监控 │ ├── monitoring/ # 舆情监控
│ │ ├── opinion_monitor.py # 实时舆情追踪 │ │ ├── opinion_monitor.py # 实时舆情追踪(占位)
│ │ └── brand_reputation.py # 品牌口碑分析 │ │ └── brand_reputation.py # 品牌口碑分析(占位)
├── analysis/ # 竞品分析
│ ├── analysis/ # 竞品分析 │ ├── competitor_tracker.py # 竞品动态监控(占位)
│ │ ── competitor_tracker.py # 竞品动态监控 │ │ ── swot_generator.py # SWOT分析报告(占位)
│ └── swot_generator.py # SWOT分析报告 ├── reporting/ # 报告服务
│ │ │ │ ├── daily_reporter.py # 自动化日报生成(占位)
├── reporting/ # 报告服务 │ └── weekly_digest.py # 周报汇编系统(占位)
│ ├── daily_reporter.py # 自动化日报生成 └── alert/ # 预警服务
── weekly_digest.py # 周报汇编系统 ── alert_trigger.py # 动态阈值告警(占位)
└── notification_center.py # 邮件/短信通知(占位)
│ └── alert/ # 预警服务
│ ├── alert_trigger.py # 动态阈值告警
│ └── notification_center.py # 邮件/短信通知
├── system_management/ # 系统管理 ├── applications/ # 应用
│ ├── scheduler/ # 任务调度 │ ├── alert.py # 告警触发/通知(占位/实现中)
│ └── task_scheduler.py # 任务调度器 │ └── reporter/
├── daily.py # 日报生成
│ └── monitor/ # 系统监控 └── monthly.py # 月报生成
│ ├── health_monitor.py # 服务健康检测
│ └── performance_watcher.py # 资源占用监控
├── utils/ # 工具库 ├── system_management/ # 系统管理层
│ ├── file_handler.py # 通用文件操作 │ ├── scheduler/
│ ├── logger.py # 日志系统 │ ├── task_scheduler.py # 任务调度器(Cron表达式 + 线程池)
├── mysql_agent.py # MySQL读写管理器 │ └── task_management.py # 任务管理辅助
│ └── datetime_parser.py # 时间格式处理 │ └── monitor/ # 系统监控(目录占位)
├── config.py # 配置加载与管理 ├── utils/ # 工具库
└── main.py # 系统入口(启动所有服务) │ ├── file_handler.py # 通用文件操作
│ ├── logger.py # 跨平台日志系统(Loguru)
│ ├── mysql_agent.py # MySQL读写管理器
│ └── minio_agent.py # MinIO对象存储客户端
├── config.py # 配置加载与管理(含数据库/存储配置)
├── main.py # 系统入口(Cron轮询 + 调度执行)
└── requirements.txt # 依赖清单
``` ```
### 程序设计原则 ### 程序设计原则
@@ -67,23 +62,32 @@ intelligence_system/
3. 密钥等信息直接放在配置类中 3. 密钥等信息直接放在配置类中
4. 数据存储遵循"结构化存MySQL,非结构化存MinIO"原则,通过元数据关联 4. 数据存储遵循"结构化存MySQL,非结构化存MinIO"原则,通过元数据关联
### 主程序设计 ### 主程序与调度设计(已实现)
主程序需要一次启动,一直运行,启动时运行一次(在代码中可取消),之后每天定时生成一次报告 主程序以长运行进程方式启动,进入轻量轮询循环(每10秒)。调度器按Cron表达式在`main_task`表中拉取到期任务,使用线程池异步执行,并在每分钟输出运行状态、每小时汇总统计。
主程序包含爬虫/api调度器。该调度器通过查询mysql中任务调度情况按需执行,db文件中应包含任务名称、 - 调度器能力:
任务路径、任务执行频率(支持按天、按周,按分钟)、上次执行时间、下次执行时间等信息 - 基于`croniter`解析Cron表达式,支持时区(默认`Asia/Shanghai`
- 线程池并发执行,信号量限制最大并发(与`max_workers`一致)
- 任务入口动态解析:支持`package.module``package.module.ClassName.main``package.module.func` 等形式
- 成功/失败后自动计算`next_run_time`或设置15分钟后重试
- 关键字段自动更新:`is_running``last_run_time``last_run_status``run_count``next_run_time`
主程序应包含数据处理调度器,根据数据类别分别处理,如文本数据处理调度器、图片数据处理调度器等, - 主循环:
每天定时拉取db获取到的原始数据,分别进行处理,处理完成后将结果保存到mysql中 - 每10秒检查一次待运行任务
- 每分钟打印当前周期统计;每小时写入累计统计日志
- 支持`SIGINT/SIGTERM`优雅关闭,等待正在运行的任务完成
主程序应包含日报、周报等生成,根据时间定时生成报告,报告需要存储 ### 日志设计(已实现)
跨平台日志系统(Loguru)输出至`logs/`目录:
### 日志设计 - application.log:主日志,`rotation = 20MB`,达到阈值后压缩为`application.log.YYYYMMDD.zip``retention = 30天`
日志系统兼容Windows、Mac、Linux平台,以`log`文件形式存储,超过20MB自动压缩。新增存储相关日志内容: - errors.log:错误日志(ERROR及以上),`rotation = 10MB``retention = 90天`
- MySQL操作:批量插入行数、表结构变更、事务状态 - 结构化扩展字段:日志支持`extra`键值对,自动美化并对长字段(如`sql``params`)截断
- MinIO操作:文件上传/下载状态、路径、大小、耗时
- 关联日志:MySQL记录与MinIO对象的绑定关系(如"ID:123 关联文件: collector/images/xxx.jpg" 建议记录的业务事件:
- 异常日志:MySQL连接失败、MinIO上传超时、数据关联不一致等告警信息 - MySQL读写操作要点(表名、影响行数、事务状态)
- MinIO对象操作(对象路径、大小、耗时、状态)
- 任务执行上下文(task_id、task_name、module_path、耗时、状态)
### 存储系统设计(MinIO+MySQL ### 存储系统设计(MinIO+MySQL
#### 核心存储分工 #### 核心存储分工
@@ -113,44 +117,32 @@ intelligence_system/
- 系统类:如任务调度表等采用功能命名(如`main_task` - 系统类:如任务调度表等采用功能命名(如`main_task`
#### 核心表结构 #### 核心表结构(当前落地)
1. `collector_news_api`:新闻API采集数据表(存储新闻标题、内容等结构化数据 1. `main_task`:任务调度表(`task_name``task_type``module_path``cron_expression``time_zone``run_count``is_running``last_run_time``last_run_status``next_run_time``is_active`
2. `collector_complaint_spider`:投诉信息爬虫数据表(含投诉文本、附件MinIO路径`attachment_minio_path`等) 2. `collector_rss_subscriptions`RSS源采集数据(`文章标题``文章摘要``发布时间``来源URL``文章链接``是否已处理` 等)
3. `collector_image_source`:采集层图片元数据表(存储图片URL、MinIO路径、格式、大小等) 3. `processed_rss_data`RSS处理结果(`分词结果``是否汽车相关``处理时间` 等)
4. `processor_text_processor`:文本处理结果表(存储NLP分析结果、关联原文ID等) 4. `collector_complaint_spider`:投诉信息爬虫数据(含文本与附件MinIO路径`attachment_minio_path`等)
5. `processor_image_processor`:图片处理结果表(存储识别标签、特征向量、处理后图片MinIO路径`result_minio_path` 5. 可选:`storage_object_index`(建议用于统一索引MinIO对象元数据
6. `storage_object_index`:MinIO对象索引表(存储所有对象的MinIO路径、哈希值、创建时间、过期时间等)
7. `main_task`:任务调度表(存储任务名称、路径、执行频率、上次/下次执行时间等)
8. `application_reporter_daily`:日报数据表(存储日报结构化内容、报表文件MinIO路径等)
9. `application_reporter_monthly`:月报数据表(存储月报结构化内容、报表文件MinIO路径等)
#### 数据交互特性
1. **MySQL交互**
- 支持DataFrame直接读写,提供分块处理(`chunksize`)和批量插入能力
- 自动适配平台特性(如Windows小批次写入优化)
- 完善的事务机制确保结构化数据一致性
2. **MinIO交互**
- 支持大文件分片上传、断点续传
3. **联动机制**
- 非结构化数据存储时,先上传至MinIO获取路径,再将路径及元数据写入MySQL
- 读取非结构化数据时,先从MySQL获取MinIO路径,再通过路径从MinIO下载
- 日志同步记录MySQL操作和MinIO对象操作(如"上传文件至MinIO: {path},关联MySQL记录ID: {id}"
### 数据采集设计 ### 数据采集设计
1. 结构化数据(如新闻文本、投诉内容):直接写入对应`collector_`前缀表 1. 结构化数据(RSS、投诉文本):写入`collector_`前缀表
2. 非结构化数据(如爬取的图片、附件): 2. 非结构化数据(附件/图片等):
- `minio_agent.py`上传至对应存储桶 - 使`utils/minio_agent.py`上传至对应存储桶
-MinIO路径、文件大小、格式等元数据写入`collector_`前缀表或`storage_object_index` -对象路径与元数据写入业务表或`storage_object_index`
3. 每个采集模块(独立py文件,`main`方法入口)需同时处理MySQLMinIO交互,确保数据关联完整 3. 采集模块需同时处理MySQLMinIO交互,确保关联完整
### 数据处理设计(RSS流程已实现)
`processors/processor_rss_data.py`流程:
-`collector_rss_subscriptions`加载未处理数据(可配置`limit`
- 加载停用词与行业关键词(`stopwords.txt` / `keywords.txt`),并动态注入`jieba`词典
- 标注词性并过滤停用词,仅保留与汽车后市场相关的词汇
- 标记与过滤:出现任一行业关键词即视为相关,进入保存
- 将结果写入`processed_rss_data`,并回写源表`是否已处理 = 1`
- 输出处理统计(总量、命中量、命中率、时间)
### 数据处理设计 ### 依赖与运行
1. 结构化数据处理:从MySQL读取原始数据,处理后写入`processor_`前缀表 - 依赖:见`requirements.txt`pandas、SQLAlchemy、PyMySQL、croniter、pytz、loguru、jieba、feedparser、beautifulsoup4、minio 等)
2. 非结构化数据处理: - 配置:在`config.py`中设置`MYSQL_CONFIG`与MinIO参数
- 从MySQL获取MinIO路径,通过`minio_agent.py`下载原始文件 - 运行:
- 处理后(如图片识别、视频帧提取)将结果文件上传至MinIO(处理层存储桶) - 启动主程序:`python main.py`
- 将处理结果的结构化信息(如识别标签)和处理后文件的MinIO路径写入`processor_`前缀表 - 添加任务:向`main_task`插入记录,`module_path`可指向如`processors.processor_rss_data.main`
3. 支持多表关联存储,通过`source_id`关联原始数据与处理结果
+1
View File
@@ -15,3 +15,4 @@ feedparser==6.0.11
Markdown==3.9 Markdown==3.9
openai==1.107.3 openai==1.107.3
tqdm==4.67.1 tqdm==4.67.1
jieba==0.42.1
+27 -5
View File
@@ -22,6 +22,9 @@ class TaskScheduler:
self.executor = ThreadPoolExecutor(max_workers=max_workers) self.executor = ThreadPoolExecutor(max_workers=max_workers)
# 并发容量控制:限制同时运行的后台任务不超过 max_workers # 并发容量控制:限制同时运行的后台任务不超过 max_workers
self._running_semaphore = threading.Semaphore(max_workers) self._running_semaphore = threading.Semaphore(max_workers)
# 任务统计
self.hourly_stats = {'成功': 0, '失败': 0, '总数': 0}
self.hourly_stats_lock = threading.Lock()
log.info(f"任务调度器已初始化,最大工作线程数: {max_workers}") log.info(f"任务调度器已初始化,最大工作线程数: {max_workers}")
def _resolve_callable(self, module_path: str): def _resolve_callable(self, module_path: str):
@@ -84,8 +87,12 @@ class TaskScheduler:
# 如果所有尝试均失败,则抛出最后的错误 # 如果所有尝试均失败,则抛出最后的错误
raise ImportError(f"模块 {module_path} 导入/解析失败: {str(last_import_error)}") raise ImportError(f"模块 {module_path} 导入/解析失败: {str(last_import_error)}")
def check_and_run_tasks(self) -> Dict[str, int]: def check_and_run_tasks(self, print_empty_status: bool = False) -> Dict[str, int]:
"""检查并执行所有到期的任务,优化空任务处理和异常容错""" """检查并执行所有到期的任务,优化空任务处理和异常容错
Args:
print_empty_status: 是否打印空任务状态(默认False,避免频繁输出)
"""
result = {'总任务数': 0, '成功': 0, '失败': 0} result = {'总任务数': 0, '成功': 0, '失败': 0}
try: try:
@@ -106,8 +113,9 @@ class TaskScheduler:
result['总任务数'] = len(tasks_df) result['总任务数'] = len(tasks_df)
if tasks_df.empty: if tasks_df.empty:
# 空任务时输出INFO级日志,明确提示状态 # 空任务时根据参数决定是否输出
print(f"当前没有到期的任务,等待新任务加入...{now.strftime('%Y-%m-%d %H:%M:%S')}") if print_empty_status:
print(f"当前没有到期的任务,等待新任务加入...{now.strftime('%Y-%m-%d %H:%M:%S')}")
return result return result
# 并发执行任务 # 并发执行任务
@@ -128,6 +136,12 @@ class TaskScheduler:
log.error(f"任务线程执行失败: {str(e)}", exc_info=True) log.error(f"任务线程执行失败: {str(e)}", exc_info=True)
result['失败'] += 1 result['失败'] += 1
# 更新小时统计
with self.hourly_stats_lock:
self.hourly_stats['成功'] += result['成功']
self.hourly_stats['失败'] += result['失败']
self.hourly_stats['总数'] += result['总任务数']
log.info( log.info(
"任务调度周期完成", "任务调度周期完成",
总任务数=result['总任务数'], 总任务数=result['总任务数'],
@@ -419,4 +433,12 @@ class TaskScheduler:
except Exception as e: except Exception as e:
log.error(f"查询待执行任务失败,将重试: {str(e)}", exc_info=True) log.error(f"查询待执行任务失败,将重试: {str(e)}", exc_info=True)
return [] return []
def get_and_reset_hourly_stats(self) -> Dict[str, int]:
"""获取并重置小时统计数据(用于每小时统计)"""
with self.hourly_stats_lock:
stats = self.hourly_stats.copy()
# 重置统计
self.hourly_stats = {'成功': 0, '失败': 0, '总数': 0}
return stats
+472 -289
View File
@@ -10,14 +10,25 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1,
"id": "initial_id", "id": "initial_id",
"metadata": { "metadata": {
"collapsed": true,
"ExecuteTime": { "ExecuteTime": {
"end_time": "2025-10-17T05:43:18.381936Z", "end_time": "2025-10-17T05:43:18.381936Z",
"start_time": "2025-10-17T05:43:15.265036Z" "start_time": "2025-10-17T05:43:15.265036Z"
} },
"collapsed": true
}, },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PROJECT_ROOT = d:\\Idea Project\\intelligence_system\n",
"\u001b[32m2025-10-23 16:56:55\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m任务调度器已初始化,最大工作线程数: 5\u001b[0m\n"
]
}
],
"source": [ "source": [
"# 使 Notebook 可从项目根导入\n", "# 使 Notebook 可从项目根导入\n",
"import sys\n", "import sys\n",
@@ -116,6 +127,78 @@
" log.exception(\"手动执行任务失败\")\n", " log.exception(\"手动执行任务失败\")\n",
" return False\n", " return False\n",
"\n", "\n",
" def run_task_synchronously(self, task_id: int) -> dict:\n",
" \"\"\"同步执行任务并返回详细结果(用于Notebook中查看执行过程)\"\"\"\n",
" import time\n",
" import sys\n",
" from io import StringIO\n",
" \n",
" task = self.get_task_by_id(task_id)\n",
" if not task:\n",
" return {\n",
" 'success': False,\n",
" 'error': f'未找到任务ID: {task_id}',\n",
" 'output': ''\n",
" }\n",
" \n",
" # 捕获标准输出\n",
" old_stdout = sys.stdout\n",
" sys.stdout = output_buffer = StringIO()\n",
" \n",
" start_time = time.time()\n",
" success = False\n",
" error_msg = None\n",
" \n",
" try:\n",
" # 直接同步执行任务逻辑\n",
" self.scheduler._execute_task_logic(task)\n",
" success = True\n",
" \n",
" # 更新任务状态\n",
" next_run_time = self.scheduler._calculate_next_run_time(\n",
" cron_expr=task['cron_expression'],\n",
" time_zone=task.get('time_zone', 'Asia/Shanghai')\n",
" )\n",
" \n",
" self.scheduler._update_task_status(task['task_id'], {\n",
" 'last_run_status': 'success',\n",
" 'is_running': 0,\n",
" 'run_count': task['run_count'] + 1,\n",
" 'next_run_time': next_run_time\n",
" })\n",
" \n",
" except Exception as e:\n",
" success = False\n",
" error_msg = str(e)\n",
" log.exception(f\"任务执行失败: {task['task_name']}\")\n",
" \n",
" # 更新失败状态\n",
" try:\n",
" next_retry_time = datetime.now() + pd.Timedelta(minutes=15)\n",
" self.scheduler._update_task_status(task['task_id'], {\n",
" 'last_run_status': 'failed',\n",
" 'is_running': 0,\n",
" 'next_run_time': next_retry_time\n",
" })\n",
" except Exception:\n",
" pass\n",
" \n",
" finally:\n",
" # 恢复标准输出\n",
" sys.stdout = old_stdout\n",
" output_text = output_buffer.getvalue()\n",
" \n",
" execution_time = time.time() - start_time\n",
" \n",
" return {\n",
" 'success': success,\n",
" 'task_name': task['task_name'],\n",
" 'task_id': task['task_id'],\n",
" 'execution_time': execution_time,\n",
" 'output': output_text,\n",
" 'error': error_msg\n",
" }\n",
"\n",
"# 在这里创建 manager(供后续单元使用)\n", "# 在这里创建 manager(供后续单元使用)\n",
"manager = TaskManager(scheduler)\n", "manager = TaskManager(scheduler)\n",
"\n", "\n",
@@ -134,18 +217,7 @@
" except Exception:\n", " except Exception:\n",
" pass\n", " pass\n",
" return str(dt)" " return str(dt)"
], ]
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PROJECT_ROOT = D:\\Idea Project\\intelligence_system\n",
"\u001B[32m2025-10-17 13:43:18\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m任务调度器已初始化,最大工作线程数: 5\u001B[0m\n"
]
}
],
"execution_count": 1
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -157,6 +229,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2,
"id": "7b020af55972643", "id": "7b020af55972643",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@@ -164,73 +237,28 @@
"start_time": "2025-10-17T05:43:18.394863Z" "start_time": "2025-10-17T05:43:18.394863Z"
} }
}, },
"source": [
"# 列出所有任务(包括已禁用的)\n",
"def list_tasks(active_only=True):\n",
" tasks = manager.get_all_tasks(active_only)\n",
" if not tasks:\n",
" display(Markdown(\"### 没有找到任务\"))\n",
" return None\n",
"\n",
" df = pd.DataFrame(tasks)\n",
"\n",
" # 格式化日期列\n",
" if 'last_run_time' in df.columns:\n",
" df['last_run_time'] = df['last_run_time'].apply(format_datetime)\n",
" if 'next_run_time' in df.columns:\n",
" df['next_run_time'] = df['next_run_time'].apply(format_datetime)\n",
"\n",
" # 重命名列名\n",
" df = df.rename(columns={\n",
" 'task_id': '任务ID',\n",
" 'task_name': '任务名称',\n",
" 'task_type': '任务类型',\n",
" 'module_path': '模块路径',\n",
" 'cron_expression': 'Cron表达式',\n",
" 'time_zone': '时区',\n",
" 'last_run_time': '最后运行时间',\n",
" 'next_run_time': '下次运行时间',\n",
" 'last_run_status': '运行状态',\n",
" 'is_active': '是否活跃',\n",
" 'run_count': '运行次数'\n",
" })\n",
"\n",
" display(Markdown(\"### 任务列表\"))\n",
" display(HTML(df.to_html(index=False)))\n",
" return df\n",
"\n",
"# 执行:列出所有任务(包括已禁用)\n",
"list_tasks(active_only=False)\n",
"\n",
"# 或者:只列出活跃任务\n",
"# list_tasks(active_only=True)"
],
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\u001B[32m2025-10-17 13:43:18\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n" "\u001b[32m2025-10-17 13:43:18\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n"
] ]
}, },
{ {
"data": { "data": {
"text/markdown": [
"### 任务列表"
],
"text/plain": [ "text/plain": [
"<IPython.core.display.Markdown object>" "<IPython.core.display.Markdown object>"
], ]
"text/markdown": "### 任务列表"
}, },
"metadata": {}, "metadata": {},
"output_type": "display_data", "output_type": "display_data"
"jetTransient": {
"display_id": null
}
}, },
{ {
"data": { "data": {
"text/plain": [
"<IPython.core.display.HTML object>"
],
"text/html": [ "text/html": [
"<table border=\"1\" class=\"dataframe\">\n", "<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n", " <thead>\n",
@@ -270,26 +298,16 @@
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>" "</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
] ]
}, },
"metadata": {}, "metadata": {},
"output_type": "display_data", "output_type": "display_data"
"jetTransient": {
"display_id": null
}
}, },
{ {
"data": { "data": {
"text/plain": [
" 任务ID 任务名称 任务类型 模块路径 \\\n",
"0 1 RSS新闻订阅 collector collectors.rss_subscriptions.NewsAPIClient \n",
"\n",
" Cron表达式 时区 下次运行时间 最后运行时间 \\\n",
"0 5 0 * * * Asia/Shanghai 2025-10-18 00:05:00 2025-10-17 00:05:07 \n",
"\n",
" 运行状态 运行次数 是否活跃 is_running created_at updated_at \n",
"0 success 4 1 0 2025-10-16 15:47:34 2025-10-17 00:05:08 "
],
"text/html": [ "text/html": [
"<div>\n", "<div>\n",
"<style scoped>\n", "<style scoped>\n",
@@ -346,6 +364,16 @@
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
"</div>" "</div>"
],
"text/plain": [
" 任务ID 任务名称 任务类型 模块路径 \\\n",
"0 1 RSS新闻订阅 collector collectors.rss_subscriptions.NewsAPIClient \n",
"\n",
" Cron表达式 时区 下次运行时间 最后运行时间 \\\n",
"0 5 0 * * * Asia/Shanghai 2025-10-18 00:05:00 2025-10-17 00:05:07 \n",
"\n",
" 运行状态 运行次数 是否活跃 is_running created_at updated_at \n",
"0 success 4 1 0 2025-10-16 15:47:34 2025-10-17 00:05:08 "
] ]
}, },
"execution_count": 2, "execution_count": 2,
@@ -353,7 +381,47 @@
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"execution_count": 2 "source": [
"# 列出所有任务(包括已禁用的)\n",
"def list_tasks(active_only=True):\n",
" tasks = manager.get_all_tasks(active_only)\n",
" if not tasks:\n",
" display(Markdown(\"### 没有找到任务\"))\n",
" return None\n",
"\n",
" df = pd.DataFrame(tasks)\n",
"\n",
" # 格式化日期列\n",
" if 'last_run_time' in df.columns:\n",
" df['last_run_time'] = df['last_run_time'].apply(format_datetime)\n",
" if 'next_run_time' in df.columns:\n",
" df['next_run_time'] = df['next_run_time'].apply(format_datetime)\n",
"\n",
" # 重命名列名\n",
" df = df.rename(columns={\n",
" 'task_id': '任务ID',\n",
" 'task_name': '任务名称',\n",
" 'task_type': '任务类型',\n",
" 'module_path': '模块路径',\n",
" 'cron_expression': 'Cron表达式',\n",
" 'time_zone': '时区',\n",
" 'last_run_time': '最后运行时间',\n",
" 'next_run_time': '下次运行时间',\n",
" 'last_run_status': '运行状态',\n",
" 'is_active': '是否活跃',\n",
" 'run_count': '运行次数'\n",
" })\n",
"\n",
" display(Markdown(\"### 任务列表\"))\n",
" display(HTML(df.to_html(index=False)))\n",
" return df\n",
"\n",
"# 执行:列出所有任务(包括已禁用)\n",
"list_tasks(active_only=False)\n",
"\n",
"# 或者:只列出活跃任务\n",
"# list_tasks(active_only=True)"
]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -365,6 +433,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3,
"id": "eab90de72c35429e", "id": "eab90de72c35429e",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@@ -372,6 +441,62 @@
"start_time": "2025-10-17T05:43:26.071398Z" "start_time": "2025-10-17T05:43:26.071398Z"
} }
}, },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2025-10-17 13:43:26\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n"
]
},
{
"data": {
"text/markdown": [
"### 任务详情\n",
"**任务ID**: 1\n",
"**任务名称**: RSS新闻订阅\n",
"**任务类型**: collector\n",
"**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n",
"**Cron表达式**: 5 0 * * *\n",
"**时区**: Asia/Shanghai\n",
"**最后运行时间**: 2025-10-17 00:05:07\n",
"**下次运行时间**: 2025-10-18 00:05:00\n",
"**运行状态**: success\n",
"**是否活跃**: 是\n",
"**运行次数**: 4\n",
"**创建时间**: 2025-10-16 15:47:34"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{'task_id': 1,\n",
" 'task_name': 'RSS新闻订阅',\n",
" 'task_type': 'collector',\n",
" 'module_path': 'collectors.rss_subscriptions.NewsAPIClient',\n",
" 'cron_expression': '5 0 * * *',\n",
" 'time_zone': 'Asia/Shanghai',\n",
" 'next_run_time': Timestamp('2025-10-18 00:05:00'),\n",
" 'last_run_time': Timestamp('2025-10-17 00:05:07'),\n",
" 'last_run_status': 'success',\n",
" 'run_count': 4,\n",
" 'is_active': 1,\n",
" 'is_running': 0,\n",
" 'created_at': Timestamp('2025-10-16 15:47:34'),\n",
" 'updated_at': Timestamp('2025-10-17 00:05:08')}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# 查看指定任务的详情\n", "# 查看指定任务的详情\n",
"def show_task_details(task_id):\n", "def show_task_details(task_id):\n",
@@ -399,53 +524,7 @@
"\n", "\n",
"# 执行:查看任务ID为1的详情(替换为实际ID)\n", "# 执行:查看任务ID为1的详情(替换为实际ID)\n",
"show_task_details(1)" "show_task_details(1)"
], ]
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-17 13:43:26\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n"
]
},
{
"data": {
"text/plain": [
"<IPython.core.display.Markdown object>"
],
"text/markdown": "### 任务详情\n**任务ID**: 1\n**任务名称**: RSS新闻订阅\n**任务类型**: collector\n**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n**Cron表达式**: 5 0 * * *\n**时区**: Asia/Shanghai\n**最后运行时间**: 2025-10-17 00:05:07\n**下次运行时间**: 2025-10-18 00:05:00\n**运行状态**: success\n**是否活跃**: 是\n**运行次数**: 4\n**创建时间**: 2025-10-16 15:47:34"
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
},
{
"data": {
"text/plain": [
"{'task_id': 1,\n",
" 'task_name': 'RSS新闻订阅',\n",
" 'task_type': 'collector',\n",
" 'module_path': 'collectors.rss_subscriptions.NewsAPIClient',\n",
" 'cron_expression': '5 0 * * *',\n",
" 'time_zone': 'Asia/Shanghai',\n",
" 'next_run_time': Timestamp('2025-10-18 00:05:00'),\n",
" 'last_run_time': Timestamp('2025-10-17 00:05:07'),\n",
" 'last_run_status': 'success',\n",
" 'run_count': 4,\n",
" 'is_active': 1,\n",
" 'is_running': 0,\n",
" 'created_at': Timestamp('2025-10-16 15:47:34'),\n",
" 'updated_at': Timestamp('2025-10-17 00:05:08')}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 3
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -473,8 +552,8 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\u001B[32m2025-10-16 15:47:34\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n", "\u001b[32m2025-10-16 15:47:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n",
"\u001B[32m2025-10-16 15:47:34\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m新任务添加成功\u001B[0m\n" "\u001b[32m2025-10-16 15:47:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m新任务添加成功\u001b[0m\n"
] ]
}, },
{ {
@@ -550,6 +629,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4,
"id": "c892fd8ad2f0dd9d", "id": "c892fd8ad2f0dd9d",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@@ -557,6 +637,61 @@
"start_time": "2025-10-17T05:44:18.980345Z" "start_time": "2025-10-17T05:44:18.980345Z"
} }
}, },
"outputs": [
{
"data": {
"text/markdown": [
"### 任务ID 1 更新成功"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2025-10-17 13:44:19\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n"
]
},
{
"data": {
"text/markdown": [
"### 任务详情\n",
"**任务ID**: 1\n",
"**任务名称**: RSS新闻订阅\n",
"**任务类型**: collector\n",
"**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n",
"**Cron表达式**: 5 * * * *\n",
"**时区**: Asia/Shanghai\n",
"**最后运行时间**: 2025-10-17 00:05:07\n",
"**下次运行时间**: 2025-10-18 00:05:00\n",
"**运行状态**: success\n",
"**是否活跃**: 是\n",
"**运行次数**: 4\n",
"**创建时间**: 2025-10-16 15:47:34"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# 更新任务属性\n", "# 更新任务属性\n",
"def update_task(task_id, **kwargs):\n", "def update_task(task_id, **kwargs):\n",
@@ -589,53 +724,7 @@
"\n", "\n",
"# 执行:同时更新多个属性(名称和Cron表达式)\n", "# 执行:同时更新多个属性(名称和Cron表达式)\n",
"# update_task(1, name=\"每日早间新闻采集\", cron=\"0 8 * * *\")" "# update_task(1, name=\"每日早间新闻采集\", cron=\"0 8 * * *\")"
], ]
"outputs": [
{
"data": {
"text/plain": [
"<IPython.core.display.Markdown object>"
],
"text/markdown": "### 任务ID 1 更新成功"
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-17 13:44:19\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n"
]
},
{
"data": {
"text/plain": [
"<IPython.core.display.Markdown object>"
],
"text/markdown": "### 任务详情\n**任务ID**: 1\n**任务名称**: RSS新闻订阅\n**任务类型**: collector\n**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n**Cron表达式**: 5 * * * *\n**时区**: Asia/Shanghai\n**最后运行时间**: 2025-10-17 00:05:07\n**下次运行时间**: 2025-10-18 00:05:00\n**运行状态**: success\n**是否活跃**: 是\n**运行次数**: 4\n**创建时间**: 2025-10-16 15:47:34"
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 4
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@@ -702,6 +791,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2,
"id": "94892f4134316f8e", "id": "94892f4134316f8e",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@@ -709,8 +799,177 @@
"start_time": "2025-10-17T05:44:35.084369Z" "start_time": "2025-10-17T05:44:35.084369Z"
} }
}, },
"outputs": [
{
"data": {
"text/markdown": [
"### 开始执行任务ID 2"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1mRSS数据处理器初始化完成\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m开始处理RSS数据...\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功加载 8 条未处理的RSS数据\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[33m\u001b[1m停用词文件不存在: processors/stopwords.txt,使用默认停用词\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[33m\u001b[1m关键词文件不存在: processors/keywords.txt\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache C:\\Users\\zy187\\AppData\\Local\\Temp\\jieba.cache\n",
"Loading model cost 0.609 seconds.\n",
"Prefix dict has been built successfully.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m数据处理完成,共处理 8 条记录\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m过滤出 1 条汽车后市场相关新闻\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m表 processed_rss_data 插入结果汇总\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功保存 1 条处理结果到数据库\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功标记 8 条数据为已处理\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1mRSS数据处理完成\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m任务执行完成,耗时: 1.19秒\u001b[0m\n"
]
},
{
"data": {
"text/markdown": [
"**任务名称**: RSS基于规则数据处理"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"**任务ID**: 2"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"**执行时长**: 1.26 秒"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"### 📋 执行输出:"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"RSS数据处理完成!\n",
"处理统计: {'total_articles': 8, 'filtered_articles': 1, 'filter_rate': 0.125, 'processing_time': '2025-10-23 16:57:21', 'save_success': True, 'mark_success': True}\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"### ✅ 任务执行成功"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{'success': True,\n",
" 'task_name': 'RSS基于规则数据处理',\n",
" 'task_id': 2,\n",
" 'execution_time': 1.2610254287719727,\n",
" 'output': \"RSS数据处理完成!\\n处理统计: {'total_articles': 8, 'filtered_articles': 1, 'filter_rate': 0.125, 'processing_time': '2025-10-23 16:57:21', 'save_success': True, 'mark_success': True}\\n\",\n",
" 'error': None}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# 手动执行任务\n", "# 手动执行任务(异步方式,快速返回)\n",
"def run_task_manually(task_id):\n", "def run_task_manually(task_id):\n",
" display(Markdown(f\"### 正在手动执行任务ID {task_id}...\"))\n", " display(Markdown(f\"### 正在手动执行任务ID {task_id}...\"))\n",
" success = manager.run_task_manually(task_id)\n", " success = manager.run_task_manually(task_id)\n",
@@ -720,118 +979,42 @@
" display(Markdown(f\"### 任务ID {task_id} 执行失败\"))\n", " display(Markdown(f\"### 任务ID {task_id} 执行失败\"))\n",
" return success\n", " return success\n",
"\n", "\n",
"# 执行:手动行任务ID为1的任务\n", "# 手动行任务(同步方式,显示详细执行过程)\n",
"run_task_manually(1)" "def run_task_with_details(task_id):\n",
], " display(Markdown(f\"### 开始执行任务ID {task_id}\"))\n",
"outputs": [ " display(Markdown(\"---\"))\n",
{ " \n",
"data": { " result = manager.run_task_synchronously(task_id)\n",
"text/plain": [ " \n",
"<IPython.core.display.Markdown object>" " if not result['success'] and result.get('error') and 'task_id' not in result:\n",
], " display(Markdown(f\"### ❌ 错误: {result['error']}\"))\n",
"text/markdown": "### 正在手动执行任务ID 1..." " return result\n",
}, " \n",
"metadata": {}, " # 显示任务基本信息\n",
"output_type": "display_data", " display(Markdown(f\"**任务名称**: {result['task_name']}\"))\n",
"jetTransient": { " display(Markdown(f\"**任务ID**: {result['task_id']}\"))\n",
"display_id": null " display(Markdown(f\"**执行时长**: {result['execution_time']:.2f} 秒\"))\n",
} " display(Markdown(\"---\"))\n",
}, " \n",
{ " # 显示执行输出\n",
"name": "stdout", " if result['output']:\n",
"output_type": "stream", " display(Markdown(\"### 📋 执行输出:\"))\n",
"text": [ " print(result['output'])\n",
"\u001B[32m2025-10-17 13:44:35\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n", " display(Markdown(\"---\"))\n",
"\u001B[32m2025-10-17 13:44:35\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m开始执行任务: RSS新闻订阅\u001B[0m\n" " \n",
] " # 显示执行结果\n",
}, " if result['success']:\n",
{ " display(Markdown(\"### ✅ 任务执行成功\"))\n",
"data": { " else:\n",
"text/plain": [ " display(Markdown(f\"### ❌ 任务执行失败\"))\n",
"<IPython.core.display.Markdown object>" " if result['error']:\n",
], " display(Markdown(f\"**错误信息**: {result['error']}\"))\n",
"text/markdown": "### 任务ID 1 执行成功" " \n",
}, " return result\n",
"metadata": {}, "\n",
"output_type": "display_data", "# 执行:手动运行任务ID为2的任务(显示详细执行过程)\n",
"jetTransient": { "run_task_with_details(2)"
"display_id": null ]
}
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m新闻API客户端初始化完成,已连接到数据库\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m数据库表结构验证通过,当前字段:['id', '文章标题', '文章链接', '文章摘要', '发布时间', '来源URL', '创建时间', '更新时间']\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m上次更新时间: 2025-10-16 08:11:07\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始获取RSS源数据...\u001B[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\ProgramTools\\anaconda3\\envs\\intelligence_system\\Lib\\site-packages\\requests\\__init__.py:86: RequestsDependencyWarning: Unable to find acceptable character detection dependency (chardet or charset_normalizer).\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1mRSS源获取完成,成功获取 4/4 个源\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m获取完成,耗时: 0.72秒\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/china.xml\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 30/30 条记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/world.xml\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[31m\u001B[1mERROR \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[31m\u001B[1m表 collector_rss_subscriptions 插入失败记录详情\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 28/30 条记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/finance.xml\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 30/30 条记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/scroll-news.xml\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[31m\u001B[1mERROR \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[31m\u001B[1m表 collector_rss_subscriptions 插入失败记录详情\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 13/30 条记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m本次最新更新时间: 2025-10-17 05:41:17\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m任务执行完成,耗时: 1.85秒\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m任务执行成功: RSS新闻订阅\u001B[0m\n"
]
}
],
"execution_count": 5
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",