rss订阅数据爬取及数据处理
This commit is contained in:
+80041
-229055
File diff suppressed because it is too large
Load Diff
+3366
File diff suppressed because it is too large
Load Diff
@@ -23,11 +23,31 @@ class IntelligenceSystem:
|
|||||||
self._setup_signal_handlers()
|
self._setup_signal_handlers()
|
||||||
log.info("系统启动 - 运行在Cron调度模式")
|
log.info("系统启动 - 运行在Cron调度模式")
|
||||||
|
|
||||||
|
# 时间追踪变量
|
||||||
|
last_status_print_time = time.time() # 上次打印状态的时间
|
||||||
|
last_hourly_report_time = time.time() # 上次小时统计的时间
|
||||||
|
status_print_interval = 60 # 每分钟打印一次状态(60秒)
|
||||||
|
hourly_report_interval = 3600 # 每小时统计一次(3600秒)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 主循环 - 仅负责定期检查任务
|
# 主循环 - 仅负责定期检查任务
|
||||||
while self._running:
|
while self._running:
|
||||||
|
current_time = time.time()
|
||||||
|
|
||||||
|
# 判断是否需要打印状态(每分钟一次)
|
||||||
|
should_print_status = (current_time - last_status_print_time) >= status_print_interval
|
||||||
|
|
||||||
# 检查并执行到期任务
|
# 检查并执行到期任务
|
||||||
self.scheduler.check_and_run_tasks()
|
self.scheduler.check_and_run_tasks(print_empty_status=should_print_status)
|
||||||
|
|
||||||
|
# 更新最后打印时间
|
||||||
|
if should_print_status:
|
||||||
|
last_status_print_time = current_time
|
||||||
|
|
||||||
|
# 检查是否需要进行小时统计(每小时一次)
|
||||||
|
if (current_time - last_hourly_report_time) >= hourly_report_interval:
|
||||||
|
self._print_hourly_stats()
|
||||||
|
last_hourly_report_time = current_time
|
||||||
|
|
||||||
# 短间隔轮询(每10秒检查一次,保证Cron时间精度)
|
# 短间隔轮询(每10秒检查一次,保证Cron时间精度)
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
@@ -48,6 +68,29 @@ class IntelligenceSystem:
|
|||||||
log.info(f"收到关闭信号 {signum},开始关闭系统")
|
log.info(f"收到关闭信号 {signum},开始关闭系统")
|
||||||
self._running = False
|
self._running = False
|
||||||
|
|
||||||
|
def _print_hourly_stats(self):
|
||||||
|
"""打印并重置小时统计信息"""
|
||||||
|
stats = self.scheduler.get_and_reset_hourly_stats()
|
||||||
|
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"📊 小时任务统计报告 - {now}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f" 总任务数: {stats['总数']}")
|
||||||
|
print(f" 成功: {stats['成功']}")
|
||||||
|
print(f" 失败: {stats['失败']}")
|
||||||
|
if stats['总数'] > 0:
|
||||||
|
success_rate = (stats['成功'] / stats['总数']) * 100
|
||||||
|
print(f" 成功率: {success_rate:.1f}%")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"小时任务统计",
|
||||||
|
总任务数=stats['总数'],
|
||||||
|
成功=stats['成功'],
|
||||||
|
失败=stats['失败']
|
||||||
|
)
|
||||||
|
|
||||||
def shutdown(self):
|
def shutdown(self):
|
||||||
"""优雅关闭系统"""
|
"""优雅关闭系统"""
|
||||||
log.info("开始优雅关闭系统")
|
log.info("开始优雅关闭系统")
|
||||||
|
|||||||
Binary file not shown.
@@ -3,62 +3,57 @@
|
|||||||
### 参考文档
|
### 参考文档
|
||||||
https://alidocs.dingtalk.com/i/nodes/NZQYprEoWoexdo1ohPdxXvDbJ1waOeDk?utm_scene=team_space
|
https://alidocs.dingtalk.com/i/nodes/NZQYprEoWoexdo1ohPdxXvDbJ1waOeDk?utm_scene=team_space
|
||||||
|
|
||||||
### 程序框架
|
### 程序框架(当前实现)
|
||||||
```angular2html
|
```angular2html
|
||||||
intelligence_system/
|
intelligence_system/
|
||||||
├── collectors/ # 数据采集层
|
├── collectors/ # 数据采集层
|
||||||
│ ├── weibo_spider.py # 黑猫爬虫
|
│ ├── complaint_spider.py # 投诉信息爬虫(结构化入库/附件走MinIO)
|
||||||
│ ├── rss_subscriptions.py # rss订阅
|
│ ├── rss_subscriptions.py # RSS 订阅抓取
|
||||||
│ ├── news_api.py # 新闻接口
|
│ └── internal/ # 内部数据收集(保留)
|
||||||
│ │
|
│ └── jian_dao_cloud.py # 简道云表单收集器(示例/占位)
|
||||||
│ └── internal/ # 内部数据收集
|
|
||||||
│ ├── jian_dao_cloud.py # 简道云表单收集器
|
|
||||||
│
|
│
|
||||||
├── processors/ # 数据处理层
|
├── processors/ # 数据处理层
|
||||||
│ ├── data_cleaner.py # 数据清洗(去重/标准化)
|
│ ├── processor_rss_data.py # RSS数据清洗、分词、过滤与入库
|
||||||
│ ├── schema_mapper.py # 数据结构转换器
|
│ ├── keywords.txt # 行业关键词(用于分词/过滤)
|
||||||
│ ├── text_parser.py # 文本解析(PDF/HTML等)
|
│ ├── stopwords.txt # 停用词
|
||||||
│ ├── image_analyzer.py # 图像识别(OpenCV集成)
|
│ └── ai_engine/
|
||||||
│ ├── video_processor.py # 音视频分离分析
|
│ └── ai_proessor_rss_data # 预留(AI分析扩展占位)
|
||||||
│ │
|
|
||||||
│ └── ai_engine/ # AI分析核心
|
|
||||||
│ ├── nlp_processor.py # 自然语言处理引擎
|
|
||||||
│ ├── sentiment_analyzer.py # 情感分析模型
|
|
||||||
│ └── topic_modeler.py # LDA主题建模工具
|
|
||||||
│
|
│
|
||||||
├── services/ # 应用服务层
|
├── services/ # 应用服务层(保留)
|
||||||
│ ├── monitoring/ # 舆情监控
|
│ ├── monitoring/ # 舆情监控
|
||||||
│ │ ├── opinion_monitor.py # 实时舆情追踪
|
│ │ ├── opinion_monitor.py # 实时舆情追踪(占位)
|
||||||
│ │ └── brand_reputation.py # 品牌口碑分析
|
│ │ └── brand_reputation.py # 品牌口碑分析(占位)
|
||||||
│ │
|
|
||||||
│ ├── analysis/ # 竞品分析
|
│ ├── analysis/ # 竞品分析
|
||||||
│ │ ├── competitor_tracker.py # 竞品动态监控
|
│ │ ├── competitor_tracker.py # 竞品动态监控(占位)
|
||||||
│ │ └── swot_generator.py # SWOT分析报告
|
│ │ └── swot_generator.py # SWOT分析报告(占位)
|
||||||
│ │
|
|
||||||
│ ├── reporting/ # 报告服务
|
│ ├── reporting/ # 报告服务
|
||||||
│ │ ├── daily_reporter.py # 自动化日报生成
|
│ │ ├── daily_reporter.py # 自动化日报生成(占位)
|
||||||
│ │ └── weekly_digest.py # 周报汇编系统
|
│ │ └── weekly_digest.py # 周报汇编系统(占位)
|
||||||
│ │
|
|
||||||
│ └── alert/ # 预警服务
|
│ └── alert/ # 预警服务
|
||||||
│ ├── alert_trigger.py # 动态阈值告警
|
│ ├── alert_trigger.py # 动态阈值告警(占位)
|
||||||
│ └── notification_center.py # 邮件/短信通知
|
│ └── notification_center.py # 邮件/短信通知(占位)
|
||||||
|
│
|
||||||
|
├── applications/ # 应用层
|
||||||
|
│ ├── alert.py # 告警触发/通知(占位/实现中)
|
||||||
|
│ └── reporter/
|
||||||
|
│ ├── daily.py # 日报生成
|
||||||
|
│ └── monthly.py # 月报生成
|
||||||
│
|
│
|
||||||
├── system_management/ # 系统管理层
|
├── system_management/ # 系统管理层
|
||||||
│ ├── scheduler/ # 任务调度
|
│ ├── scheduler/
|
||||||
│ │ └── task_scheduler.py # 任务调度器
|
│ │ ├── task_scheduler.py # 任务调度器(Cron表达式 + 线程池)
|
||||||
│ │
|
│ │ └── task_management.py # 任务管理辅助
|
||||||
│ └── monitor/ # 系统监控
|
│ └── monitor/ # 系统监控(目录占位)
|
||||||
│ ├── health_monitor.py # 服务健康检测
|
|
||||||
│ └── performance_watcher.py # 资源占用监控
|
|
||||||
│
|
│
|
||||||
├── utils/ # 工具库
|
├── utils/ # 工具库
|
||||||
│ ├── file_handler.py # 通用文件操作
|
│ ├── file_handler.py # 通用文件操作
|
||||||
│ ├── logger.py # 日志系统
|
│ ├── logger.py # 跨平台日志系统(Loguru)
|
||||||
│ ├── mysql_agent.py # MySQL读写管理器
|
│ ├── mysql_agent.py # MySQL读写管理器
|
||||||
│ └── datetime_parser.py # 时间格式处理
|
│ └── minio_agent.py # MinIO对象存储客户端
|
||||||
│
|
│
|
||||||
├── config.py # 配置加载与管理
|
├── config.py # 配置加载与管理(含数据库/存储配置)
|
||||||
└── main.py # 系统入口(启动所有服务)
|
├── main.py # 系统入口(Cron轮询 + 调度执行)
|
||||||
|
└── requirements.txt # 依赖清单
|
||||||
```
|
```
|
||||||
|
|
||||||
### 程序设计原则
|
### 程序设计原则
|
||||||
@@ -67,23 +62,32 @@ intelligence_system/
|
|||||||
3. 密钥等信息直接放在配置类中
|
3. 密钥等信息直接放在配置类中
|
||||||
4. 数据存储遵循"结构化存MySQL,非结构化存MinIO"原则,通过元数据关联
|
4. 数据存储遵循"结构化存MySQL,非结构化存MinIO"原则,通过元数据关联
|
||||||
|
|
||||||
### 主程序设计
|
### 主程序与调度设计(已实现)
|
||||||
主程序需要一次启动,一直运行,启动时运行一次(在代码中可取消),之后每天定时生成一次报告
|
主程序以长运行进程方式启动,进入轻量轮询循环(每10秒)。调度器按Cron表达式在`main_task`表中拉取到期任务,使用线程池异步执行,并在每分钟输出运行状态、每小时汇总统计。
|
||||||
|
|
||||||
主程序包含爬虫/api调度器。该调度器通过查询mysql中任务调度情况按需执行,db文件中应包含任务名称、
|
- 调度器能力:
|
||||||
任务路径、任务执行频率(支持按天、按周,按分钟)、上次执行时间、下次执行时间等信息
|
- 基于`croniter`解析Cron表达式,支持时区(默认`Asia/Shanghai`)
|
||||||
|
- 线程池并发执行,信号量限制最大并发(与`max_workers`一致)
|
||||||
|
- 任务入口动态解析:支持`package.module`、`package.module.ClassName.main`、`package.module.func` 等形式
|
||||||
|
- 成功/失败后自动计算`next_run_time`或设置15分钟后重试
|
||||||
|
- 关键字段自动更新:`is_running`、`last_run_time`、`last_run_status`、`run_count`、`next_run_time`
|
||||||
|
|
||||||
主程序应包含数据处理调度器,根据数据类别分别处理,如文本数据处理调度器、图片数据处理调度器等,
|
- 主循环:
|
||||||
每天定时拉取db获取到的原始数据,分别进行处理,处理完成后将结果保存到mysql中
|
- 每10秒检查一次待运行任务
|
||||||
|
- 每分钟打印当前周期统计;每小时写入累计统计日志
|
||||||
|
- 支持`SIGINT/SIGTERM`优雅关闭,等待正在运行的任务完成
|
||||||
|
|
||||||
主程序应包含日报、周报等生成,根据时间定时生成报告,报告需要存储
|
### 日志设计(已实现)
|
||||||
|
跨平台日志系统(Loguru)输出至`logs/`目录:
|
||||||
|
|
||||||
### 日志设计
|
- application.log:主日志,`rotation = 20MB`,达到阈值后压缩为`application.log.YYYYMMDD.zip`,`retention = 30天`
|
||||||
日志系统兼容Windows、Mac、Linux平台,以`log`文件形式存储,超过20MB自动压缩。新增存储相关日志内容:
|
- errors.log:错误日志(ERROR及以上),`rotation = 10MB`,`retention = 90天`
|
||||||
- MySQL操作:批量插入行数、表结构变更、事务状态
|
- 结构化扩展字段:日志支持`extra`键值对,自动美化并对长字段(如`sql`、`params`)截断
|
||||||
- MinIO操作:文件上传/下载状态、路径、大小、耗时
|
|
||||||
- 关联日志:MySQL记录与MinIO对象的绑定关系(如"ID:123 关联文件: collector/images/xxx.jpg")
|
建议记录的业务事件:
|
||||||
- 异常日志:MySQL连接失败、MinIO上传超时、数据关联不一致等告警信息
|
- MySQL读写操作要点(表名、影响行数、事务状态)
|
||||||
|
- MinIO对象操作(对象路径、大小、耗时、状态)
|
||||||
|
- 任务执行上下文(task_id、task_name、module_path、耗时、状态)
|
||||||
|
|
||||||
### 存储系统设计(MinIO+MySQL)
|
### 存储系统设计(MinIO+MySQL)
|
||||||
#### 核心存储分工
|
#### 核心存储分工
|
||||||
@@ -113,44 +117,32 @@ intelligence_system/
|
|||||||
- 系统类:如任务调度表等采用功能命名(如`main_task`)
|
- 系统类:如任务调度表等采用功能命名(如`main_task`)
|
||||||
|
|
||||||
|
|
||||||
#### 核心表结构
|
#### 核心表结构(当前落地)
|
||||||
1. `collector_news_api`:新闻API采集数据表(存储新闻标题、内容等结构化数据)
|
1. `main_task`:任务调度表(`task_name`、`task_type`、`module_path`、`cron_expression`、`time_zone`、`run_count`、`is_running`、`last_run_time`、`last_run_status`、`next_run_time`、`is_active` 等)
|
||||||
2. `collector_complaint_spider`:投诉信息爬虫数据表(含投诉文本、附件MinIO路径`attachment_minio_path`等)
|
2. `collector_rss_subscriptions`:RSS源采集数据(`文章标题`、`文章摘要`、`发布时间`、`来源URL`、`文章链接`、`是否已处理` 等)
|
||||||
3. `collector_image_source`:采集层图片元数据表(存储图片URL、MinIO路径、格式、大小等)
|
3. `processed_rss_data`:RSS处理结果(`分词结果`、`是否汽车相关`、`处理时间` 等)
|
||||||
4. `processor_text_processor`:文本处理结果表(存储NLP分析结果、关联原文ID等)
|
4. `collector_complaint_spider`:投诉信息爬虫数据(含文本与附件MinIO路径`attachment_minio_path`等)
|
||||||
5. `processor_image_processor`:图片处理结果表(存储识别标签、特征向量、处理后图片MinIO路径`result_minio_path`等)
|
5. 可选:`storage_object_index`(建议用于统一索引MinIO对象元数据)
|
||||||
6. `storage_object_index`:MinIO对象索引表(存储所有对象的MinIO路径、哈希值、创建时间、过期时间等)
|
|
||||||
7. `main_task`:任务调度表(存储任务名称、路径、执行频率、上次/下次执行时间等)
|
|
||||||
8. `application_reporter_daily`:日报数据表(存储日报结构化内容、报表文件MinIO路径等)
|
|
||||||
9. `application_reporter_monthly`:月报数据表(存储月报结构化内容、报表文件MinIO路径等)
|
|
||||||
|
|
||||||
|
|
||||||
#### 数据交互特性
|
|
||||||
1. **MySQL交互**
|
|
||||||
- 支持DataFrame直接读写,提供分块处理(`chunksize`)和批量插入能力
|
|
||||||
- 自动适配平台特性(如Windows小批次写入优化)
|
|
||||||
- 完善的事务机制确保结构化数据一致性
|
|
||||||
|
|
||||||
2. **MinIO交互**
|
|
||||||
- 支持大文件分片上传、断点续传
|
|
||||||
|
|
||||||
3. **联动机制**
|
|
||||||
- 非结构化数据存储时,先上传至MinIO获取路径,再将路径及元数据写入MySQL
|
|
||||||
- 读取非结构化数据时,先从MySQL获取MinIO路径,再通过路径从MinIO下载
|
|
||||||
- 日志同步记录MySQL操作和MinIO对象操作(如"上传文件至MinIO: {path},关联MySQL记录ID: {id}")
|
|
||||||
|
|
||||||
### 数据采集设计
|
### 数据采集设计
|
||||||
1. 结构化数据(如新闻文本、投诉内容):直接写入对应`collector_`前缀表
|
1. 结构化数据(RSS、投诉文本):写入`collector_`前缀表
|
||||||
2. 非结构化数据(如爬取的图片、附件):
|
2. 非结构化数据(附件/图片等):
|
||||||
- 调用`minio_agent.py`上传至对应存储桶
|
- 使用`utils/minio_agent.py`上传至对应存储桶
|
||||||
- 将MinIO路径、文件大小、格式等元数据写入`collector_`前缀表或`storage_object_index`表
|
- 将对象路径与元数据写入业务表或`storage_object_index`
|
||||||
3. 每个采集模块(独立py文件,`main`方法入口)需同时处理MySQL和MinIO交互,确保数据关联完整
|
3. 采集模块需同时处理MySQL与MinIO交互,确保关联完整
|
||||||
|
|
||||||
|
### 数据处理设计(RSS流程已实现)
|
||||||
|
`processors/processor_rss_data.py`流程:
|
||||||
|
- 从`collector_rss_subscriptions`加载未处理数据(可配置`limit`)
|
||||||
|
- 加载停用词与行业关键词(`stopwords.txt` / `keywords.txt`),并动态注入`jieba`词典
|
||||||
|
- 标注词性并过滤停用词,仅保留与汽车后市场相关的词汇
|
||||||
|
- 标记与过滤:出现任一行业关键词即视为相关,进入保存
|
||||||
|
- 将结果写入`processed_rss_data`,并回写源表`是否已处理 = 1`
|
||||||
|
- 输出处理统计(总量、命中量、命中率、时间)
|
||||||
|
|
||||||
### 数据处理设计
|
### 依赖与运行
|
||||||
1. 结构化数据处理:从MySQL读取原始数据,处理后写入`processor_`前缀表
|
- 依赖:见`requirements.txt`(pandas、SQLAlchemy、PyMySQL、croniter、pytz、loguru、jieba、feedparser、beautifulsoup4、minio 等)
|
||||||
2. 非结构化数据处理:
|
- 配置:在`config.py`中设置`MYSQL_CONFIG`与MinIO参数
|
||||||
- 从MySQL获取MinIO路径,通过`minio_agent.py`下载原始文件
|
- 运行:
|
||||||
- 处理后(如图片识别、视频帧提取)将结果文件上传至MinIO(处理层存储桶)
|
- 启动主程序:`python main.py`
|
||||||
- 将处理结果的结构化信息(如识别标签)和处理后文件的MinIO路径写入`processor_`前缀表
|
- 添加任务:向`main_task`插入记录,`module_path`可指向如`processors.processor_rss_data.main`
|
||||||
3. 支持多表关联存储,通过`source_id`关联原始数据与处理结果
|
|
||||||
|
|||||||
@@ -15,3 +15,4 @@ feedparser==6.0.11
|
|||||||
Markdown==3.9
|
Markdown==3.9
|
||||||
openai==1.107.3
|
openai==1.107.3
|
||||||
tqdm==4.67.1
|
tqdm==4.67.1
|
||||||
|
jieba==0.42.1
|
||||||
|
|||||||
@@ -22,6 +22,9 @@ class TaskScheduler:
|
|||||||
self.executor = ThreadPoolExecutor(max_workers=max_workers)
|
self.executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
# 并发容量控制:限制同时运行的后台任务不超过 max_workers
|
# 并发容量控制:限制同时运行的后台任务不超过 max_workers
|
||||||
self._running_semaphore = threading.Semaphore(max_workers)
|
self._running_semaphore = threading.Semaphore(max_workers)
|
||||||
|
# 任务统计
|
||||||
|
self.hourly_stats = {'成功': 0, '失败': 0, '总数': 0}
|
||||||
|
self.hourly_stats_lock = threading.Lock()
|
||||||
log.info(f"任务调度器已初始化,最大工作线程数: {max_workers}")
|
log.info(f"任务调度器已初始化,最大工作线程数: {max_workers}")
|
||||||
|
|
||||||
def _resolve_callable(self, module_path: str):
|
def _resolve_callable(self, module_path: str):
|
||||||
@@ -84,8 +87,12 @@ class TaskScheduler:
|
|||||||
# 如果所有尝试均失败,则抛出最后的错误
|
# 如果所有尝试均失败,则抛出最后的错误
|
||||||
raise ImportError(f"模块 {module_path} 导入/解析失败: {str(last_import_error)}")
|
raise ImportError(f"模块 {module_path} 导入/解析失败: {str(last_import_error)}")
|
||||||
|
|
||||||
def check_and_run_tasks(self) -> Dict[str, int]:
|
def check_and_run_tasks(self, print_empty_status: bool = False) -> Dict[str, int]:
|
||||||
"""检查并执行所有到期的任务,优化空任务处理和异常容错"""
|
"""检查并执行所有到期的任务,优化空任务处理和异常容错
|
||||||
|
|
||||||
|
Args:
|
||||||
|
print_empty_status: 是否打印空任务状态(默认False,避免频繁输出)
|
||||||
|
"""
|
||||||
result = {'总任务数': 0, '成功': 0, '失败': 0}
|
result = {'总任务数': 0, '成功': 0, '失败': 0}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -106,7 +113,8 @@ class TaskScheduler:
|
|||||||
|
|
||||||
result['总任务数'] = len(tasks_df)
|
result['总任务数'] = len(tasks_df)
|
||||||
if tasks_df.empty:
|
if tasks_df.empty:
|
||||||
# 空任务时输出INFO级日志,明确提示状态
|
# 空任务时根据参数决定是否输出
|
||||||
|
if print_empty_status:
|
||||||
print(f"当前没有到期的任务,等待新任务加入...{now.strftime('%Y-%m-%d %H:%M:%S')}")
|
print(f"当前没有到期的任务,等待新任务加入...{now.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -128,6 +136,12 @@ class TaskScheduler:
|
|||||||
log.error(f"任务线程执行失败: {str(e)}", exc_info=True)
|
log.error(f"任务线程执行失败: {str(e)}", exc_info=True)
|
||||||
result['失败'] += 1
|
result['失败'] += 1
|
||||||
|
|
||||||
|
# 更新小时统计
|
||||||
|
with self.hourly_stats_lock:
|
||||||
|
self.hourly_stats['成功'] += result['成功']
|
||||||
|
self.hourly_stats['失败'] += result['失败']
|
||||||
|
self.hourly_stats['总数'] += result['总任务数']
|
||||||
|
|
||||||
log.info(
|
log.info(
|
||||||
"任务调度周期完成",
|
"任务调度周期完成",
|
||||||
总任务数=result['总任务数'],
|
总任务数=result['总任务数'],
|
||||||
@@ -420,3 +434,11 @@ class TaskScheduler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"查询待执行任务失败,将重试: {str(e)}", exc_info=True)
|
log.error(f"查询待执行任务失败,将重试: {str(e)}", exc_info=True)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def get_and_reset_hourly_stats(self) -> Dict[str, int]:
|
||||||
|
"""获取并重置小时统计数据(用于每小时统计)"""
|
||||||
|
with self.hourly_stats_lock:
|
||||||
|
stats = self.hourly_stats.copy()
|
||||||
|
# 重置统计
|
||||||
|
self.hourly_stats = {'成功': 0, '失败': 0, '总数': 0}
|
||||||
|
return stats
|
||||||
+468
-285
@@ -10,14 +10,25 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
"id": "initial_id",
|
"id": "initial_id",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true,
|
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2025-10-17T05:43:18.381936Z",
|
"end_time": "2025-10-17T05:43:18.381936Z",
|
||||||
"start_time": "2025-10-17T05:43:15.265036Z"
|
"start_time": "2025-10-17T05:43:15.265036Z"
|
||||||
}
|
|
||||||
},
|
},
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"PROJECT_ROOT = d:\\Idea Project\\intelligence_system\n",
|
||||||
|
"\u001b[32m2025-10-23 16:56:55\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m任务调度器已初始化,最大工作线程数: 5\u001b[0m\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# 使 Notebook 可从项目根导入\n",
|
"# 使 Notebook 可从项目根导入\n",
|
||||||
"import sys\n",
|
"import sys\n",
|
||||||
@@ -116,6 +127,78 @@
|
|||||||
" log.exception(\"手动执行任务失败\")\n",
|
" log.exception(\"手动执行任务失败\")\n",
|
||||||
" return False\n",
|
" return False\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
" def run_task_synchronously(self, task_id: int) -> dict:\n",
|
||||||
|
" \"\"\"同步执行任务并返回详细结果(用于Notebook中查看执行过程)\"\"\"\n",
|
||||||
|
" import time\n",
|
||||||
|
" import sys\n",
|
||||||
|
" from io import StringIO\n",
|
||||||
|
" \n",
|
||||||
|
" task = self.get_task_by_id(task_id)\n",
|
||||||
|
" if not task:\n",
|
||||||
|
" return {\n",
|
||||||
|
" 'success': False,\n",
|
||||||
|
" 'error': f'未找到任务ID: {task_id}',\n",
|
||||||
|
" 'output': ''\n",
|
||||||
|
" }\n",
|
||||||
|
" \n",
|
||||||
|
" # 捕获标准输出\n",
|
||||||
|
" old_stdout = sys.stdout\n",
|
||||||
|
" sys.stdout = output_buffer = StringIO()\n",
|
||||||
|
" \n",
|
||||||
|
" start_time = time.time()\n",
|
||||||
|
" success = False\n",
|
||||||
|
" error_msg = None\n",
|
||||||
|
" \n",
|
||||||
|
" try:\n",
|
||||||
|
" # 直接同步执行任务逻辑\n",
|
||||||
|
" self.scheduler._execute_task_logic(task)\n",
|
||||||
|
" success = True\n",
|
||||||
|
" \n",
|
||||||
|
" # 更新任务状态\n",
|
||||||
|
" next_run_time = self.scheduler._calculate_next_run_time(\n",
|
||||||
|
" cron_expr=task['cron_expression'],\n",
|
||||||
|
" time_zone=task.get('time_zone', 'Asia/Shanghai')\n",
|
||||||
|
" )\n",
|
||||||
|
" \n",
|
||||||
|
" self.scheduler._update_task_status(task['task_id'], {\n",
|
||||||
|
" 'last_run_status': 'success',\n",
|
||||||
|
" 'is_running': 0,\n",
|
||||||
|
" 'run_count': task['run_count'] + 1,\n",
|
||||||
|
" 'next_run_time': next_run_time\n",
|
||||||
|
" })\n",
|
||||||
|
" \n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" success = False\n",
|
||||||
|
" error_msg = str(e)\n",
|
||||||
|
" log.exception(f\"任务执行失败: {task['task_name']}\")\n",
|
||||||
|
" \n",
|
||||||
|
" # 更新失败状态\n",
|
||||||
|
" try:\n",
|
||||||
|
" next_retry_time = datetime.now() + pd.Timedelta(minutes=15)\n",
|
||||||
|
" self.scheduler._update_task_status(task['task_id'], {\n",
|
||||||
|
" 'last_run_status': 'failed',\n",
|
||||||
|
" 'is_running': 0,\n",
|
||||||
|
" 'next_run_time': next_retry_time\n",
|
||||||
|
" })\n",
|
||||||
|
" except Exception:\n",
|
||||||
|
" pass\n",
|
||||||
|
" \n",
|
||||||
|
" finally:\n",
|
||||||
|
" # 恢复标准输出\n",
|
||||||
|
" sys.stdout = old_stdout\n",
|
||||||
|
" output_text = output_buffer.getvalue()\n",
|
||||||
|
" \n",
|
||||||
|
" execution_time = time.time() - start_time\n",
|
||||||
|
" \n",
|
||||||
|
" return {\n",
|
||||||
|
" 'success': success,\n",
|
||||||
|
" 'task_name': task['task_name'],\n",
|
||||||
|
" 'task_id': task['task_id'],\n",
|
||||||
|
" 'execution_time': execution_time,\n",
|
||||||
|
" 'output': output_text,\n",
|
||||||
|
" 'error': error_msg\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
"# 在这里创建 manager(供后续单元使用)\n",
|
"# 在这里创建 manager(供后续单元使用)\n",
|
||||||
"manager = TaskManager(scheduler)\n",
|
"manager = TaskManager(scheduler)\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -134,18 +217,7 @@
|
|||||||
" except Exception:\n",
|
" except Exception:\n",
|
||||||
" pass\n",
|
" pass\n",
|
||||||
" return str(dt)"
|
" return str(dt)"
|
||||||
],
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"PROJECT_ROOT = D:\\Idea Project\\intelligence_system\n",
|
|
||||||
"\u001B[32m2025-10-17 13:43:18\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m任务调度器已初始化,最大工作线程数: 5\u001B[0m\n"
|
|
||||||
]
|
]
|
||||||
}
|
|
||||||
],
|
|
||||||
"execution_count": 1
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
@@ -157,6 +229,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
"id": "7b020af55972643",
|
"id": "7b020af55972643",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
@@ -164,73 +237,28 @@
|
|||||||
"start_time": "2025-10-17T05:43:18.394863Z"
|
"start_time": "2025-10-17T05:43:18.394863Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
|
||||||
"# 列出所有任务(包括已禁用的)\n",
|
|
||||||
"def list_tasks(active_only=True):\n",
|
|
||||||
" tasks = manager.get_all_tasks(active_only)\n",
|
|
||||||
" if not tasks:\n",
|
|
||||||
" display(Markdown(\"### 没有找到任务\"))\n",
|
|
||||||
" return None\n",
|
|
||||||
"\n",
|
|
||||||
" df = pd.DataFrame(tasks)\n",
|
|
||||||
"\n",
|
|
||||||
" # 格式化日期列\n",
|
|
||||||
" if 'last_run_time' in df.columns:\n",
|
|
||||||
" df['last_run_time'] = df['last_run_time'].apply(format_datetime)\n",
|
|
||||||
" if 'next_run_time' in df.columns:\n",
|
|
||||||
" df['next_run_time'] = df['next_run_time'].apply(format_datetime)\n",
|
|
||||||
"\n",
|
|
||||||
" # 重命名列名\n",
|
|
||||||
" df = df.rename(columns={\n",
|
|
||||||
" 'task_id': '任务ID',\n",
|
|
||||||
" 'task_name': '任务名称',\n",
|
|
||||||
" 'task_type': '任务类型',\n",
|
|
||||||
" 'module_path': '模块路径',\n",
|
|
||||||
" 'cron_expression': 'Cron表达式',\n",
|
|
||||||
" 'time_zone': '时区',\n",
|
|
||||||
" 'last_run_time': '最后运行时间',\n",
|
|
||||||
" 'next_run_time': '下次运行时间',\n",
|
|
||||||
" 'last_run_status': '运行状态',\n",
|
|
||||||
" 'is_active': '是否活跃',\n",
|
|
||||||
" 'run_count': '运行次数'\n",
|
|
||||||
" })\n",
|
|
||||||
"\n",
|
|
||||||
" display(Markdown(\"### 任务列表\"))\n",
|
|
||||||
" display(HTML(df.to_html(index=False)))\n",
|
|
||||||
" return df\n",
|
|
||||||
"\n",
|
|
||||||
"# 执行:列出所有任务(包括已禁用)\n",
|
|
||||||
"list_tasks(active_only=False)\n",
|
|
||||||
"\n",
|
|
||||||
"# 或者:只列出活跃任务\n",
|
|
||||||
"# list_tasks(active_only=True)"
|
|
||||||
],
|
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\u001B[32m2025-10-17 13:43:18\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n"
|
"\u001b[32m2025-10-17 13:43:18\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"### 任务列表"
|
||||||
|
],
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"<IPython.core.display.Markdown object>"
|
"<IPython.core.display.Markdown object>"
|
||||||
],
|
]
|
||||||
"text/markdown": "### 任务列表"
|
|
||||||
},
|
},
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "display_data",
|
"output_type": "display_data"
|
||||||
"jetTransient": {
|
|
||||||
"display_id": null
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
|
||||||
"<IPython.core.display.HTML object>"
|
|
||||||
],
|
|
||||||
"text/html": [
|
"text/html": [
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
" <thead>\n",
|
" <thead>\n",
|
||||||
@@ -270,26 +298,16 @@
|
|||||||
" </tr>\n",
|
" </tr>\n",
|
||||||
" </tbody>\n",
|
" </tbody>\n",
|
||||||
"</table>"
|
"</table>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "display_data",
|
"output_type": "display_data"
|
||||||
"jetTransient": {
|
|
||||||
"display_id": null
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
|
||||||
" 任务ID 任务名称 任务类型 模块路径 \\\n",
|
|
||||||
"0 1 RSS新闻订阅 collector collectors.rss_subscriptions.NewsAPIClient \n",
|
|
||||||
"\n",
|
|
||||||
" Cron表达式 时区 下次运行时间 最后运行时间 \\\n",
|
|
||||||
"0 5 0 * * * Asia/Shanghai 2025-10-18 00:05:00 2025-10-17 00:05:07 \n",
|
|
||||||
"\n",
|
|
||||||
" 运行状态 运行次数 是否活跃 is_running created_at updated_at \n",
|
|
||||||
"0 success 4 1 0 2025-10-16 15:47:34 2025-10-17 00:05:08 "
|
|
||||||
],
|
|
||||||
"text/html": [
|
"text/html": [
|
||||||
"<div>\n",
|
"<div>\n",
|
||||||
"<style scoped>\n",
|
"<style scoped>\n",
|
||||||
@@ -346,6 +364,16 @@
|
|||||||
" </tbody>\n",
|
" </tbody>\n",
|
||||||
"</table>\n",
|
"</table>\n",
|
||||||
"</div>"
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" 任务ID 任务名称 任务类型 模块路径 \\\n",
|
||||||
|
"0 1 RSS新闻订阅 collector collectors.rss_subscriptions.NewsAPIClient \n",
|
||||||
|
"\n",
|
||||||
|
" Cron表达式 时区 下次运行时间 最后运行时间 \\\n",
|
||||||
|
"0 5 0 * * * Asia/Shanghai 2025-10-18 00:05:00 2025-10-17 00:05:07 \n",
|
||||||
|
"\n",
|
||||||
|
" 运行状态 运行次数 是否活跃 is_running created_at updated_at \n",
|
||||||
|
"0 success 4 1 0 2025-10-16 15:47:34 2025-10-17 00:05:08 "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
@@ -353,7 +381,47 @@
|
|||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"execution_count": 2
|
"source": [
|
||||||
|
"# 列出所有任务(包括已禁用的)\n",
|
||||||
|
"def list_tasks(active_only=True):\n",
|
||||||
|
" tasks = manager.get_all_tasks(active_only)\n",
|
||||||
|
" if not tasks:\n",
|
||||||
|
" display(Markdown(\"### 没有找到任务\"))\n",
|
||||||
|
" return None\n",
|
||||||
|
"\n",
|
||||||
|
" df = pd.DataFrame(tasks)\n",
|
||||||
|
"\n",
|
||||||
|
" # 格式化日期列\n",
|
||||||
|
" if 'last_run_time' in df.columns:\n",
|
||||||
|
" df['last_run_time'] = df['last_run_time'].apply(format_datetime)\n",
|
||||||
|
" if 'next_run_time' in df.columns:\n",
|
||||||
|
" df['next_run_time'] = df['next_run_time'].apply(format_datetime)\n",
|
||||||
|
"\n",
|
||||||
|
" # 重命名列名\n",
|
||||||
|
" df = df.rename(columns={\n",
|
||||||
|
" 'task_id': '任务ID',\n",
|
||||||
|
" 'task_name': '任务名称',\n",
|
||||||
|
" 'task_type': '任务类型',\n",
|
||||||
|
" 'module_path': '模块路径',\n",
|
||||||
|
" 'cron_expression': 'Cron表达式',\n",
|
||||||
|
" 'time_zone': '时区',\n",
|
||||||
|
" 'last_run_time': '最后运行时间',\n",
|
||||||
|
" 'next_run_time': '下次运行时间',\n",
|
||||||
|
" 'last_run_status': '运行状态',\n",
|
||||||
|
" 'is_active': '是否活跃',\n",
|
||||||
|
" 'run_count': '运行次数'\n",
|
||||||
|
" })\n",
|
||||||
|
"\n",
|
||||||
|
" display(Markdown(\"### 任务列表\"))\n",
|
||||||
|
" display(HTML(df.to_html(index=False)))\n",
|
||||||
|
" return df\n",
|
||||||
|
"\n",
|
||||||
|
"# 执行:列出所有任务(包括已禁用)\n",
|
||||||
|
"list_tasks(active_only=False)\n",
|
||||||
|
"\n",
|
||||||
|
"# 或者:只列出活跃任务\n",
|
||||||
|
"# list_tasks(active_only=True)"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
@@ -365,6 +433,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
"id": "eab90de72c35429e",
|
"id": "eab90de72c35429e",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
@@ -372,6 +441,62 @@
|
|||||||
"start_time": "2025-10-17T05:43:26.071398Z"
|
"start_time": "2025-10-17T05:43:26.071398Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[32m2025-10-17 13:43:26\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"### 任务详情\n",
|
||||||
|
"**任务ID**: 1\n",
|
||||||
|
"**任务名称**: RSS新闻订阅\n",
|
||||||
|
"**任务类型**: collector\n",
|
||||||
|
"**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n",
|
||||||
|
"**Cron表达式**: 5 0 * * *\n",
|
||||||
|
"**时区**: Asia/Shanghai\n",
|
||||||
|
"**最后运行时间**: 2025-10-17 00:05:07\n",
|
||||||
|
"**下次运行时间**: 2025-10-18 00:05:00\n",
|
||||||
|
"**运行状态**: success\n",
|
||||||
|
"**是否活跃**: 是\n",
|
||||||
|
"**运行次数**: 4\n",
|
||||||
|
"**创建时间**: 2025-10-16 15:47:34"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'task_id': 1,\n",
|
||||||
|
" 'task_name': 'RSS新闻订阅',\n",
|
||||||
|
" 'task_type': 'collector',\n",
|
||||||
|
" 'module_path': 'collectors.rss_subscriptions.NewsAPIClient',\n",
|
||||||
|
" 'cron_expression': '5 0 * * *',\n",
|
||||||
|
" 'time_zone': 'Asia/Shanghai',\n",
|
||||||
|
" 'next_run_time': Timestamp('2025-10-18 00:05:00'),\n",
|
||||||
|
" 'last_run_time': Timestamp('2025-10-17 00:05:07'),\n",
|
||||||
|
" 'last_run_status': 'success',\n",
|
||||||
|
" 'run_count': 4,\n",
|
||||||
|
" 'is_active': 1,\n",
|
||||||
|
" 'is_running': 0,\n",
|
||||||
|
" 'created_at': Timestamp('2025-10-16 15:47:34'),\n",
|
||||||
|
" 'updated_at': Timestamp('2025-10-17 00:05:08')}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# 查看指定任务的详情\n",
|
"# 查看指定任务的详情\n",
|
||||||
"def show_task_details(task_id):\n",
|
"def show_task_details(task_id):\n",
|
||||||
@@ -399,54 +524,8 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"# 执行:查看任务ID为1的详情(替换为实际ID)\n",
|
"# 执行:查看任务ID为1的详情(替换为实际ID)\n",
|
||||||
"show_task_details(1)"
|
"show_task_details(1)"
|
||||||
],
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"\u001B[32m2025-10-17 13:43:26\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"<IPython.core.display.Markdown object>"
|
|
||||||
],
|
|
||||||
"text/markdown": "### 任务详情\n**任务ID**: 1\n**任务名称**: RSS新闻订阅\n**任务类型**: collector\n**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n**Cron表达式**: 5 0 * * *\n**时区**: Asia/Shanghai\n**最后运行时间**: 2025-10-17 00:05:07\n**下次运行时间**: 2025-10-18 00:05:00\n**运行状态**: success\n**是否活跃**: 是\n**运行次数**: 4\n**创建时间**: 2025-10-16 15:47:34"
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data",
|
|
||||||
"jetTransient": {
|
|
||||||
"display_id": null
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"{'task_id': 1,\n",
|
|
||||||
" 'task_name': 'RSS新闻订阅',\n",
|
|
||||||
" 'task_type': 'collector',\n",
|
|
||||||
" 'module_path': 'collectors.rss_subscriptions.NewsAPIClient',\n",
|
|
||||||
" 'cron_expression': '5 0 * * *',\n",
|
|
||||||
" 'time_zone': 'Asia/Shanghai',\n",
|
|
||||||
" 'next_run_time': Timestamp('2025-10-18 00:05:00'),\n",
|
|
||||||
" 'last_run_time': Timestamp('2025-10-17 00:05:07'),\n",
|
|
||||||
" 'last_run_status': 'success',\n",
|
|
||||||
" 'run_count': 4,\n",
|
|
||||||
" 'is_active': 1,\n",
|
|
||||||
" 'is_running': 0,\n",
|
|
||||||
" 'created_at': Timestamp('2025-10-16 15:47:34'),\n",
|
|
||||||
" 'updated_at': Timestamp('2025-10-17 00:05:08')}"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"execution_count": 3
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "a313f1524f5a54bc",
|
"id": "a313f1524f5a54bc",
|
||||||
@@ -473,8 +552,8 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\u001B[32m2025-10-16 15:47:34\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n",
|
"\u001b[32m2025-10-16 15:47:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n",
|
||||||
"\u001B[32m2025-10-16 15:47:34\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m新任务添加成功\u001B[0m\n"
|
"\u001b[32m2025-10-16 15:47:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m新任务添加成功\u001b[0m\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -550,6 +629,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
"id": "c892fd8ad2f0dd9d",
|
"id": "c892fd8ad2f0dd9d",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
@@ -557,6 +637,61 @@
|
|||||||
"start_time": "2025-10-17T05:44:18.980345Z"
|
"start_time": "2025-10-17T05:44:18.980345Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"### 任务ID 1 更新成功"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[32m2025-10-17 13:44:19\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"### 任务详情\n",
|
||||||
|
"**任务ID**: 1\n",
|
||||||
|
"**任务名称**: RSS新闻订阅\n",
|
||||||
|
"**任务类型**: collector\n",
|
||||||
|
"**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n",
|
||||||
|
"**Cron表达式**: 5 * * * *\n",
|
||||||
|
"**时区**: Asia/Shanghai\n",
|
||||||
|
"**最后运行时间**: 2025-10-17 00:05:07\n",
|
||||||
|
"**下次运行时间**: 2025-10-18 00:05:00\n",
|
||||||
|
"**运行状态**: success\n",
|
||||||
|
"**是否活跃**: 是\n",
|
||||||
|
"**运行次数**: 4\n",
|
||||||
|
"**创建时间**: 2025-10-16 15:47:34"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# 更新任务属性\n",
|
"# 更新任务属性\n",
|
||||||
"def update_task(task_id, **kwargs):\n",
|
"def update_task(task_id, **kwargs):\n",
|
||||||
@@ -589,54 +724,8 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"# 执行:同时更新多个属性(名称和Cron表达式)\n",
|
"# 执行:同时更新多个属性(名称和Cron表达式)\n",
|
||||||
"# update_task(1, name=\"每日早间新闻采集\", cron=\"0 8 * * *\")"
|
"# update_task(1, name=\"每日早间新闻采集\", cron=\"0 8 * * *\")"
|
||||||
],
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"<IPython.core.display.Markdown object>"
|
|
||||||
],
|
|
||||||
"text/markdown": "### 任务ID 1 更新成功"
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data",
|
|
||||||
"jetTransient": {
|
|
||||||
"display_id": null
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"\u001B[32m2025-10-17 13:44:19\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"<IPython.core.display.Markdown object>"
|
|
||||||
],
|
|
||||||
"text/markdown": "### 任务详情\n**任务ID**: 1\n**任务名称**: RSS新闻订阅\n**任务类型**: collector\n**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n**Cron表达式**: 5 * * * *\n**时区**: Asia/Shanghai\n**最后运行时间**: 2025-10-17 00:05:07\n**下次运行时间**: 2025-10-18 00:05:00\n**运行状态**: success\n**是否活跃**: 是\n**运行次数**: 4\n**创建时间**: 2025-10-16 15:47:34"
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data",
|
|
||||||
"jetTransient": {
|
|
||||||
"display_id": null
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"True"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"execution_count": 4
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "37564011cf5aa501",
|
"id": "37564011cf5aa501",
|
||||||
@@ -702,6 +791,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
"id": "94892f4134316f8e",
|
"id": "94892f4134316f8e",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
@@ -709,8 +799,177 @@
|
|||||||
"start_time": "2025-10-17T05:44:35.084369Z"
|
"start_time": "2025-10-17T05:44:35.084369Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"### 开始执行任务ID 2"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"---"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1mRSS数据处理器初始化完成\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m开始处理RSS数据...\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功加载 8 条未处理的RSS数据\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[33m\u001b[1m停用词文件不存在: processors/stopwords.txt,使用默认停用词\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[33m\u001b[1m关键词文件不存在: processors/keywords.txt\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Building prefix dict from the default dictionary ...\n",
|
||||||
|
"Loading model from cache C:\\Users\\zy187\\AppData\\Local\\Temp\\jieba.cache\n",
|
||||||
|
"Loading model cost 0.609 seconds.\n",
|
||||||
|
"Prefix dict has been built successfully.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m数据处理完成,共处理 8 条记录\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m过滤出 1 条汽车后市场相关新闻\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m表 processed_rss_data 插入结果汇总\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功保存 1 条处理结果到数据库\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功标记 8 条数据为已处理\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1mRSS数据处理完成\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m任务执行完成,耗时: 1.19秒\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"**任务名称**: RSS基于规则数据处理"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"**任务ID**: 2"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"**执行时长**: 1.26 秒"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"---"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"### 📋 执行输出:"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"RSS数据处理完成!\n",
|
||||||
|
"处理统计: {'total_articles': 8, 'filtered_articles': 1, 'filter_rate': 0.125, 'processing_time': '2025-10-23 16:57:21', 'save_success': True, 'mark_success': True}\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"---"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"### ✅ 任务执行成功"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'success': True,\n",
|
||||||
|
" 'task_name': 'RSS基于规则数据处理',\n",
|
||||||
|
" 'task_id': 2,\n",
|
||||||
|
" 'execution_time': 1.2610254287719727,\n",
|
||||||
|
" 'output': \"RSS数据处理完成!\\n处理统计: {'total_articles': 8, 'filtered_articles': 1, 'filter_rate': 0.125, 'processing_time': '2025-10-23 16:57:21', 'save_success': True, 'mark_success': True}\\n\",\n",
|
||||||
|
" 'error': None}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# 手动执行任务\n",
|
"# 手动执行任务(异步方式,快速返回)\n",
|
||||||
"def run_task_manually(task_id):\n",
|
"def run_task_manually(task_id):\n",
|
||||||
" display(Markdown(f\"### 正在手动执行任务ID {task_id}...\"))\n",
|
" display(Markdown(f\"### 正在手动执行任务ID {task_id}...\"))\n",
|
||||||
" success = manager.run_task_manually(task_id)\n",
|
" success = manager.run_task_manually(task_id)\n",
|
||||||
@@ -720,119 +979,43 @@
|
|||||||
" display(Markdown(f\"### 任务ID {task_id} 执行失败\"))\n",
|
" display(Markdown(f\"### 任务ID {task_id} 执行失败\"))\n",
|
||||||
" return success\n",
|
" return success\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# 执行:手动运行任务ID为1的任务\n",
|
"# 手动执行任务(同步方式,显示详细执行过程)\n",
|
||||||
"run_task_manually(1)"
|
"def run_task_with_details(task_id):\n",
|
||||||
],
|
" display(Markdown(f\"### 开始执行任务ID {task_id}\"))\n",
|
||||||
"outputs": [
|
" display(Markdown(\"---\"))\n",
|
||||||
{
|
" \n",
|
||||||
"data": {
|
" result = manager.run_task_synchronously(task_id)\n",
|
||||||
"text/plain": [
|
" \n",
|
||||||
"<IPython.core.display.Markdown object>"
|
" if not result['success'] and result.get('error') and 'task_id' not in result:\n",
|
||||||
],
|
" display(Markdown(f\"### ❌ 错误: {result['error']}\"))\n",
|
||||||
"text/markdown": "### 正在手动执行任务ID 1..."
|
" return result\n",
|
||||||
},
|
" \n",
|
||||||
"metadata": {},
|
" # 显示任务基本信息\n",
|
||||||
"output_type": "display_data",
|
" display(Markdown(f\"**任务名称**: {result['task_name']}\"))\n",
|
||||||
"jetTransient": {
|
" display(Markdown(f\"**任务ID**: {result['task_id']}\"))\n",
|
||||||
"display_id": null
|
" display(Markdown(f\"**执行时长**: {result['execution_time']:.2f} 秒\"))\n",
|
||||||
}
|
" display(Markdown(\"---\"))\n",
|
||||||
},
|
" \n",
|
||||||
{
|
" # 显示执行输出\n",
|
||||||
"name": "stdout",
|
" if result['output']:\n",
|
||||||
"output_type": "stream",
|
" display(Markdown(\"### 📋 执行输出:\"))\n",
|
||||||
"text": [
|
" print(result['output'])\n",
|
||||||
"\u001B[32m2025-10-17 13:44:35\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n",
|
" display(Markdown(\"---\"))\n",
|
||||||
"\u001B[32m2025-10-17 13:44:35\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m开始执行任务: RSS新闻订阅\u001B[0m\n"
|
" \n",
|
||||||
|
" # 显示执行结果\n",
|
||||||
|
" if result['success']:\n",
|
||||||
|
" display(Markdown(\"### ✅ 任务执行成功\"))\n",
|
||||||
|
" else:\n",
|
||||||
|
" display(Markdown(f\"### ❌ 任务执行失败\"))\n",
|
||||||
|
" if result['error']:\n",
|
||||||
|
" display(Markdown(f\"**错误信息**: {result['error']}\"))\n",
|
||||||
|
" \n",
|
||||||
|
" return result\n",
|
||||||
|
"\n",
|
||||||
|
"# 执行:手动运行任务ID为2的任务(显示详细执行过程)\n",
|
||||||
|
"run_task_with_details(2)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"<IPython.core.display.Markdown object>"
|
|
||||||
],
|
|
||||||
"text/markdown": "### 任务ID 1 执行成功"
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data",
|
|
||||||
"jetTransient": {
|
|
||||||
"display_id": null
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"True"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m新闻API客户端初始化完成,已连接到数据库\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m数据库表结构验证通过,当前字段:['id', '文章标题', '文章链接', '文章摘要', '发布时间', '来源URL', '创建时间', '更新时间']\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m上次更新时间: 2025-10-16 08:11:07\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始获取RSS源数据...\u001B[0m\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"D:\\ProgramTools\\anaconda3\\envs\\intelligence_system\\Lib\\site-packages\\requests\\__init__.py:86: RequestsDependencyWarning: Unable to find acceptable character detection dependency (chardet or charset_normalizer).\n",
|
|
||||||
" warnings.warn(\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1mRSS源获取完成,成功获取 4/4 个源\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m获取完成,耗时: 0.72秒\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/china.xml\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 30/30 条记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/world.xml\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[31m\u001B[1mERROR \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[31m\u001B[1m表 collector_rss_subscriptions 插入失败记录详情\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 28/30 条记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/finance.xml\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 30/30 条记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/scroll-news.xml\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[31m\u001B[1mERROR \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[31m\u001B[1m表 collector_rss_subscriptions 插入失败记录详情\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 13/30 条记录\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m本次最新更新时间: 2025-10-17 05:41:17\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m任务执行完成,耗时: 1.85秒\u001B[0m\n",
|
|
||||||
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m任务执行成功: RSS新闻订阅\u001B[0m\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"execution_count": 5
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "c3492a1af7dbf2b1",
|
"id": "c3492a1af7dbf2b1",
|
||||||
|
|||||||
Reference in New Issue
Block a user