rss订阅数据爬取及数据处理

This commit is contained in:
z66
2025-10-23 17:18:49 +08:00
parent fd67231866
commit e1db06dd79
8 changed files with 84042 additions and 229449 deletions
+472 -289
View File
@@ -10,14 +10,25 @@
},
{
"cell_type": "code",
"execution_count": 1,
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-10-17T05:43:18.381936Z",
"start_time": "2025-10-17T05:43:15.265036Z"
}
},
"collapsed": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PROJECT_ROOT = d:\\Idea Project\\intelligence_system\n",
"\u001b[32m2025-10-23 16:56:55\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m任务调度器已初始化,最大工作线程数: 5\u001b[0m\n"
]
}
],
"source": [
"# 使 Notebook 可从项目根导入\n",
"import sys\n",
@@ -116,6 +127,78 @@
" log.exception(\"手动执行任务失败\")\n",
" return False\n",
"\n",
" def run_task_synchronously(self, task_id: int) -> dict:\n",
" \"\"\"同步执行任务并返回详细结果(用于Notebook中查看执行过程)\"\"\"\n",
" import time\n",
" import sys\n",
" from io import StringIO\n",
" \n",
" task = self.get_task_by_id(task_id)\n",
" if not task:\n",
" return {\n",
" 'success': False,\n",
" 'error': f'未找到任务ID: {task_id}',\n",
" 'output': ''\n",
" }\n",
" \n",
" # 捕获标准输出\n",
" old_stdout = sys.stdout\n",
" sys.stdout = output_buffer = StringIO()\n",
" \n",
" start_time = time.time()\n",
" success = False\n",
" error_msg = None\n",
" \n",
" try:\n",
" # 直接同步执行任务逻辑\n",
" self.scheduler._execute_task_logic(task)\n",
" success = True\n",
" \n",
" # 更新任务状态\n",
" next_run_time = self.scheduler._calculate_next_run_time(\n",
" cron_expr=task['cron_expression'],\n",
" time_zone=task.get('time_zone', 'Asia/Shanghai')\n",
" )\n",
" \n",
" self.scheduler._update_task_status(task['task_id'], {\n",
" 'last_run_status': 'success',\n",
" 'is_running': 0,\n",
" 'run_count': task['run_count'] + 1,\n",
" 'next_run_time': next_run_time\n",
" })\n",
" \n",
" except Exception as e:\n",
" success = False\n",
" error_msg = str(e)\n",
" log.exception(f\"任务执行失败: {task['task_name']}\")\n",
" \n",
" # 更新失败状态\n",
" try:\n",
" next_retry_time = datetime.now() + pd.Timedelta(minutes=15)\n",
" self.scheduler._update_task_status(task['task_id'], {\n",
" 'last_run_status': 'failed',\n",
" 'is_running': 0,\n",
" 'next_run_time': next_retry_time\n",
" })\n",
" except Exception:\n",
" pass\n",
" \n",
" finally:\n",
" # 恢复标准输出\n",
" sys.stdout = old_stdout\n",
" output_text = output_buffer.getvalue()\n",
" \n",
" execution_time = time.time() - start_time\n",
" \n",
" return {\n",
" 'success': success,\n",
" 'task_name': task['task_name'],\n",
" 'task_id': task['task_id'],\n",
" 'execution_time': execution_time,\n",
" 'output': output_text,\n",
" 'error': error_msg\n",
" }\n",
"\n",
"# 在这里创建 manager(供后续单元使用)\n",
"manager = TaskManager(scheduler)\n",
"\n",
@@ -134,18 +217,7 @@
" except Exception:\n",
" pass\n",
" return str(dt)"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PROJECT_ROOT = D:\\Idea Project\\intelligence_system\n",
"\u001B[32m2025-10-17 13:43:18\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m任务调度器已初始化,最大工作线程数: 5\u001B[0m\n"
]
}
],
"execution_count": 1
]
},
{
"cell_type": "markdown",
@@ -157,6 +229,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7b020af55972643",
"metadata": {
"ExecuteTime": {
@@ -164,73 +237,28 @@
"start_time": "2025-10-17T05:43:18.394863Z"
}
},
"source": [
"# 列出所有任务(包括已禁用的)\n",
"def list_tasks(active_only=True):\n",
" tasks = manager.get_all_tasks(active_only)\n",
" if not tasks:\n",
" display(Markdown(\"### 没有找到任务\"))\n",
" return None\n",
"\n",
" df = pd.DataFrame(tasks)\n",
"\n",
" # 格式化日期列\n",
" if 'last_run_time' in df.columns:\n",
" df['last_run_time'] = df['last_run_time'].apply(format_datetime)\n",
" if 'next_run_time' in df.columns:\n",
" df['next_run_time'] = df['next_run_time'].apply(format_datetime)\n",
"\n",
" # 重命名列名\n",
" df = df.rename(columns={\n",
" 'task_id': '任务ID',\n",
" 'task_name': '任务名称',\n",
" 'task_type': '任务类型',\n",
" 'module_path': '模块路径',\n",
" 'cron_expression': 'Cron表达式',\n",
" 'time_zone': '时区',\n",
" 'last_run_time': '最后运行时间',\n",
" 'next_run_time': '下次运行时间',\n",
" 'last_run_status': '运行状态',\n",
" 'is_active': '是否活跃',\n",
" 'run_count': '运行次数'\n",
" })\n",
"\n",
" display(Markdown(\"### 任务列表\"))\n",
" display(HTML(df.to_html(index=False)))\n",
" return df\n",
"\n",
"# 执行:列出所有任务(包括已禁用)\n",
"list_tasks(active_only=False)\n",
"\n",
"# 或者:只列出活跃任务\n",
"# list_tasks(active_only=True)"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-17 13:43:18\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n"
"\u001b[32m2025-10-17 13:43:18\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n"
]
},
{
"data": {
"text/markdown": [
"### 任务列表"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
],
"text/markdown": "### 任务列表"
]
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"<IPython.core.display.HTML object>"
],
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
@@ -270,26 +298,16 @@
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
"output_type": "display_data"
},
{
"data": {
"text/plain": [
" 任务ID 任务名称 任务类型 模块路径 \\\n",
"0 1 RSS新闻订阅 collector collectors.rss_subscriptions.NewsAPIClient \n",
"\n",
" Cron表达式 时区 下次运行时间 最后运行时间 \\\n",
"0 5 0 * * * Asia/Shanghai 2025-10-18 00:05:00 2025-10-17 00:05:07 \n",
"\n",
" 运行状态 运行次数 是否活跃 is_running created_at updated_at \n",
"0 success 4 1 0 2025-10-16 15:47:34 2025-10-17 00:05:08 "
],
"text/html": [
"<div>\n",
"<style scoped>\n",
@@ -346,6 +364,16 @@
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 任务ID 任务名称 任务类型 模块路径 \\\n",
"0 1 RSS新闻订阅 collector collectors.rss_subscriptions.NewsAPIClient \n",
"\n",
" Cron表达式 时区 下次运行时间 最后运行时间 \\\n",
"0 5 0 * * * Asia/Shanghai 2025-10-18 00:05:00 2025-10-17 00:05:07 \n",
"\n",
" 运行状态 运行次数 是否活跃 is_running created_at updated_at \n",
"0 success 4 1 0 2025-10-16 15:47:34 2025-10-17 00:05:08 "
]
},
"execution_count": 2,
@@ -353,7 +381,47 @@
"output_type": "execute_result"
}
],
"execution_count": 2
"source": [
"# 列出所有任务(包括已禁用的)\n",
"def list_tasks(active_only=True):\n",
" tasks = manager.get_all_tasks(active_only)\n",
" if not tasks:\n",
" display(Markdown(\"### 没有找到任务\"))\n",
" return None\n",
"\n",
" df = pd.DataFrame(tasks)\n",
"\n",
" # 格式化日期列\n",
" if 'last_run_time' in df.columns:\n",
" df['last_run_time'] = df['last_run_time'].apply(format_datetime)\n",
" if 'next_run_time' in df.columns:\n",
" df['next_run_time'] = df['next_run_time'].apply(format_datetime)\n",
"\n",
" # 重命名列名\n",
" df = df.rename(columns={\n",
" 'task_id': '任务ID',\n",
" 'task_name': '任务名称',\n",
" 'task_type': '任务类型',\n",
" 'module_path': '模块路径',\n",
" 'cron_expression': 'Cron表达式',\n",
" 'time_zone': '时区',\n",
" 'last_run_time': '最后运行时间',\n",
" 'next_run_time': '下次运行时间',\n",
" 'last_run_status': '运行状态',\n",
" 'is_active': '是否活跃',\n",
" 'run_count': '运行次数'\n",
" })\n",
"\n",
" display(Markdown(\"### 任务列表\"))\n",
" display(HTML(df.to_html(index=False)))\n",
" return df\n",
"\n",
"# 执行:列出所有任务(包括已禁用)\n",
"list_tasks(active_only=False)\n",
"\n",
"# 或者:只列出活跃任务\n",
"# list_tasks(active_only=True)"
]
},
{
"cell_type": "markdown",
@@ -365,6 +433,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"id": "eab90de72c35429e",
"metadata": {
"ExecuteTime": {
@@ -372,6 +441,62 @@
"start_time": "2025-10-17T05:43:26.071398Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2025-10-17 13:43:26\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n"
]
},
{
"data": {
"text/markdown": [
"### 任务详情\n",
"**任务ID**: 1\n",
"**任务名称**: RSS新闻订阅\n",
"**任务类型**: collector\n",
"**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n",
"**Cron表达式**: 5 0 * * *\n",
"**时区**: Asia/Shanghai\n",
"**最后运行时间**: 2025-10-17 00:05:07\n",
"**下次运行时间**: 2025-10-18 00:05:00\n",
"**运行状态**: success\n",
"**是否活跃**: 是\n",
"**运行次数**: 4\n",
"**创建时间**: 2025-10-16 15:47:34"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{'task_id': 1,\n",
" 'task_name': 'RSS新闻订阅',\n",
" 'task_type': 'collector',\n",
" 'module_path': 'collectors.rss_subscriptions.NewsAPIClient',\n",
" 'cron_expression': '5 0 * * *',\n",
" 'time_zone': 'Asia/Shanghai',\n",
" 'next_run_time': Timestamp('2025-10-18 00:05:00'),\n",
" 'last_run_time': Timestamp('2025-10-17 00:05:07'),\n",
" 'last_run_status': 'success',\n",
" 'run_count': 4,\n",
" 'is_active': 1,\n",
" 'is_running': 0,\n",
" 'created_at': Timestamp('2025-10-16 15:47:34'),\n",
" 'updated_at': Timestamp('2025-10-17 00:05:08')}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 查看指定任务的详情\n",
"def show_task_details(task_id):\n",
@@ -399,53 +524,7 @@
"\n",
"# 执行:查看任务ID为1的详情(替换为实际ID)\n",
"show_task_details(1)"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-17 13:43:26\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n"
]
},
{
"data": {
"text/plain": [
"<IPython.core.display.Markdown object>"
],
"text/markdown": "### 任务详情\n**任务ID**: 1\n**任务名称**: RSS新闻订阅\n**任务类型**: collector\n**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n**Cron表达式**: 5 0 * * *\n**时区**: Asia/Shanghai\n**最后运行时间**: 2025-10-17 00:05:07\n**下次运行时间**: 2025-10-18 00:05:00\n**运行状态**: success\n**是否活跃**: 是\n**运行次数**: 4\n**创建时间**: 2025-10-16 15:47:34"
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
},
{
"data": {
"text/plain": [
"{'task_id': 1,\n",
" 'task_name': 'RSS新闻订阅',\n",
" 'task_type': 'collector',\n",
" 'module_path': 'collectors.rss_subscriptions.NewsAPIClient',\n",
" 'cron_expression': '5 0 * * *',\n",
" 'time_zone': 'Asia/Shanghai',\n",
" 'next_run_time': Timestamp('2025-10-18 00:05:00'),\n",
" 'last_run_time': Timestamp('2025-10-17 00:05:07'),\n",
" 'last_run_status': 'success',\n",
" 'run_count': 4,\n",
" 'is_active': 1,\n",
" 'is_running': 0,\n",
" 'created_at': Timestamp('2025-10-16 15:47:34'),\n",
" 'updated_at': Timestamp('2025-10-17 00:05:08')}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 3
]
},
{
"cell_type": "markdown",
@@ -473,8 +552,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-16 15:47:34\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n",
"\u001B[32m2025-10-16 15:47:34\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m新任务添加成功\u001B[0m\n"
"\u001b[32m2025-10-16 15:47:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n",
"\u001b[32m2025-10-16 15:47:34\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m新任务添加成功\u001b[0m\n"
]
},
{
@@ -550,6 +629,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c892fd8ad2f0dd9d",
"metadata": {
"ExecuteTime": {
@@ -557,6 +637,61 @@
"start_time": "2025-10-17T05:44:18.980345Z"
}
},
"outputs": [
{
"data": {
"text/markdown": [
"### 任务ID 1 更新成功"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2025-10-17 13:44:19\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n"
]
},
{
"data": {
"text/markdown": [
"### 任务详情\n",
"**任务ID**: 1\n",
"**任务名称**: RSS新闻订阅\n",
"**任务类型**: collector\n",
"**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n",
"**Cron表达式**: 5 * * * *\n",
"**时区**: Asia/Shanghai\n",
"**最后运行时间**: 2025-10-17 00:05:07\n",
"**下次运行时间**: 2025-10-18 00:05:00\n",
"**运行状态**: success\n",
"**是否活跃**: 是\n",
"**运行次数**: 4\n",
"**创建时间**: 2025-10-16 15:47:34"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 更新任务属性\n",
"def update_task(task_id, **kwargs):\n",
@@ -589,53 +724,7 @@
"\n",
"# 执行:同时更新多个属性(名称和Cron表达式)\n",
"# update_task(1, name=\"每日早间新闻采集\", cron=\"0 8 * * *\")"
],
"outputs": [
{
"data": {
"text/plain": [
"<IPython.core.display.Markdown object>"
],
"text/markdown": "### 任务ID 1 更新成功"
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-17 13:44:19\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n"
]
},
{
"data": {
"text/plain": [
"<IPython.core.display.Markdown object>"
],
"text/markdown": "### 任务详情\n**任务ID**: 1\n**任务名称**: RSS新闻订阅\n**任务类型**: collector\n**模块路径**: collectors.rss_subscriptions.NewsAPIClient\n**Cron表达式**: 5 * * * *\n**时区**: Asia/Shanghai\n**最后运行时间**: 2025-10-17 00:05:07\n**下次运行时间**: 2025-10-18 00:05:00\n**运行状态**: success\n**是否活跃**: 是\n**运行次数**: 4\n**创建时间**: 2025-10-16 15:47:34"
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 4
]
},
{
"cell_type": "markdown",
@@ -702,6 +791,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"id": "94892f4134316f8e",
"metadata": {
"ExecuteTime": {
@@ -709,8 +799,177 @@
"start_time": "2025-10-17T05:44:35.084369Z"
}
},
"outputs": [
{
"data": {
"text/markdown": [
"### 开始执行任务ID 2"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m查询执行成功\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1mRSS数据处理器初始化完成\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:20\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m开始处理RSS数据...\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功加载 8 条未处理的RSS数据\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[33m\u001b[1m停用词文件不存在: processors/stopwords.txt,使用默认停用词\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[33m\u001b[1m关键词文件不存在: processors/keywords.txt\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache C:\\Users\\zy187\\AppData\\Local\\Temp\\jieba.cache\n",
"Loading model cost 0.609 seconds.\n",
"Prefix dict has been built successfully.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m数据处理完成,共处理 8 条记录\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m过滤出 1 条汽车后市场相关新闻\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmysql_agent\u001b[0m - \u001b[1m表 processed_rss_data 插入结果汇总\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功保存 1 条处理结果到数据库\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1m成功标记 8 条数据为已处理\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mprocessor_rss_data\u001b[0m - \u001b[1mRSS数据处理完成\u001b[0m\n",
"\u001b[32m2025-10-23 16:57:21\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mtask_scheduler\u001b[0m - \u001b[1m任务执行完成,耗时: 1.19秒\u001b[0m\n"
]
},
{
"data": {
"text/markdown": [
"**任务名称**: RSS基于规则数据处理"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"**任务ID**: 2"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"**执行时长**: 1.26 秒"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"### 📋 执行输出:"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"RSS数据处理完成!\n",
"处理统计: {'total_articles': 8, 'filtered_articles': 1, 'filter_rate': 0.125, 'processing_time': '2025-10-23 16:57:21', 'save_success': True, 'mark_success': True}\n",
"\n"
]
},
{
"data": {
"text/markdown": [
"---"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"### ✅ 任务执行成功"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{'success': True,\n",
" 'task_name': 'RSS基于规则数据处理',\n",
" 'task_id': 2,\n",
" 'execution_time': 1.2610254287719727,\n",
" 'output': \"RSS数据处理完成!\\n处理统计: {'total_articles': 8, 'filtered_articles': 1, 'filter_rate': 0.125, 'processing_time': '2025-10-23 16:57:21', 'save_success': True, 'mark_success': True}\\n\",\n",
" 'error': None}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 手动执行任务\n",
"# 手动执行任务(异步方式,快速返回)\n",
"def run_task_manually(task_id):\n",
" display(Markdown(f\"### 正在手动执行任务ID {task_id}...\"))\n",
" success = manager.run_task_manually(task_id)\n",
@@ -720,118 +979,42 @@
" display(Markdown(f\"### 任务ID {task_id} 执行失败\"))\n",
" return success\n",
"\n",
"# 执行:手动行任务ID为1的任务\n",
"run_task_manually(1)"
],
"outputs": [
{
"data": {
"text/plain": [
"<IPython.core.display.Markdown object>"
],
"text/markdown": "### 正在手动执行任务ID 1..."
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-17 13:44:35\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m查询执行成功\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:35\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m开始执行任务: RSS新闻订阅\u001B[0m\n"
]
},
{
"data": {
"text/plain": [
"<IPython.core.display.Markdown object>"
],
"text/markdown": "### 任务ID 1 执行成功"
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m新闻API客户端初始化完成,已连接到数据库\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m数据库表结构验证通过,当前字段:['id', '文章标题', '文章链接', '文章摘要', '发布时间', '来源URL', '创建时间', '更新时间']\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m上次更新时间: 2025-10-16 08:11:07\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:37\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始获取RSS源数据...\u001B[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\ProgramTools\\anaconda3\\envs\\intelligence_system\\Lib\\site-packages\\requests\\__init__.py:86: RequestsDependencyWarning: Unable to find acceptable character detection dependency (chardet or charset_normalizer).\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1mRSS源获取完成,成功获取 4/4 个源\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m获取完成,耗时: 0.72秒\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/china.xml\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 30/30 条记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:38\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/world.xml\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[31m\u001B[1mERROR \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[31m\u001B[1m表 collector_rss_subscriptions 插入失败记录详情\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 28/30 条记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/finance.xml\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 30/30 条记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m开始处理 RSS 源: https://www.chinanews.com.cn/rss/scroll-news.xml\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[33m\u001B[1mWARNING \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[33m\u001B[1m表 collector_rss_subscriptions 中跳过重复记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[1m表 collector_rss_subscriptions 插入结果汇总\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[31m\u001B[1mERROR \u001B[0m | \u001B[36mmysql_agent\u001B[0m - \u001B[31m\u001B[1m表 collector_rss_subscriptions 插入失败记录详情\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m成功写入 13/30 条记录\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mrss_subscriptions\u001B[0m - \u001B[1m本次最新更新时间: 2025-10-17 05:41:17\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m任务执行完成,耗时: 1.85秒\u001B[0m\n",
"\u001B[32m2025-10-17 13:44:39\u001B[0m | \u001B[1mINFO \u001B[0m | \u001B[36mtask_scheduler\u001B[0m - \u001B[1m任务执行成功: RSS新闻订阅\u001B[0m\n"
]
}
],
"execution_count": 5
"# 手动行任务(同步方式,显示详细执行过程)\n",
"def run_task_with_details(task_id):\n",
" display(Markdown(f\"### 开始执行任务ID {task_id}\"))\n",
" display(Markdown(\"---\"))\n",
" \n",
" result = manager.run_task_synchronously(task_id)\n",
" \n",
" if not result['success'] and result.get('error') and 'task_id' not in result:\n",
" display(Markdown(f\"### ❌ 错误: {result['error']}\"))\n",
" return result\n",
" \n",
" # 显示任务基本信息\n",
" display(Markdown(f\"**任务名称**: {result['task_name']}\"))\n",
" display(Markdown(f\"**任务ID**: {result['task_id']}\"))\n",
" display(Markdown(f\"**执行时长**: {result['execution_time']:.2f} 秒\"))\n",
" display(Markdown(\"---\"))\n",
" \n",
" # 显示执行输出\n",
" if result['output']:\n",
" display(Markdown(\"### 📋 执行输出:\"))\n",
" print(result['output'])\n",
" display(Markdown(\"---\"))\n",
" \n",
" # 显示执行结果\n",
" if result['success']:\n",
" display(Markdown(\"### ✅ 任务执行成功\"))\n",
" else:\n",
" display(Markdown(f\"### ❌ 任务执行失败\"))\n",
" if result['error']:\n",
" display(Markdown(f\"**错误信息**: {result['error']}\"))\n",
" \n",
" return result\n",
"\n",
"# 执行:手动运行任务ID为2的任务(显示详细执行过程)\n",
"run_task_with_details(2)"
]
},
{
"cell_type": "markdown",