minio对象存储数据库链接

2025-09-16 17:35:53 +08:00
parent 8e92acf5d5
commit 9afa9d2e58
10 changed files with 7291 additions and 347 deletions
@@ -6,12 +6,14 @@ import croniter
 import pytz
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import pandas as pd
+from sqlalchemy.exc import SQLAlchemyError
 from utils.mysql_agent import MySQLAgent
 from utils.logger import CrossPlatformLog

 # 初始化调度器日志
 log = CrossPlatformLog.get_logger("TaskScheduler")

+
 class TaskScheduler:
    def __init__(self, db_config: Optional[Dict] = None, max_workers: int = 5):
        """初始化任务调度器（基于Cron表达式）"""
@@ -20,31 +22,37 @@ class TaskScheduler:
        log.info(f"任务调度器已初始化，最大工作线程数: {max_workers}")

    def check_and_run_tasks(self) -> Dict[str, int]:
-        """检查并执行所有到期的任务"""
+        """检查并执行所有到期的任务，优化空任务处理和异常容错"""
        result = {'总任务数': 0, '成功': 0, '失败': 0}

        try:
-            # 获取当前时间（带时区）
-            now = datetime.now(pytz.timezone('Asia/Shanghai')).replace(tzinfo=None)
+            # 获取当前时间（带时区转换为本地时间）
+            tz = pytz.timezone('Asia/Shanghai')
+            now = datetime.now(tz).replace(tzinfo=None)  # 移除时区信息，与数据库存储一致
+            log.debug(f"当前检查时间: {now.strftime('%Y-%m-%d %H:%M:%S')}")

-            # 查询所有到期的活跃任务
+            # 查询所有到期的活跃任务（使用参数化查询防止注入）
            tasks_df = self.db.query_to_df("""
-                SELECT * FROM main_task 
-                WHERE is_active = 1 
-                  AND next_run_time <= %s 
-                  AND is_running = 0
-                ORDER BY next_run_time
-            """, params=(now,))
+                                           SELECT *
+                                           FROM main_task
+                                           WHERE is_active = 1
+                                             AND next_run_time <= %s
+                                             AND is_running = 0
+                                           ORDER BY next_run_time
+                                           """, params=(now,))

            result['总任务数'] = len(tasks_df)
            if tasks_df.empty:
-                log.debug("没有到期的任务需要执行")
+                # 空任务时输出INFO级日志，明确提示状态
+                log.info("当前没有到期的任务，等待新任务加入...")
                return result

            # 并发执行任务
            futures = []
            for _, task in tasks_df.iterrows():
-                futures.append(self.executor.submit(self._process_single_task, task))
+                # 传递任务字典的副本避免线程安全问题
+                task_copy = task.to_dict()
+                futures.append(self.executor.submit(self._process_single_task, task_copy))

            # 收集执行结果
            for future in as_completed(futures):
@@ -54,7 +62,7 @@ class TaskScheduler:
                    else:
                        result['失败'] += 1
                except Exception as e:
-                    log.error(f"任务线程执行失败: {str(e)}")
+                    log.error(f"任务线程执行失败: {str(e)}", exc_info=True)
                    result['失败'] += 1

            log.info(
@@ -65,21 +73,28 @@ class TaskScheduler:
            )
            return result

+        except SQLAlchemyError as e:  # 数据库异常处理优化
+            log.error(f"数据库操作失败，将在下次轮询重试: {str(e)}", exc_info=True)
+            return result  # 不中断，返回当前结果
        except Exception as e:
-            log.critical("调度器主循环执行失败", exc_info=True)
-            raise
+            log.error("调度器周期执行异常，将在下次轮询重试", exc_info=True)
+            return result  # 不中断主循环，允许下次重试

    def _process_single_task(self, task: Dict[str, Any]) -> bool:
        """处理单个任务（线程安全）"""
        task_id = task['task_id']
-        task_log = log.bind(task_id=task_id, task_name=task['task_name'])
-        task_log.info(f"开始执行任务: {task['task_name']}")
+        task_name = task['task_name']
+        task_log = log.bind(task_id=task_id, task_name=task_name)
+        task_log.info(f"开始执行任务: {task_name}")

        try:
-            # 标记任务为运行中
+            # 标记任务为运行中（使用当前时间的时区感知对象）
+            tz = pytz.timezone(task.get('time_zone', 'Asia/Shanghai'))
+            current_time = datetime.now(tz).replace(tzinfo=None)
+
            self._update_task_status(task_id, {
                'is_running': 1,
-                'last_run_time': datetime.now()
+                'last_run_time': current_time
            })

            # 执行任务逻辑
@@ -98,7 +113,7 @@ class TaskScheduler:
                'run_count': task['run_count'] + 1,
                'next_run_time': next_run_time
            })
-            task_log.info(f"任务执行成功: {task['task_name']}")
+            task_log.info(f"任务执行成功: {task_name}")
            return True

        except Exception as e:
@@ -107,23 +122,35 @@ class TaskScheduler:
            # 失败时计算下次重试时间（15分钟后）
            next_retry_time = datetime.now() + pd.Timedelta(minutes=15)

-            self._update_task_status(task_id, {
-                'last_run_status': 'failed',
-                'is_running': 0,
-                'next_run_time': next_retry_time
-            })
+            # 即使任务执行失败，也要确保状态更新
+            try:
+                self._update_task_status(task_id, {
+                    'last_run_status': 'failed',
+                    'is_running': 0,
+                    'next_run_time': next_retry_time
+                })
+            except Exception as update_err:
+                task_log.error(f"任务失败后状态更新失败: {str(update_err)}", exc_info=True)
+
            return False

    def _execute_task_logic(self, task: Dict[str, Any]) -> None:
        """执行任务的具体逻辑（动态导入模块）"""
        start_time = time.time()
-        task_log = log.bind(task_id=task['task_id'], module=task['module_path'])
+        task_id = task['task_id']
+        module_path = task['module_path']
+        task_log = log.bind(task_id=task_id, module=module_path)

        try:
-            # 动态导入任务模块
-            module = importlib.import_module(task['module_path'])
-            if not hasattr(module, 'main'):
-                raise ImportError(f"模块 {task['module_path']} 中未找到 main() 函数")
+            # 动态导入任务模块（增加模块存在性检查）
+            try:
+                module = importlib.import_module(module_path)
+            except ImportError as e:
+                raise ImportError(f"模块 {module_path} 导入失败: {str(e)}")
+
+            # 检查main函数是否存在
+            if not hasattr(module, 'main') or not callable(module.main):
+                raise AttributeError(f"模块 {module_path} 中未找到可调用的 main() 函数")

            task_log.debug("开始执行模块中的 main() 函数")
            module.main()  # 调用任务主函数
@@ -137,7 +164,7 @@ class TaskScheduler:
        """基于Cron表达式计算下次运行时间"""
        try:
            tz = pytz.timezone(time_zone)
-            now = datetime.now(tz)
+            now = datetime.now(tz)  # 使用任务指定时区的当前时间
            cron = croniter.croniter(cron_expr, now)
            next_run = cron.get_next(datetime)
            return next_run.replace(tzinfo=None)  # 移除时区信息，适应数据库存储
@@ -146,12 +173,26 @@ class TaskScheduler:
            raise ValueError(f"无效的Cron表达式: {cron_expr}")

    def _update_task_status(self, task_id: int, updates: Dict[str, Any]) -> None:
-        """更新任务状态到数据库"""
-        set_clause = ", ".join([f"{k}=%s" for k in updates.keys()])
+        """更新任务状态到数据库（适配SQLAlchemy的参数传递方式）"""
+        if not updates:
+            log.warning(f"任务ID {task_id} 未提供任何更新字段")
+            return
+
+        # 构建UPDATE语句（确保字段名安全）
+        valid_fields = {'is_running', 'last_run_time', 'last_run_status',
+                        'run_count', 'next_run_time', 'updated_at'}
+        filtered_updates = {k: v for k, v in updates.items() if k in valid_fields}
+
+        if not filtered_updates:
+            log.warning(f"任务ID {task_id} 没有有效的更新字段")
+            return
+
+        set_clause = ", ".join([f"{k}=%s" for k in filtered_updates.keys()])
        sql = f"UPDATE main_task SET {set_clause}, updated_at=NOW() WHERE task_id=%s"
-        params = list(updates.values()) + [task_id]
+        params = list(filtered_updates.values()) + [task_id]

        try:
+            # 执行更新并获取受影响的行数
            affected_rows = self.db.execute_sql(sql, params=params)
            if affected_rows != 1:
                log.warning(
@@ -160,51 +201,104 @@ class TaskScheduler:
                    预期影响行数=1,
                    实际影响行数=affected_rows
                )
+        except SQLAlchemyError as e:
+            log.error(f"任务状态更新失败（数据库错误），task_id: {task_id}", exc_info=True)
+            raise
        except Exception as e:
            log.error(f"任务状态更新失败，task_id: {task_id}", exc_info=True)
            raise

    def add_task(self,
-                task_name: str,
-                task_type: str,
-                module_path: str,
-                cron_expression: str,
-                time_zone: str = 'Asia/Shanghai') -> int:
+                 task_name: str,
+                 task_type: str,
+                 module_path: str,
+                 cron_expression: str,
+                 time_zone: str = 'Asia/Shanghai') -> int:
        """添加新的Cron任务"""
        if not cron_expression:
            raise ValueError("Cron表达式不能为空")

+        # 验证模块是否存在（提前检查，避免添加无效任务）
+        try:
+            importlib.import_module(module_path)
+        except ImportError as e:
+            raise ValueError(f"模块 {module_path} 不存在: {str(e)}")
+
        # 计算首次运行时间
        first_run_time = self._calculate_next_run_time(cron_expression, time_zone)

        # 插入数据库
        sql = """
-        INSERT INTO main_task 
-        (task_name, task_type, module_path, cron_expression, time_zone, 
-         next_run_time, is_active)
-        VALUES (%s, %s, %s, %s, %s, %s, 1)
-        """
+              INSERT INTO main_task
+              (task_name, task_type, module_path, cron_expression, time_zone,
+               next_run_time, is_active, created_at, updated_at)
+              VALUES (%s, %s, %s, %s, %s, %s, 1, NOW(), NOW()) \
+              """
        params = (task_name, task_type, module_path, cron_expression, time_zone, first_run_time)

        try:
            self.db.execute_sql(sql, params=params)
-            task_id = self.db.query_to_df("SELECT LAST_INSERT_ID() AS id").iloc[0]['id']
+            # 获取插入的任务ID
+            result_df = self.db.query_to_df("SELECT LAST_INSERT_ID() AS id")
+            if result_df.empty or 'id' not in result_df.columns:
+                raise ValueError("无法获取新添加任务的ID")
+
+            task_id = result_df.iloc[0]['id']
            log.info(
-                f"新任务添加成功",
+                "新任务添加成功",
                task_id=task_id,
                task_name=task_name,
                cron表达式=cron_expression,
-                首次运行时间=first_run_time
+                首次运行时间=first_run_time.strftime('%Y-%m-%d %H:%M:%S')
            )
            return task_id
+        except SQLAlchemyError as e:
+            log.error(f"添加任务失败（数据库错误）: {task_name}", exc_info=True)
+            raise
        except Exception as e:
            log.error(f"添加任务失败: {task_name}", exc_info=True)
            raise

    def get_pending_tasks_count(self) -> int:
-        """获取当前等待执行的任务数量"""
-        result = self.db.query_to_df("""
-            SELECT COUNT(*) AS count FROM main_task 
-            WHERE is_active = 1 AND next_run_time <= %s
-        """, params=(datetime.now(),))
-        return result.iloc[0]['count'] if not result.empty else 0
+        """获取待执行任务数量（用于优雅关闭）"""
+        try:
+            tz = pytz.timezone('Asia/Shanghai')
+            now = datetime.now(tz).replace(tzinfo=None)
+            sql = """
+                  SELECT COUNT(*) as cnt 
+                  FROM main_task 
+                  WHERE is_active = 1 
+                    AND next_run_time <= %s 
+                    AND is_running = 0
+                  """
+            df = self.db.query_to_df(sql, params=(now,))
+            return df['cnt'].iloc[0] if not df.empty else 0
+        except Exception as e:
+            log.error(f"查询待执行任务数量失败: {str(e)}", exc_info=True)
+            return 0  # 出错时返回0，避免影响关闭流程
+
+    def get_pending_tasks(self) -> List[Dict[str, Any]]:
+        """查询所有待执行任务（兼容原有逻辑）"""
+        try:
+            tz = pytz.timezone('Asia/Shanghai')
+            now = datetime.now(tz).replace(tzinfo=None)
+            sql = """
+                  SELECT * 
+                  FROM main_task 
+                  WHERE is_active = 1 
+                    AND next_run_time <= %s 
+                    AND is_running = 0
+                  ORDER BY next_run_time
+                  """
+            tasks_df = self.db.query_to_df(sql, params=(now,))
+
+            if tasks_df.empty:
+                log.info("当前任务列表为空，等待新任务加入...")
+                return []
+
+            log.info(f"查询到{len(tasks_df)}个待执行任务")
+            return tasks_df.to_dict('records')
+
+        except Exception as e:
+            log.error(f"查询待执行任务失败，将重试: {str(e)}", exc_info=True)
+            return []