import os import json import time import uuid import logging from datetime import datetime, timedelta from flask import Blueprint, request, jsonify, current_app from utils.db_manager import DatabaseManager from utils.sensitive_filter import filter_dict from utils.cache_manager import CacheManager workflow_bp = Blueprint('workflow', __name__, url_prefix='/api/workflow') logger = logging.getLogger('workflow_api') logger.setLevel(logging.INFO) # 缓存管理器 workflow_cache = CacheManager(name="workflows", memory_capacity=100, cache_duration=1) # 默认爬虫配置模板 DEFAULT_CRAWLER_TEMPLATES = [ { "id": "default_weibo", "name": "微博热门话题", "description": "抓取微博热门话题及相关评论", "icon": "fab fa-weibo", "config": { "source": "weibo", "crawlDepth": 2, "interval": 3600, "maxRetries": 3, "timeout": 30, "maxConcurrent": 2, "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "filters": { "minComments": 10, "minLikes": 50, "excludeKeywords": [] } } }, { "id": "weibo_trending", "name": "微博热搜榜", "description": "抓取微博热搜榜单内容", "icon": "fas fa-fire", "config": { "source": "weibo_trending", "crawlDepth": 1, "interval": 1800, "maxRetries": 3, "timeout": 20, "maxConcurrent": 1, "filters": { "topN": 50, "excludeKeywords": [] } } } ] # 默认分析流程模板 DEFAULT_ANALYSIS_TEMPLATES = [ { "id": "sentiment_analysis", "name": "情感分析流程", "description": "对文本进行情感分析", "icon": "fas fa-smile", "components": [ { "id": "data_source", "type": "data_source", "name": "数据源", "config": { "source_type": "database", "table": "comments", "filter": { "timeRange": "1d" } }, "position": {"x": 100, "y": 100} }, { "id": "text_preprocessing", "type": "preprocessing", "name": "文本预处理", "config": { "removeStopwords": True, "removeURLs": True, "removeEmojis": False }, "position": {"x": 300, "y": 100} }, { "id": "sentiment_model", "type": "model", "name": "情感分析模型", "config": { "model_type": "sentiment", "api": "openai", "optimize_for": "balanced" }, "position": {"x": 500, "y": 100} }, { "id": "visualization", "type": "visualization", "name": "可视化", "config": { "chart_type": "pie", "title": "情感分布" }, "position": {"x": 700, "y": 100} } ], "connections": [ {"source": "data_source", "target": "text_preprocessing"}, {"source": "text_preprocessing", "target": "sentiment_model"}, {"source": "sentiment_model", "target": "visualization"} ] }, { "id": "topic_analysis", "name": "话题分析流程", "description": "对文本进行话题分类和关键词提取", "icon": "fas fa-tasks", "components": [ { "id": "data_source", "type": "data_source", "name": "数据源", "config": { "source_type": "database", "table": "weibo_posts", "filter": { "timeRange": "7d" } }, "position": {"x": 100, "y": 100} }, { "id": "text_preprocessing", "type": "preprocessing", "name": "文本预处理", "config": { "removeStopwords": True, "removeURLs": True, "removeEmojis": True }, "position": {"x": 300, "y": 100} }, { "id": "topic_model", "type": "model", "name": "话题分类模型", "config": { "model_type": "topic_classification", "api": "deepseek", "optimize_for": "performance" }, "position": {"x": 500, "y": 50} }, { "id": "keyword_model", "type": "model", "name": "关键词提取模型", "config": { "model_type": "keyword_extraction", "api": "openai", "optimize_for": "balanced" }, "position": {"x": 500, "y": 150} }, { "id": "topic_viz", "type": "visualization", "name": "话题分布", "config": { "chart_type": "bar", "title": "话题分布" }, "position": {"x": 700, "y": 50} }, { "id": "keyword_viz", "type": "visualization", "name": "关键词云", "config": { "chart_type": "wordcloud", "title": "热门关键词" }, "position": {"x": 700, "y": 150} } ], "connections": [ {"source": "data_source", "target": "text_preprocessing"}, {"source": "text_preprocessing", "target": "topic_model"}, {"source": "text_preprocessing", "target": "keyword_model"}, {"source": "topic_model", "target": "topic_viz"}, {"source": "keyword_model", "target": "keyword_viz"} ] } ] # 默认可用组件 AVAILABLE_COMPONENTS = { "data_source": [ { "id": "database", "name": "数据库", "description": "从系统数据库获取数据", "config_schema": { "table": {"type": "string", "description": "数据表名", "required": True}, "filter": {"type": "object", "description": "数据过滤条件"} } }, { "id": "api", "name": "API接口", "description": "从外部API获取数据", "config_schema": { "url": {"type": "string", "description": "API URL", "required": True}, "method": {"type": "string", "description": "请求方法", "default": "GET"}, "headers": {"type": "object", "description": "请求头"}, "params": {"type": "object", "description": "请求参数"} } }, { "id": "csv", "name": "CSV文件", "description": "从CSV文件导入数据", "config_schema": { "file_path": {"type": "string", "description": "文件路径", "required": True}, "encoding": {"type": "string", "description": "文件编码", "default": "utf-8"}, "delimiter": {"type": "string", "description": "分隔符", "default": ","} } } ], "preprocessing": [ { "id": "text_preprocessing", "name": "文本预处理", "description": "清洗和规范化文本数据", "config_schema": { "removeStopwords": {"type": "boolean", "description": "去除停用词", "default": True}, "removeURLs": {"type": "boolean", "description": "去除URL", "default": True}, "removeEmojis": {"type": "boolean", "description": "去除表情符号", "default": False}, "lowercase": {"type": "boolean", "description": "转为小写", "default": True} } }, { "id": "tokenization", "name": "分词", "description": "将文本切分为词语或标记", "config_schema": { "method": {"type": "string", "description": "分词方法", "default": "jieba"}, "pos_tagging": {"type": "boolean", "description": "进行词性标注", "default": False} } }, { "id": "feature_extraction", "name": "特征提取", "description": "从文本提取数值特征", "config_schema": { "method": {"type": "string", "description": "特征提取方法", "default": "tfidf"}, "max_features": {"type": "integer", "description": "最大特征数", "default": 1000} } } ], "model": [ { "id": "sentiment", "name": "情感分析", "description": "分析文本情感倾向", "config_schema": { "api": {"type": "string", "description": "使用的API", "default": "openai"}, "model_type": {"type": "string", "description": "模型类型", "default": "sentiment_analysis"}, "optimize_for": {"type": "string", "description": "优化目标", "default": "balanced"} } }, { "id": "topic_classification", "name": "话题分类", "description": "对文本进行话题分类", "config_schema": { "api": {"type": "string", "description": "使用的API", "default": "deepseek"}, "model_type": {"type": "string", "description": "模型类型", "default": "topic_classification"}, "optimize_for": {"type": "string", "description": "优化目标", "default": "performance"} } }, { "id": "keyword_extraction", "name": "关键词提取", "description": "从文本中提取关键词", "config_schema": { "api": {"type": "string", "description": "使用的API", "default": "openai"}, "model_type": {"type": "string", "description": "模型类型", "default": "keyword_extraction"}, "optimize_for": {"type": "string", "description": "优化目标", "default": "balanced"} } }, { "id": "custom_ai", "name": "自定义AI模型", "description": "使用自定义AI模型进行分析", "config_schema": { "model_path": {"type": "string", "description": "模型路径", "required": True}, "model_type": {"type": "string", "description": "模型类型", "required": True} } } ], "visualization": [ { "id": "line_chart", "name": "折线图", "description": "展示数据随时间的变化趋势", "config_schema": { "title": {"type": "string", "description": "图表标题", "default": "时间趋势"}, "x_axis": {"type": "string", "description": "X轴字段", "default": "time"}, "y_axis": {"type": "string", "description": "Y轴字段", "default": "value"}, "color": {"type": "string", "description": "线条颜色", "default": "#1890ff"} } }, { "id": "bar_chart", "name": "柱状图", "description": "展示不同类别的数据对比", "config_schema": { "title": {"type": "string", "description": "图表标题", "default": "数据对比"}, "x_axis": {"type": "string", "description": "X轴字段", "default": "category"}, "y_axis": {"type": "string", "description": "Y轴字段", "default": "value"} } }, { "id": "pie_chart", "name": "饼图", "description": "展示数据的构成比例", "config_schema": { "title": {"type": "string", "description": "图表标题", "default": "比例分布"}, "value_field": {"type": "string", "description": "值字段", "default": "value"}, "label_field": {"type": "string", "description": "标签字段", "default": "label"} } }, { "id": "wordcloud", "name": "词云图", "description": "直观展示文本中的高频词", "config_schema": { "title": {"type": "string", "description": "图表标题", "default": "关键词云"}, "max_words": {"type": "integer", "description": "最大词数", "default": 100}, "color_scheme": {"type": "string", "description": "配色方案", "default": "viridis"} } }, { "id": "heatmap", "name": "热力图", "description": "展示数据的密度分布", "config_schema": { "title": {"type": "string", "description": "图表标题", "default": "热力分布"}, "x_axis": {"type": "string", "description": "X轴字段", "default": "x"}, "y_axis": {"type": "string", "description": "Y轴字段", "default": "y"}, "value_field": {"type": "string", "description": "值字段", "default": "value"} } } ] } @workflow_bp.route('/crawler-templates', methods=['GET']) def get_crawler_templates(): """获取爬虫配置模板列表""" # 从缓存获取 templates = workflow_cache.get('crawler_templates') if templates is None: # 从数据库获取用户定义的模板 db = DatabaseManager.get_connection() cursor = db.cursor() cursor.execute(""" SELECT id, name, description, icon, config FROM crawler_templates WHERE deleted = 0 ORDER BY created_at DESC """) user_templates = cursor.fetchall() cursor.close() # 结合默认模板 templates = DEFAULT_CRAWLER_TEMPLATES + list(user_templates) # 缓存结果 workflow_cache.set('crawler_templates', templates) return jsonify({ 'success': True, 'data': filter_dict(templates) }) @workflow_bp.route('/crawler-templates/', methods=['GET']) def get_crawler_template(template_id): """获取指定爬虫配置模板""" # 查找默认模板 for template in DEFAULT_CRAWLER_TEMPLATES: if template['id'] == template_id: return jsonify({ 'success': True, 'data': filter_dict(template) }) # 从数据库查找用户模板 db = DatabaseManager.get_connection() cursor = db.cursor() cursor.execute(""" SELECT id, name, description, icon, config FROM crawler_templates WHERE id = %s AND deleted = 0 """, (template_id,)) template = cursor.fetchone() cursor.close() if not template: return jsonify({ 'success': False, 'message': f"未找到模板: {template_id}" }), 404 return jsonify({ 'success': True, 'data': filter_dict(template) }) @workflow_bp.route('/crawler-templates', methods=['POST']) def create_crawler_template(): """创建爬虫配置模板""" data = request.json required_fields = ['name', 'description', 'config'] # 验证必要字段 for field in required_fields: if field not in data: return jsonify({ 'success': False, 'message': f"缺少必要字段: {field}" }), 400 # 生成ID template_id = f"template_{int(time.time())}_{uuid.uuid4().hex[:8]}" # 准备数据 template = { 'id': template_id, 'name': data['name'], 'description': data['description'], 'icon': data.get('icon', 'fas fa-spider'), 'config': data['config'], 'created_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'deleted': 0 } # 保存到数据库 db = DatabaseManager.get_connection() cursor = db.cursor() try: cursor.execute(""" INSERT INTO crawler_templates (id, name, description, icon, config, created_at, updated_at, deleted) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """, ( template['id'], template['name'], template['description'], template['icon'], json.dumps(template['config']), template['created_at'], template['updated_at'], template['deleted'] )) db.commit() # 清除缓存 workflow_cache.invalidate('crawler_templates') return jsonify({ 'success': True, 'data': filter_dict(template) }), 201 except Exception as e: db.rollback() logger.error(f"创建爬虫模板失败: {e}") return jsonify({ 'success': False, 'message': f"创建模板失败: {str(e)}" }), 500 finally: cursor.close() @workflow_bp.route('/crawler-templates/', methods=['PUT']) def update_crawler_template(template_id): """更新爬虫配置模板""" data = request.json # 验证模板是否存在 db = DatabaseManager.get_connection() cursor = db.cursor() cursor.execute(""" SELECT id FROM crawler_templates WHERE id = %s AND deleted = 0 """, (template_id,)) exists = cursor.fetchone() if not exists: cursor.close() return jsonify({ 'success': False, 'message': f"未找到模板: {template_id}" }), 404 # 准备更新数据 update_data = {} if 'name' in data: update_data['name'] = data['name'] if 'description' in data: update_data['description'] = data['description'] if 'icon' in data: update_data['icon'] = data['icon'] if 'config' in data: update_data['config'] = json.dumps(data['config']) update_data['updated_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 构建SQL语句 sql = "UPDATE crawler_templates SET " sql += ", ".join([f"{key} = %s" for key in update_data.keys()]) sql += " WHERE id = %s" # 执行更新 try: cursor.execute(sql, list(update_data.values()) + [template_id]) db.commit() # 清除缓存 workflow_cache.invalidate('crawler_templates') return jsonify({ 'success': True, 'message': "模板更新成功" }) except Exception as e: db.rollback() logger.error(f"更新爬虫模板失败: {e}") return jsonify({ 'success': False, 'message': f"更新模板失败: {str(e)}" }), 500 finally: cursor.close() @workflow_bp.route('/crawler-templates/', methods=['DELETE']) def delete_crawler_template(template_id): """删除爬虫配置模板""" # 验证模板是否存在 db = DatabaseManager.get_connection() cursor = db.cursor() cursor.execute(""" SELECT id FROM crawler_templates WHERE id = %s AND deleted = 0 """, (template_id,)) exists = cursor.fetchone() if not exists: cursor.close() return jsonify({ 'success': False, 'message': f"未找到模板: {template_id}" }), 404 # 软删除 try: cursor.execute(""" UPDATE crawler_templates SET deleted = 1, updated_at = %s WHERE id = %s """, (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), template_id)) db.commit() # 清除缓存 workflow_cache.invalidate('crawler_templates') return jsonify({ 'success': True, 'message': "模板删除成功" }) except Exception as e: db.rollback() logger.error(f"删除爬虫模板失败: {e}") return jsonify({ 'success': False, 'message': f"删除模板失败: {str(e)}" }), 500 finally: cursor.close() @workflow_bp.route('/analysis-templates', methods=['GET']) def get_analysis_templates(): """获取分析流程模板列表""" # 从缓存获取 templates = workflow_cache.get('analysis_templates') if templates is None: # 从数据库获取用户定义的模板 db = DatabaseManager.get_connection() cursor = db.cursor() cursor.execute(""" SELECT id, name, description, icon, components, connections FROM analysis_templates WHERE deleted = 0 ORDER BY created_at DESC """) user_templates = cursor.fetchall() cursor.close() # 结合默认模板 templates = DEFAULT_ANALYSIS_TEMPLATES + list(user_templates) # 缓存结果 workflow_cache.set('analysis_templates', templates) return jsonify({ 'success': True, 'data': filter_dict(templates) }) @workflow_bp.route('/analysis-templates/', methods=['GET']) def get_analysis_template(template_id): """获取指定分析流程模板""" # 查找默认模板 for template in DEFAULT_ANALYSIS_TEMPLATES: if template['id'] == template_id: return jsonify({ 'success': True, 'data': filter_dict(template) }) # 从数据库查找用户模板 db = DatabaseManager.get_connection() cursor = db.cursor() cursor.execute(""" SELECT id, name, description, icon, components, connections FROM analysis_templates WHERE id = %s AND deleted = 0 """, (template_id,)) template = cursor.fetchone() cursor.close() if not template: return jsonify({ 'success': False, 'message': f"未找到模板: {template_id}" }), 404 return jsonify({ 'success': True, 'data': filter_dict(template) }) @workflow_bp.route('/analysis-templates', methods=['POST']) def create_analysis_template(): """创建分析流程模板""" data = request.json required_fields = ['name', 'description', 'components', 'connections'] # 验证必要字段 for field in required_fields: if field not in data: return jsonify({ 'success': False, 'message': f"缺少必要字段: {field}" }), 400 # 生成ID template_id = f"template_{int(time.time())}_{uuid.uuid4().hex[:8]}" # 准备数据 template = { 'id': template_id, 'name': data['name'], 'description': data['description'], 'icon': data.get('icon', 'fas fa-chart-line'), 'components': data['components'], 'connections': data['connections'], 'created_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'deleted': 0 } # 保存到数据库 db = DatabaseManager.get_connection() cursor = db.cursor() try: cursor.execute(""" INSERT INTO analysis_templates (id, name, description, icon, components, connections, created_at, updated_at, deleted) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) """, ( template['id'], template['name'], template['description'], template['icon'], json.dumps(template['components']), json.dumps(template['connections']), template['created_at'], template['updated_at'], template['deleted'] )) db.commit() # 清除缓存 workflow_cache.invalidate('analysis_templates') return jsonify({ 'success': True, 'data': filter_dict(template) }), 201 except Exception as e: db.rollback() logger.error(f"创建分析模板失败: {e}") return jsonify({ 'success': False, 'message': f"创建模板失败: {str(e)}" }), 500 finally: cursor.close() @workflow_bp.route('/analysis-templates/', methods=['PUT']) def update_analysis_template(template_id): """更新分析流程模板""" data = request.json # 验证模板是否存在 db = DatabaseManager.get_connection() cursor = db.cursor() cursor.execute(""" SELECT id FROM analysis_templates WHERE id = %s AND deleted = 0 """, (template_id,)) exists = cursor.fetchone() if not exists: cursor.close() return jsonify({ 'success': False, 'message': f"未找到模板: {template_id}" }), 404 # 准备更新数据 update_data = {} if 'name' in data: update_data['name'] = data['name'] if 'description' in data: update_data['description'] = data['description'] if 'icon' in data: update_data['icon'] = data['icon'] if 'components' in data: update_data['components'] = json.dumps(data['components']) if 'connections' in data: update_data['connections'] = json.dumps(data['connections']) update_data['updated_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 构建SQL语句 sql = "UPDATE analysis_templates SET " sql += ", ".join([f"{key} = %s" for key in update_data.keys()]) sql += " WHERE id = %s" # 执行更新 try: cursor.execute(sql, list(update_data.values()) + [template_id]) db.commit() # 清除缓存 workflow_cache.invalidate('analysis_templates') return jsonify({ 'success': True, 'message': "模板更新成功" }) except Exception as e: db.rollback() logger.error(f"更新分析模板失败: {e}") return jsonify({ 'success': False, 'message': f"更新模板失败: {str(e)}" }), 500 finally: cursor.close() @workflow_bp.route('/analysis-templates/', methods=['DELETE']) def delete_analysis_template(template_id): """删除分析流程模板""" # 验证模板是否存在 db = DatabaseManager.get_connection() cursor = db.cursor() cursor.execute(""" SELECT id FROM analysis_templates WHERE id = %s AND deleted = 0 """, (template_id,)) exists = cursor.fetchone() if not exists: cursor.close() return jsonify({ 'success': False, 'message': f"未找到模板: {template_id}" }), 404 # 软删除 try: cursor.execute(""" UPDATE analysis_templates SET deleted = 1, updated_at = %s WHERE id = %s """, (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), template_id)) db.commit() # 清除缓存 workflow_cache.invalidate('analysis_templates') return jsonify({ 'success': True, 'message': "模板删除成功" }) except Exception as e: db.rollback() logger.error(f"删除分析模板失败: {e}") return jsonify({ 'success': False, 'message': f"删除模板失败: {str(e)}" }), 500 finally: cursor.close() @workflow_bp.route('/components', methods=['GET']) def get_available_components(): """获取可用组件列表""" return jsonify({ 'success': True, 'data': filter_dict(AVAILABLE_COMPONENTS) }) @workflow_bp.route('/run-workflow', methods=['POST']) def run_workflow(): """执行工作流""" data = request.json # 验证必要字段 if 'components' not in data or 'connections' not in data: return jsonify({ 'success': False, 'message': "缺少必要字段: components 或 connections" }), 400 # 这里是执行工作流逻辑的占位符 # 实际实现需要根据组件类型和连接关系建立执行计划并执行 # 记录执行请求 logger.info(f"收到工作流执行请求,组件数量: {len(data['components'])}, 连接数量: {len(data['connections'])}") # 创建任务ID task_id = f"task_{int(time.time())}_{uuid.uuid4().hex[:8]}" # 返回任务ID return jsonify({ 'success': True, 'message': "工作流执行请求已提交", 'data': { 'task_id': task_id, 'status': 'pending' } }) @workflow_bp.route('/task-status/', methods=['GET']) def get_task_status(task_id): """获取任务执行状态""" # 这里是获取任务状态的占位符 # 实际实现需要查询任务执行状态 # 示例状态 status = { 'task_id': task_id, 'status': 'running', 'progress': 45, 'message': "正在执行数据预处理", 'started_at': (datetime.now() - timedelta(minutes=2)).strftime('%Y-%m-%d %H:%M:%S'), 'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } return jsonify({ 'success': True, 'data': status })