Files
bettafish-company/views/workflow_api.py
T

920 lines
30 KiB
Python

import os
import json
import time
import uuid
import logging
from datetime import datetime, timedelta
from flask import Blueprint, request, jsonify, current_app, render_template
from utils.db_manager import DatabaseManager
from utils.sensitive_filter import filter_dict
from utils.cache_manager import CacheManager
# 创建两个蓝图:一个用于页面渲染,一个用于API
workflow_bp = Blueprint('workflow', __name__, url_prefix='/workflow')
workflow_api_bp = Blueprint('workflow_api', __name__, url_prefix='/api/workflow')
logger = logging.getLogger('workflow_api')
logger.setLevel(logging.INFO)
# 缓存管理器
workflow_cache = CacheManager(name="workflows", memory_capacity=100, cache_duration=1)
# 添加工作流编辑器页面路由
@workflow_bp.route('/editor', methods=['GET'])
def workflow_editor():
"""渲染工作流编辑器页面"""
return render_template('workflow_editor.html')
# 默认爬虫配置模板
DEFAULT_CRAWLER_TEMPLATES = [
{
"id": "default_weibo",
"name": "微博热门话题",
"description": "抓取微博热门话题及相关评论",
"icon": "fab fa-weibo",
"config": {
"source": "weibo",
"crawlDepth": 2,
"interval": 3600,
"maxRetries": 3,
"timeout": 30,
"maxConcurrent": 2,
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"filters": {
"minComments": 10,
"minLikes": 50,
"excludeKeywords": []
}
}
},
{
"id": "weibo_trending",
"name": "微博热搜榜",
"description": "抓取微博热搜榜单内容",
"icon": "fas fa-fire",
"config": {
"source": "weibo_trending",
"crawlDepth": 1,
"interval": 1800,
"maxRetries": 3,
"timeout": 20,
"maxConcurrent": 1,
"filters": {
"topN": 50,
"excludeKeywords": []
}
}
}
]
# 默认分析流程模板
DEFAULT_ANALYSIS_TEMPLATES = [
{
"id": "sentiment_analysis",
"name": "情感分析流程",
"description": "对文本进行情感分析",
"icon": "fas fa-smile",
"components": [
{
"id": "data_source",
"type": "data_source",
"name": "数据源",
"config": {
"source_type": "database",
"table": "comments",
"filter": {
"timeRange": "1d"
}
},
"position": {"x": 100, "y": 100}
},
{
"id": "text_preprocessing",
"type": "preprocessing",
"name": "文本预处理",
"config": {
"removeStopwords": True,
"removeURLs": True,
"removeEmojis": False
},
"position": {"x": 300, "y": 100}
},
{
"id": "sentiment_model",
"type": "model",
"name": "情感分析模型",
"config": {
"model_type": "sentiment",
"api": "openai",
"optimize_for": "balanced"
},
"position": {"x": 500, "y": 100}
},
{
"id": "visualization",
"type": "visualization",
"name": "可视化",
"config": {
"chart_type": "pie",
"title": "情感分布"
},
"position": {"x": 700, "y": 100}
}
],
"connections": [
{"source": "data_source", "target": "text_preprocessing"},
{"source": "text_preprocessing", "target": "sentiment_model"},
{"source": "sentiment_model", "target": "visualization"}
]
},
{
"id": "topic_analysis",
"name": "话题分析流程",
"description": "对文本进行话题分类和关键词提取",
"icon": "fas fa-tasks",
"components": [
{
"id": "data_source",
"type": "data_source",
"name": "数据源",
"config": {
"source_type": "database",
"table": "weibo_posts",
"filter": {
"timeRange": "7d"
}
},
"position": {"x": 100, "y": 100}
},
{
"id": "text_preprocessing",
"type": "preprocessing",
"name": "文本预处理",
"config": {
"removeStopwords": True,
"removeURLs": True,
"removeEmojis": True
},
"position": {"x": 300, "y": 100}
},
{
"id": "topic_model",
"type": "model",
"name": "话题分类模型",
"config": {
"model_type": "topic_classification",
"api": "deepseek",
"optimize_for": "performance"
},
"position": {"x": 500, "y": 50}
},
{
"id": "keyword_model",
"type": "model",
"name": "关键词提取模型",
"config": {
"model_type": "keyword_extraction",
"api": "openai",
"optimize_for": "balanced"
},
"position": {"x": 500, "y": 150}
},
{
"id": "topic_viz",
"type": "visualization",
"name": "话题分布",
"config": {
"chart_type": "bar",
"title": "话题分布"
},
"position": {"x": 700, "y": 50}
},
{
"id": "keyword_viz",
"type": "visualization",
"name": "关键词云",
"config": {
"chart_type": "wordcloud",
"title": "热门关键词"
},
"position": {"x": 700, "y": 150}
}
],
"connections": [
{"source": "data_source", "target": "text_preprocessing"},
{"source": "text_preprocessing", "target": "topic_model"},
{"source": "text_preprocessing", "target": "keyword_model"},
{"source": "topic_model", "target": "topic_viz"},
{"source": "keyword_model", "target": "keyword_viz"}
]
}
]
# 默认可用组件
AVAILABLE_COMPONENTS = {
"data_source": [
{
"id": "database",
"name": "数据库",
"description": "从系统数据库获取数据",
"config_schema": {
"table": {"type": "string", "description": "数据表名", "required": True},
"filter": {"type": "object", "description": "数据过滤条件"}
}
},
{
"id": "api",
"name": "API接口",
"description": "从外部API获取数据",
"config_schema": {
"url": {"type": "string", "description": "API URL", "required": True},
"method": {"type": "string", "description": "请求方法", "default": "GET"},
"headers": {"type": "object", "description": "请求头"},
"params": {"type": "object", "description": "请求参数"}
}
},
{
"id": "csv",
"name": "CSV文件",
"description": "从CSV文件导入数据",
"config_schema": {
"file_path": {"type": "string", "description": "文件路径", "required": True},
"encoding": {"type": "string", "description": "文件编码", "default": "utf-8"},
"delimiter": {"type": "string", "description": "分隔符", "default": ","}
}
}
],
"preprocessing": [
{
"id": "text_preprocessing",
"name": "文本预处理",
"description": "清洗和规范化文本数据",
"config_schema": {
"removeStopwords": {"type": "boolean", "description": "去除停用词", "default": True},
"removeURLs": {"type": "boolean", "description": "去除URL", "default": True},
"removeEmojis": {"type": "boolean", "description": "去除表情符号", "default": False},
"lowercase": {"type": "boolean", "description": "转为小写", "default": True}
}
},
{
"id": "tokenization",
"name": "分词",
"description": "将文本切分为词语或标记",
"config_schema": {
"method": {"type": "string", "description": "分词方法", "default": "jieba"},
"pos_tagging": {"type": "boolean", "description": "进行词性标注", "default": False}
}
},
{
"id": "feature_extraction",
"name": "特征提取",
"description": "从文本提取数值特征",
"config_schema": {
"method": {"type": "string", "description": "特征提取方法", "default": "tfidf"},
"max_features": {"type": "integer", "description": "最大特征数", "default": 1000}
}
}
],
"model": [
{
"id": "sentiment",
"name": "情感分析",
"description": "分析文本情感倾向",
"config_schema": {
"api": {"type": "string", "description": "使用的API", "default": "openai"},
"model_type": {"type": "string", "description": "模型类型", "default": "sentiment_analysis"},
"optimize_for": {"type": "string", "description": "优化目标", "default": "balanced"}
}
},
{
"id": "topic_classification",
"name": "话题分类",
"description": "对文本进行话题分类",
"config_schema": {
"api": {"type": "string", "description": "使用的API", "default": "deepseek"},
"model_type": {"type": "string", "description": "模型类型", "default": "topic_classification"},
"optimize_for": {"type": "string", "description": "优化目标", "default": "performance"}
}
},
{
"id": "keyword_extraction",
"name": "关键词提取",
"description": "从文本中提取关键词",
"config_schema": {
"api": {"type": "string", "description": "使用的API", "default": "openai"},
"model_type": {"type": "string", "description": "模型类型", "default": "keyword_extraction"},
"optimize_for": {"type": "string", "description": "优化目标", "default": "balanced"}
}
},
{
"id": "custom_ai",
"name": "自定义AI模型",
"description": "使用自定义AI模型进行分析",
"config_schema": {
"model_path": {"type": "string", "description": "模型路径", "required": True},
"model_type": {"type": "string", "description": "模型类型", "required": True}
}
}
],
"visualization": [
{
"id": "line_chart",
"name": "折线图",
"description": "展示数据随时间的变化趋势",
"config_schema": {
"title": {"type": "string", "description": "图表标题", "default": "时间趋势"},
"x_axis": {"type": "string", "description": "X轴字段", "default": "time"},
"y_axis": {"type": "string", "description": "Y轴字段", "default": "value"},
"color": {"type": "string", "description": "线条颜色", "default": "#1890ff"}
}
},
{
"id": "bar_chart",
"name": "柱状图",
"description": "展示不同类别的数据对比",
"config_schema": {
"title": {"type": "string", "description": "图表标题", "default": "数据对比"},
"x_axis": {"type": "string", "description": "X轴字段", "default": "category"},
"y_axis": {"type": "string", "description": "Y轴字段", "default": "value"}
}
},
{
"id": "pie_chart",
"name": "饼图",
"description": "展示数据的构成比例",
"config_schema": {
"title": {"type": "string", "description": "图表标题", "default": "比例分布"},
"value_field": {"type": "string", "description": "值字段", "default": "value"},
"label_field": {"type": "string", "description": "标签字段", "default": "label"}
}
},
{
"id": "wordcloud",
"name": "词云图",
"description": "直观展示文本中的高频词",
"config_schema": {
"title": {"type": "string", "description": "图表标题", "default": "关键词云"},
"max_words": {"type": "integer", "description": "最大词数", "default": 100},
"color_scheme": {"type": "string", "description": "配色方案", "default": "viridis"}
}
},
{
"id": "heatmap",
"name": "热力图",
"description": "展示数据的密度分布",
"config_schema": {
"title": {"type": "string", "description": "图表标题", "default": "热力分布"},
"x_axis": {"type": "string", "description": "X轴字段", "default": "x"},
"y_axis": {"type": "string", "description": "Y轴字段", "default": "y"},
"value_field": {"type": "string", "description": "值字段", "default": "value"}
}
}
]
}
@workflow_api_bp.route('/crawler-templates', methods=['GET'])
def get_crawler_templates():
"""获取爬虫配置模板列表"""
# 从缓存获取
templates = workflow_cache.get('crawler_templates')
if templates is None:
# 从数据库获取用户定义的模板
db = DatabaseManager.get_connection()
cursor = db.cursor()
cursor.execute("""
SELECT id, name, description, icon, config
FROM crawler_templates
WHERE deleted = 0
ORDER BY created_at DESC
""")
user_templates = cursor.fetchall()
cursor.close()
# 结合默认模板
templates = DEFAULT_CRAWLER_TEMPLATES + list(user_templates)
# 缓存结果
workflow_cache.set('crawler_templates', templates)
return jsonify({
'success': True,
'data': filter_dict(templates)
})
@workflow_api_bp.route('/crawler-templates/<template_id>', methods=['GET'])
def get_crawler_template(template_id):
"""获取指定爬虫配置模板"""
# 查找默认模板
for template in DEFAULT_CRAWLER_TEMPLATES:
if template['id'] == template_id:
return jsonify({
'success': True,
'data': filter_dict(template)
})
# 从数据库查找用户模板
db = DatabaseManager.get_connection()
cursor = db.cursor()
cursor.execute("""
SELECT id, name, description, icon, config
FROM crawler_templates
WHERE id = %s AND deleted = 0
""", (template_id,))
template = cursor.fetchone()
cursor.close()
if not template:
return jsonify({
'success': False,
'message': f"未找到模板: {template_id}"
}), 404
return jsonify({
'success': True,
'data': filter_dict(template)
})
@workflow_api_bp.route('/crawler-templates', methods=['POST'])
def create_crawler_template():
"""创建爬虫配置模板"""
data = request.json
required_fields = ['name', 'description', 'config']
# 验证必要字段
for field in required_fields:
if field not in data:
return jsonify({
'success': False,
'message': f"缺少必要字段: {field}"
}), 400
# 生成ID
template_id = f"template_{int(time.time())}_{uuid.uuid4().hex[:8]}"
# 准备数据
template = {
'id': template_id,
'name': data['name'],
'description': data['description'],
'icon': data.get('icon', 'fas fa-spider'),
'config': data['config'],
'created_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'deleted': 0
}
# 保存到数据库
db = DatabaseManager.get_connection()
cursor = db.cursor()
try:
cursor.execute("""
INSERT INTO crawler_templates
(id, name, description, icon, config, created_at, updated_at, deleted)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
""", (
template['id'],
template['name'],
template['description'],
template['icon'],
json.dumps(template['config']),
template['created_at'],
template['updated_at'],
template['deleted']
))
db.commit()
# 清除缓存
workflow_cache.invalidate('crawler_templates')
return jsonify({
'success': True,
'data': filter_dict(template)
}), 201
except Exception as e:
db.rollback()
logger.error(f"创建爬虫模板失败: {e}")
return jsonify({
'success': False,
'message': f"创建模板失败: {str(e)}"
}), 500
finally:
cursor.close()
@workflow_api_bp.route('/crawler-templates/<template_id>', methods=['PUT'])
def update_crawler_template(template_id):
"""更新爬虫配置模板"""
data = request.json
# 验证模板是否存在
db = DatabaseManager.get_connection()
cursor = db.cursor()
cursor.execute("""
SELECT id FROM crawler_templates
WHERE id = %s AND deleted = 0
""", (template_id,))
exists = cursor.fetchone()
if not exists:
cursor.close()
return jsonify({
'success': False,
'message': f"未找到模板: {template_id}"
}), 404
# 准备更新数据
update_data = {}
if 'name' in data:
update_data['name'] = data['name']
if 'description' in data:
update_data['description'] = data['description']
if 'icon' in data:
update_data['icon'] = data['icon']
if 'config' in data:
update_data['config'] = json.dumps(data['config'])
update_data['updated_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 构建SQL语句
sql = "UPDATE crawler_templates SET "
sql += ", ".join([f"{key} = %s" for key in update_data.keys()])
sql += " WHERE id = %s"
# 执行更新
try:
cursor.execute(sql, list(update_data.values()) + [template_id])
db.commit()
# 清除缓存
workflow_cache.invalidate('crawler_templates')
return jsonify({
'success': True,
'message': "模板更新成功"
})
except Exception as e:
db.rollback()
logger.error(f"更新爬虫模板失败: {e}")
return jsonify({
'success': False,
'message': f"更新模板失败: {str(e)}"
}), 500
finally:
cursor.close()
@workflow_api_bp.route('/crawler-templates/<template_id>', methods=['DELETE'])
def delete_crawler_template(template_id):
"""删除爬虫配置模板"""
# 验证模板是否存在
db = DatabaseManager.get_connection()
cursor = db.cursor()
cursor.execute("""
SELECT id FROM crawler_templates
WHERE id = %s AND deleted = 0
""", (template_id,))
exists = cursor.fetchone()
if not exists:
cursor.close()
return jsonify({
'success': False,
'message': f"未找到模板: {template_id}"
}), 404
# 软删除
try:
cursor.execute("""
UPDATE crawler_templates
SET deleted = 1, updated_at = %s
WHERE id = %s
""", (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), template_id))
db.commit()
# 清除缓存
workflow_cache.invalidate('crawler_templates')
return jsonify({
'success': True,
'message': "模板删除成功"
})
except Exception as e:
db.rollback()
logger.error(f"删除爬虫模板失败: {e}")
return jsonify({
'success': False,
'message': f"删除模板失败: {str(e)}"
}), 500
finally:
cursor.close()
@workflow_api_bp.route('/analysis-templates', methods=['GET'])
def get_analysis_templates():
"""获取分析流程模板列表"""
# 从缓存获取
templates = workflow_cache.get('analysis_templates')
if templates is None:
# 从数据库获取用户定义的模板
db = DatabaseManager.get_connection()
cursor = db.cursor()
cursor.execute("""
SELECT id, name, description, icon, components, connections
FROM analysis_templates
WHERE deleted = 0
ORDER BY created_at DESC
""")
user_templates = cursor.fetchall()
cursor.close()
# 结合默认模板
templates = DEFAULT_ANALYSIS_TEMPLATES + list(user_templates)
# 缓存结果
workflow_cache.set('analysis_templates', templates)
return jsonify({
'success': True,
'data': filter_dict(templates)
})
@workflow_api_bp.route('/analysis-templates/<template_id>', methods=['GET'])
def get_analysis_template(template_id):
"""获取指定分析流程模板"""
# 查找默认模板
for template in DEFAULT_ANALYSIS_TEMPLATES:
if template['id'] == template_id:
return jsonify({
'success': True,
'data': filter_dict(template)
})
# 从数据库查找用户模板
db = DatabaseManager.get_connection()
cursor = db.cursor()
cursor.execute("""
SELECT id, name, description, icon, components, connections
FROM analysis_templates
WHERE id = %s AND deleted = 0
""", (template_id,))
template = cursor.fetchone()
cursor.close()
if not template:
return jsonify({
'success': False,
'message': f"未找到模板: {template_id}"
}), 404
return jsonify({
'success': True,
'data': filter_dict(template)
})
@workflow_api_bp.route('/analysis-templates', methods=['POST'])
def create_analysis_template():
"""创建分析流程模板"""
data = request.json
required_fields = ['name', 'description', 'components', 'connections']
# 验证必要字段
for field in required_fields:
if field not in data:
return jsonify({
'success': False,
'message': f"缺少必要字段: {field}"
}), 400
# 生成ID
template_id = f"template_{int(time.time())}_{uuid.uuid4().hex[:8]}"
# 准备数据
template = {
'id': template_id,
'name': data['name'],
'description': data['description'],
'icon': data.get('icon', 'fas fa-chart-line'),
'components': data['components'],
'connections': data['connections'],
'created_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'deleted': 0
}
# 保存到数据库
db = DatabaseManager.get_connection()
cursor = db.cursor()
try:
cursor.execute("""
INSERT INTO analysis_templates
(id, name, description, icon, components, connections, created_at, updated_at, deleted)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
""", (
template['id'],
template['name'],
template['description'],
template['icon'],
json.dumps(template['components']),
json.dumps(template['connections']),
template['created_at'],
template['updated_at'],
template['deleted']
))
db.commit()
# 清除缓存
workflow_cache.invalidate('analysis_templates')
return jsonify({
'success': True,
'data': filter_dict(template)
}), 201
except Exception as e:
db.rollback()
logger.error(f"创建分析模板失败: {e}")
return jsonify({
'success': False,
'message': f"创建模板失败: {str(e)}"
}), 500
finally:
cursor.close()
@workflow_api_bp.route('/analysis-templates/<template_id>', methods=['PUT'])
def update_analysis_template(template_id):
"""更新分析流程模板"""
data = request.json
# 验证模板是否存在
db = DatabaseManager.get_connection()
cursor = db.cursor()
cursor.execute("""
SELECT id FROM analysis_templates
WHERE id = %s AND deleted = 0
""", (template_id,))
exists = cursor.fetchone()
if not exists:
cursor.close()
return jsonify({
'success': False,
'message': f"未找到模板: {template_id}"
}), 404
# 准备更新数据
update_data = {}
if 'name' in data:
update_data['name'] = data['name']
if 'description' in data:
update_data['description'] = data['description']
if 'icon' in data:
update_data['icon'] = data['icon']
if 'components' in data:
update_data['components'] = json.dumps(data['components'])
if 'connections' in data:
update_data['connections'] = json.dumps(data['connections'])
update_data['updated_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 构建SQL语句
sql = "UPDATE analysis_templates SET "
sql += ", ".join([f"{key} = %s" for key in update_data.keys()])
sql += " WHERE id = %s"
# 执行更新
try:
cursor.execute(sql, list(update_data.values()) + [template_id])
db.commit()
# 清除缓存
workflow_cache.invalidate('analysis_templates')
return jsonify({
'success': True,
'message': "模板更新成功"
})
except Exception as e:
db.rollback()
logger.error(f"更新分析模板失败: {e}")
return jsonify({
'success': False,
'message': f"更新模板失败: {str(e)}"
}), 500
finally:
cursor.close()
@workflow_api_bp.route('/analysis-templates/<template_id>', methods=['DELETE'])
def delete_analysis_template(template_id):
"""删除分析流程模板"""
# 验证模板是否存在
db = DatabaseManager.get_connection()
cursor = db.cursor()
cursor.execute("""
SELECT id FROM analysis_templates
WHERE id = %s AND deleted = 0
""", (template_id,))
exists = cursor.fetchone()
if not exists:
cursor.close()
return jsonify({
'success': False,
'message': f"未找到模板: {template_id}"
}), 404
# 软删除
try:
cursor.execute("""
UPDATE analysis_templates
SET deleted = 1, updated_at = %s
WHERE id = %s
""", (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), template_id))
db.commit()
# 清除缓存
workflow_cache.invalidate('analysis_templates')
return jsonify({
'success': True,
'message': "模板删除成功"
})
except Exception as e:
db.rollback()
logger.error(f"删除分析模板失败: {e}")
return jsonify({
'success': False,
'message': f"删除模板失败: {str(e)}"
}), 500
finally:
cursor.close()
@workflow_api_bp.route('/components', methods=['GET'])
def get_available_components():
"""获取可用组件列表"""
return jsonify({
'success': True,
'data': filter_dict(AVAILABLE_COMPONENTS)
})
@workflow_api_bp.route('/run-workflow', methods=['POST'])
def run_workflow():
"""执行工作流"""
data = request.json
# 验证必要字段
if 'components' not in data or 'connections' not in data:
return jsonify({
'success': False,
'message': "缺少必要字段: components 或 connections"
}), 400
# 这里是执行工作流逻辑的占位符
# 实际实现需要根据组件类型和连接关系建立执行计划并执行
# 记录执行请求
logger.info(f"收到工作流执行请求,组件数量: {len(data['components'])}, 连接数量: {len(data['connections'])}")
# 创建任务ID
task_id = f"task_{int(time.time())}_{uuid.uuid4().hex[:8]}"
# 返回任务ID
return jsonify({
'success': True,
'message': "工作流执行请求已提交",
'data': {
'task_id': task_id,
'status': 'pending'
}
})
@workflow_api_bp.route('/task-status/<task_id>', methods=['GET'])
def get_task_status(task_id):
"""获取任务执行状态"""
# 这里是获取任务状态的占位符
# 实际实现需要查询任务执行状态
# 示例状态
status = {
'task_id': task_id,
'status': 'running',
'progress': 45,
'message': "正在执行数据预处理",
'started_at': (datetime.now() - timedelta(minutes=2)).strftime('%Y-%m-%d %H:%M:%S'),
'updated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
return jsonify({
'success': True,
'data': status
})
@workflow_api_bp.route('/save', methods=['POST'])
def save_workflow():
"""保存工作流"""
# ... existing code ...
@workflow_api_bp.route('/<workflow_id>', methods=['GET'])
def get_workflow(workflow_id):
"""获取工作流"""
# ... existing code ...
@workflow_api_bp.route('/<workflow_id>', methods=['DELETE'])
def delete_workflow(workflow_id):
"""删除工作流"""
# ... existing code ...