🚀 Major Upgrade! Visual Workflow Orchestrator and AI-Powered Crawler Implemented. Added Model Arena Feature and Efficiency Optimizations (Two-Level Caching Architecture + End-to-End Performance Enhancements).

This commit is contained in:
戒酒的李白
2025-03-13 13:14:35 +08:00
parent ee5372941a
commit 0c6a40b869
12 changed files with 5688 additions and 78 deletions
+837
View File
@@ -0,0 +1,837 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import json
import time
import uuid
import logging
import traceback
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from utils.db_manager import DatabaseManager
from utils.cache_manager import CacheManager
from utils.model_router import ModelRouter
from utils.sensitive_filter import SensitiveDataFilter
from spider.weibo_crawler import WeiboCrawler
from utils.ai_analyzer import AIAnalyzer
# 配置日志
from utils.logger import setup_logger
logger = setup_logger('workflow_engine', 'logs/workflow_engine.log')
class WorkflowEngine:
"""工作流引擎 - 负责执行数据爬取和分析工作流"""
_instance = None
_initialized = False
def __new__(cls):
if cls._instance is None:
cls._instance = super(WorkflowEngine, cls).__new__(cls)
return cls._instance
def __init__(self):
if self._initialized:
return
self.db = DatabaseManager()
self.cache = CacheManager(memory_capacity=50, cache_duration=3600)
self.model_router = ModelRouter()
self.sensitive_filter = SensitiveDataFilter()
self.executor = ThreadPoolExecutor(max_workers=5)
self.running_tasks = {}
# 创建必要的目录
self.data_dir = Path('data/workflow')
self.data_dir.mkdir(parents=True, exist_ok=True)
self._initialized = True
logger.info("工作流引擎初始化完成")
def execute_crawler_workflow(self, task_id, config):
"""
执行爬虫工作流
Args:
task_id: 任务ID
config: 爬虫配置
"""
logger.info(f"开始执行爬虫工作流: {task_id}")
try:
# 更新任务状态为运行中
self._update_task_status(task_id, 'running', 0)
# 创建爬虫实例
crawler = WeiboCrawler()
# 设置爬虫参数
source = config.get('source', 'hot_topics')
depth = config.get('crawl_depth', 1)
interval = config.get('interval', 5)
filters = config.get('filters', {})
# 执行爬取
result = crawler.crawl(
source=source,
depth=depth,
interval=interval,
filters=filters,
callback=lambda progress: self._update_task_progress(task_id, progress)
)
# 更新任务状态为已完成
self._update_task_status(task_id, 'completed', 100, result=result)
logger.info(f"爬虫工作流完成: {task_id}")
return result
except Exception as e:
logger.error(f"爬虫工作流出错: {str(e)}")
logger.error(traceback.format_exc())
self._update_task_status(task_id, 'failed', 0, error=str(e))
return None
def execute_analysis_workflow(self, task_id, workflow):
"""
执行分析工作流
Args:
task_id: 任务ID
workflow: 工作流配置
"""
logger.info(f"开始执行分析工作流: {task_id}")
try:
# 更新任务状态为运行中
self._update_task_status(task_id, 'running', 0)
components = workflow.get('components', [])
connections = workflow.get('connections', [])
# 验证工作流
if not components or not connections:
raise ValueError("工作流配置不完整,缺少组件或连接")
# 构建组件依赖图
component_map, dependency_graph = self._build_dependency_graph(components, connections)
# 进行拓扑排序
execution_order = self._topological_sort(dependency_graph)
# 执行组件
result_map = {}
total_components = len(execution_order)
for idx, component_id in enumerate(execution_order):
component = component_map.get(component_id)
if not component:
continue
# 计算总体进度
progress = int((idx / total_components) * 100)
self._update_task_progress(task_id, progress)
# 获取输入数据
input_data = self._get_component_input_data(component_id, connections, result_map)
# 执行组件
result = self._execute_component(component, input_data)
# 存储结果
result_map[component_id] = result
# 获取最终输出
final_outputs = self._get_final_outputs(dependency_graph, result_map)
# 应用敏感信息过滤
if final_outputs and self.sensitive_filter.is_enabled():
if isinstance(final_outputs, dict):
final_outputs = self.sensitive_filter.filter_dict(final_outputs)
elif isinstance(final_outputs, list):
final_outputs = self.sensitive_filter.filter_list(final_outputs)
# 更新任务状态为已完成
self._update_task_status(task_id, 'completed', 100, result=final_outputs)
logger.info(f"分析工作流完成: {task_id}")
return final_outputs
except Exception as e:
logger.error(f"分析工作流出错: {str(e)}")
logger.error(traceback.format_exc())
self._update_task_status(task_id, 'failed', 0, error=str(e))
return None
def start_workflow(self, workflow_type, config, template_id=None):
"""
异步启动工作流
Args:
workflow_type: 工作流类型 (crawler/analysis)
config: 工作流配置
template_id: 关联的模板ID
Returns:
task_id: 工作流任务ID
"""
# 生成任务ID
task_id = str(uuid.uuid4())
# 保存任务信息到数据库
conn = self.db.get_connection()
cursor = conn.cursor()
try:
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
cursor.execute(
"""
INSERT INTO workflow_tasks
(id, template_id, type, status, progress, config, created_at, updated_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
""",
(
task_id,
template_id,
workflow_type,
'pending',
0,
json.dumps(config, ensure_ascii=False),
now,
now
)
)
conn.commit()
# 异步执行工作流
if workflow_type == 'crawler':
self.running_tasks[task_id] = self.executor.submit(
self.execute_crawler_workflow, task_id, config
)
elif workflow_type == 'analysis':
self.running_tasks[task_id] = self.executor.submit(
self.execute_analysis_workflow, task_id, config
)
else:
logger.error(f"未知的工作流类型: {workflow_type}")
return None
return task_id
except Exception as e:
logger.error(f"启动工作流失败: {str(e)}")
conn.rollback()
return None
finally:
cursor.close()
def get_task_status(self, task_id):
"""
获取任务状态
Args:
task_id: 任务ID
Returns:
task: 任务信息
"""
# 先检查缓存
cache_key = f"task_status:{task_id}"
cached_task = self.cache.get(cache_key)
if cached_task:
return cached_task
# 从数据库获取
conn = self.db.get_connection()
cursor = conn.cursor()
try:
cursor.execute(
"SELECT * FROM workflow_tasks WHERE id = %s",
(task_id,)
)
task = cursor.fetchone()
if task:
# 将JSON字符串转为Python对象
if task.get('config'):
task['config'] = json.loads(task['config'])
if task.get('result'):
task['result'] = json.loads(task['result'])
# 缓存结果
self.cache.set(cache_key, task)
return task
except Exception as e:
logger.error(f"获取任务状态失败: {str(e)}")
return None
finally:
cursor.close()
def cancel_task(self, task_id):
"""
取消任务
Args:
task_id: 任务ID
Returns:
success: 是否成功
"""
# 检查任务是否存在并正在运行
if task_id in self.running_tasks:
# 尝试取消任务
future = self.running_tasks[task_id]
if not future.done():
future.cancel()
# 从运行列表中移除
del self.running_tasks[task_id]
# 更新数据库状态
conn = self.db.get_connection()
cursor = conn.cursor()
try:
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
cursor.execute(
"""
UPDATE workflow_tasks
SET status = %s, updated_at = %s
WHERE id = %s
""",
('cancelled', now, task_id)
)
conn.commit()
# 清理缓存
cache_key = f"task_status:{task_id}"
self.cache.delete(cache_key)
return True
except Exception as e:
logger.error(f"取消任务失败: {str(e)}")
conn.rollback()
return False
finally:
cursor.close()
def _update_task_status(self, task_id, status, progress, result=None, error=None):
"""更新任务状态"""
conn = self.db.get_connection()
cursor = conn.cursor()
try:
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
update_fields = ["status = %s", "progress = %s", "updated_at = %s"]
params = [status, progress, now]
# 添加开始时间
if status == 'running' and progress == 0:
update_fields.append("started_at = %s")
params.append(now)
# 添加完成时间
if status in ['completed', 'failed']:
update_fields.append("completed_at = %s")
params.append(now)
# 添加结果
if result is not None:
update_fields.append("result = %s")
params.append(json.dumps(result, ensure_ascii=False))
# 添加错误
if error is not None:
update_fields.append("error = %s")
params.append(error)
# 构建SQL
sql = f"""
UPDATE workflow_tasks
SET {', '.join(update_fields)}
WHERE id = %s
"""
params.append(task_id)
cursor.execute(sql, tuple(params))
conn.commit()
# 清理缓存
cache_key = f"task_status:{task_id}"
self.cache.delete(cache_key)
except Exception as e:
logger.error(f"更新任务状态失败: {str(e)}")
conn.rollback()
finally:
cursor.close()
def _update_task_progress(self, task_id, progress):
"""更新任务进度"""
self._update_task_status(task_id, 'running', progress)
def _build_dependency_graph(self, components, connections):
"""构建组件依赖图"""
component_map = {comp['id']: comp for comp in components}
dependency_graph = {comp['id']: [] for comp in components}
# 构建依赖关系
for conn in connections:
source = conn.get('source')
target = conn.get('target')
if source and target and source in component_map and target in component_map:
dependency_graph[target].append(source)
return component_map, dependency_graph
def _topological_sort(self, graph):
"""拓扑排序,确定组件执行顺序"""
visited = set()
temp = set()
order = []
def visit(node):
if node in temp:
raise ValueError(f"工作流存在循环依赖: {node}")
if node in visited:
return
temp.add(node)
for neighbor in graph.get(node, []):
visit(neighbor)
temp.remove(node)
visited.add(node)
order.append(node)
for node in graph:
if node not in visited:
visit(node)
return list(reversed(order))
def _get_component_input_data(self, component_id, connections, result_map):
"""获取组件的输入数据"""
input_data = {}
for conn in connections:
if conn.get('target') == component_id:
source_id = conn.get('source')
if source_id in result_map:
input_name = conn.get('targetInput', 'default')
input_data[input_name] = result_map[source_id]
return input_data
def _execute_component(self, component, input_data):
"""执行单个组件"""
component_type = component.get('type')
config = component.get('config', {})
if component_type == 'data_source':
return self._execute_data_source(config, input_data)
elif component_type == 'preprocessing':
return self._execute_preprocessing(config, input_data)
elif component_type == 'model':
return self._execute_model(config, input_data)
elif component_type == 'visualization':
return self._execute_visualization(config, input_data)
else:
logger.warning(f"未知的组件类型: {component_type}")
return None
def _execute_data_source(self, config, input_data):
"""执行数据源组件"""
source_type = config.get('source_type')
if source_type == 'database':
# 从数据库获取数据
table = config.get('table')
filters = config.get('filters', {})
limit = config.get('limit', 1000)
query_conditions = []
query_params = []
for key, value in filters.items():
if value:
query_conditions.append(f"{key} = %s")
query_params.append(value)
where_clause = f"WHERE {' AND '.join(query_conditions)}" if query_conditions else ""
sql = f"SELECT * FROM {table} {where_clause} LIMIT {limit}"
conn = self.db.get_connection()
cursor = conn.cursor()
try:
cursor.execute(sql, tuple(query_params))
return cursor.fetchall()
except Exception as e:
logger.error(f"数据库查询出错: {str(e)}")
return []
finally:
cursor.close()
elif source_type == 'file':
# 从文件加载数据
file_path = config.get('file_path')
if not file_path or not os.path.exists(file_path):
return []
try:
with open(file_path, 'r', encoding='utf-8') as f:
if file_path.endswith('.json'):
return json.load(f)
else:
return f.read()
except Exception as e:
logger.error(f"文件读取出错: {str(e)}")
return []
elif source_type == 'api':
# 这里需要实现API调用逻辑
# 由于涉及复杂的HTTP请求,暂不实现
logger.warning("API数据源暂未实现")
return []
else:
logger.warning(f"未知的数据源类型: {source_type}")
return []
def _execute_preprocessing(self, config, input_data):
"""执行数据预处理组件"""
preprocessing_type = config.get('preprocessing_type')
data = input_data.get('default', [])
if not data:
return []
if preprocessing_type == 'filter':
# 过滤数据
field = config.get('field')
value = config.get('value')
operator = config.get('operator', 'eq')
if not field:
return data
result = []
for item in data:
if operator == 'eq' and item.get(field) == value:
result.append(item)
elif operator == 'neq' and item.get(field) != value:
result.append(item)
elif operator == 'contains' and value in str(item.get(field, '')):
result.append(item)
elif operator == 'not_contains' and value not in str(item.get(field, '')):
result.append(item)
return result
elif preprocessing_type == 'sort':
# 排序数据
field = config.get('field')
order = config.get('order', 'asc')
if not field:
return data
return sorted(
data,
key=lambda x: x.get(field, ''),
reverse=(order == 'desc')
)
elif preprocessing_type == 'aggregate':
# 聚合数据
group_by = config.get('group_by')
aggregate_field = config.get('aggregate_field')
aggregate_type = config.get('aggregate_type', 'count')
if not group_by:
return data
result = {}
for item in data:
key = item.get(group_by)
if key not in result:
result[key] = {
'count': 0,
'sum': 0,
'values': []
}
result[key]['count'] += 1
if aggregate_field:
value = item.get(aggregate_field, 0)
if isinstance(value, (int, float)):
result[key]['sum'] += value
result[key]['values'].append(value)
# 计算最终结果
final_result = []
for key, values in result.items():
item = {group_by: key}
if aggregate_type == 'count':
item['value'] = values['count']
elif aggregate_type == 'sum':
item['value'] = values['sum']
elif aggregate_type == 'avg':
item['value'] = values['sum'] / values['count'] if values['count'] > 0 else 0
final_result.append(item)
return final_result
else:
logger.warning(f"未知的预处理类型: {preprocessing_type}")
return data
def _execute_model(self, config, input_data):
"""执行模型组件"""
model_type = config.get('model_type')
data = input_data.get('default', [])
if not data:
return []
analyzer = AIAnalyzer()
if model_type == 'sentiment':
# 情感分析
texts = []
if isinstance(data, list):
# 如果是列表,从指定字段获取文本
field = config.get('text_field', 'content')
texts = [item.get(field, '') for item in data if item.get(field)]
elif isinstance(data, str):
# 如果是字符串,直接使用
texts = [data]
# 获取合适的模型
model = self.model_router.select_model_for_text(texts[0] if texts else "", "sentiment")
# 执行分析
results = []
for text in texts:
result = analyzer.analyze_sentiment(text, model=model)
results.append(result)
# 如果输入是列表,将结果合并回原始数据
if isinstance(data, list):
field = config.get('text_field', 'content')
for i, item in enumerate(data):
if i < len(results) and item.get(field):
item['sentiment'] = results[i]
return data
else:
return results[0] if results else None
elif model_type == 'topic':
# 主题分类
texts = []
if isinstance(data, list):
field = config.get('text_field', 'content')
texts = [item.get(field, '') for item in data if item.get(field)]
elif isinstance(data, str):
texts = [data]
# 获取合适的模型
model = self.model_router.select_model_for_text(texts[0] if texts else "", "topic")
# 执行分析
results = []
for text in texts:
result = analyzer.analyze_topic(text, model=model)
results.append(result)
# 如果输入是列表,将结果合并回原始数据
if isinstance(data, list):
field = config.get('text_field', 'content')
for i, item in enumerate(data):
if i < len(results) and item.get(field):
item['topic'] = results[i]
return data
else:
return results[0] if results else None
elif model_type == 'keywords':
# 关键词提取
texts = []
if isinstance(data, list):
field = config.get('text_field', 'content')
texts = [item.get(field, '') for item in data if item.get(field)]
elif isinstance(data, str):
texts = [data]
# 获取合适的模型
model = self.model_router.select_model_for_text(texts[0] if texts else "", "keyword")
# 执行分析
results = []
for text in texts:
result = analyzer.extract_keywords(text, model=model)
results.append(result)
# 如果输入是列表,将结果合并回原始数据
if isinstance(data, list):
field = config.get('text_field', 'content')
for i, item in enumerate(data):
if i < len(results) and item.get(field):
item['keywords'] = results[i]
return data
else:
return results[0] if results else None
elif model_type == 'summarize':
# 文本摘要
texts = []
if isinstance(data, list):
field = config.get('text_field', 'content')
texts = [item.get(field, '') for item in data if item.get(field)]
elif isinstance(data, str):
texts = [data]
# 获取合适的模型
model = self.model_router.select_model_for_text(texts[0] if texts else "", "summarization")
# 执行分析
results = []
for text in texts:
result = analyzer.summarize_text(text, model=model)
results.append(result)
# 如果输入是列表,将结果合并回原始数据
if isinstance(data, list):
field = config.get('text_field', 'content')
for i, item in enumerate(data):
if i < len(results) and item.get(field):
item['summary'] = results[i]
return data
else:
return results[0] if results else None
else:
logger.warning(f"未知的模型类型: {model_type}")
return data
def _execute_visualization(self, config, input_data):
"""执行可视化组件"""
visualization_type = config.get('visualization_type')
data = input_data.get('default', [])
if not data:
return {}
if visualization_type == 'chart':
# 图表可视化
chart_type = config.get('chart_type', 'bar')
x_field = config.get('x_field')
y_field = config.get('y_field')
title = config.get('title', '数据可视化')
if not x_field or not y_field:
return {'error': '缺少x或y字段'}
# 提取数据
chart_data = {
'type': chart_type,
'title': title,
'xAxis': {'type': 'category', 'data': []},
'yAxis': {'type': 'value'},
'series': [{'data': []}]
}
for item in data:
x_value = item.get(x_field)
y_value = item.get(y_field)
if x_value is not None and y_value is not None:
chart_data['xAxis']['data'].append(x_value)
chart_data['series'][0]['data'].append(y_value)
return chart_data
elif visualization_type == 'table':
# 表格可视化
columns = config.get('columns', [])
title = config.get('title', '数据表格')
# 如果没有指定列,使用数据中的所有字段
if not columns and isinstance(data, list) and data:
columns = list(data[0].keys())
# 构建表格数据
table_data = {
'type': 'table',
'title': title,
'columns': columns,
'data': data
}
return table_data
elif visualization_type == 'wordcloud':
# 词云可视化
word_field = config.get('word_field')
value_field = config.get('value_field')
title = config.get('title', '词云图')
if not word_field:
return {'error': '缺少词字段'}
# 构建词云数据
wordcloud_data = {
'type': 'wordcloud',
'title': title,
'data': []
}
for item in data:
word = item.get(word_field)
value = item.get(value_field, 1)
if word:
wordcloud_data['data'].append({
'name': word,
'value': value
})
return wordcloud_data
else:
logger.warning(f"未知的可视化类型: {visualization_type}")
return {}
def _get_final_outputs(self, dependency_graph, result_map):
"""获取最终输出结果"""
# 找出没有后继节点的叶子节点
leaf_nodes = []
all_targets = set()
for node, deps in dependency_graph.items():
all_targets.update(deps)
for node in dependency_graph:
if node not in all_targets:
leaf_nodes.append(node)
# 收集所有叶子节点的结果
outputs = {}
for node in leaf_nodes:
if node in result_map:
outputs[node] = result_map[node]
return outputs