🚀 Major Upgrade! Visual Workflow Orchestrator and AI-Powered Crawler Implemented. Added Model Arena Feature and Efficiency Optimizations (Two-Level Caching Architecture + End-to-End Performance Enhancements).
This commit is contained in:
+262
-77
@@ -1,116 +1,301 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import shutil
|
||||
from datetime import datetime, timedelta
|
||||
import threading
|
||||
import queue
|
||||
from collections import OrderedDict
|
||||
import pickle
|
||||
import hashlib
|
||||
import logging
|
||||
|
||||
class PredictionCache:
|
||||
logger = logging.getLogger('cache_manager')
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
class LRUCache:
|
||||
"""实现LRU (Least Recently Used) 缓存策略"""
|
||||
|
||||
def __init__(self, capacity):
|
||||
self.cache = OrderedDict()
|
||||
self.capacity = capacity
|
||||
|
||||
def get(self, key):
|
||||
if key not in self.cache:
|
||||
return None
|
||||
# 访问元素时,将其移至末尾,表示最近使用
|
||||
self.cache.move_to_end(key)
|
||||
return self.cache[key]
|
||||
|
||||
def put(self, key, value):
|
||||
# 如果键已存在,更新值并将其移至末尾
|
||||
if key in self.cache:
|
||||
self.cache[key] = value
|
||||
self.cache.move_to_end(key)
|
||||
return
|
||||
|
||||
# 如果缓存已满,删除最久未使用的项(OrderedDict 的首项)
|
||||
if len(self.cache) >= self.capacity:
|
||||
self.cache.popitem(last=False)
|
||||
|
||||
# 添加新项至末尾
|
||||
self.cache[key] = value
|
||||
|
||||
def remove(self, key):
|
||||
if key in self.cache:
|
||||
del self.cache[key]
|
||||
|
||||
def clear(self):
|
||||
self.cache.clear()
|
||||
|
||||
def __len__(self):
|
||||
return len(self.cache)
|
||||
|
||||
def get_all_keys(self):
|
||||
return list(self.cache.keys())
|
||||
|
||||
|
||||
class CacheManager:
|
||||
"""两级缓存系统:内存LRU缓存 + 磁盘持久化缓存"""
|
||||
|
||||
_instance = None
|
||||
_lock = threading.Lock()
|
||||
|
||||
def __new__(cls):
|
||||
def __new__(cls, *args, **kwargs):
|
||||
with cls._lock:
|
||||
if cls._instance is None:
|
||||
cls._instance = super(PredictionCache, cls).__new__(cls)
|
||||
cls._instance = super(CacheManager, cls).__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
def __init__(self):
|
||||
if not hasattr(self, 'initialized'):
|
||||
self.cache_dir = 'cache/predictions'
|
||||
self.cache_duration = timedelta(hours=24) # 缓存24小时
|
||||
self.cache = {}
|
||||
self.cache_queue = queue.Queue()
|
||||
def __init__(self, name="default", memory_capacity=1000, cache_duration=24,
|
||||
disk_cache_dir="cache", flush_interval=5):
|
||||
if hasattr(self, 'initialized'):
|
||||
return
|
||||
|
||||
self.name = name
|
||||
self.memory_cache = LRUCache(memory_capacity)
|
||||
self.disk_cache_dir = os.path.join(disk_cache_dir, name)
|
||||
self.cache_duration = timedelta(hours=cache_duration)
|
||||
self.flush_interval = flush_interval # 定时将内存缓存刷新到磁盘的间隔(分钟)
|
||||
self.cache_stats = {"hits": 0, "misses": 0, "disk_hits": 0}
|
||||
self.disk_queue = queue.Queue()
|
||||
self.initialized = True
|
||||
|
||||
# 确保缓存目录存在
|
||||
os.makedirs(self.cache_dir, exist_ok=True)
|
||||
os.makedirs(self.disk_cache_dir, exist_ok=True)
|
||||
|
||||
# 启动缓存清理线程
|
||||
self.cleanup_thread = threading.Thread(target=self._cleanup_old_cache, daemon=True)
|
||||
# 启动缓存管理线程
|
||||
self.cleanup_thread = threading.Thread(target=self._cleanup_and_flush_task, daemon=True)
|
||||
self.cleanup_thread.start()
|
||||
|
||||
# 加载现有缓存
|
||||
self._load_cache()
|
||||
# 启动磁盘写入线程
|
||||
self.disk_writer_thread = threading.Thread(target=self._disk_writer_task, daemon=True)
|
||||
self.disk_writer_thread.start()
|
||||
|
||||
logger.info(f"初始化缓存管理器: {name},内存容量: {memory_capacity}项,缓存时间: {cache_duration}小时")
|
||||
|
||||
def _load_cache(self):
|
||||
"""加载磁盘上的缓存文件"""
|
||||
try:
|
||||
for filename in os.listdir(self.cache_dir):
|
||||
if filename.endswith('.json'):
|
||||
filepath = os.path.join(self.cache_dir, filename)
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
cache_data = json.load(f)
|
||||
# 检查缓存是否过期
|
||||
if self._is_cache_valid(cache_data['timestamp']):
|
||||
topic = filename[:-5] # 移除.json后缀
|
||||
self.cache[topic] = cache_data
|
||||
else:
|
||||
# 删除过期缓存文件
|
||||
os.remove(filepath)
|
||||
except Exception as e:
|
||||
print(f"加载缓存失败: {e}")
|
||||
def _get_cache_key(self, key):
|
||||
"""标准化缓存键"""
|
||||
if isinstance(key, str):
|
||||
return key
|
||||
return hashlib.md5(str(key).encode()).hexdigest()
|
||||
|
||||
def _cleanup_old_cache(self):
|
||||
"""定期清理过期缓存的后台线程"""
|
||||
while True:
|
||||
try:
|
||||
# 检查并清理内存缓存
|
||||
current_time = datetime.now()
|
||||
expired_topics = []
|
||||
|
||||
for topic, cache_data in self.cache.items():
|
||||
if not self._is_cache_valid(cache_data['timestamp']):
|
||||
expired_topics.append(topic)
|
||||
|
||||
# 删除过期缓存
|
||||
for topic in expired_topics:
|
||||
del self.cache[topic]
|
||||
cache_file = os.path.join(self.cache_dir, f"{topic}.json")
|
||||
if os.path.exists(cache_file):
|
||||
os.remove(cache_file)
|
||||
|
||||
# 休眠1小时后再次检查
|
||||
time.sleep(3600)
|
||||
except Exception as e:
|
||||
print(f"清理缓存时出错: {e}")
|
||||
time.sleep(3600) # 发生错误时也等待1小时
|
||||
def _get_disk_path(self, key):
|
||||
"""获取磁盘缓存路径"""
|
||||
safe_key = self._get_cache_key(key)
|
||||
return os.path.join(self.disk_cache_dir, f"{safe_key}.cache")
|
||||
|
||||
def _is_cache_valid(self, timestamp):
|
||||
"""检查缓存是否有效"""
|
||||
"""检查缓存是否过期"""
|
||||
cache_time = datetime.fromtimestamp(timestamp)
|
||||
return datetime.now() - cache_time < self.cache_duration
|
||||
|
||||
def get(self, topic):
|
||||
"""获取话题的预测缓存"""
|
||||
if topic in self.cache and self._is_cache_valid(self.cache[topic]['timestamp']):
|
||||
return self.cache[topic]['prediction']
|
||||
def get(self, key):
|
||||
"""获取缓存数据,首先检查内存,然后检查磁盘"""
|
||||
cache_key = self._get_cache_key(key)
|
||||
|
||||
# 1. 检查内存缓存
|
||||
cache_data = self.memory_cache.get(cache_key)
|
||||
if cache_data is not None:
|
||||
if self._is_cache_valid(cache_data['timestamp']):
|
||||
self.cache_stats["hits"] += 1
|
||||
logger.debug(f"内存缓存命中: {key}")
|
||||
return cache_data['data']
|
||||
else:
|
||||
# 过期缓存,从内存中删除
|
||||
self.memory_cache.remove(cache_key)
|
||||
|
||||
# 2. 检查磁盘缓存
|
||||
disk_path = self._get_disk_path(cache_key)
|
||||
if os.path.exists(disk_path):
|
||||
try:
|
||||
with open(disk_path, 'rb') as f:
|
||||
cache_data = pickle.load(f)
|
||||
|
||||
if self._is_cache_valid(cache_data['timestamp']):
|
||||
# 从磁盘加载后,放入内存缓存
|
||||
self.memory_cache.put(cache_key, cache_data)
|
||||
self.cache_stats["disk_hits"] += 1
|
||||
logger.debug(f"磁盘缓存命中: {key}")
|
||||
return cache_data['data']
|
||||
else:
|
||||
# 过期缓存,删除磁盘文件
|
||||
os.remove(disk_path)
|
||||
except Exception as e:
|
||||
logger.warning(f"读取磁盘缓存失败: {key}, 错误: {e}")
|
||||
|
||||
self.cache_stats["misses"] += 1
|
||||
logger.debug(f"缓存未命中: {key}")
|
||||
return None
|
||||
|
||||
def set(self, topic, prediction):
|
||||
"""设置话题的预测缓存"""
|
||||
def set(self, key, data, immediate_disk_write=False):
|
||||
"""设置缓存数据,同时更新内存和安排磁盘写入"""
|
||||
cache_key = self._get_cache_key(key)
|
||||
cache_data = {
|
||||
'prediction': prediction,
|
||||
'data': data,
|
||||
'timestamp': datetime.now().timestamp()
|
||||
}
|
||||
|
||||
# 更新内存缓存
|
||||
self.cache[topic] = cache_data
|
||||
self.memory_cache.put(cache_key, cache_data)
|
||||
|
||||
# 异步保存到磁盘
|
||||
self.cache_queue.put((topic, cache_data))
|
||||
threading.Thread(target=self._save_cache_to_disk, daemon=True).start()
|
||||
# 安排写入磁盘
|
||||
if immediate_disk_write:
|
||||
self._write_to_disk(cache_key, cache_data)
|
||||
else:
|
||||
self.disk_queue.put((cache_key, cache_data))
|
||||
|
||||
logger.debug(f"缓存已设置: {key}")
|
||||
return True
|
||||
|
||||
def _save_cache_to_disk(self):
|
||||
"""异步保存缓存到磁盘"""
|
||||
def invalidate(self, key):
|
||||
"""使指定键的缓存失效"""
|
||||
cache_key = self._get_cache_key(key)
|
||||
|
||||
# 从内存中删除
|
||||
self.memory_cache.remove(cache_key)
|
||||
|
||||
# 从磁盘中删除
|
||||
disk_path = self._get_disk_path(cache_key)
|
||||
if os.path.exists(disk_path):
|
||||
try:
|
||||
os.remove(disk_path)
|
||||
logger.debug(f"缓存已失效: {key}")
|
||||
except Exception as e:
|
||||
logger.warning(f"删除磁盘缓存失败: {key}, 错误: {e}")
|
||||
|
||||
return True
|
||||
|
||||
def clear_all(self):
|
||||
"""清除所有缓存"""
|
||||
# 清除内存缓存
|
||||
self.memory_cache.clear()
|
||||
|
||||
# 清除磁盘缓存
|
||||
try:
|
||||
while not self.cache_queue.empty():
|
||||
topic, cache_data = self.cache_queue.get()
|
||||
cache_file = os.path.join(self.cache_dir, f"{topic}.json")
|
||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(cache_data, f, ensure_ascii=False, indent=2)
|
||||
shutil.rmtree(self.disk_cache_dir)
|
||||
os.makedirs(self.disk_cache_dir, exist_ok=True)
|
||||
logger.info(f"所有缓存已清除: {self.name}")
|
||||
except Exception as e:
|
||||
print(f"保存缓存到磁盘失败: {e}")
|
||||
logger.error(f"清除磁盘缓存失败: {e}")
|
||||
|
||||
# 重置统计信息
|
||||
self.cache_stats = {"hits": 0, "misses": 0, "disk_hits": 0}
|
||||
|
||||
return True
|
||||
|
||||
def get_stats(self):
|
||||
"""获取缓存统计信息"""
|
||||
total_requests = self.cache_stats["hits"] + self.cache_stats["misses"]
|
||||
hit_rate = (self.cache_stats["hits"] / total_requests * 100) if total_requests > 0 else 0
|
||||
total_hits = self.cache_stats["hits"] + self.cache_stats["disk_hits"]
|
||||
|
||||
memory_size = len(self.memory_cache)
|
||||
disk_size = len([f for f in os.listdir(self.disk_cache_dir) if f.endswith('.cache')])
|
||||
|
||||
return {
|
||||
"name": self.name,
|
||||
"memory_items": memory_size,
|
||||
"disk_items": disk_size,
|
||||
"memory_hits": self.cache_stats["hits"],
|
||||
"disk_hits": self.cache_stats["disk_hits"],
|
||||
"misses": self.cache_stats["misses"],
|
||||
"total_requests": total_requests,
|
||||
"hit_rate": hit_rate,
|
||||
"two_level_hit_rate": (total_hits / total_requests * 100) if total_requests > 0 else 0
|
||||
}
|
||||
|
||||
def _write_to_disk(self, cache_key, cache_data):
|
||||
"""将缓存写入磁盘"""
|
||||
disk_path = self._get_disk_path(cache_key)
|
||||
try:
|
||||
with open(disk_path, 'wb') as f:
|
||||
pickle.dump(cache_data, f)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"写入磁盘缓存失败: {cache_key}, 错误: {e}")
|
||||
return False
|
||||
|
||||
def _disk_writer_task(self):
|
||||
"""后台线程,负责将缓存写入磁盘"""
|
||||
while True:
|
||||
try:
|
||||
# 尝试从队列获取条目,超时后继续循环
|
||||
try:
|
||||
cache_key, cache_data = self.disk_queue.get(timeout=1)
|
||||
self._write_to_disk(cache_key, cache_data)
|
||||
self.disk_queue.task_done()
|
||||
except queue.Empty:
|
||||
time.sleep(0.1)
|
||||
except Exception as e:
|
||||
logger.error(f"磁盘写入线程出错: {e}")
|
||||
time.sleep(5) # 发生错误时等待一段时间
|
||||
|
||||
def _cleanup_and_flush_task(self):
|
||||
"""后台线程,负责清理过期缓存和定期刷新内存缓存到磁盘"""
|
||||
while True:
|
||||
try:
|
||||
# 1. 清理过期的内存缓存
|
||||
current_time = datetime.now()
|
||||
for key in self.memory_cache.get_all_keys():
|
||||
cache_data = self.memory_cache.get(key)
|
||||
if not self._is_cache_valid(cache_data['timestamp']):
|
||||
self.memory_cache.remove(key)
|
||||
|
||||
# 2. 清理过期的磁盘缓存
|
||||
for filename in os.listdir(self.disk_cache_dir):
|
||||
if filename.endswith('.cache'):
|
||||
filepath = os.path.join(self.disk_cache_dir, filename)
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
cache_data = pickle.load(f)
|
||||
if not self._is_cache_valid(cache_data['timestamp']):
|
||||
os.remove(filepath)
|
||||
except Exception as e:
|
||||
# 清理损坏的缓存文件
|
||||
logger.warning(f"读取缓存文件失败,将删除: {filepath}, 错误: {e}")
|
||||
os.remove(filepath)
|
||||
|
||||
# 3. 将内存缓存刷新到磁盘
|
||||
# 注意:这会重写已经写入磁盘的缓存,但确保内存和磁盘保持同步
|
||||
for key in self.memory_cache.get_all_keys():
|
||||
cache_data = self.memory_cache.get(key)
|
||||
self._write_to_disk(key, cache_data)
|
||||
|
||||
# 每小时执行一次清理
|
||||
time.sleep(3600)
|
||||
except Exception as e:
|
||||
logger.error(f"缓存清理线程出错: {e}")
|
||||
time.sleep(3600) # 发生错误时也等待一段时间
|
||||
|
||||
# 创建全局缓存实例
|
||||
prediction_cache = PredictionCache()
|
||||
|
||||
# 创建不同领域的缓存实例
|
||||
prediction_cache = CacheManager(name="predictions", memory_capacity=500, cache_duration=24)
|
||||
sentiment_cache = CacheManager(name="sentiment", memory_capacity=1000, cache_duration=12)
|
||||
topic_cache = CacheManager(name="topics", memory_capacity=200, cache_duration=6)
|
||||
user_data_cache = CacheManager(name="user_data", memory_capacity=300, cache_duration=48)
|
||||
|
||||
# 向后兼容的别名
|
||||
PredictionCache = CacheManager
|
||||
# 为保持向后兼容,我们保留原来的prediction_cache
|
||||
prediction_cache_old = prediction_cache
|
||||
Reference in New Issue
Block a user