🚀 Major Upgrade! Visual Workflow Orchestrator and AI-Powered Crawler Implemented. Added Model Arena Feature and Efficiency Optimizations (Two-Level Caching Architecture + End-to-End Performance Enhancements).

This commit is contained in:
戒酒的李白
2025-03-13 13:14:35 +08:00
parent ee5372941a
commit 0c6a40b869
12 changed files with 5688 additions and 78 deletions
+262 -77
View File
@@ -1,116 +1,301 @@
import json
import os
import time
import shutil
from datetime import datetime, timedelta
import threading
import queue
from collections import OrderedDict
import pickle
import hashlib
import logging
class PredictionCache:
logger = logging.getLogger('cache_manager')
logger.setLevel(logging.INFO)
class LRUCache:
"""实现LRU (Least Recently Used) 缓存策略"""
def __init__(self, capacity):
self.cache = OrderedDict()
self.capacity = capacity
def get(self, key):
if key not in self.cache:
return None
# 访问元素时,将其移至末尾,表示最近使用
self.cache.move_to_end(key)
return self.cache[key]
def put(self, key, value):
# 如果键已存在,更新值并将其移至末尾
if key in self.cache:
self.cache[key] = value
self.cache.move_to_end(key)
return
# 如果缓存已满,删除最久未使用的项(OrderedDict 的首项)
if len(self.cache) >= self.capacity:
self.cache.popitem(last=False)
# 添加新项至末尾
self.cache[key] = value
def remove(self, key):
if key in self.cache:
del self.cache[key]
def clear(self):
self.cache.clear()
def __len__(self):
return len(self.cache)
def get_all_keys(self):
return list(self.cache.keys())
class CacheManager:
"""两级缓存系统:内存LRU缓存 + 磁盘持久化缓存"""
_instance = None
_lock = threading.Lock()
def __new__(cls):
def __new__(cls, *args, **kwargs):
with cls._lock:
if cls._instance is None:
cls._instance = super(PredictionCache, cls).__new__(cls)
cls._instance = super(CacheManager, cls).__new__(cls)
return cls._instance
def __init__(self):
if not hasattr(self, 'initialized'):
self.cache_dir = 'cache/predictions'
self.cache_duration = timedelta(hours=24) # 缓存24小时
self.cache = {}
self.cache_queue = queue.Queue()
def __init__(self, name="default", memory_capacity=1000, cache_duration=24,
disk_cache_dir="cache", flush_interval=5):
if hasattr(self, 'initialized'):
return
self.name = name
self.memory_cache = LRUCache(memory_capacity)
self.disk_cache_dir = os.path.join(disk_cache_dir, name)
self.cache_duration = timedelta(hours=cache_duration)
self.flush_interval = flush_interval # 定时将内存缓存刷新到磁盘的间隔(分钟)
self.cache_stats = {"hits": 0, "misses": 0, "disk_hits": 0}
self.disk_queue = queue.Queue()
self.initialized = True
# 确保缓存目录存在
os.makedirs(self.cache_dir, exist_ok=True)
os.makedirs(self.disk_cache_dir, exist_ok=True)
# 启动缓存理线程
self.cleanup_thread = threading.Thread(target=self._cleanup_old_cache, daemon=True)
# 启动缓存理线程
self.cleanup_thread = threading.Thread(target=self._cleanup_and_flush_task, daemon=True)
self.cleanup_thread.start()
# 加载现有缓存
self._load_cache()
# 启动磁盘写入线程
self.disk_writer_thread = threading.Thread(target=self._disk_writer_task, daemon=True)
self.disk_writer_thread.start()
logger.info(f"初始化缓存管理器: {name},内存容量: {memory_capacity}项,缓存时间: {cache_duration}小时")
def _load_cache(self):
"""加载磁盘上的缓存文件"""
try:
for filename in os.listdir(self.cache_dir):
if filename.endswith('.json'):
filepath = os.path.join(self.cache_dir, filename)
with open(filepath, 'r', encoding='utf-8') as f:
cache_data = json.load(f)
# 检查缓存是否过期
if self._is_cache_valid(cache_data['timestamp']):
topic = filename[:-5] # 移除.json后缀
self.cache[topic] = cache_data
else:
# 删除过期缓存文件
os.remove(filepath)
except Exception as e:
print(f"加载缓存失败: {e}")
def _get_cache_key(self, key):
"""标准化缓存键"""
if isinstance(key, str):
return key
return hashlib.md5(str(key).encode()).hexdigest()
def _cleanup_old_cache(self):
"""定期清理过期缓存的后台线程"""
while True:
try:
# 检查并清理内存缓存
current_time = datetime.now()
expired_topics = []
for topic, cache_data in self.cache.items():
if not self._is_cache_valid(cache_data['timestamp']):
expired_topics.append(topic)
# 删除过期缓存
for topic in expired_topics:
del self.cache[topic]
cache_file = os.path.join(self.cache_dir, f"{topic}.json")
if os.path.exists(cache_file):
os.remove(cache_file)
# 休眠1小时后再次检查
time.sleep(3600)
except Exception as e:
print(f"清理缓存时出错: {e}")
time.sleep(3600) # 发生错误时也等待1小时
def _get_disk_path(self, key):
"""获取磁盘缓存路径"""
safe_key = self._get_cache_key(key)
return os.path.join(self.disk_cache_dir, f"{safe_key}.cache")
def _is_cache_valid(self, timestamp):
"""检查缓存是否有效"""
"""检查缓存是否过期"""
cache_time = datetime.fromtimestamp(timestamp)
return datetime.now() - cache_time < self.cache_duration
def get(self, topic):
"""获取话题的预测缓存"""
if topic in self.cache and self._is_cache_valid(self.cache[topic]['timestamp']):
return self.cache[topic]['prediction']
def get(self, key):
"""获取缓存数据,首先检查内存,然后检查磁盘"""
cache_key = self._get_cache_key(key)
# 1. 检查内存缓存
cache_data = self.memory_cache.get(cache_key)
if cache_data is not None:
if self._is_cache_valid(cache_data['timestamp']):
self.cache_stats["hits"] += 1
logger.debug(f"内存缓存命中: {key}")
return cache_data['data']
else:
# 过期缓存,从内存中删除
self.memory_cache.remove(cache_key)
# 2. 检查磁盘缓存
disk_path = self._get_disk_path(cache_key)
if os.path.exists(disk_path):
try:
with open(disk_path, 'rb') as f:
cache_data = pickle.load(f)
if self._is_cache_valid(cache_data['timestamp']):
# 从磁盘加载后,放入内存缓存
self.memory_cache.put(cache_key, cache_data)
self.cache_stats["disk_hits"] += 1
logger.debug(f"磁盘缓存命中: {key}")
return cache_data['data']
else:
# 过期缓存,删除磁盘文件
os.remove(disk_path)
except Exception as e:
logger.warning(f"读取磁盘缓存失败: {key}, 错误: {e}")
self.cache_stats["misses"] += 1
logger.debug(f"缓存未命中: {key}")
return None
def set(self, topic, prediction):
"""设置话题的预测缓存"""
def set(self, key, data, immediate_disk_write=False):
"""设置缓存数据,同时更新内存和安排磁盘写入"""
cache_key = self._get_cache_key(key)
cache_data = {
'prediction': prediction,
'data': data,
'timestamp': datetime.now().timestamp()
}
# 更新内存缓存
self.cache[topic] = cache_data
self.memory_cache.put(cache_key, cache_data)
# 异步保存到磁盘
self.cache_queue.put((topic, cache_data))
threading.Thread(target=self._save_cache_to_disk, daemon=True).start()
# 安排写入磁盘
if immediate_disk_write:
self._write_to_disk(cache_key, cache_data)
else:
self.disk_queue.put((cache_key, cache_data))
logger.debug(f"缓存已设置: {key}")
return True
def _save_cache_to_disk(self):
"""异步保存缓存到磁盘"""
def invalidate(self, key):
"""使指定键的缓存失效"""
cache_key = self._get_cache_key(key)
# 从内存中删除
self.memory_cache.remove(cache_key)
# 从磁盘中删除
disk_path = self._get_disk_path(cache_key)
if os.path.exists(disk_path):
try:
os.remove(disk_path)
logger.debug(f"缓存已失效: {key}")
except Exception as e:
logger.warning(f"删除磁盘缓存失败: {key}, 错误: {e}")
return True
def clear_all(self):
"""清除所有缓存"""
# 清除内存缓存
self.memory_cache.clear()
# 清除磁盘缓存
try:
while not self.cache_queue.empty():
topic, cache_data = self.cache_queue.get()
cache_file = os.path.join(self.cache_dir, f"{topic}.json")
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(cache_data, f, ensure_ascii=False, indent=2)
shutil.rmtree(self.disk_cache_dir)
os.makedirs(self.disk_cache_dir, exist_ok=True)
logger.info(f"所有缓存已清除: {self.name}")
except Exception as e:
print(f"保存缓存到磁盘失败: {e}")
logger.error(f"清除磁盘缓存失败: {e}")
# 重置统计信息
self.cache_stats = {"hits": 0, "misses": 0, "disk_hits": 0}
return True
def get_stats(self):
"""获取缓存统计信息"""
total_requests = self.cache_stats["hits"] + self.cache_stats["misses"]
hit_rate = (self.cache_stats["hits"] / total_requests * 100) if total_requests > 0 else 0
total_hits = self.cache_stats["hits"] + self.cache_stats["disk_hits"]
memory_size = len(self.memory_cache)
disk_size = len([f for f in os.listdir(self.disk_cache_dir) if f.endswith('.cache')])
return {
"name": self.name,
"memory_items": memory_size,
"disk_items": disk_size,
"memory_hits": self.cache_stats["hits"],
"disk_hits": self.cache_stats["disk_hits"],
"misses": self.cache_stats["misses"],
"total_requests": total_requests,
"hit_rate": hit_rate,
"two_level_hit_rate": (total_hits / total_requests * 100) if total_requests > 0 else 0
}
def _write_to_disk(self, cache_key, cache_data):
"""将缓存写入磁盘"""
disk_path = self._get_disk_path(cache_key)
try:
with open(disk_path, 'wb') as f:
pickle.dump(cache_data, f)
return True
except Exception as e:
logger.warning(f"写入磁盘缓存失败: {cache_key}, 错误: {e}")
return False
def _disk_writer_task(self):
"""后台线程,负责将缓存写入磁盘"""
while True:
try:
# 尝试从队列获取条目,超时后继续循环
try:
cache_key, cache_data = self.disk_queue.get(timeout=1)
self._write_to_disk(cache_key, cache_data)
self.disk_queue.task_done()
except queue.Empty:
time.sleep(0.1)
except Exception as e:
logger.error(f"磁盘写入线程出错: {e}")
time.sleep(5) # 发生错误时等待一段时间
def _cleanup_and_flush_task(self):
"""后台线程,负责清理过期缓存和定期刷新内存缓存到磁盘"""
while True:
try:
# 1. 清理过期的内存缓存
current_time = datetime.now()
for key in self.memory_cache.get_all_keys():
cache_data = self.memory_cache.get(key)
if not self._is_cache_valid(cache_data['timestamp']):
self.memory_cache.remove(key)
# 2. 清理过期的磁盘缓存
for filename in os.listdir(self.disk_cache_dir):
if filename.endswith('.cache'):
filepath = os.path.join(self.disk_cache_dir, filename)
try:
with open(filepath, 'rb') as f:
cache_data = pickle.load(f)
if not self._is_cache_valid(cache_data['timestamp']):
os.remove(filepath)
except Exception as e:
# 清理损坏的缓存文件
logger.warning(f"读取缓存文件失败,将删除: {filepath}, 错误: {e}")
os.remove(filepath)
# 3. 将内存缓存刷新到磁盘
# 注意:这会重写已经写入磁盘的缓存,但确保内存和磁盘保持同步
for key in self.memory_cache.get_all_keys():
cache_data = self.memory_cache.get(key)
self._write_to_disk(key, cache_data)
# 每小时执行一次清理
time.sleep(3600)
except Exception as e:
logger.error(f"缓存清理线程出错: {e}")
time.sleep(3600) # 发生错误时也等待一段时间
# 创建全局缓存实例
prediction_cache = PredictionCache()
# 创建不同领域的缓存实例
prediction_cache = CacheManager(name="predictions", memory_capacity=500, cache_duration=24)
sentiment_cache = CacheManager(name="sentiment", memory_capacity=1000, cache_duration=12)
topic_cache = CacheManager(name="topics", memory_capacity=200, cache_duration=6)
user_data_cache = CacheManager(name="user_data", memory_capacity=300, cache_duration=48)
# 向后兼容的别名
PredictionCache = CacheManager
# 为保持向后兼容,我们保留原来的prediction_cache
prediction_cache_old = prediction_cache