91 lines
3.1 KiB
Python
91 lines
3.1 KiB
Python
from datetime import datetime
|
|
|
|
from scrapy.statscollectors import StatsCollector
|
|
|
|
from .connection import from_settings as redis_from_settings
|
|
from .defaults import SCHEDULER_PERSIST, STATS_KEY
|
|
from .utils import convert_bytes_to_str
|
|
|
|
|
|
class RedisStatsCollector(StatsCollector):
|
|
"""
|
|
Stats Collector based on Redis
|
|
"""
|
|
|
|
def __init__(self, crawler, spider=None):
|
|
super().__init__(crawler)
|
|
self.server = redis_from_settings(crawler.settings)
|
|
self.spider = spider
|
|
self.spider_name = spider.name if spider else crawler.spidercls.name
|
|
self.stats_key = crawler.settings.get("STATS_KEY", STATS_KEY)
|
|
self.persist = crawler.settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST)
|
|
|
|
def _get_key(self, spider=None):
|
|
"""Return the hash name of stats"""
|
|
if spider:
|
|
return self.stats_key % {"spider": spider.name}
|
|
if self.spider:
|
|
return self.stats_key % {"spider": self.spider.name}
|
|
return self.stats_key % {"spider": self.spider_name or "scrapy"}
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
return cls(crawler)
|
|
|
|
@classmethod
|
|
def from_spider(cls, spider):
|
|
return cls(spider.crawler)
|
|
|
|
def get_value(self, key, default=None, spider=None):
|
|
"""Return the value of hash stats"""
|
|
if self.server.hexists(self._get_key(spider), key):
|
|
return int(self.server.hget(self._get_key(spider), key))
|
|
else:
|
|
return default
|
|
|
|
def get_stats(self, spider=None):
|
|
"""Return the all of the values of hash stats"""
|
|
stats = self.server.hgetall(self._get_key(spider))
|
|
if stats:
|
|
return convert_bytes_to_str(stats)
|
|
return {}
|
|
|
|
def set_value(self, key, value, spider=None):
|
|
"""Set the value according to hash key of stats"""
|
|
if isinstance(value, datetime):
|
|
value = value.timestamp()
|
|
self.server.hset(self._get_key(spider), key, value)
|
|
|
|
def set_stats(self, stats, spider=None):
|
|
"""Set all the hash stats"""
|
|
self.server.hmset(self._get_key(spider), stats)
|
|
|
|
def inc_value(self, key, count=1, start=0, spider=None):
|
|
"""Set increment of value according to key"""
|
|
if not self.server.hexists(self._get_key(spider), key):
|
|
self.set_value(key, start)
|
|
self.server.hincrby(self._get_key(spider), key, count)
|
|
|
|
def max_value(self, key, value, spider=None):
|
|
"""Set max value between current and new value"""
|
|
self.set_value(key, max(self.get_value(key, value), value))
|
|
|
|
def min_value(self, key, value, spider=None):
|
|
"""Set min value between current and new value"""
|
|
self.set_value(key, min(self.get_value(key, value), value))
|
|
|
|
def clear_stats(self, spider=None):
|
|
"""Clear all the hash stats"""
|
|
self.server.delete(self._get_key(spider))
|
|
|
|
def open_spider(self, spider):
|
|
"""Set spider to self"""
|
|
if spider:
|
|
self.spider = spider
|
|
|
|
def close_spider(self, spider, reason):
|
|
"""Clear spider and clear stats"""
|
|
self.spider = None
|
|
if not self.persist:
|
|
self.clear_stats(spider)
|