from scrapy.linkextractors import LinkExtractor from scrapy.spiders import Rule from scrapy_redis.spiders import RedisCrawlSpider class MyCrawler(RedisCrawlSpider): """Spider that reads urls from redis queue (myspider:start_urls).""" name = "mycrawler_redis" redis_key = "mycrawler:start_urls" rules = ( # follow all links Rule(LinkExtractor(), callback="parse_page", follow=True), ) def __init__(self, *args, **kwargs): # Dynamically define the allowed domains list. domain = kwargs.pop("domain", "") self.allowed_domains = filter(None, domain.split(",")) super().__init__(*args, **kwargs) def parse_page(self, response): return { "name": response.css("title::text").extract_first(), "url": response.url, }