Files
2025-08-05 09:19:34 +08:00

29 lines
831 B
Python

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy_redis.spiders import RedisCrawlSpider
class MyCrawler(RedisCrawlSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = "mycrawler_redis"
redis_key = "mycrawler:start_urls"
rules = (
# follow all links
Rule(LinkExtractor(), callback="parse_page", follow=True),
)
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop("domain", "")
self.allowed_domains = filter(None, domain.split(","))
super().__init__(*args, **kwargs)
def parse_page(self, response):
return {
"name": response.css("title::text").extract_first(),
"url": response.url,
}