29 lines
831 B
Python
29 lines
831 B
Python
from scrapy.linkextractors import LinkExtractor
|
|
from scrapy.spiders import Rule
|
|
|
|
from scrapy_redis.spiders import RedisCrawlSpider
|
|
|
|
|
|
class MyCrawler(RedisCrawlSpider):
|
|
"""Spider that reads urls from redis queue (myspider:start_urls)."""
|
|
|
|
name = "mycrawler_redis"
|
|
redis_key = "mycrawler:start_urls"
|
|
|
|
rules = (
|
|
# follow all links
|
|
Rule(LinkExtractor(), callback="parse_page", follow=True),
|
|
)
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
# Dynamically define the allowed domains list.
|
|
domain = kwargs.pop("domain", "")
|
|
self.allowed_domains = filter(None, domain.split(","))
|
|
super().__init__(*args, **kwargs)
|
|
|
|
def parse_page(self, response):
|
|
return {
|
|
"name": response.css("title::text").extract_first(),
|
|
"url": response.url,
|
|
}
|