变更
This commit is contained in:
@@ -0,0 +1,28 @@
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import Rule
|
||||
|
||||
from scrapy_redis.spiders import RedisCrawlSpider
|
||||
|
||||
|
||||
class MyCrawler(RedisCrawlSpider):
|
||||
"""Spider that reads urls from redis queue (myspider:start_urls)."""
|
||||
|
||||
name = "mycrawler_redis"
|
||||
redis_key = "mycrawler:start_urls"
|
||||
|
||||
rules = (
|
||||
# follow all links
|
||||
Rule(LinkExtractor(), callback="parse_page", follow=True),
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Dynamically define the allowed domains list.
|
||||
domain = kwargs.pop("domain", "")
|
||||
self.allowed_domains = filter(None, domain.split(","))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def parse_page(self, response):
|
||||
return {
|
||||
"name": response.css("title::text").extract_first(),
|
||||
"url": response.url,
|
||||
}
|
||||
Reference in New Issue
Block a user