变更
This commit is contained in:
@@ -0,0 +1,24 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# http://doc.scrapy.org/topics/items.html
|
||||
|
||||
from scrapy.item import Field, Item
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.loader.processors import Join, MapCompose, TakeFirst
|
||||
|
||||
|
||||
class ExampleItem(Item):
|
||||
name = Field()
|
||||
description = Field()
|
||||
link = Field()
|
||||
crawled = Field()
|
||||
spider = Field()
|
||||
url = Field()
|
||||
|
||||
|
||||
class ExampleLoader(ItemLoader):
|
||||
default_item_class = ExampleItem
|
||||
default_input_processor = MapCompose(lambda s: s.strip())
|
||||
default_output_processor = TakeFirst()
|
||||
description_out = Join()
|
||||
@@ -0,0 +1,12 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class ExamplePipeline:
|
||||
def process_item(self, item, spider):
|
||||
item["crawled"] = datetime.utcnow()
|
||||
item["spider"] = spider.name
|
||||
return item
|
||||
@@ -0,0 +1,37 @@
|
||||
# Scrapy settings for example project
|
||||
#
|
||||
# For simplicity, this file contains only the most important settings by
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/topics/settings.html
|
||||
#
|
||||
SPIDER_MODULES = ["example.spiders"]
|
||||
NEWSPIDER_MODULE = "example.spiders"
|
||||
|
||||
LOG_LEVEL = "WARNING"
|
||||
|
||||
USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)"
|
||||
|
||||
#设置重复过滤器模块
|
||||
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
|
||||
#设置调度器,scrapy_redis具备与数据库交互的功能
|
||||
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
|
||||
#设置当爬虫结束时是否保持redis数据库中的去重集合与任务队列
|
||||
SCHEDULER_PERSIST = True
|
||||
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
|
||||
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
|
||||
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
|
||||
|
||||
ITEM_PIPELINES = {
|
||||
"example.pipelines.ExamplePipeline": 300,
|
||||
#当开启该管道,该管道将会把数据存到redis数据库中
|
||||
"scrapy_redis.pipelines.RedisPipeline": 400,
|
||||
}
|
||||
#设置redis数据库
|
||||
REDIS_URL = "redis://127.0.0.1:6379"
|
||||
|
||||
LOG_LEVEL = "DEBUG"
|
||||
|
||||
# Introduce an artifical delay to make use of parallelism. to speed up the
|
||||
# crawl.
|
||||
DOWNLOAD_DELAY = 1
|
||||
@@ -0,0 +1,8 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# To create the first spider for your project use this command:
|
||||
#
|
||||
# scrapy genspider myspider myspider-domain.com
|
||||
#
|
||||
# For more info see:
|
||||
# http://doc.scrapy.org/topics/spiders.html
|
||||
@@ -0,0 +1,26 @@
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
|
||||
|
||||
class DmozSpider(CrawlSpider):
|
||||
"""Follow categories and extract links."""
|
||||
|
||||
name = "dmoz"
|
||||
allowed_domains = ["dmoztools.net"]
|
||||
start_urls = ["http://www.dmoztools.net/"]
|
||||
|
||||
rules = [
|
||||
Rule(
|
||||
LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
|
||||
callback="parse_directory",
|
||||
follow=True,
|
||||
),
|
||||
]
|
||||
|
||||
def parse_directory(self, response):
|
||||
for div in response.css(".title-and-desc"):
|
||||
yield {
|
||||
"name": div.css(".site-title::text").extract_first(),
|
||||
"description": div.css(".site-descr::text").extract_first().strip(),
|
||||
"link": div.css("a::attr(href)").extract_first(),
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import Rule
|
||||
|
||||
from scrapy_redis.spiders import RedisCrawlSpider
|
||||
|
||||
|
||||
class MyCrawler(RedisCrawlSpider):
|
||||
"""Spider that reads urls from redis queue (myspider:start_urls)."""
|
||||
|
||||
name = "mycrawler_redis"
|
||||
redis_key = "mycrawler:start_urls"
|
||||
|
||||
rules = (
|
||||
# follow all links
|
||||
Rule(LinkExtractor(), callback="parse_page", follow=True),
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Dynamically define the allowed domains list.
|
||||
domain = kwargs.pop("domain", "")
|
||||
self.allowed_domains = filter(None, domain.split(","))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def parse_page(self, response):
|
||||
return {
|
||||
"name": response.css("title::text").extract_first(),
|
||||
"url": response.url,
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
from scrapy_redis.spiders import RedisSpider
|
||||
|
||||
|
||||
class MySpider(RedisSpider):
|
||||
"""Spider that reads urls from redis queue (myspider:start_urls)."""
|
||||
|
||||
name = "myspider_redis"
|
||||
redis_key = "myspider:start_urls"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Dynamically define the allowed domains list.
|
||||
domain = kwargs.pop("domain", "")
|
||||
self.allowed_domains = filter(None, domain.split(","))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def parse(self, response):
|
||||
return {
|
||||
"name": response.css("title::text").extract_first(),
|
||||
"url": response.url,
|
||||
}
|
||||
Reference in New Issue
Block a user