python/python爬虫/scrapy-redis-0.9.1/example-project/example/settings.py

# Scrapy settings for example project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ["example.spiders"]
NEWSPIDER_MODULE = "example.spiders"

LOG_LEVEL = "WARNING"

USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)"

#设置重复过滤器模块
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#设置调度器，scrapy_redis具备与数据库交互的功能
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#设置当爬虫结束时是否保持redis数据库中的去重集合与任务队列
SCHEDULER_PERSIST = True
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"

ITEM_PIPELINES = {
    "example.pipelines.ExamplePipeline": 300,
    #当开启该管道，该管道将会把数据存到redis数据库中
    "scrapy_redis.pipelines.RedisPipeline": 400,
}
#设置redis数据库
REDIS_URL = "redis://127.0.0.1:6379"

LOG_LEVEL = "DEBUG"

# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1