变更

2025-08-05 09:19:34 +08:00
commit 584548d006
1696 changed files with 53855 additions and 0 deletions
@@ -0,0 +1,37 @@
+# Scrapy settings for example project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+SPIDER_MODULES = ["example.spiders"]
+NEWSPIDER_MODULE = "example.spiders"
+
+LOG_LEVEL = "WARNING"
+
+USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)"
+
+#设置重复过滤器模块
+DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
+#设置调度器，scrapy_redis具备与数据库交互的功能
+SCHEDULER = "scrapy_redis.scheduler.Scheduler"
+#设置当爬虫结束时是否保持redis数据库中的去重集合与任务队列
+SCHEDULER_PERSIST = True
+# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
+# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
+# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
+
+ITEM_PIPELINES = {
+    "example.pipelines.ExamplePipeline": 300,
+    #当开启该管道，该管道将会把数据存到redis数据库中
+    "scrapy_redis.pipelines.RedisPipeline": 400,
+}
+#设置redis数据库
+REDIS_URL = "redis://127.0.0.1:6379"
+
+LOG_LEVEL = "DEBUG"
+
+# Introduce an artifical delay to make use of parallelism. to speed up the
+# crawl.
+DOWNLOAD_DELAY = 1