变更

2025-08-05 09:19:34 +08:00
commit 584548d006
1696 changed files with 53855 additions and 0 deletions
@@ -0,0 +1,24 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/topics/items.html
+
+from scrapy.item import Field, Item
+from scrapy.loader import ItemLoader
+from scrapy.loader.processors import Join, MapCompose, TakeFirst
+
+
+class ExampleItem(Item):
+    name = Field()
+    description = Field()
+    link = Field()
+    crawled = Field()
+    spider = Field()
+    url = Field()
+
+
+class ExampleLoader(ItemLoader):
+    default_item_class = ExampleItem
+    default_input_processor = MapCompose(lambda s: s.strip())
+    default_output_processor = TakeFirst()
+    description_out = Join()
@@ -0,0 +1,12 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+from datetime import datetime
+
+
+class ExamplePipeline:
+    def process_item(self, item, spider):
+        item["crawled"] = datetime.utcnow()
+        item["spider"] = spider.name
+        return item
@@ -0,0 +1,37 @@
+# Scrapy settings for example project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+SPIDER_MODULES = ["example.spiders"]
+NEWSPIDER_MODULE = "example.spiders"
+
+LOG_LEVEL = "WARNING"
+
+USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)"
+
+#设置重复过滤器模块
+DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
+#设置调度器，scrapy_redis具备与数据库交互的功能
+SCHEDULER = "scrapy_redis.scheduler.Scheduler"
+#设置当爬虫结束时是否保持redis数据库中的去重集合与任务队列
+SCHEDULER_PERSIST = True
+# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
+# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
+# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
+
+ITEM_PIPELINES = {
+    "example.pipelines.ExamplePipeline": 300,
+    #当开启该管道，该管道将会把数据存到redis数据库中
+    "scrapy_redis.pipelines.RedisPipeline": 400,
+}
+#设置redis数据库
+REDIS_URL = "redis://127.0.0.1:6379"
+
+LOG_LEVEL = "DEBUG"
+
+# Introduce an artifical delay to make use of parallelism. to speed up the
+# crawl.
+DOWNLOAD_DELAY = 1
@@ -0,0 +1,8 @@
+# This package will contain the spiders of your Scrapy project
+#
+# To create the first spider for your project use this command:
+#
+#   scrapy genspider myspider myspider-domain.com
+#
+# For more info see:
+# http://doc.scrapy.org/topics/spiders.html
@@ -0,0 +1,26 @@
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import CrawlSpider, Rule
+
+
+class DmozSpider(CrawlSpider):
+    """Follow categories and extract links."""
+
+    name = "dmoz"
+    allowed_domains = ["dmoztools.net"]
+    start_urls = ["http://www.dmoztools.net/"]
+
+    rules = [
+        Rule(
+            LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
+            callback="parse_directory",
+            follow=True,
+        ),
+    ]
+
+    def parse_directory(self, response):
+        for div in response.css(".title-and-desc"):
+            yield {
+                "name": div.css(".site-title::text").extract_first(),
+                "description": div.css(".site-descr::text").extract_first().strip(),
+                "link": div.css("a::attr(href)").extract_first(),
+            }
@@ -0,0 +1,28 @@
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import Rule
+
+from scrapy_redis.spiders import RedisCrawlSpider
+
+
+class MyCrawler(RedisCrawlSpider):
+    """Spider that reads urls from redis queue (myspider:start_urls)."""
+
+    name = "mycrawler_redis"
+    redis_key = "mycrawler:start_urls"
+
+    rules = (
+        # follow all links
+        Rule(LinkExtractor(), callback="parse_page", follow=True),
+    )
+
+    def __init__(self, *args, **kwargs):
+        # Dynamically define the allowed domains list.
+        domain = kwargs.pop("domain", "")
+        self.allowed_domains = filter(None, domain.split(","))
+        super().__init__(*args, **kwargs)
+
+    def parse_page(self, response):
+        return {
+            "name": response.css("title::text").extract_first(),
+            "url": response.url,
+        }
@@ -0,0 +1,20 @@
+from scrapy_redis.spiders import RedisSpider
+
+
+class MySpider(RedisSpider):
+    """Spider that reads urls from redis queue (myspider:start_urls)."""
+
+    name = "myspider_redis"
+    redis_key = "myspider:start_urls"
+
+    def __init__(self, *args, **kwargs):
+        # Dynamically define the allowed domains list.
+        domain = kwargs.pop("domain", "")
+        self.allowed_domains = filter(None, domain.split(","))
+        super().__init__(*args, **kwargs)
+
+    def parse(self, response):
+        return {
+            "name": response.css("title::text").extract_first(),
+            "url": response.url,
+        }