This commit is contained in:
2025-08-05 09:19:34 +08:00
commit 584548d006
1696 changed files with 53855 additions and 0 deletions
@@ -0,0 +1,8 @@
# This package will contain the spiders of your Scrapy project
#
# To create the first spider for your project use this command:
#
# scrapy genspider myspider myspider-domain.com
#
# For more info see:
# http://doc.scrapy.org/topics/spiders.html
@@ -0,0 +1,26 @@
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class DmozSpider(CrawlSpider):
"""Follow categories and extract links."""
name = "dmoz"
allowed_domains = ["dmoztools.net"]
start_urls = ["http://www.dmoztools.net/"]
rules = [
Rule(
LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
callback="parse_directory",
follow=True,
),
]
def parse_directory(self, response):
for div in response.css(".title-and-desc"):
yield {
"name": div.css(".site-title::text").extract_first(),
"description": div.css(".site-descr::text").extract_first().strip(),
"link": div.css("a::attr(href)").extract_first(),
}
@@ -0,0 +1,28 @@
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy_redis.spiders import RedisCrawlSpider
class MyCrawler(RedisCrawlSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = "mycrawler_redis"
redis_key = "mycrawler:start_urls"
rules = (
# follow all links
Rule(LinkExtractor(), callback="parse_page", follow=True),
)
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop("domain", "")
self.allowed_domains = filter(None, domain.split(","))
super().__init__(*args, **kwargs)
def parse_page(self, response):
return {
"name": response.css("title::text").extract_first(),
"url": response.url,
}
@@ -0,0 +1,20 @@
from scrapy_redis.spiders import RedisSpider
class MySpider(RedisSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = "myspider_redis"
redis_key = "myspider:start_urls"
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop("domain", "")
self.allowed_domains = filter(None, domain.split(","))
super().__init__(*args, **kwargs)
def parse(self, response):
return {
"name": response.css("title::text").extract_first(),
"url": response.url,
}