Files
python/python爬虫/scrapy-redis-0.9.1/example-project/example/spiders/dmoz.py
T
2025-08-05 09:19:34 +08:00

27 lines
818 B
Python

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class DmozSpider(CrawlSpider):
"""Follow categories and extract links."""
name = "dmoz"
allowed_domains = ["dmoztools.net"]
start_urls = ["http://www.dmoztools.net/"]
rules = [
Rule(
LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
callback="parse_directory",
follow=True,
),
]
def parse_directory(self, response):
for div in response.css(".title-and-desc"):
yield {
"name": div.css(".site-title::text").extract_first(),
"description": div.css(".site-descr::text").extract_first().strip(),
"link": div.css("a::attr(href)").extract_first(),
}