27 lines
818 B
Python
27 lines
818 B
Python
from scrapy.linkextractors import LinkExtractor
|
|
from scrapy.spiders import CrawlSpider, Rule
|
|
|
|
|
|
class DmozSpider(CrawlSpider):
|
|
"""Follow categories and extract links."""
|
|
|
|
name = "dmoz"
|
|
allowed_domains = ["dmoztools.net"]
|
|
start_urls = ["http://www.dmoztools.net/"]
|
|
|
|
rules = [
|
|
Rule(
|
|
LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
|
|
callback="parse_directory",
|
|
follow=True,
|
|
),
|
|
]
|
|
|
|
def parse_directory(self, response):
|
|
for div in response.css(".title-and-desc"):
|
|
yield {
|
|
"name": div.css(".site-title::text").extract_first(),
|
|
"description": div.css(".site-descr::text").extract_first().strip(),
|
|
"link": div.css("a::attr(href)").extract_first(),
|
|
}
|