变更
This commit is contained in:
@@ -0,0 +1,26 @@
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
|
||||
|
||||
class DmozSpider(CrawlSpider):
|
||||
"""Follow categories and extract links."""
|
||||
|
||||
name = "dmoz"
|
||||
allowed_domains = ["dmoztools.net"]
|
||||
start_urls = ["http://www.dmoztools.net/"]
|
||||
|
||||
rules = [
|
||||
Rule(
|
||||
LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
|
||||
callback="parse_directory",
|
||||
follow=True,
|
||||
),
|
||||
]
|
||||
|
||||
def parse_directory(self, response):
|
||||
for div in response.css(".title-and-desc"):
|
||||
yield {
|
||||
"name": div.css(".site-title::text").extract_first(),
|
||||
"description": div.css(".site-descr::text").extract_first().strip(),
|
||||
"link": div.css("a::attr(href)").extract_first(),
|
||||
}
|
||||
Reference in New Issue
Block a user