69 lines
2.0 KiB
Python
69 lines
2.0 KiB
Python
import asyncio
|
|
import json
|
|
from crawl4ai import AsyncWebCrawler
|
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
|
from pprint import pprint as pp
|
|
|
|
async def extract_books():
|
|
schema = {
|
|
"name": "Douban Book 250",
|
|
"baseSelector": "tr.item",
|
|
"type": "list",
|
|
"fields": [
|
|
{
|
|
"name": "title",
|
|
"type": "text",
|
|
"selector": ".pl2 > a",
|
|
},
|
|
{
|
|
"name": "url",
|
|
"type": "attribute",
|
|
"selector": ".pl2 > a",
|
|
"attribute": "href",
|
|
},
|
|
{
|
|
"name": "info",
|
|
"type": "text",
|
|
"selector": ".pl",
|
|
},
|
|
{
|
|
"name": "rate",
|
|
"type": "text",
|
|
"selector": ".rating_nums",
|
|
},
|
|
{
|
|
"name": "quote",
|
|
"type": "text",
|
|
"selector": "span.inq",
|
|
},
|
|
],
|
|
}
|
|
|
|
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
|
all_books = []
|
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
for i in range(10):
|
|
result = await crawler.arun(
|
|
url = f"https://book.douban.com/top250?start={i * 25}",
|
|
extraction_strategy=extraction_strategy,
|
|
bypass_cache=True,
|
|
)
|
|
assert result.success, "Failed to crawl the page"
|
|
|
|
books = json.loads(result.extracted_content)
|
|
all_books.extend(books)
|
|
print(f"成功提取第 {i + 1} 页的 {len(books)} 本图书")
|
|
|
|
# 避免请求过快
|
|
await asyncio.sleep(2)
|
|
|
|
# 保存所有图书数据到文件
|
|
with open("books.json", "w", encoding="utf-8") as f:
|
|
json.dump(all_books, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\n总共提取了 {len(all_books)} 本图书")
|
|
return all_books
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(extract_books()) |