import asyncio import json from crawl4ai import AsyncWebCrawler from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from pprint import pprint as pp async def extract_books(): schema = { "name": "Douban Book 250", "baseSelector": "tr.item", "type": "list", "fields": [ { "name": "title", "type": "text", "selector": ".pl2 > a", }, { "name": "url", "type": "attribute", "selector": ".pl2 > a", "attribute": "href", }, { "name": "info", "type": "text", "selector": ".pl", }, { "name": "rate", "type": "text", "selector": ".rating_nums", }, { "name": "quote", "type": "text", "selector": "span.inq", }, ], } extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) all_books = [] async with AsyncWebCrawler(verbose=True) as crawler: for i in range(10): result = await crawler.arun( url = f"https://book.douban.com/top250?start={i * 25}", extraction_strategy=extraction_strategy, bypass_cache=True, ) assert result.success, "Failed to crawl the page" books = json.loads(result.extracted_content) all_books.extend(books) print(f"成功提取第 {i + 1} 页的 {len(books)} 本图书") # 避免请求过快 await asyncio.sleep(2) # 保存所有图书数据到文件 with open("books.json", "w", encoding="utf-8") as f: json.dump(all_books, f, ensure_ascii=False, indent=2) print(f"\n总共提取了 {len(all_books)} 本图书") return all_books if __name__ == "__main__": asyncio.run(extract_books())