python/python爬虫/crawl4ai/03_生成json结果.py

import asyncio
import json
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from pprint import pprint as pp

async def extract_books():
    schema = {
        "name": "Douban Book 250",
        "baseSelector": "tr.item",
        "type": "list",
        "fields": [
            {
                "name": "title",
                "type": "text",
                "selector": ".pl2 > a",
            },
            {
                "name": "url",
                "type": "attribute",
                "selector": ".pl2 > a",
                "attribute": "href",
            },
            {
                "name": "info",
                "type": "text",
                "selector": ".pl",
            },
            {
                "name": "rate",
                "type": "text",
                "selector": ".rating_nums",
            },
            {
                "name": "quote",
                "type": "text",
                "selector": "span.inq",
            },
        ],
    }

    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
    all_books = []

    async with AsyncWebCrawler(verbose=True) as crawler:
        for i in range(10):
            result = await crawler.arun(
                url = f"https://book.douban.com/top250?start={i * 25}",
                extraction_strategy=extraction_strategy,
                bypass_cache=True,
            )
            assert result.success, "Failed to crawl the page"

            books = json.loads(result.extracted_content)
            all_books.extend(books)
            print(f"成功提取第 {i + 1} 页的 {len(books)} 本图书")

            # 避免请求过快
            await asyncio.sleep(2)

        # 保存所有图书数据到文件
        with open("books.json", "w", encoding="utf-8") as f:
            json.dump(all_books, f, ensure_ascii=False, indent=2)

        print(f"\n总共提取了 {len(all_books)} 本图书")
        return all_books

if __name__ == "__main__":
    asyncio.run(extract_books())