变更

2025-08-05 09:19:34 +08:00
commit 584548d006
1696 changed files with 53855 additions and 0 deletions
@@ -0,0 +1,8 @@
+import asyncio
+from crawl4ai import AsyncWebCrawler
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com")
+        print(result.markdown[:300])  # 打印前 300 个字符
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,20 @@
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+
+async def main():
+    browser_conf = BrowserConfig(headless=True)  # 设为 False 以观察浏览器
+
+    run_conf = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS  # 此处为获取最新内容，默认为 CacheMode.ENABLED
+    )
+    async with AsyncWebCrawler(config=browser_conf) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_conf
+        )
+        print(result.markdown)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,26 @@
+import asyncio
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+# 创建 Markdown 生成器并配置内容过滤器
+md_generator = DefaultMarkdownGenerator(
+    content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
+)
+
+# 配置爬虫运行参数
+config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS, # 跳过缓存，每次重新抓取页面
+    markdown_generator=md_generator # 使用自定义的 Markdown 生成器
+)
+
+# 主函数必须是 async def
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://news.ycombinator.com", config=config)
+        print("原始 Markdown 长度:", len(result.markdown.raw_markdown))
+        print("过滤后 Markdown 长度:", len(result.markdown.fit_markdown))
+
+# 启动异步任务
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,69 @@
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from pprint import pprint as pp
+
+async def extract_books():
+    schema = {
+        "name": "Douban Book 250",
+        "baseSelector": "tr.item",
+        "type": "list",
+        "fields": [
+            {
+                "name": "title",
+                "type": "text",
+                "selector": ".pl2 > a",
+            },
+            {
+                "name": "url",
+                "type": "attribute",
+                "selector": ".pl2 > a",
+                "attribute": "href",
+            },
+            {
+                "name": "info",
+                "type": "text",
+                "selector": ".pl",
+            },
+            {
+                "name": "rate",
+                "type": "text",
+                "selector": ".rating_nums",
+            },
+            {
+                "name": "quote",
+                "type": "text",
+                "selector": "span.inq",
+            },
+        ],
+    }
+
+    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+    all_books = []
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        for i in range(10):
+            result = await crawler.arun(
+                url = f"https://book.douban.com/top250?start={i * 25}",
+                extraction_strategy=extraction_strategy,
+                bypass_cache=True,
+            )
+            assert result.success, "Failed to crawl the page"
+
+            books = json.loads(result.extracted_content)
+            all_books.extend(books)
+            print(f"成功提取第 {i + 1} 页的 {len(books)} 本图书")
+
+            # 避免请求过快
+            await asyncio.sleep(2)
+
+        # 保存所有图书数据到文件
+        with open("books.json", "w", encoding="utf-8") as f:
+            json.dump(all_books, f, ensure_ascii=False, indent=2)
+
+        print(f"\n总共提取了 {len(all_books)} 本图书")
+        return all_books
+
+if __name__ == "__main__":
+    asyncio.run(extract_books())