import asyncio from crawl4ai.content_filter_strategy import PruningContentFilter from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode # 创建 Markdown 生成器并配置内容过滤器 md_generator = DefaultMarkdownGenerator( content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed") ) # 配置爬虫运行参数 config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, # 跳过缓存,每次重新抓取页面 markdown_generator=md_generator # 使用自定义的 Markdown 生成器 ) # 主函数必须是 async def async def main(): async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://news.ycombinator.com", config=config) print("原始 Markdown 长度:", len(result.markdown.raw_markdown)) print("过滤后 Markdown 长度:", len(result.markdown.fit_markdown)) # 启动异步任务 if __name__ == "__main__": asyncio.run(main())