python/python爬虫/crawl4ai/03_生成Markdown结果.py

import asyncio
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode

# 创建 Markdown 生成器并配置内容过滤器
md_generator = DefaultMarkdownGenerator(
    content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
)

# 配置爬虫运行参数
config = CrawlerRunConfig(
    cache_mode=CacheMode.BYPASS, # 跳过缓存，每次重新抓取页面
    markdown_generator=md_generator # 使用自定义的 Markdown 生成器
)

# 主函数必须是 async def
async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun("https://news.ycombinator.com", config=config)
        print("原始 Markdown 长度:", len(result.markdown.raw_markdown))
        print("过滤后 Markdown 长度:", len(result.markdown.fit_markdown))

# 启动异步任务
if __name__ == "__main__":
    asyncio.run(main())