Files
python/python爬虫/crawl4ai/03_生成Markdown结果.py
T
2025-08-05 09:19:34 +08:00

26 lines
1010 B
Python

import asyncio
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
# 创建 Markdown 生成器并配置内容过滤器
md_generator = DefaultMarkdownGenerator(
content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
)
# 配置爬虫运行参数
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, # 跳过缓存,每次重新抓取页面
markdown_generator=md_generator # 使用自定义的 Markdown 生成器
)
# 主函数必须是 async def
async def main():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://news.ycombinator.com", config=config)
print("原始 Markdown 长度:", len(result.markdown.raw_markdown))
print("过滤后 Markdown 长度:", len(result.markdown.fit_markdown))
# 启动异步任务
if __name__ == "__main__":
asyncio.run(main())