26 lines
1010 B
Python
26 lines
1010 B
Python
import asyncio
|
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
|
|
|
# 创建 Markdown 生成器并配置内容过滤器
|
|
md_generator = DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
|
|
)
|
|
|
|
# 配置爬虫运行参数
|
|
config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, # 跳过缓存,每次重新抓取页面
|
|
markdown_generator=md_generator # 使用自定义的 Markdown 生成器
|
|
)
|
|
|
|
# 主函数必须是 async def
|
|
async def main():
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://news.ycombinator.com", config=config)
|
|
print("原始 Markdown 长度:", len(result.markdown.raw_markdown))
|
|
print("过滤后 Markdown 长度:", len(result.markdown.fit_markdown))
|
|
|
|
# 启动异步任务
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |