变更
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
print(result.markdown[:300]) # 打印前 300 个字符
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,20 @@
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
|
||||
async def main():
|
||||
browser_conf = BrowserConfig(headless=True) # 设为 False 以观察浏览器
|
||||
|
||||
run_conf = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS # 此处为获取最新内容,默认为 CacheMode.ENABLED
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=run_conf
|
||||
)
|
||||
print(result.markdown)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,26 @@
|
||||
import asyncio
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
|
||||
# 创建 Markdown 生成器并配置内容过滤器
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
|
||||
)
|
||||
|
||||
# 配置爬虫运行参数
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS, # 跳过缓存,每次重新抓取页面
|
||||
markdown_generator=md_generator # 使用自定义的 Markdown 生成器
|
||||
)
|
||||
|
||||
# 主函数必须是 async def
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://news.ycombinator.com", config=config)
|
||||
print("原始 Markdown 长度:", len(result.markdown.raw_markdown))
|
||||
print("过滤后 Markdown 长度:", len(result.markdown.fit_markdown))
|
||||
|
||||
# 启动异步任务
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,69 @@
|
||||
import asyncio
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from pprint import pprint as pp
|
||||
|
||||
async def extract_books():
|
||||
schema = {
|
||||
"name": "Douban Book 250",
|
||||
"baseSelector": "tr.item",
|
||||
"type": "list",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"type": "text",
|
||||
"selector": ".pl2 > a",
|
||||
},
|
||||
{
|
||||
"name": "url",
|
||||
"type": "attribute",
|
||||
"selector": ".pl2 > a",
|
||||
"attribute": "href",
|
||||
},
|
||||
{
|
||||
"name": "info",
|
||||
"type": "text",
|
||||
"selector": ".pl",
|
||||
},
|
||||
{
|
||||
"name": "rate",
|
||||
"type": "text",
|
||||
"selector": ".rating_nums",
|
||||
},
|
||||
{
|
||||
"name": "quote",
|
||||
"type": "text",
|
||||
"selector": "span.inq",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
all_books = []
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
for i in range(10):
|
||||
result = await crawler.arun(
|
||||
url = f"https://book.douban.com/top250?start={i * 25}",
|
||||
extraction_strategy=extraction_strategy,
|
||||
bypass_cache=True,
|
||||
)
|
||||
assert result.success, "Failed to crawl the page"
|
||||
|
||||
books = json.loads(result.extracted_content)
|
||||
all_books.extend(books)
|
||||
print(f"成功提取第 {i + 1} 页的 {len(books)} 本图书")
|
||||
|
||||
# 避免请求过快
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# 保存所有图书数据到文件
|
||||
with open("books.json", "w", encoding="utf-8") as f:
|
||||
json.dump(all_books, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n总共提取了 {len(all_books)} 本图书")
|
||||
return all_books
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(extract_books())
|
||||
Reference in New Issue
Block a user