更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -95,15 +95,25 @@ class DouYinClient(AbstractApiClient):
|
||||
params["a_bogus"] = a_bogus
|
||||
|
||||
async def request(self, method, url, **kwargs):
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
try:
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
return response.json()
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
try:
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
raise DataFetchError(f"{e}, {response.text}")
|
||||
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout, httpx.WriteTimeout) as e:
|
||||
utils.logger.error(f"网络连接错误: {type(e).__name__}: {e}")
|
||||
raise DataFetchError(f"网络连接失败: {type(e).__name__}: {e}")
|
||||
except httpx.TimeoutException as e:
|
||||
utils.logger.error(f"请求超时: {e}")
|
||||
raise DataFetchError(f"请求超时: {e}")
|
||||
except Exception as e:
|
||||
raise DataFetchError(f"{e}, {response.text}")
|
||||
utils.logger.error(f"请求异常: {type(e).__name__}: {e}")
|
||||
raise DataFetchError(f"请求失败: {type(e).__name__}: {e}")
|
||||
|
||||
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
||||
"""
|
||||
|
||||
@@ -121,6 +121,8 @@ class DouYinCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
|
||||
page += 1
|
||||
continue
|
||||
posts_res = None
|
||||
retry_success = False
|
||||
try:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
|
||||
posts_res = await self.dy_client.search_info_by_keyword(
|
||||
@@ -129,11 +131,36 @@ class DouYinCrawler(AbstractCrawler):
|
||||
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
|
||||
search_id=dy_search_id,
|
||||
)
|
||||
if posts_res.get("data") is None or posts_res.get("data") == []:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
|
||||
retry_success = True
|
||||
except DataFetchError as e:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed: {e}")
|
||||
# 如果是网络连接错误,等待后重试一次
|
||||
if "网络连接" in str(e) or "ConnectError" in str(e) or "超时" in str(e):
|
||||
utils.logger.warning(f"[DouYinCrawler.search] 网络错误,等待3秒后重试...")
|
||||
await asyncio.sleep(3)
|
||||
try:
|
||||
posts_res = await self.dy_client.search_info_by_keyword(
|
||||
keyword=keyword,
|
||||
offset=page * dy_limit_count - dy_limit_count,
|
||||
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
|
||||
search_id=dy_search_id,
|
||||
)
|
||||
retry_success = True
|
||||
except Exception as retry_e:
|
||||
utils.logger.error(f"[DouYinCrawler.search] 重试失败: {retry_e}")
|
||||
break
|
||||
else:
|
||||
break
|
||||
except DataFetchError:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} unexpected error: {type(e).__name__}: {e}")
|
||||
break
|
||||
|
||||
# 如果请求失败(包括重试失败),跳过后续处理
|
||||
if not retry_success or posts_res is None:
|
||||
break
|
||||
|
||||
if posts_res.get("data") is None or posts_res.get("data") == []:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
|
||||
break
|
||||
|
||||
page += 1
|
||||
|
||||
Reference in New Issue
Block a user