更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -95,15 +95,25 @@ class DouYinClient(AbstractApiClient):
params["a_bogus"] = a_bogus
async def request(self, method, url, **kwargs):
async with httpx.AsyncClient(proxy=self.proxy) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
try:
if response.text == "" or response.text == "blocked":
utils.logger.error(f"request params incrr, response.text: {response.text}")
raise Exception("account blocked")
return response.json()
async with httpx.AsyncClient(proxy=self.proxy) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
try:
if response.text == "" or response.text == "blocked":
utils.logger.error(f"request params incrr, response.text: {response.text}")
raise Exception("account blocked")
return response.json()
except Exception as e:
raise DataFetchError(f"{e}, {response.text}")
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout, httpx.WriteTimeout) as e:
utils.logger.error(f"网络连接错误: {type(e).__name__}: {e}")
raise DataFetchError(f"网络连接失败: {type(e).__name__}: {e}")
except httpx.TimeoutException as e:
utils.logger.error(f"请求超时: {e}")
raise DataFetchError(f"请求超时: {e}")
except Exception as e:
raise DataFetchError(f"{e}, {response.text}")
utils.logger.error(f"请求异常: {type(e).__name__}: {e}")
raise DataFetchError(f"请求失败: {type(e).__name__}: {e}")
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
"""
@@ -121,6 +121,8 @@ class DouYinCrawler(AbstractCrawler):
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
page += 1
continue
posts_res = None
retry_success = False
try:
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
posts_res = await self.dy_client.search_info_by_keyword(
@@ -129,11 +131,36 @@ class DouYinCrawler(AbstractCrawler):
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
search_id=dy_search_id,
)
if posts_res.get("data") is None or posts_res.get("data") == []:
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
retry_success = True
except DataFetchError as e:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed: {e}")
# 如果是网络连接错误,等待后重试一次
if "网络连接" in str(e) or "ConnectError" in str(e) or "超时" in str(e):
utils.logger.warning(f"[DouYinCrawler.search] 网络错误,等待3秒后重试...")
await asyncio.sleep(3)
try:
posts_res = await self.dy_client.search_info_by_keyword(
keyword=keyword,
offset=page * dy_limit_count - dy_limit_count,
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
search_id=dy_search_id,
)
retry_success = True
except Exception as retry_e:
utils.logger.error(f"[DouYinCrawler.search] 重试失败: {retry_e}")
break
else:
break
except DataFetchError:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
except Exception as e:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} unexpected error: {type(e).__name__}: {e}")
break
# 如果请求失败(包括重试失败),跳过后续处理
if not retry_success or posts_res is None:
break
if posts_res.get("data") is None or posts_res.get("data") == []:
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
break
page += 1