更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -45,13 +45,51 @@ class KuaiShouClient(AbstractApiClient):
|
||||
self.graphql = KuaiShouGraphQL()
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Any:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
data: Dict = response.json()
|
||||
if data.get("errors"):
|
||||
raise DataFetchError(data.get("errors", "unkonw error"))
|
||||
else:
|
||||
return data.get("data", {})
|
||||
"""Make HTTP request with retry and proxy fallback."""
|
||||
max_retries = 3
|
||||
|
||||
# build proxy attempts: try proxy first (if set), then no-proxy
|
||||
proxy_attempts: List[Optional[str]] = []
|
||||
if self.proxy:
|
||||
proxy_attempts.append(self.proxy)
|
||||
proxy_attempts.append(None) # always allow a direct attempt
|
||||
|
||||
last_exc: Optional[Exception] = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
proxy_to_use = proxy_attempts[min(attempt, len(proxy_attempts) - 1)]
|
||||
try:
|
||||
async with httpx.AsyncClient(proxy=proxy_to_use) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
data: Dict = response.json()
|
||||
if data.get("errors"):
|
||||
raise DataFetchError(data.get("errors", "unkonw error"))
|
||||
return data.get("data", {})
|
||||
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.NetworkError) as e:
|
||||
last_exc = e
|
||||
utils.logger.warning(
|
||||
f"[KuaiShouClient.request] Network error (attempt {attempt+1}/{max_retries}) "
|
||||
f"proxy={proxy_to_use} url={url} err={e!r}"
|
||||
)
|
||||
if attempt < max_retries - 1:
|
||||
await asyncio.sleep(1)
|
||||
continue
|
||||
utils.logger.error(
|
||||
f"[KuaiShouClient.request] Network failed after {max_retries} attempts "
|
||||
f"proxy={proxy_to_use} url={url} err={e!r}"
|
||||
)
|
||||
raise
|
||||
except Exception as e:
|
||||
# For other exceptions (like DataFetchError), don't retry
|
||||
last_exc = e
|
||||
utils.logger.error(
|
||||
f"[KuaiShouClient.request] Request failed proxy={proxy_to_use} url={url} err={e!r}"
|
||||
)
|
||||
raise
|
||||
|
||||
# If somehow we exit the loop without returning, raise last exception
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
|
||||
async def get(self, uri: str, params=None) -> Dict:
|
||||
final_uri = uri
|
||||
|
||||
@@ -83,7 +83,26 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
|
||||
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(f"{self.index_url}?isHome=1")
|
||||
# 添加重试机制处理网络连接错误
|
||||
max_retries = 3
|
||||
retry_count = 0
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
await self.context_page.goto(f"{self.index_url}?isHome=1", timeout=30000)
|
||||
break
|
||||
except Exception as e:
|
||||
retry_count += 1
|
||||
error_msg = str(e)
|
||||
if "ERR_CONNECTION_RESET" in error_msg or "net::" in error_msg or "Connection" in error_msg:
|
||||
if retry_count < max_retries:
|
||||
utils.logger.warning(f"[KuaishouCrawler] 网络连接错误,第 {retry_count} 次重试: {e}")
|
||||
await asyncio.sleep(2 * retry_count) # 递增等待时间
|
||||
else:
|
||||
utils.logger.error(f"[KuaishouCrawler] 网络连接失败,已重试 {max_retries} 次: {e}")
|
||||
raise
|
||||
else:
|
||||
# 非网络错误直接抛出
|
||||
raise
|
||||
|
||||
# Create a client to interact with the kuaishou website.
|
||||
self.ks_client = await self.create_ks_client(httpx_proxy_format)
|
||||
|
||||
@@ -49,6 +49,21 @@ class KuaishouLogin(AbstractLogin):
|
||||
else:
|
||||
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
async def _quick_check_login_state(self) -> bool:
|
||||
"""
|
||||
Quick check if the current login status is successful without retry
|
||||
Returns True if logged in, False otherwise
|
||||
"""
|
||||
try:
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
kuaishou_pass_token = cookie_dict.get("passToken")
|
||||
if kuaishou_pass_token:
|
||||
return True
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
@@ -67,11 +82,47 @@ class KuaishouLogin(AbstractLogin):
|
||||
"""login kuaishou website and keep webdriver login state"""
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
|
||||
|
||||
# click login button
|
||||
# Check if already logged in (quick check without retry)
|
||||
is_logged_in = await self._quick_check_login_state()
|
||||
if is_logged_in:
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login button click ...")
|
||||
return
|
||||
|
||||
# Check if login button exists (if not, might already be logged in)
|
||||
login_button_ele = self.context_page.locator(
|
||||
"xpath=//p[text()='登录']"
|
||||
)
|
||||
await login_button_ele.click()
|
||||
|
||||
try:
|
||||
# Wait for the element to be visible with a shorter timeout
|
||||
await login_button_ele.wait_for(state="visible", timeout=3000)
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button found, attempting to click ...")
|
||||
|
||||
# Try normal click first
|
||||
await login_button_ele.click(timeout=5000)
|
||||
except Exception as e:
|
||||
# If login button is not found, might already be logged in
|
||||
if "timeout" in str(e).lower() or "waiting for" in str(e).lower():
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button not found, checking if already logged in ...")
|
||||
# Double check login state (quick check)
|
||||
is_logged_in = await self._quick_check_login_state()
|
||||
if is_logged_in:
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login ...")
|
||||
return
|
||||
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Login button not found and not logged in: {e}")
|
||||
raise
|
||||
else:
|
||||
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Normal click failed: {e}, trying force click...")
|
||||
try:
|
||||
# If normal click fails, try force click to bypass overlay
|
||||
await login_button_ele.click(force=True, timeout=5000)
|
||||
except Exception as e2:
|
||||
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Force click failed: {e2}, trying JavaScript click...")
|
||||
# If force click also fails, use JavaScript to click directly
|
||||
await login_button_ele.evaluate("element => element.click()")
|
||||
|
||||
# Wait a moment for the login modal to appear
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# find login qrcode
|
||||
qrcode_img_selector = "//div[@class='qrcode-img']//img"
|
||||
|
||||
Reference in New Issue
Block a user