更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -45,13 +45,51 @@ class KuaiShouClient(AbstractApiClient):
self.graphql = KuaiShouGraphQL()
async def request(self, method, url, **kwargs) -> Any:
async with httpx.AsyncClient(proxy=self.proxy) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
data: Dict = response.json()
if data.get("errors"):
raise DataFetchError(data.get("errors", "unkonw error"))
else:
return data.get("data", {})
"""Make HTTP request with retry and proxy fallback."""
max_retries = 3
# build proxy attempts: try proxy first (if set), then no-proxy
proxy_attempts: List[Optional[str]] = []
if self.proxy:
proxy_attempts.append(self.proxy)
proxy_attempts.append(None) # always allow a direct attempt
last_exc: Optional[Exception] = None
for attempt in range(max_retries):
proxy_to_use = proxy_attempts[min(attempt, len(proxy_attempts) - 1)]
try:
async with httpx.AsyncClient(proxy=proxy_to_use) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
data: Dict = response.json()
if data.get("errors"):
raise DataFetchError(data.get("errors", "unkonw error"))
return data.get("data", {})
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.NetworkError) as e:
last_exc = e
utils.logger.warning(
f"[KuaiShouClient.request] Network error (attempt {attempt+1}/{max_retries}) "
f"proxy={proxy_to_use} url={url} err={e!r}"
)
if attempt < max_retries - 1:
await asyncio.sleep(1)
continue
utils.logger.error(
f"[KuaiShouClient.request] Network failed after {max_retries} attempts "
f"proxy={proxy_to_use} url={url} err={e!r}"
)
raise
except Exception as e:
# For other exceptions (like DataFetchError), don't retry
last_exc = e
utils.logger.error(
f"[KuaiShouClient.request] Request failed proxy={proxy_to_use} url={url} err={e!r}"
)
raise
# If somehow we exit the loop without returning, raise last exception
if last_exc:
raise last_exc
async def get(self, uri: str, params=None) -> Dict:
final_uri = uri
@@ -83,7 +83,26 @@ class KuaishouCrawler(AbstractCrawler):
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(f"{self.index_url}?isHome=1")
# 添加重试机制处理网络连接错误
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
await self.context_page.goto(f"{self.index_url}?isHome=1", timeout=30000)
break
except Exception as e:
retry_count += 1
error_msg = str(e)
if "ERR_CONNECTION_RESET" in error_msg or "net::" in error_msg or "Connection" in error_msg:
if retry_count < max_retries:
utils.logger.warning(f"[KuaishouCrawler] 网络连接错误,第 {retry_count} 次重试: {e}")
await asyncio.sleep(2 * retry_count) # 递增等待时间
else:
utils.logger.error(f"[KuaishouCrawler] 网络连接失败,已重试 {max_retries} 次: {e}")
raise
else:
# 非网络错误直接抛出
raise
# Create a client to interact with the kuaishou website.
self.ks_client = await self.create_ks_client(httpx_proxy_format)
@@ -49,6 +49,21 @@ class KuaishouLogin(AbstractLogin):
else:
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
async def _quick_check_login_state(self) -> bool:
"""
Quick check if the current login status is successful without retry
Returns True if logged in, False otherwise
"""
try:
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
kuaishou_pass_token = cookie_dict.get("passToken")
if kuaishou_pass_token:
return True
return False
except Exception:
return False
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self) -> bool:
"""
@@ -67,11 +82,47 @@ class KuaishouLogin(AbstractLogin):
"""login kuaishou website and keep webdriver login state"""
utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
# click login button
# Check if already logged in (quick check without retry)
is_logged_in = await self._quick_check_login_state()
if is_logged_in:
utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login button click ...")
return
# Check if login button exists (if not, might already be logged in)
login_button_ele = self.context_page.locator(
"xpath=//p[text()='登录']"
)
await login_button_ele.click()
try:
# Wait for the element to be visible with a shorter timeout
await login_button_ele.wait_for(state="visible", timeout=3000)
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button found, attempting to click ...")
# Try normal click first
await login_button_ele.click(timeout=5000)
except Exception as e:
# If login button is not found, might already be logged in
if "timeout" in str(e).lower() or "waiting for" in str(e).lower():
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button not found, checking if already logged in ...")
# Double check login state (quick check)
is_logged_in = await self._quick_check_login_state()
if is_logged_in:
utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login ...")
return
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Login button not found and not logged in: {e}")
raise
else:
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Normal click failed: {e}, trying force click...")
try:
# If normal click fails, try force click to bypass overlay
await login_button_ele.click(force=True, timeout=5000)
except Exception as e2:
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Force click failed: {e2}, trying JavaScript click...")
# If force click also fails, use JavaScript to click directly
await login_button_ele.evaluate("element => element.click()")
# Wait a moment for the login modal to appear
await asyncio.sleep(1)
# find login qrcode
qrcode_img_selector = "//div[@class='qrcode-img']//img"