更新部分爬虫以兼容本地运行及数据库存储
This commit is contained in:
@@ -48,6 +48,8 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.default_ip_proxy = default_ip_proxy
|
||||
self.playwright_page = playwright_page # Playwright页面对象
|
||||
self._last_captcha_check_time = 0 # 上次验证码检测时间
|
||||
self._captcha_verified_recently = False # 是否最近完成过验证码
|
||||
|
||||
def _sync_request(self, method, url, proxy=None, **kwargs):
|
||||
"""
|
||||
@@ -210,6 +212,287 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
self.headers["Cookie"] = cookie_str
|
||||
utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")
|
||||
|
||||
async def _wait_for_captcha_completion(self, max_wait_time: int = 300):
|
||||
"""
|
||||
检测并等待百度验证码完成(在爬虫过程中使用)
|
||||
等待用户手动拖动验证码,验证成功后自动识别并继续
|
||||
|
||||
Args:
|
||||
max_wait_time: 最大等待时间(秒),默认120秒
|
||||
"""
|
||||
if not self.playwright_page:
|
||||
return
|
||||
|
||||
import time
|
||||
|
||||
async def _detect_captcha() -> bool:
|
||||
"""更全面地检测验证码(包含文本、URL 及常见容器)"""
|
||||
# DOM 选择器
|
||||
selector_hits = [
|
||||
'.tang-pass-slider',
|
||||
'#captcha',
|
||||
'.vcode-img',
|
||||
'.pass-verify',
|
||||
'.tang-pass-verify',
|
||||
'.pass-verify-slider',
|
||||
'div[id*="captcha"]',
|
||||
'div[class*="verify"]',
|
||||
'div[class*="captcha"]',
|
||||
'text=安全验证',
|
||||
'text=请输入验证码',
|
||||
'text=拖动',
|
||||
'text=滑动',
|
||||
]
|
||||
for selector in selector_hits:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element and await element.is_visible():
|
||||
return True
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# URL 关键词
|
||||
url_lower = (self.playwright_page.url or "").lower()
|
||||
if any(key in url_lower for key in ["verify", "captcha", "wappass"]):
|
||||
return True
|
||||
|
||||
# 页面文本关键词(截断以降低开销)
|
||||
try:
|
||||
page_text = (await self.playwright_page.content())[:4000]
|
||||
if any(
|
||||
kw in page_text
|
||||
for kw in ["安全验证", "请输入验证码", "完成验证", "滑块", "拖动完成验证"]
|
||||
):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
# 如果最近5秒内刚完成过验证码,跳过检测(避免重复检测)
|
||||
if self._captcha_verified_recently:
|
||||
time_since_last_check = time.time() - self._last_captcha_check_time
|
||||
if time_since_last_check < 5:
|
||||
utils.logger.debug(
|
||||
f"[BaiduTieBaClient] 最近 {time_since_last_check:.1f} 秒内完成过验证码,跳过检测"
|
||||
)
|
||||
return
|
||||
else:
|
||||
self._captcha_verified_recently = False
|
||||
|
||||
# 基础选择器(用于后续反复检测)
|
||||
captcha_selectors = [
|
||||
'.tang-pass-slider',
|
||||
'#captcha',
|
||||
'.vcode-img',
|
||||
'.pass-verify',
|
||||
'.tang-pass-verify',
|
||||
'.pass-verify-slider',
|
||||
'div[id*="captcha"]',
|
||||
'div[class*="verify"]',
|
||||
'div[class*="captcha"]',
|
||||
]
|
||||
success_selectors = [
|
||||
'.tang-pass-success',
|
||||
'.pass-verify-success',
|
||||
'div[class*="success"]',
|
||||
]
|
||||
|
||||
# 检测验证码是否存在
|
||||
captcha_found = await _detect_captcha()
|
||||
if captcha_found:
|
||||
utils.logger.warning("[BaiduTieBaClient] 🔐 检测到验证码,请手动拖动完成验证...")
|
||||
if not captcha_found:
|
||||
return
|
||||
|
||||
# 记录当前URL,用于检测页面跳转
|
||||
initial_url = self.playwright_page.url
|
||||
utils.logger.info(f"[BaiduTieBaClient] 当前页面URL: {initial_url}")
|
||||
utils.logger.info(f"[BaiduTieBaClient] ⏳ 等待用户手动完成验证码(最多等待 {max_wait_time} 秒)...")
|
||||
|
||||
start_time = time.time()
|
||||
last_log_time = 0
|
||||
check_interval = 1 # 检查间隔改为1秒,更快响应
|
||||
|
||||
while True:
|
||||
# 检查是否超时
|
||||
elapsed_time = time.time() - start_time
|
||||
if elapsed_time >= max_wait_time:
|
||||
utils.logger.warning(
|
||||
f"[BaiduTieBaClient] ⏰ 等待验证码超时({max_wait_time}秒),跳过当前百度贴吧爬取任务"
|
||||
)
|
||||
# 超时直接中断本次百度贴吧爬虫,交给上层捕获处理
|
||||
raise TimeoutError(
|
||||
f"Baidu captcha wait timeout ({max_wait_time}s), skip tieba crawling"
|
||||
)
|
||||
|
||||
try:
|
||||
# 检测验证成功的标识
|
||||
verification_success = False
|
||||
for selector in success_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
verification_success = True
|
||||
utils.logger.info(f"[BaiduTieBaClient] ✅ 检测到验证成功标识 (selector: {selector})")
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 检测验证码是否还存在
|
||||
captcha_still_exists = False
|
||||
for selector in captcha_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
captcha_still_exists = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 检测页面URL是否变化(验证成功后可能会跳转)
|
||||
current_url = self.playwright_page.url
|
||||
url_changed = current_url != initial_url
|
||||
|
||||
# 判断验证是否成功
|
||||
# 成功条件:1. 验证码消失 2. 或者检测到成功标识 3. 或者URL变化(且不是验证码页面)
|
||||
if verification_success or (not captcha_still_exists and url_changed):
|
||||
# 验证码消失且URL变化,可能是验证成功后的跳转
|
||||
utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失,检测到页面变化,等待3秒确认验证完成...")
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# 再次确认验证码是否真的消失了
|
||||
captcha_still_exists = False
|
||||
for selector in captcha_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
captcha_still_exists = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not captcha_still_exists:
|
||||
# 确认验证成功
|
||||
final_url = self.playwright_page.url
|
||||
utils.logger.info(f"[BaiduTieBaClient] ✅ 验证码验证成功!")
|
||||
if url_changed:
|
||||
utils.logger.info(f"[BaiduTieBaClient] 📍 页面已跳转: {initial_url} -> {final_url}")
|
||||
else:
|
||||
utils.logger.info(f"[BaiduTieBaClient] 📍 页面URL未变化,验证在当前页面完成")
|
||||
|
||||
# 标记最近完成过验证码,避免立即再次检测
|
||||
self._captcha_verified_recently = True
|
||||
import time
|
||||
self._last_captcha_check_time = time.time()
|
||||
|
||||
# 等待页面稳定,避免立即再次检测验证码
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# 验证成功后,再次检查是否又出现了验证码(防止跳转到新的验证码页面)
|
||||
utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后,检查是否又出现验证码...")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
captcha_reappeared = False
|
||||
for selector in captcha_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
captcha_reappeared = True
|
||||
utils.logger.warning(f"[BaiduTieBaClient] ⚠️ 验证成功后检测到新的验证码 (selector: {selector}),继续等待...")
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not captcha_reappeared:
|
||||
utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功,未出现新的验证码,继续执行...")
|
||||
break
|
||||
else:
|
||||
# 如果又出现了验证码,重置状态继续等待
|
||||
utils.logger.warning("[BaiduTieBaClient] ⚠️ 检测到新的验证码,重置等待状态...")
|
||||
initial_url = self.playwright_page.url
|
||||
start_time = time.time()
|
||||
continue
|
||||
else:
|
||||
# 验证码又出现了,可能验证失败或页面刷新
|
||||
utils.logger.warning("[BaiduTieBaClient] ⚠️ 验证码重新出现,可能验证失败,继续等待...")
|
||||
elif not captcha_still_exists and not url_changed:
|
||||
# 验证码消失但URL未变化,可能是验证成功但未跳转
|
||||
utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失,等待3秒确认验证完成...")
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# 再次确认
|
||||
captcha_still_exists = False
|
||||
for selector in captcha_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
captcha_still_exists = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not captcha_still_exists:
|
||||
utils.logger.info("[BaiduTieBaClient] ✅ 验证码验证成功!")
|
||||
|
||||
# 标记最近完成过验证码
|
||||
self._captcha_verified_recently = True
|
||||
import time
|
||||
self._last_captcha_check_time = time.time()
|
||||
|
||||
# 等待页面稳定
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# 验证成功后,再次检查是否又出现了验证码
|
||||
utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后,检查是否又出现验证码...")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
captcha_reappeared = False
|
||||
for selector in captcha_selectors:
|
||||
try:
|
||||
element = await self.playwright_page.query_selector(selector)
|
||||
if element:
|
||||
is_visible = await element.is_visible()
|
||||
if is_visible:
|
||||
captcha_reappeared = True
|
||||
utils.logger.warning(f"[BaiduTieBaClient] ⚠️ 验证成功后检测到新的验证码 (selector: {selector}),继续等待...")
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not captcha_reappeared:
|
||||
utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功,未出现新的验证码,继续执行...")
|
||||
break
|
||||
else:
|
||||
# 如果又出现了验证码,重置状态继续等待
|
||||
utils.logger.warning("[BaiduTieBaClient] ⚠️ 检测到新的验证码,重置等待状态...")
|
||||
initial_url = self.playwright_page.url
|
||||
start_time = time.time()
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
# 如果检测过程中出现异常,继续等待
|
||||
utils.logger.debug(f"[BaiduTieBaClient] 验证码检测异常: {e}")
|
||||
|
||||
# 等待一段时间后再次检查
|
||||
await asyncio.sleep(check_interval)
|
||||
|
||||
# 每10秒输出一次提示
|
||||
current_time = int(elapsed_time)
|
||||
if current_time != last_log_time and current_time % 10 == 0 and current_time > 0:
|
||||
remaining_time = max_wait_time - current_time
|
||||
utils.logger.info(f"[BaiduTieBaClient] ⏳ 仍在等待验证码完成...(剩余 {remaining_time} 秒)")
|
||||
last_log_time = current_time
|
||||
|
||||
async def get_notes_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
@@ -253,6 +536,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问搜索页面
|
||||
await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -290,6 +576,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问帖子详情页面
|
||||
await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -340,6 +629,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问评论页面
|
||||
await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -448,6 +740,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问子评论页面
|
||||
await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -527,6 +822,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问贴吧页面
|
||||
await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -562,6 +860,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问创作者主页
|
||||
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
@@ -597,6 +898,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
# 使用Playwright访问创作者帖子列表页面
|
||||
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
|
||||
|
||||
# 检测并等待验证码完成
|
||||
await self._wait_for_captcha_completion()
|
||||
|
||||
# 等待页面加载,使用配置文件中的延时设置
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user