更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -48,6 +48,8 @@ class BaiduTieBaClient(AbstractApiClient):
self._page_extractor = TieBaExtractor()
self.default_ip_proxy = default_ip_proxy
self.playwright_page = playwright_page # Playwright页面对象
self._last_captcha_check_time = 0 # 上次验证码检测时间
self._captcha_verified_recently = False # 是否最近完成过验证码
def _sync_request(self, method, url, proxy=None, **kwargs):
"""
@@ -210,6 +212,287 @@ class BaiduTieBaClient(AbstractApiClient):
self.headers["Cookie"] = cookie_str
utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")
async def _wait_for_captcha_completion(self, max_wait_time: int = 300):
"""
检测并等待百度验证码完成(在爬虫过程中使用)
等待用户手动拖动验证码,验证成功后自动识别并继续
Args:
max_wait_time: 最大等待时间(秒),默认120秒
"""
if not self.playwright_page:
return
import time
async def _detect_captcha() -> bool:
"""更全面地检测验证码(包含文本、URL 及常见容器)"""
# DOM 选择器
selector_hits = [
'.tang-pass-slider',
'#captcha',
'.vcode-img',
'.pass-verify',
'.tang-pass-verify',
'.pass-verify-slider',
'div[id*="captcha"]',
'div[class*="verify"]',
'div[class*="captcha"]',
'text=安全验证',
'text=请输入验证码',
'text=拖动',
'text=滑动',
]
for selector in selector_hits:
try:
element = await self.playwright_page.query_selector(selector)
if element and await element.is_visible():
return True
except Exception:
continue
# URL 关键词
url_lower = (self.playwright_page.url or "").lower()
if any(key in url_lower for key in ["verify", "captcha", "wappass"]):
return True
# 页面文本关键词(截断以降低开销)
try:
page_text = (await self.playwright_page.content())[:4000]
if any(
kw in page_text
for kw in ["安全验证", "请输入验证码", "完成验证", "滑块", "拖动完成验证"]
):
return True
except Exception:
pass
return False
# 如果最近5秒内刚完成过验证码,跳过检测(避免重复检测)
if self._captcha_verified_recently:
time_since_last_check = time.time() - self._last_captcha_check_time
if time_since_last_check < 5:
utils.logger.debug(
f"[BaiduTieBaClient] 最近 {time_since_last_check:.1f} 秒内完成过验证码,跳过检测"
)
return
else:
self._captcha_verified_recently = False
# 基础选择器(用于后续反复检测)
captcha_selectors = [
'.tang-pass-slider',
'#captcha',
'.vcode-img',
'.pass-verify',
'.tang-pass-verify',
'.pass-verify-slider',
'div[id*="captcha"]',
'div[class*="verify"]',
'div[class*="captcha"]',
]
success_selectors = [
'.tang-pass-success',
'.pass-verify-success',
'div[class*="success"]',
]
# 检测验证码是否存在
captcha_found = await _detect_captcha()
if captcha_found:
utils.logger.warning("[BaiduTieBaClient] 🔐 检测到验证码,请手动拖动完成验证...")
if not captcha_found:
return
# 记录当前URL,用于检测页面跳转
initial_url = self.playwright_page.url
utils.logger.info(f"[BaiduTieBaClient] 当前页面URL: {initial_url}")
utils.logger.info(f"[BaiduTieBaClient] ⏳ 等待用户手动完成验证码(最多等待 {max_wait_time} 秒)...")
start_time = time.time()
last_log_time = 0
check_interval = 1 # 检查间隔改为1秒,更快响应
while True:
# 检查是否超时
elapsed_time = time.time() - start_time
if elapsed_time >= max_wait_time:
utils.logger.warning(
f"[BaiduTieBaClient] ⏰ 等待验证码超时({max_wait_time}秒),跳过当前百度贴吧爬取任务"
)
# 超时直接中断本次百度贴吧爬虫,交给上层捕获处理
raise TimeoutError(
f"Baidu captcha wait timeout ({max_wait_time}s), skip tieba crawling"
)
try:
# 检测验证成功的标识
verification_success = False
for selector in success_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
verification_success = True
utils.logger.info(f"[BaiduTieBaClient] ✅ 检测到验证成功标识 (selector: {selector})")
break
except Exception:
continue
# 检测验证码是否还存在
captcha_still_exists = False
for selector in captcha_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
captcha_still_exists = True
break
except Exception:
continue
# 检测页面URL是否变化(验证成功后可能会跳转)
current_url = self.playwright_page.url
url_changed = current_url != initial_url
# 判断验证是否成功
# 成功条件:1. 验证码消失 2. 或者检测到成功标识 3. 或者URL变化(且不是验证码页面)
if verification_success or (not captcha_still_exists and url_changed):
# 验证码消失且URL变化,可能是验证成功后的跳转
utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失,检测到页面变化,等待3秒确认验证完成...")
await asyncio.sleep(3)
# 再次确认验证码是否真的消失了
captcha_still_exists = False
for selector in captcha_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
captcha_still_exists = True
break
except Exception:
continue
if not captcha_still_exists:
# 确认验证成功
final_url = self.playwright_page.url
utils.logger.info(f"[BaiduTieBaClient] ✅ 验证码验证成功!")
if url_changed:
utils.logger.info(f"[BaiduTieBaClient] 📍 页面已跳转: {initial_url} -> {final_url}")
else:
utils.logger.info(f"[BaiduTieBaClient] 📍 页面URL未变化,验证在当前页面完成")
# 标记最近完成过验证码,避免立即再次检测
self._captcha_verified_recently = True
import time
self._last_captcha_check_time = time.time()
# 等待页面稳定,避免立即再次检测验证码
await asyncio.sleep(3)
# 验证成功后,再次检查是否又出现了验证码(防止跳转到新的验证码页面)
utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后,检查是否又出现验证码...")
await asyncio.sleep(2)
captcha_reappeared = False
for selector in captcha_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
captcha_reappeared = True
utils.logger.warning(f"[BaiduTieBaClient] ⚠️ 验证成功后检测到新的验证码 (selector: {selector}),继续等待...")
break
except Exception:
continue
if not captcha_reappeared:
utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功,未出现新的验证码,继续执行...")
break
else:
# 如果又出现了验证码,重置状态继续等待
utils.logger.warning("[BaiduTieBaClient] ⚠️ 检测到新的验证码,重置等待状态...")
initial_url = self.playwright_page.url
start_time = time.time()
continue
else:
# 验证码又出现了,可能验证失败或页面刷新
utils.logger.warning("[BaiduTieBaClient] ⚠️ 验证码重新出现,可能验证失败,继续等待...")
elif not captcha_still_exists and not url_changed:
# 验证码消失但URL未变化,可能是验证成功但未跳转
utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失,等待3秒确认验证完成...")
await asyncio.sleep(3)
# 再次确认
captcha_still_exists = False
for selector in captcha_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
captcha_still_exists = True
break
except Exception:
continue
if not captcha_still_exists:
utils.logger.info("[BaiduTieBaClient] ✅ 验证码验证成功!")
# 标记最近完成过验证码
self._captcha_verified_recently = True
import time
self._last_captcha_check_time = time.time()
# 等待页面稳定
await asyncio.sleep(3)
# 验证成功后,再次检查是否又出现了验证码
utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后,检查是否又出现验证码...")
await asyncio.sleep(2)
captcha_reappeared = False
for selector in captcha_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
captcha_reappeared = True
utils.logger.warning(f"[BaiduTieBaClient] ⚠️ 验证成功后检测到新的验证码 (selector: {selector}),继续等待...")
break
except Exception:
continue
if not captcha_reappeared:
utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功,未出现新的验证码,继续执行...")
break
else:
# 如果又出现了验证码,重置状态继续等待
utils.logger.warning("[BaiduTieBaClient] ⚠️ 检测到新的验证码,重置等待状态...")
initial_url = self.playwright_page.url
start_time = time.time()
continue
except Exception as e:
# 如果检测过程中出现异常,继续等待
utils.logger.debug(f"[BaiduTieBaClient] 验证码检测异常: {e}")
# 等待一段时间后再次检查
await asyncio.sleep(check_interval)
# 每10秒输出一次提示
current_time = int(elapsed_time)
if current_time != last_log_time and current_time % 10 == 0 and current_time > 0:
remaining_time = max_wait_time - current_time
utils.logger.info(f"[BaiduTieBaClient] ⏳ 仍在等待验证码完成...(剩余 {remaining_time} 秒)")
last_log_time = current_time
async def get_notes_by_keyword(
self,
keyword: str,
@@ -253,6 +536,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问搜索页面
await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -290,6 +576,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问帖子详情页面
await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -340,6 +629,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问评论页面
await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -448,6 +740,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问子评论页面
await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -527,6 +822,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问贴吧页面
await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -562,6 +860,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问创作者主页
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -597,6 +898,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问创作者帖子列表页面
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)