更新部分爬虫以兼容本地运行及数据库存储

This commit is contained in:
z66
2025-12-16 10:56:56 +08:00
parent a9eda60493
commit ff1ce2a3ba
28 changed files with 1394 additions and 126 deletions
@@ -9,8 +9,8 @@
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 基础配置
PLATFORM = "zhihu" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
KEYWORDS = "F6智慧门店,南京爱福路汽车科技有限公司,汽车后市场,汽修店,新康众" # 关键词搜索配置,以英文逗号分隔
PLATFORM = "ks" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
KEYWORDS = "F6智慧门店,F6智数,中国汽车后市场白皮书,南京爱福路汽车科技有限公司,汽车后市场,汽车修理厂,新康众,天猫养车,汽后,汽修厂,爱福路,康众" # 关键词搜索配置,以英文逗号分隔
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = ""
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
@@ -30,6 +30,12 @@ IP_PROXY_PROVIDER_NAME = "kuaidaili" # kuaidaili | wandouhttp
# 抖音如果一直提示失败,打开浏览器看下是否扫码登录之后出现了手机号验证,如果出现了手动过一下再试。
HEADLESS = True
# HTTP/网络配置
# 如内网有自签名证书导致 TLS 失败,可临时置为 False
HTTPX_VERIFY = False
# 若需要指定上游代理(如 http://user:pass@host:port),填此值;留空使用系统/环境变量
HTTPX_PROXY = ""
# 是否保存登录状态
SAVE_LOGIN_STATE = True
@@ -13,10 +13,11 @@ _engines = {}
async def create_database_if_not_exists(db_type: str):
if db_type == "mysql" or db_type == "db":
# Connect to the server without a database
server_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}"
server_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}?charset=utf8mb4"
engine = create_async_engine(server_url, echo=False)
async with engine.connect() as conn:
await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {mysql_db_config['db_name']}"))
# 确保数据库使用utf8mb4字符集
await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {mysql_db_config['db_name']} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
await engine.dispose()
elif db_type == "postgresql":
# Connect to PostgreSQL default database (postgres) to create target database
@@ -48,7 +49,8 @@ def get_async_engine(db_type: str = None):
if db_type == "sqlite":
db_url = f"sqlite+aiosqlite:///{sqlite_db_config['db_path']}"
elif db_type == "mysql" or db_type == "db":
db_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}"
# 添加charset=utf8mb4以支持完整的UTF-8编码(包括emoji和中文)
db_url = f"mysql+asyncmy://{mysql_db_config['user']}:{mysql_db_config['password']}@{mysql_db_config['host']}:{mysql_db_config['port']}/{mysql_db_config['db_name']}?charset=utf8mb4"
elif db_type == "postgresql":
db_url = f"postgresql+asyncpg://{postgresql_db_config['user']}:{postgresql_db_config['password']}@{postgresql_db_config['host']}:{postgresql_db_config['port']}/{postgresql_db_config['db_name']}"
else:
@@ -11,42 +11,54 @@
import asyncio
import sys
from typing import Optional
from typing import Dict, Optional, Type
import importlib
import cmd_arg
import config
from database import db
from base.base_crawler import AbstractCrawler
from media_platform.bilibili import BilibiliCrawler
from media_platform.douyin import DouYinCrawler
from media_platform.kuaishou import KuaishouCrawler
from media_platform.tieba import TieBaCrawler
from media_platform.weibo import WeiboCrawler
from media_platform.xhs import XiaoHongShuCrawler
from media_platform.zhihu import ZhihuCrawler
from tools.async_file_writer import AsyncFileWriter
from var import crawler_type_var
class CrawlerFactory:
CRAWLERS = {
"xhs": XiaoHongShuCrawler,
"dy": DouYinCrawler,
"ks": KuaishouCrawler,
"bili": BilibiliCrawler,
"wb": WeiboCrawler,
"tieba": TieBaCrawler,
"zhihu": ZhihuCrawler,
_CRAWLER_PATHS = {
"xhs": "media_platform.xhs.XiaoHongShuCrawler",
"dy": "media_platform.douyin.DouYinCrawler",
"ks": "media_platform.kuaishou.KuaishouCrawler",
"bili": "media_platform.bilibili.BilibiliCrawler",
"wb": "media_platform.weibo.WeiboCrawler",
"tieba": "media_platform.tieba.TieBaCrawler",
"zhihu": "media_platform.zhihu.ZhihuCrawler",
}
_cache: Dict[str, Type[AbstractCrawler]] = {}
@staticmethod
def create_crawler(platform: str) -> AbstractCrawler:
crawler_class = CrawlerFactory.CRAWLERS.get(platform)
if not crawler_class:
path = CrawlerFactory._CRAWLER_PATHS.get(platform)
if not path:
raise ValueError(
"Invalid Media Platform Currently only supported xhs or dy or ks or bili ..."
)
return crawler_class()
if platform not in CrawlerFactory._cache:
module_name, class_name = path.rsplit(".", 1)
try:
module = importlib.import_module(module_name)
crawler_class = getattr(module, class_name)
except ModuleNotFoundError as exc:
hint = (
"Please install optional dependency 'xhshow' (pip install xhshow) "
"or disable the xhs platform."
if platform == "xhs" and exc.name == "xhshow"
else f"Missing dependency while importing {module_name}"
)
raise ModuleNotFoundError(f"{exc}: {hint}") from exc
CrawlerFactory._cache[platform] = crawler_class
return CrawlerFactory._cache[platform]()
crawler: Optional[AbstractCrawler] = None
@@ -60,6 +72,12 @@ async def main():
# Init crawler
global crawler
# 导入工具模块以初始化日志
from tools import utils
utils.logger.info("=" * 60)
utils.logger.info("MediaCrawler 启动")
utils.logger.info("=" * 60)
# parse cmd
args = await cmd_arg.parse_cmd()
@@ -49,8 +49,27 @@ class BilibiliClient(AbstractApiClient):
self.cookie_dict = cookie_dict
async def request(self, method, url, **kwargs) -> Any:
async with httpx.AsyncClient(proxy=self.proxy) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
"""
Basic HTTP request wrapper with retries for transient network errors.
"""
verify = getattr(config, "HTTPX_VERIFY", True)
# 优先使用传入 proxy,其次是 config.HTTPX_PROXY,最后走系统环境变量
proxy = self.proxy or getattr(config, "HTTPX_PROXY", "") or None
async with httpx.AsyncClient(proxy=proxy, timeout=self.timeout, verify=verify) as client:
# 简单重试,处理短暂的连接失败
last_exc: Optional[Exception] = None
for attempt in range(3):
try:
response = await client.request(method, url, **kwargs)
break
except httpx.HTTPError as e:
last_exc = e
if attempt == 2:
# 3rd failure -> give up
utils.logger.error(f"[BilibiliClient.request] Network error on {method} {url}: {repr(e)}")
raise DataFetchError(f"network error: {e}") from e
await asyncio.sleep(1)
try:
data: Dict = response.json()
except json.JSONDecodeError:
@@ -68,10 +68,23 @@ class BilibiliLogin(AbstractLogin):
return True
return False
async def _has_valid_login_cookie(self) -> bool:
"""
快速检查当前上下文是否已有登录态用于避免重复扫码
"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
return bool(cookie_dict.get("SESSDATA") or cookie_dict.get("DedeUserID"))
async def login_by_qrcode(self):
"""login bilibili website and keep webdriver login state"""
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")
# 如果已经登录则直接跳过扫码流程
if await self._has_valid_login_cookie():
utils.logger.info("[BilibiliLogin.login_by_qrcode] 已检测到有效登录态,跳过扫码登录")
return
# click login button
login_button_ele = self.context_page.locator(
"xpath=//div[@class='right-entry__outside go-login-btn']//div"
@@ -95,6 +95,7 @@ class DouYinClient(AbstractApiClient):
params["a_bogus"] = a_bogus
async def request(self, method, url, **kwargs):
try:
async with httpx.AsyncClient(proxy=self.proxy) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
try:
@@ -104,6 +105,15 @@ class DouYinClient(AbstractApiClient):
return response.json()
except Exception as e:
raise DataFetchError(f"{e}, {response.text}")
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout, httpx.WriteTimeout) as e:
utils.logger.error(f"网络连接错误: {type(e).__name__}: {e}")
raise DataFetchError(f"网络连接失败: {type(e).__name__}: {e}")
except httpx.TimeoutException as e:
utils.logger.error(f"请求超时: {e}")
raise DataFetchError(f"请求超时: {e}")
except Exception as e:
utils.logger.error(f"请求异常: {type(e).__name__}: {e}")
raise DataFetchError(f"请求失败: {type(e).__name__}: {e}")
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
"""
@@ -121,6 +121,8 @@ class DouYinCrawler(AbstractCrawler):
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
page += 1
continue
posts_res = None
retry_success = False
try:
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
posts_res = await self.dy_client.search_info_by_keyword(
@@ -129,12 +131,37 @@ class DouYinCrawler(AbstractCrawler):
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
search_id=dy_search_id,
)
retry_success = True
except DataFetchError as e:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed: {e}")
# 如果是网络连接错误,等待后重试一次
if "网络连接" in str(e) or "ConnectError" in str(e) or "超时" in str(e):
utils.logger.warning(f"[DouYinCrawler.search] 网络错误,等待3秒后重试...")
await asyncio.sleep(3)
try:
posts_res = await self.dy_client.search_info_by_keyword(
keyword=keyword,
offset=page * dy_limit_count - dy_limit_count,
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
search_id=dy_search_id,
)
retry_success = True
except Exception as retry_e:
utils.logger.error(f"[DouYinCrawler.search] 重试失败: {retry_e}")
break
else:
break
except Exception as e:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} unexpected error: {type(e).__name__}: {e}")
break
# 如果请求失败(包括重试失败),跳过后续处理
if not retry_success or posts_res is None:
break
if posts_res.get("data") is None or posts_res.get("data") == []:
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
break
except DataFetchError:
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
break
page += 1
if "data" not in posts_res:
@@ -45,13 +45,51 @@ class KuaiShouClient(AbstractApiClient):
self.graphql = KuaiShouGraphQL()
async def request(self, method, url, **kwargs) -> Any:
async with httpx.AsyncClient(proxy=self.proxy) as client:
"""Make HTTP request with retry and proxy fallback."""
max_retries = 3
# build proxy attempts: try proxy first (if set), then no-proxy
proxy_attempts: List[Optional[str]] = []
if self.proxy:
proxy_attempts.append(self.proxy)
proxy_attempts.append(None) # always allow a direct attempt
last_exc: Optional[Exception] = None
for attempt in range(max_retries):
proxy_to_use = proxy_attempts[min(attempt, len(proxy_attempts) - 1)]
try:
async with httpx.AsyncClient(proxy=proxy_to_use) as client:
response = await client.request(method, url, timeout=self.timeout, **kwargs)
data: Dict = response.json()
if data.get("errors"):
raise DataFetchError(data.get("errors", "unkonw error"))
else:
return data.get("data", {})
except (httpx.ConnectError, httpx.ConnectTimeout, httpx.NetworkError) as e:
last_exc = e
utils.logger.warning(
f"[KuaiShouClient.request] Network error (attempt {attempt+1}/{max_retries}) "
f"proxy={proxy_to_use} url={url} err={e!r}"
)
if attempt < max_retries - 1:
await asyncio.sleep(1)
continue
utils.logger.error(
f"[KuaiShouClient.request] Network failed after {max_retries} attempts "
f"proxy={proxy_to_use} url={url} err={e!r}"
)
raise
except Exception as e:
# For other exceptions (like DataFetchError), don't retry
last_exc = e
utils.logger.error(
f"[KuaiShouClient.request] Request failed proxy={proxy_to_use} url={url} err={e!r}"
)
raise
# If somehow we exit the loop without returning, raise last exception
if last_exc:
raise last_exc
async def get(self, uri: str, params=None) -> Dict:
final_uri = uri
@@ -83,7 +83,26 @@ class KuaishouCrawler(AbstractCrawler):
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(f"{self.index_url}?isHome=1")
# 添加重试机制处理网络连接错误
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
await self.context_page.goto(f"{self.index_url}?isHome=1", timeout=30000)
break
except Exception as e:
retry_count += 1
error_msg = str(e)
if "ERR_CONNECTION_RESET" in error_msg or "net::" in error_msg or "Connection" in error_msg:
if retry_count < max_retries:
utils.logger.warning(f"[KuaishouCrawler] 网络连接错误,第 {retry_count} 次重试: {e}")
await asyncio.sleep(2 * retry_count) # 递增等待时间
else:
utils.logger.error(f"[KuaishouCrawler] 网络连接失败,已重试 {max_retries} 次: {e}")
raise
else:
# 非网络错误直接抛出
raise
# Create a client to interact with the kuaishou website.
self.ks_client = await self.create_ks_client(httpx_proxy_format)
@@ -49,6 +49,21 @@ class KuaishouLogin(AbstractLogin):
else:
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
async def _quick_check_login_state(self) -> bool:
"""
Quick check if the current login status is successful without retry
Returns True if logged in, False otherwise
"""
try:
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
kuaishou_pass_token = cookie_dict.get("passToken")
if kuaishou_pass_token:
return True
return False
except Exception:
return False
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self) -> bool:
"""
@@ -67,11 +82,47 @@ class KuaishouLogin(AbstractLogin):
"""login kuaishou website and keep webdriver login state"""
utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
# click login button
# Check if already logged in (quick check without retry)
is_logged_in = await self._quick_check_login_state()
if is_logged_in:
utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login button click ...")
return
# Check if login button exists (if not, might already be logged in)
login_button_ele = self.context_page.locator(
"xpath=//p[text()='登录']"
)
await login_button_ele.click()
try:
# Wait for the element to be visible with a shorter timeout
await login_button_ele.wait_for(state="visible", timeout=3000)
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button found, attempting to click ...")
# Try normal click first
await login_button_ele.click(timeout=5000)
except Exception as e:
# If login button is not found, might already be logged in
if "timeout" in str(e).lower() or "waiting for" in str(e).lower():
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login button not found, checking if already logged in ...")
# Double check login state (quick check)
is_logged_in = await self._quick_check_login_state()
if is_logged_in:
utils.logger.info("[KuaishouLogin.login_by_qrcode] Already logged in, skipping login ...")
return
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Login button not found and not logged in: {e}")
raise
else:
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Normal click failed: {e}, trying force click...")
try:
# If normal click fails, try force click to bypass overlay
await login_button_ele.click(force=True, timeout=5000)
except Exception as e2:
utils.logger.warning(f"[KuaishouLogin.login_by_qrcode] Force click failed: {e2}, trying JavaScript click...")
# If force click also fails, use JavaScript to click directly
await login_button_ele.evaluate("element => element.click()")
# Wait a moment for the login modal to appear
await asyncio.sleep(1)
# find login qrcode
qrcode_img_selector = "//div[@class='qrcode-img']//img"
@@ -48,6 +48,8 @@ class BaiduTieBaClient(AbstractApiClient):
self._page_extractor = TieBaExtractor()
self.default_ip_proxy = default_ip_proxy
self.playwright_page = playwright_page # Playwright页面对象
self._last_captcha_check_time = 0 # 上次验证码检测时间
self._captcha_verified_recently = False # 是否最近完成过验证码
def _sync_request(self, method, url, proxy=None, **kwargs):
"""
@@ -210,6 +212,287 @@ class BaiduTieBaClient(AbstractApiClient):
self.headers["Cookie"] = cookie_str
utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")
async def _wait_for_captcha_completion(self, max_wait_time: int = 300):
"""
检测并等待百度验证码完成在爬虫过程中使用
等待用户手动拖动验证码验证成功后自动识别并继续
Args:
max_wait_time: 最大等待时间默认120秒
"""
if not self.playwright_page:
return
import time
async def _detect_captcha() -> bool:
"""更全面地检测验证码(包含文本、URL 及常见容器)"""
# DOM 选择器
selector_hits = [
'.tang-pass-slider',
'#captcha',
'.vcode-img',
'.pass-verify',
'.tang-pass-verify',
'.pass-verify-slider',
'div[id*="captcha"]',
'div[class*="verify"]',
'div[class*="captcha"]',
'text=安全验证',
'text=请输入验证码',
'text=拖动',
'text=滑动',
]
for selector in selector_hits:
try:
element = await self.playwright_page.query_selector(selector)
if element and await element.is_visible():
return True
except Exception:
continue
# URL 关键词
url_lower = (self.playwright_page.url or "").lower()
if any(key in url_lower for key in ["verify", "captcha", "wappass"]):
return True
# 页面文本关键词(截断以降低开销)
try:
page_text = (await self.playwright_page.content())[:4000]
if any(
kw in page_text
for kw in ["安全验证", "请输入验证码", "完成验证", "滑块", "拖动完成验证"]
):
return True
except Exception:
pass
return False
# 如果最近5秒内刚完成过验证码,跳过检测(避免重复检测)
if self._captcha_verified_recently:
time_since_last_check = time.time() - self._last_captcha_check_time
if time_since_last_check < 5:
utils.logger.debug(
f"[BaiduTieBaClient] 最近 {time_since_last_check:.1f} 秒内完成过验证码,跳过检测"
)
return
else:
self._captcha_verified_recently = False
# 基础选择器(用于后续反复检测)
captcha_selectors = [
'.tang-pass-slider',
'#captcha',
'.vcode-img',
'.pass-verify',
'.tang-pass-verify',
'.pass-verify-slider',
'div[id*="captcha"]',
'div[class*="verify"]',
'div[class*="captcha"]',
]
success_selectors = [
'.tang-pass-success',
'.pass-verify-success',
'div[class*="success"]',
]
# 检测验证码是否存在
captcha_found = await _detect_captcha()
if captcha_found:
utils.logger.warning("[BaiduTieBaClient] 🔐 检测到验证码,请手动拖动完成验证...")
if not captcha_found:
return
# 记录当前URL,用于检测页面跳转
initial_url = self.playwright_page.url
utils.logger.info(f"[BaiduTieBaClient] 当前页面URL: {initial_url}")
utils.logger.info(f"[BaiduTieBaClient] ⏳ 等待用户手动完成验证码(最多等待 {max_wait_time} 秒)...")
start_time = time.time()
last_log_time = 0
check_interval = 1 # 检查间隔改为1秒,更快响应
while True:
# 检查是否超时
elapsed_time = time.time() - start_time
if elapsed_time >= max_wait_time:
utils.logger.warning(
f"[BaiduTieBaClient] ⏰ 等待验证码超时({max_wait_time}秒),跳过当前百度贴吧爬取任务"
)
# 超时直接中断本次百度贴吧爬虫,交给上层捕获处理
raise TimeoutError(
f"Baidu captcha wait timeout ({max_wait_time}s), skip tieba crawling"
)
try:
# 检测验证成功的标识
verification_success = False
for selector in success_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
verification_success = True
utils.logger.info(f"[BaiduTieBaClient] ✅ 检测到验证成功标识 (selector: {selector})")
break
except Exception:
continue
# 检测验证码是否还存在
captcha_still_exists = False
for selector in captcha_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
captcha_still_exists = True
break
except Exception:
continue
# 检测页面URL是否变化(验证成功后可能会跳转)
current_url = self.playwright_page.url
url_changed = current_url != initial_url
# 判断验证是否成功
# 成功条件:1. 验证码消失 2. 或者检测到成功标识 3. 或者URL变化(且不是验证码页面)
if verification_success or (not captcha_still_exists and url_changed):
# 验证码消失且URL变化,可能是验证成功后的跳转
utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失,检测到页面变化,等待3秒确认验证完成...")
await asyncio.sleep(3)
# 再次确认验证码是否真的消失了
captcha_still_exists = False
for selector in captcha_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
captcha_still_exists = True
break
except Exception:
continue
if not captcha_still_exists:
# 确认验证成功
final_url = self.playwright_page.url
utils.logger.info(f"[BaiduTieBaClient] ✅ 验证码验证成功!")
if url_changed:
utils.logger.info(f"[BaiduTieBaClient] 📍 页面已跳转: {initial_url} -> {final_url}")
else:
utils.logger.info(f"[BaiduTieBaClient] 📍 页面URL未变化,验证在当前页面完成")
# 标记最近完成过验证码,避免立即再次检测
self._captcha_verified_recently = True
import time
self._last_captcha_check_time = time.time()
# 等待页面稳定,避免立即再次检测验证码
await asyncio.sleep(3)
# 验证成功后,再次检查是否又出现了验证码(防止跳转到新的验证码页面)
utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后,检查是否又出现验证码...")
await asyncio.sleep(2)
captcha_reappeared = False
for selector in captcha_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
captcha_reappeared = True
utils.logger.warning(f"[BaiduTieBaClient] ⚠️ 验证成功后检测到新的验证码 (selector: {selector}),继续等待...")
break
except Exception:
continue
if not captcha_reappeared:
utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功,未出现新的验证码,继续执行...")
break
else:
# 如果又出现了验证码,重置状态继续等待
utils.logger.warning("[BaiduTieBaClient] ⚠️ 检测到新的验证码,重置等待状态...")
initial_url = self.playwright_page.url
start_time = time.time()
continue
else:
# 验证码又出现了,可能验证失败或页面刷新
utils.logger.warning("[BaiduTieBaClient] ⚠️ 验证码重新出现,可能验证失败,继续等待...")
elif not captcha_still_exists and not url_changed:
# 验证码消失但URL未变化,可能是验证成功但未跳转
utils.logger.info("[BaiduTieBaClient] 🔍 验证码已消失,等待3秒确认验证完成...")
await asyncio.sleep(3)
# 再次确认
captcha_still_exists = False
for selector in captcha_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
captcha_still_exists = True
break
except Exception:
continue
if not captcha_still_exists:
utils.logger.info("[BaiduTieBaClient] ✅ 验证码验证成功!")
# 标记最近完成过验证码
self._captcha_verified_recently = True
import time
self._last_captcha_check_time = time.time()
# 等待页面稳定
await asyncio.sleep(3)
# 验证成功后,再次检查是否又出现了验证码
utils.logger.info("[BaiduTieBaClient] 🔍 验证成功后,检查是否又出现验证码...")
await asyncio.sleep(2)
captcha_reappeared = False
for selector in captcha_selectors:
try:
element = await self.playwright_page.query_selector(selector)
if element:
is_visible = await element.is_visible()
if is_visible:
captcha_reappeared = True
utils.logger.warning(f"[BaiduTieBaClient] ⚠️ 验证成功后检测到新的验证码 (selector: {selector}),继续等待...")
break
except Exception:
continue
if not captcha_reappeared:
utils.logger.info("[BaiduTieBaClient] ✅ 确认验证成功,未出现新的验证码,继续执行...")
break
else:
# 如果又出现了验证码,重置状态继续等待
utils.logger.warning("[BaiduTieBaClient] ⚠️ 检测到新的验证码,重置等待状态...")
initial_url = self.playwright_page.url
start_time = time.time()
continue
except Exception as e:
# 如果检测过程中出现异常,继续等待
utils.logger.debug(f"[BaiduTieBaClient] 验证码检测异常: {e}")
# 等待一段时间后再次检查
await asyncio.sleep(check_interval)
# 每10秒输出一次提示
current_time = int(elapsed_time)
if current_time != last_log_time and current_time % 10 == 0 and current_time > 0:
remaining_time = max_wait_time - current_time
utils.logger.info(f"[BaiduTieBaClient] ⏳ 仍在等待验证码完成...(剩余 {remaining_time} 秒)")
last_log_time = current_time
async def get_notes_by_keyword(
self,
keyword: str,
@@ -253,6 +536,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问搜索页面
await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -290,6 +576,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问帖子详情页面
await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -340,6 +629,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问评论页面
await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -448,6 +740,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问子评论页面
await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -527,6 +822,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问贴吧页面
await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -562,6 +860,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问创作者主页
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -597,6 +898,9 @@ class BaiduTieBaClient(AbstractApiClient):
# 使用Playwright访问创作者帖子列表页面
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
# 检测并等待验证码完成
await self._wait_for_captcha_completion()
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
@@ -58,11 +58,47 @@ class WeiboClient:
if enable_return_response:
return response
# 检查响应状态码
if response.status_code != 200:
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
utils.logger.error(f"[WeiboClient.request] request {method}:{url} failed with status {response.status_code}")
raise DataFetchError(error_msg)
# 检查响应内容类型
content_type = response.headers.get("content-type", "").lower()
if "application/json" not in content_type and "text/json" not in content_type:
# 可能是HTML响应(如登录页面)
response_text = response.text[:500]
utils.logger.warning(f"[WeiboClient.request] Unexpected content type: {content_type}, response preview: {response_text}")
# 如果看起来像是HTML,可能是需要登录
if "<html" in response_text.lower() or "<!doctype" in response_text.lower():
raise DataFetchError("Response is HTML, may need to login or cookie expired")
raise DataFetchError(f"Unexpected content type: {content_type}")
# 安全地解析JSON
try:
data: Dict = response.json()
except ValueError as e:
# JSON解析失败
response_text = response.text[:500]
utils.logger.error(f"[WeiboClient.request] JSON decode error for {method}:{url}")
utils.logger.error(f"[WeiboClient.request] Response text (first 500 chars): {response_text}")
raise DataFetchError(f"Failed to parse JSON response: {e}")
# 检查响应是否为空
if not data:
utils.logger.warning(f"[WeiboClient.request] Empty response for {method}:{url}")
return {"cards": []}
ok_code = data.get("ok")
if ok_code == 0: # response error
msg = data.get("msg", "response error")
# "这里还没有内容" 是正常情况,表示没有更多数据,不应该抛出异常
if msg == "这里还没有内容" or "还没有内容" in msg:
utils.logger.info(f"[WeiboClient.request] No more content available: {msg}")
return {"cards": []} # 返回空结果,而不是抛出异常
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
raise DataFetchError(data.get("msg", "response error"))
raise DataFetchError(msg)
elif ok_code != 1: # unknown error
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
raise DataFetchError(data.get("msg", "unknown error"))
@@ -15,6 +15,7 @@
import asyncio
import os
import re
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task
from typing import Dict, List, Optional, Tuple
@@ -56,6 +57,17 @@ class WeiboCrawler(AbstractCrawler):
self.cdp_manager = None
async def start(self):
# 初始化数据库表(如果需要)
if config.SAVE_DATA_OPTION in ["db", "sqlite", "postgresql"]:
try:
from database.db_session import create_tables
utils.logger.info(f"[WeiboCrawler.start] Initializing database tables for {config.SAVE_DATA_OPTION}...")
await create_tables(config.SAVE_DATA_OPTION)
utils.logger.info(f"[WeiboCrawler.start] Database tables initialized successfully")
except Exception as e:
utils.logger.error(f"[WeiboCrawler.start] Failed to initialize database tables: {e}", exc_info=True)
raise
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
@@ -151,16 +163,39 @@ class WeiboCrawler(AbstractCrawler):
page += 1
continue
utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
try:
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
cards = search_res.get("cards", [])
utils.logger.info(f"[WeiboCrawler.search] Received {len(cards)} cards from search API")
# 如果没有更多内容,跳出循环
if len(cards) == 0:
utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}', stopping pagination")
break
except DataFetchError as e:
# 如果是"没有内容"的错误,正常结束
if "还没有内容" in str(e) or "没有内容" in str(e):
utils.logger.info(f"[WeiboCrawler.search] No more content for keyword '{keyword}': {e}")
break
# 其他错误继续抛出
raise
note_id_list: List[str] = []
note_list = filter_search_result_card(search_res.get("cards"))
note_list = filter_search_result_card(cards)
utils.logger.info(f"[WeiboCrawler.search] Filtered to {len(note_list)} notes (card_type=9)")
for note_item in note_list:
if note_item:
mblog: Dict = note_item.get("mblog")
if mblog:
note_id_list.append(mblog.get("id"))
note_id = mblog.get("id")
note_id_list.append(note_id)
try:
await weibo_store.update_weibo_note(note_item)
await self.get_note_images(mblog)
except Exception as e:
utils.logger.error(f"[WeiboCrawler.search] Failed to save note {note_id}: {e}", exc_info=True)
# 继续处理其他笔记,不中断整个流程
page += 1
@@ -34,7 +34,7 @@ class ZhiHuClient(AbstractApiClient):
def __init__(
self,
timeout=10,
timeout=30, # 增加超时时间到30秒,避免请求卡住
proxy=None,
*,
headers: Dict[str, str],
@@ -57,7 +57,8 @@ class ZhiHuClient(AbstractApiClient):
"""
d_c0 = self.cookie_dict.get("d_c0")
if not d_c0:
raise Exception("d_c0 not found in cookies")
utils.logger.error(f"[ZhiHuClient._pre_headers] d_c0 not found in cookies. Available cookies: {list(self.cookie_dict.keys())}")
raise Exception("d_c0 not found in cookies. Please make sure you have logged in and cookies are updated.")
sign_res = sign(url, self.default_headers["cookie"])
headers = self.default_headers.copy()
headers['x-zst-81'] = sign_res["x-zst-81"]
@@ -184,6 +185,7 @@ class ZhiHuClient(AbstractApiClient):
Returns:
"""
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 开始搜索关键词: {keyword}, 页码: {page}")
uri = "/api/v4/search_v3"
params = {
"gk_version": "gz-gaokao",
@@ -200,9 +202,16 @@ class ZhiHuClient(AbstractApiClient):
"sort": sort.value,
"vertical": note_type.value,
}
try:
utils.logger.debug(f"[ZhiHuClient.get_note_by_keyword] 发送搜索请求: {uri}, params: {params}")
search_res = await self.get(uri, params)
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}")
return self._extractor.extract_contents_from_search(search_res)
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 搜索请求成功,开始解析结果")
contents = self._extractor.extract_contents_from_search(search_res)
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] 解析完成,找到 {len(contents)} 条内容")
return contents
except Exception as e:
utils.logger.error(f"[ZhiHuClient.get_note_by_keyword] 搜索失败: {e}", exc_info=True)
raise
async def get_root_comments(
self,
@@ -90,7 +90,9 @@ class ZhihuCrawler(AbstractCrawler):
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
# 设置页面超时时间为30秒
self.context_page.set_default_timeout(30000)
await self.context_page.goto(self.index_url, wait_until="domcontentloaded", timeout=30000)
# Create a client to interact with the zhihu website.
self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format)
@@ -103,38 +105,83 @@ class ZhihuCrawler(AbstractCrawler):
cookie_str=config.COOKIES,
)
await login_obj.begin()
# 登录后等待页面稳定
await asyncio.sleep(2)
await self.zhihu_client.update_cookies(
browser_context=self.browser_context
)
# 知乎的搜索接口需要打开搜索页面之后cookies才能访问API,单独的首页不行
# 使用用户配置的第一个关键词,如果没有关键词则使用默认的"test"
search_keyword = "test" # 默认关键词
if config.KEYWORDS and config.KEYWORDS.strip():
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
if keywords_list:
search_keyword = keywords_list[0]
utils.logger.info(f"[ZhihuCrawler.start] 使用用户关键词 '{search_keyword}' 初始化搜索页面")
else:
utils.logger.warning(f"[ZhihuCrawler.start] 关键词配置为空,使用默认关键词 'test'")
else:
utils.logger.warning(f"[ZhihuCrawler.start] 未配置关键词,使用默认关键词 'test'")
utils.logger.info(
"[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies,该过程需要5秒左右"
f"[ZhihuCrawler.start] ========== 准备跳转到搜索页面获取Cookies =========="
)
utils.logger.info(
f"[ZhihuCrawler.start] 关键词: {search_keyword}"
)
try:
# 使用用户配置的关键词,而不是硬编码的python
from urllib.parse import quote
encoded_keyword = quote(search_keyword)
search_url = f"{self.index_url}/search?q={encoded_keyword}&search_source=Guess&utm_content=search_hot&type=content"
utils.logger.info(f"[ZhihuCrawler.start] 跳转到搜索页面: {search_url}")
# 添加超时时间,避免卡住
await self.context_page.goto(
f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
search_url,
wait_until="domcontentloaded",
timeout=30000
)
await asyncio.sleep(5)
utils.logger.info("[ZhihuCrawler.start] 页面跳转完成,等待页面稳定...")
# 等待页面基本加载完成,不等待networkidle(知乎页面可能一直有请求)
await asyncio.sleep(3)
utils.logger.info("[ZhihuCrawler.start] 搜索页面已加载,开始更新cookies")
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
utils.logger.info("[ZhihuCrawler.start] ========== Cookies更新完成 ==========")
except Exception as e:
utils.logger.error(f"[ZhihuCrawler.start] 跳转到搜索页面失败: {e},尝试继续执行", exc_info=True)
# 即使跳转失败,也尝试更新cookies
try:
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
utils.logger.info("[ZhihuCrawler.start] Cookies更新完成(跳转失败后)")
except Exception as cookie_error:
utils.logger.error(f"[ZhihuCrawler.start] 更新cookies失败: {cookie_error}", exc_info=True)
crawler_type_var.set(config.CRAWLER_TYPE)
utils.logger.info(f"[ZhihuCrawler.start] ========== 开始执行爬取任务 ==========")
utils.logger.info(f"[ZhihuCrawler.start] 爬取类型: {config.CRAWLER_TYPE}")
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
utils.logger.info("[ZhihuCrawler.start] 准备开始搜索关键词")
await self.search()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
utils.logger.info("[ZhihuCrawler.start] 准备开始获取指定帖子详情")
await self.get_specified_notes()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
utils.logger.info("[ZhihuCrawler.start] 准备开始获取创作者信息")
await self.get_creators_and_notes()
else:
pass
utils.logger.warning(f"[ZhihuCrawler.start] 未知的爬取类型: {config.CRAWLER_TYPE}")
utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...")
async def search(self) -> None:
"""Search for notes and retrieve their comment information."""
utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords")
utils.logger.info("[ZhihuCrawler.search] ========== 开始搜索知乎关键词 ==========")
zhihu_limit_count = 20 # zhihu limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
@@ -145,7 +192,19 @@ class ZhihuCrawler(AbstractCrawler):
total_failed_contents = 0
total_saved_comments = 0
for keyword in config.KEYWORDS.split(","):
# 安全地处理关键词列表
if not config.KEYWORDS or not config.KEYWORDS.strip():
utils.logger.error("[ZhihuCrawler.search] 关键词配置为空,无法执行搜索任务")
return
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
if not keywords_list:
utils.logger.error("[ZhihuCrawler.search] 关键词列表为空,无法执行搜索任务")
return
utils.logger.info(f"[ZhihuCrawler.search] 关键词列表: {keywords_list}, 共 {len(keywords_list)} 个关键词")
for keyword in keywords_list:
source_keyword_var.set(keyword)
utils.logger.info(
f"[ZhihuCrawler.search] Current search keyword: {keyword}"
@@ -420,6 +479,18 @@ class ZhihuCrawler(AbstractCrawler):
cookie_str, cookie_dict = utils.convert_cookies(
await self.browser_context.cookies()
)
# 获取用户配置的关键词用于 referer,如果没有则使用默认值
referer_keyword = "test"
if config.KEYWORDS and config.KEYWORDS.strip():
keywords_list = [k.strip() for k in config.KEYWORDS.split(",") if k.strip()]
if keywords_list:
referer_keyword = keywords_list[0]
from urllib.parse import quote
encoded_referer_keyword = quote(referer_keyword)
referer_url = f"https://www.zhihu.com/search?q={encoded_referer_keyword}&time_interval=a_year&type=content"
zhihu_client_obj = ZhiHuClient(
proxy=httpx_proxy,
headers={
@@ -427,7 +498,7 @@ class ZhihuCrawler(AbstractCrawler):
"accept-language": "zh-CN,zh;q=0.9",
"cookie": cookie_str,
"priority": "u=1, i",
"referer": "https://www.zhihu.com/search?q=python&time_interval=a_year&type=content",
"referer": referer_url,
"user-agent": self.user_agent,
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",
@@ -119,6 +119,32 @@ class BiliDbStoreImplement(AbstractStore):
content_item: content item dict
"""
video_id = content_item.get("video_id")
if not video_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
title = content_item.get("title", "")
desc = content_item.get("desc", "")
content_text = title + " " + desc
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[BilibiliDbStoreImplement.store_content] ❌ Filtered video {video_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[BilibiliDbStoreImplement.store_content] Failed to load keyword config: {e}")
# 确保 video_id 为整数类型,匹配数据库 BigInteger 字段
if video_id is not None:
video_id = int(video_id) if not isinstance(video_id, int) else video_id
@@ -88,6 +88,30 @@ class DouyinDbStoreImplement(AbstractStore):
content_item: content item dict
"""
aweme_id = content_item.get("aweme_id")
if not aweme_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
desc = content_item.get("desc", "")
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(desc, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[DouyinDbStoreImplement.store_content] ❌ Filtered aweme {aweme_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[DouyinDbStoreImplement.store_content] Failed to load keyword config: {e}")
async with get_session() as session:
result = await session.execute(select(DouyinAweme).where(DouyinAweme.aweme_id == aweme_id))
aweme_detail = result.scalar_one_or_none()
@@ -89,6 +89,30 @@ class KuaishouDbStoreImplement(AbstractStore):
content_item: content item dict
"""
video_id = content_item.get("video_id")
if not video_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
caption = content_item.get("caption", "")
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(caption, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[KuaishouDbStoreImplement.store_content] ❌ Filtered video {video_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[KuaishouDbStoreImplement.store_content] Failed to load keyword config: {e}")
async with get_session() as session:
result = await session.execute(select(KuaishouVideo).where(KuaishouVideo.video_id == video_id))
video_detail = result.scalar_one_or_none()
@@ -95,6 +95,32 @@ class TieBaDbStoreImplement(AbstractStore):
content_item: content item dict
"""
note_id = content_item.get("note_id")
if not note_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
title = content_item.get("title", "")
text = content_item.get("text", "")
content_text = title + " " + text
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[TiebaDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[TiebaDbStoreImplement.store_content] Failed to load keyword config: {e}")
async with get_session() as session:
stmt = select(TiebaNote).where(TiebaNote.note_id == note_id)
res = await session.execute(stmt)
@@ -93,7 +93,12 @@ async def update_weibo_note(note_item: Dict):
"source_keyword": source_keyword_var.get(),
}
utils.logger.info(f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
try:
await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
utils.logger.debug(f"[store.weibo.update_weibo_note] Successfully saved note {note_id}")
except Exception as e:
utils.logger.error(f"[store.weibo.update_weibo_note] Failed to save note {note_id}: {e}", exc_info=True)
raise
async def batch_update_weibo_note_comments(note_id: str, comments: List[Dict]):
@@ -148,7 +153,12 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
"avatar": user_info.get("profile_image_url", ""),
}
utils.logger.info(f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
try:
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
utils.logger.debug(f"[store.weibo.update_weibo_note_comment] Successfully saved comment {comment_id}")
except Exception as e:
utils.logger.error(f"[store.weibo.update_weibo_note_comment] Failed to save comment {comment_id}: {e}", exc_info=True)
raise
async def update_weibo_note_image(picid: str, pic_content, extension_file_name):
@@ -21,7 +21,7 @@ import pathlib
from typing import Dict
import aiofiles
from sqlalchemy import select
from sqlalchemy import select, text
from sqlalchemy.ext.asyncio import AsyncSession
import config
@@ -29,7 +29,7 @@ from base.base_crawler import AbstractStore
from database.models import WeiboCreator, WeiboNote, WeiboNoteComment
from tools import utils, words
from tools.async_file_writer import AsyncFileWriter
from database.db_session import get_session
from database.db_session import get_session, get_async_engine
from var import crawler_type_var
@@ -89,6 +89,33 @@ class WeiboCsvStoreImplement(AbstractStore):
class WeiboDbStoreImplement(AbstractStore):
def __init__(self, **kwargs):
super().__init__(**kwargs)
async def _check_connection(self):
"""检查数据库连接是否正常(使用类变量缓存检查结果)"""
# 使用类变量缓存检查结果,避免重复检查
if not hasattr(WeiboDbStoreImplement, '_global_connection_checked'):
WeiboDbStoreImplement._global_connection_checked = False
if WeiboDbStoreImplement._global_connection_checked:
return True
try:
engine = get_async_engine(config.SAVE_DATA_OPTION)
if engine is None:
utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Engine is None for SAVE_DATA_OPTION={config.SAVE_DATA_OPTION}")
return False
async with engine.connect() as conn:
await conn.execute(text("SELECT 1"))
WeiboDbStoreImplement._global_connection_checked = True
utils.logger.info(f"[WeiboDbStoreImplement._check_connection] Database connection verified")
return True
except Exception as e:
utils.logger.error(f"[WeiboDbStoreImplement._check_connection] Database connection failed: {e}", exc_info=True)
return False
async def store_content(self, content_item: Dict):
"""
Weibo content DB storage implementation
@@ -99,21 +126,62 @@ class WeiboDbStoreImplement(AbstractStore):
"""
note_id = content_item.get("note_id")
if not note_id:
utils.logger.error(f"[WeiboDbStoreImplement.store_content] note_id is missing in content_item: {content_item}")
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
content_text = content_item.get("content", "")
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[WeiboDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Failed to load keyword config: {e}")
# 检查数据库连接
if not await self._check_connection():
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database connection check failed, skipping save for note {note_id}")
return
try:
async with get_session() as session:
if session is None:
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database session is None, check SAVE_DATA_OPTION config")
return
stmt = select(WeiboNote).where(WeiboNote.note_id == note_id)
res = await session.execute(stmt)
db_note = res.scalar_one_or_none()
if db_note:
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Updating existing note {note_id}")
db_note.last_modify_ts = utils.get_current_timestamp()
for key, value in content_item.items():
if hasattr(db_note, key):
setattr(db_note, key, value)
else:
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Creating new note {note_id}")
content_item["add_ts"] = utils.get_current_timestamp()
content_item["last_modify_ts"] = utils.get_current_timestamp()
db_note = WeiboNote(**content_item)
session.add(db_note)
await session.commit()
utils.logger.debug(f"[WeiboDbStoreImplement.store_content] Successfully committed note {note_id} to database")
except Exception as e:
utils.logger.error(f"[WeiboDbStoreImplement.store_content] Database error saving note {note_id}: {e}", exc_info=True)
raise
async def store_comment(self, comment_item: Dict):
"""
@@ -125,21 +193,36 @@ class WeiboDbStoreImplement(AbstractStore):
"""
comment_id = comment_item.get("comment_id")
if not comment_id:
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] comment_id is missing in comment_item: {comment_item}")
return
try:
async with get_session() as session:
if session is None:
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database session is None, check SAVE_DATA_OPTION config")
return
stmt = select(WeiboNoteComment).where(WeiboNoteComment.comment_id == comment_id)
res = await session.execute(stmt)
db_comment = res.scalar_one_or_none()
if db_comment:
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Updating existing comment {comment_id}")
db_comment.last_modify_ts = utils.get_current_timestamp()
for key, value in comment_item.items():
if hasattr(db_comment, key):
setattr(db_comment, key, value)
else:
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Creating new comment {comment_id}")
comment_item["add_ts"] = utils.get_current_timestamp()
comment_item["last_modify_ts"] = utils.get_current_timestamp()
db_comment = WeiboNoteComment(**comment_item)
session.add(db_comment)
await session.commit()
utils.logger.debug(f"[WeiboDbStoreImplement.store_comment] Successfully committed comment {comment_id} to database")
except Exception as e:
utils.logger.error(f"[WeiboDbStoreImplement.store_comment] Database error saving comment {comment_id}: {e}", exc_info=True)
raise
async def store_creator(self, creator: Dict):
"""
@@ -89,6 +89,34 @@ class XhsDbStoreImplement(AbstractStore):
note_id = content_item.get("note_id")
if not note_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
# 添加项目根目录到路径,以便导入 MindSpider 的 config
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
title = content_item.get("title", "")
desc = content_item.get("desc", "")
content_text = title + " " + desc
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
# 如果配置了关键词,进行匹配检查
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[XhsDbStoreImplement.store_content] ❌ Filtered note {note_id} - content does not match any keyword")
return
except Exception as e:
# 如果配置读取失败,记录警告但不阻止保存(向后兼容)
utils.logger.debug(f"[XhsDbStoreImplement.store_content] Failed to load keyword config: {e}")
async with get_session() as session:
if await self.content_is_exist(session, note_id):
await self.update_content(session, content_item)
@@ -85,8 +85,21 @@ async def batch_update_zhihu_note_comments(comments: List[ZhihuComment]):
if not comments:
return
success_count = 0
error_count = 0
for comment_item in comments:
try:
await update_zhihu_content_comment(comment_item)
success_count += 1
except Exception as e:
error_count += 1
comment_id = getattr(comment_item, 'comment_id', 'unknown')
utils.logger.error(f"[store.zhihu.batch_update_zhihu_note_comments] 保存评论失败 (comment_id={comment_id}): {e}", exc_info=True)
if error_count > 0:
utils.logger.warning(f"[store.zhihu.batch_update_zhihu_note_comments] 批量保存完成: 成功 {success_count} 条, 失败 {error_count}")
else:
utils.logger.info(f"[store.zhihu.batch_update_zhihu_note_comments] 批量保存完成: 成功 {success_count}")
async def update_zhihu_content_comment(comment_item: ZhihuComment):
@@ -98,10 +111,17 @@ async def update_zhihu_content_comment(comment_item: ZhihuComment):
Returns:
"""
try:
local_db_item = comment_item.model_dump()
local_db_item.update({"last_modify_ts": utils.get_current_timestamp()})
utils.logger.info(f"[store.zhihu.update_zhihu_note_comment] zhihu content comment:{local_db_item}")
# 使用更安全的日志记录方式,避免编码问题导致日志输出异常
comment_id = local_db_item.get('comment_id', 'unknown')
utils.logger.debug(f"[store.zhihu.update_zhihu_note_comment] 准备保存评论: comment_id={comment_id}")
await ZhihuStoreFactory.create_store().store_comment(local_db_item)
except Exception as e:
comment_id = getattr(comment_item, 'comment_id', 'unknown')
utils.logger.error(f"[store.zhihu.update_zhihu_note_comment] 保存评论异常 (comment_id={comment_id}): {e}", exc_info=True)
raise
async def save_creator(creator: ZhihuCreator):
@@ -94,23 +94,71 @@ class ZhihuDbStoreImplement(AbstractStore):
content_item: content item dict
"""
content_id = content_item.get("content_id")
if not content_id:
return
# 关键词过滤:仅在落库时进行,仅对主贴/视频过滤,不过滤评论
# 支持精确匹配和模糊匹配两种模式
try:
import sys
from pathlib import Path
project_root = Path(__file__).resolve().parents[4]
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
from config import settings
title = content_item.get("title", "")
content = content_item.get("content", "")
content_text = title + " " + content
strict_keywords = getattr(settings, 'STRICT_KEYWORDS', None)
fuzzy_keywords = getattr(settings, 'FUZZY_KEYWORDS', None)
if strict_keywords or fuzzy_keywords:
if not utils.check_keyword_match_with_modes(content_text, strict_keywords, fuzzy_keywords):
utils.logger.warning(f"[ZhihuDbStoreImplement.store_content] ❌ Filtered content {content_id} - content does not match any keyword")
return
except Exception as e:
utils.logger.debug(f"[ZhihuDbStoreImplement.store_content] Failed to load keyword config: {e}")
try:
# 确保所有字符串值都是正确的UTF-8编码
cleaned_item = {}
for key, value in content_item.items():
if isinstance(value, bytes):
# 如果是bytes类型,尝试解码为UTF-8
try:
value = value.decode('utf-8')
except UnicodeDecodeError:
# 如果UTF-8解码失败,尝试其他编码
try:
value = value.decode('gbk', errors='replace')
except:
value = value.decode('utf-8', errors='replace')
elif isinstance(value, str):
# 确保字符串是有效的UTF-8
try:
value.encode('utf-8')
except UnicodeEncodeError:
# 如果编码失败,尝试修复
value = value.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
cleaned_item[key] = value
async with get_session() as session:
stmt = select(ZhihuContent).where(ZhihuContent.content_id == content_id)
result = await session.execute(stmt)
existing_content = result.scalars().first()
if existing_content:
for key, value in content_item.items():
for key, value in cleaned_item.items():
setattr(existing_content, key, value)
utils.logger.debug(f"[ZhihuDbStore] 更新内容: {content_id}")
else:
new_content = ZhihuContent(**content_item)
new_content = ZhihuContent(**cleaned_item)
session.add(new_content)
utils.logger.debug(f"[ZhihuDbStore] 新增内容: {content_id}")
await session.commit()
utils.logger.info(f"[ZhihuDbStore] 成功保存内容到数据库: {content_id}")
except Exception as e:
utils.logger.error(f"[ZhihuDbStore] 保存内容失败 (content_id={content_id}): {e}")
utils.logger.error(f"[ZhihuDbStore] 保存内容失败 (content_id={content_id}): {e}", exc_info=True)
raise
async def store_comment(self, comment_item: Dict):
@@ -121,22 +169,44 @@ class ZhihuDbStoreImplement(AbstractStore):
"""
comment_id = comment_item.get("comment_id")
try:
# 确保所有字符串值都是正确的UTF-8编码
cleaned_item = {}
for key, value in comment_item.items():
if isinstance(value, bytes):
# 如果是bytes类型,尝试解码为UTF-8
try:
value = value.decode('utf-8')
except UnicodeDecodeError:
# 如果UTF-8解码失败,尝试其他编码
try:
value = value.decode('gbk', errors='replace')
except:
value = value.decode('utf-8', errors='replace')
elif isinstance(value, str):
# 确保字符串是有效的UTF-8
try:
value.encode('utf-8')
except UnicodeEncodeError:
# 如果编码失败,尝试修复
value = value.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
cleaned_item[key] = value
async with get_session() as session:
stmt = select(ZhihuComment).where(ZhihuComment.comment_id == comment_id)
result = await session.execute(stmt)
existing_comment = result.scalars().first()
if existing_comment:
for key, value in comment_item.items():
for key, value in cleaned_item.items():
setattr(existing_comment, key, value)
utils.logger.debug(f"[ZhihuDbStore] 更新评论: {comment_id}")
else:
new_comment = ZhihuComment(**comment_item)
new_comment = ZhihuComment(**cleaned_item)
session.add(new_comment)
utils.logger.debug(f"[ZhihuDbStore] 新增评论: {comment_id}")
await session.commit()
utils.logger.info(f"[ZhihuDbStore] 成功保存评论到数据库: {comment_id}")
except Exception as e:
utils.logger.error(f"[ZhihuDbStore] 保存评论失败 (comment_id={comment_id}): {e}")
utils.logger.error(f"[ZhihuDbStore] 保存评论失败 (comment_id={comment_id}): {e}", exc_info=True)
raise
async def store_creator(self, creator: Dict):
@@ -11,6 +11,11 @@
import argparse
import logging
import os
import re
import sys
from logging.handlers import RotatingFileHandler
from pathlib import Path
from .crawler_util import *
from .slider_util import *
@@ -19,17 +24,80 @@ from .time_util import *
def init_loging_config():
level = logging.INFO
logging.basicConfig(
level=level,
format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
datefmt='%Y-%m-%d %H:%M:%S'
# 日志格式
log_format = "%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s"
date_format = '%Y-%m-%d %H:%M:%S'
# 创建日志目录(项目根目录的 logs 文件夹)
# 从当前文件位置向上查找,直到找到包含 logs 目录的项目根目录
current_file = Path(__file__).resolve()
project_root = None
# 方法1: 向上查找直到找到 logs 目录
for parent in current_file.parents:
logs_dir = parent / "logs"
if logs_dir.exists() or parent.name == "BettaFish-1.2.0":
project_root = parent
break
# 方法2: 如果没找到,使用当前工作目录
if project_root is None:
project_root = Path.cwd()
# 如果当前在 MediaCrawler 目录,向上查找
if project_root.name == "MediaCrawler":
project_root = project_root.parent.parent
log_dir = project_root / "logs"
log_dir.mkdir(exist_ok=True)
# 日志文件路径
log_file = log_dir / "mediacrawler.log"
# 配置根日志记录器
root_logger = logging.getLogger()
root_logger.setLevel(level)
# 清除已有的处理器,避免重复
root_logger.handlers.clear()
# 控制台处理器 - 明确使用 sys.stdout 确保输出到控制台
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(level)
console_formatter = logging.Formatter(log_format, datefmt=date_format)
console_handler.setFormatter(console_formatter)
root_logger.addHandler(console_handler)
# 确保输出立即刷新
sys.stdout.flush()
sys.stderr.flush()
# 文件处理器(带轮转,最大10MB,保留5个备份)
try:
file_handler = RotatingFileHandler(
log_file,
maxBytes=10 * 1024 * 1024, # 10MB
backupCount=5,
encoding='utf-8'
)
file_handler.setLevel(level)
file_formatter = logging.Formatter(log_format, datefmt=date_format)
file_handler.setFormatter(file_formatter)
root_logger.addHandler(file_handler)
except Exception as e:
# 如果文件日志初始化失败,至少保证控制台日志可用
print(f"警告: 无法初始化文件日志: {e}")
# 创建 MediaCrawler 专用日志记录器
_logger = logging.getLogger("MediaCrawler")
_logger.setLevel(level)
# 关闭 httpx 的 INFO 日志
logging.getLogger("httpx").setLevel(logging.WARNING)
# 输出日志文件位置
_logger.info(f"日志文件: {log_file}")
return _logger
@@ -44,3 +112,101 @@ def str2bool(v):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
def check_keyword_match_strict(content: str, keyword: str) -> bool:
"""
严格关键词匹配检查内容是否包含关键词严格模式
Args:
content: 要检查的内容文本
keyword: 关键词可以是单个关键词也可以是逗号分隔的多个关键词
Returns:
bool: 如果内容包含任意一个关键词返回True否则返回False
"""
if not content or not keyword:
return False
# 清理HTML标签
clean_content = re.sub(r"<.*?>", "", content)
# 转换为小写进行匹配
clean_content_lower = clean_content.lower()
# 支持多个关键词(逗号分隔),只要匹配任意一个即可
keywords = [k.strip().lower() for k in keyword.split(",") if k.strip()]
# 检查内容是否包含任意一个关键词
for kw in keywords:
if kw in clean_content_lower:
return True
return False
def check_keyword_match_fuzzy(content: str, keyword: str) -> bool:
"""
模糊关键词匹配检查内容是否包含关键词模糊模式支持部分匹配
Args:
content: 要检查的内容文本
keyword: 关键词可以是单个关键词也可以是逗号分隔的多个关键词
Returns:
bool: 如果内容包含任意一个关键词或关键词的部分返回True否则返回False
"""
if not content or not keyword:
return False
# 清理HTML标签
clean_content = re.sub(r"<.*?>", "", content)
# 转换为小写进行匹配
clean_content_lower = clean_content.lower()
# 支持多个关键词(逗号分隔),只要匹配任意一个即可
keywords = [k.strip().lower() for k in keyword.split(",") if k.strip()]
# 检查内容是否包含任意一个关键词(或关键词的部分)
for kw in keywords:
# 精确匹配
if kw in clean_content_lower:
return True
# 模糊匹配:如果关键词长度>=3,检查是否包含关键词的主要部分
if len(kw) >= 3:
# 去除空格后匹配
kw_no_space = kw.replace(" ", "")
content_no_space = clean_content_lower.replace(" ", "")
if kw_no_space in content_no_space:
return True
# 检查关键词的前半部分(至少2个字符)
if len(kw) >= 4:
half_kw = kw[:len(kw)//2]
if half_kw in clean_content_lower:
return True
return False
def check_keyword_match_with_modes(content: str, strict_keywords: str = None, fuzzy_keywords: str = None) -> bool:
"""
使用精确和模糊两种模式检查关键词匹配
Args:
content: 要检查的内容文本
strict_keywords: 精确匹配关键词逗号分隔
fuzzy_keywords: 模糊匹配关键词逗号分隔
Returns:
bool: 如果内容匹配任意一个关键词精确或模糊返回True否则返回False
"""
# 先检查精确匹配关键词
if strict_keywords:
if check_keyword_match_strict(content, strict_keywords):
return True
# 再检查模糊匹配关键词
if fuzzy_keywords:
if check_keyword_match_fuzzy(content, fuzzy_keywords):
return True
return False
@@ -251,7 +251,8 @@ class KeywordManager:
def _get_default_keywords(self) -> List[str]:
"""获取默认关键词列表"""
return [
"F6智慧门店","南京爱福路汽车科技有限公司","汽车后市场","修店","新康众"
"F6智慧门店","F6智数","中国汽车后市场白皮书","南京爱福路汽车科技有限公司","汽车后市场","车修理厂",
"新康众","天猫养车","汽后","汽修厂","爱福路","康众",
]
def get_all_keywords_for_platforms(self, platforms: List[str], target_date: date = None,
@@ -252,14 +252,63 @@ postgresql_db_config = {{
logger.info(f"执行命令: {' '.join(cmd)}")
# 切换到MediaCrawler目录并执行,捕获输出
result = subprocess.run(
# 使用utf-8编码,errors='surrogateescape'可以更好地处理编码问题
# 设置环境变量确保子进程使用UTF-8编码
env = os.environ.copy()
env['PYTHONIOENCODING'] = 'utf-8'
env['PYTHONUTF8'] = '1'
# 使用 Popen 实时输出日志,而不是等到结束才显示
import subprocess as sp
process = sp.Popen(
cmd,
cwd=self.mediacrawler_path,
timeout=3600, # 60分钟超时
capture_output=True,
stdout=sp.PIPE,
stderr=sp.STDOUT, # 将stderr合并到stdout
text=True,
encoding='utf-8',
errors='replace'
errors='surrogateescape',
env=env,
bufsize=1, # 行缓冲
universal_newlines=True
)
# 实时读取并输出日志
output_lines = []
error_lines = []
try:
for line in process.stdout:
line = line.rstrip()
if line:
output_lines.append(line)
# 实时输出到控制台
print(f"[{platform}] {line}", flush=True)
logger.info(f"[{platform}] {line}")
# 等待进程完成
return_code = process.wait(timeout=3600)
except sp.TimeoutExpired:
process.kill()
process.wait()
return_code = -1
logger.error(f"[{platform}] 爬取超时")
except Exception as e:
process.kill()
process.wait()
return_code = -1
logger.error(f"[{platform}] 执行异常: {e}", exc_info=True)
# 创建类似 subprocess.run 的 result 对象
class Result:
def __init__(self, returncode, stdout, stderr):
self.returncode = returncode
self.stdout = stdout
self.stderr = stderr
result = Result(
returncode=return_code,
stdout='\n'.join(output_lines),
stderr='\n'.join(error_lines)
)
end_time = datetime.now()
@@ -269,6 +318,19 @@ postgresql_db_config = {{
output_lines = result.stdout.split('\n') if result.stdout else []
error_lines = result.stderr.split('\n') if result.stderr else []
# 输出日志到控制台和日志文件
if output_lines:
logger.info(f"[{platform}] 爬虫标准输出:")
for line in output_lines:
if line.strip(): # 忽略空行
logger.info(f"[{platform}] {line}")
if error_lines:
logger.warning(f"[{platform}] 爬虫错误输出:")
for line in error_lines:
if line.strip(): # 忽略空行
logger.warning(f"[{platform}] {line}")
# 合并所有输出行用于解析
all_lines = output_lines + error_lines
@@ -329,10 +391,64 @@ postgresql_db_config = {{
# 合并所有行用于解析
all_lines = output_lines + error_lines
# 用于统计各平台的保存操作次数(通过日志关键字统计)
# 视频/内容保存操作的关键字
content_save_keywords = [
"[store.bilibili.update_bilibili_video]",
"update_bilibili_video",
"[store.douyin.update_dy_aweme]",
"update_dy_aweme",
"[store.kuaishou.update_kuaishou_video]",
"update_kuaishou_video",
"[store.xhs.update_xhs_note]",
"update_xhs_note",
"[store.weibo.update_weibo_note]",
"update_weibo_note",
"[store.tieba.update_tieba_note]",
"update_tieba_note",
"[store.zhihu.update_zhihu_content]",
"update_zhihu_content",
]
# 评论保存操作的关键字
comment_save_keywords = [
"[store.bilibili.update_bilibili_video_comment]",
"update_bilibili_video_comment",
"[store.douyin.update_dy_aweme_comment]",
"update_dy_aweme_comment",
"[store.kuaishou.update_ks_video_comment]",
"update_ks_video_comment",
"[store.xhs.update_xhs_note_comment]",
"update_xhs_note_comment",
"[store.weibo.update_weibo_note_comment]",
"update_weibo_note_comment",
"[store.tieba.update_tieba_note_comment]",
"update_tieba_note_comment",
"[store.zhihu.update_zhihu_content_comment]",
"update_zhihu_note_comment",
"update_zhihu_content_comment",
]
# 先统计日志关键字出现的次数(用于bilibili等没有汇总信息的平台)
log_keyword_content_count = 0
log_keyword_comment_count = 0
# 解析输出行,查找各种可能的数据保存信息
for line in all_lines:
line_lower = line.lower()
# 统计视频/内容保存操作(通过日志关键字)
for keyword in content_save_keywords:
if keyword in line or keyword.lower() in line_lower:
log_keyword_content_count += 1
break # 避免重复计数
# 统计评论保存操作(通过日志关键字)
for keyword in comment_save_keywords:
if keyword in line or keyword.lower() in line_lower:
log_keyword_comment_count += 1
break # 避免重复计数
# 查找保存的内容数量(多种可能的格式)
# 例如:"保存了 10 条笔记"、"成功保存 5 条内容"、"inserted 3 records"等
if any(keyword in line_lower for keyword in ["条笔记", "条内容", "条视频", "条帖子", "条回答"]):
@@ -367,10 +483,18 @@ postgresql_db_config = {{
if any(keyword in line_lower for keyword in ["登录", "扫码", "login", "需要登录"]):
stats["login_required"] = True
# 如果通过汇总信息没有找到保存数量,使用日志关键字统计的结果
# 这样可以支持bilibili等没有输出汇总信息的平台
if stats["notes_count"] == 0 and log_keyword_content_count > 0:
stats["notes_count"] = log_keyword_content_count
if stats["comments_count"] == 0 and log_keyword_comment_count > 0:
stats["comments_count"] = log_keyword_comment_count
# 如果没有找到明确的保存数量,尝试从数据库操作日志中提取
if stats["notes_count"] == 0 and stats["comments_count"] == 0:
# 查找可能的数据库插入信息
for line in all_lines:
line_lower = line.lower()
# 查找类似 "insert into" 或 "保存到" 的信息
if "insert" in line_lower or "保存到" in line_lower:
try:
+8
View File
@@ -34,6 +34,14 @@ class Settings(BaseSettings):
True,
description="开启后运行基于关键词的爬取流程,关闭则完全跳过关键词搜索"
)
STRICT_KEYWORDS: Optional[str] = Field(
None,
description="精确匹配关键词(逗号分隔),内容必须完整包含这些关键词才能落库"
)
FUZZY_KEYWORDS: Optional[str] = Field(
None,
description="模糊匹配关键词(逗号分隔),内容包含这些关键词的部分即可落库"
)
MINDSPIDER_API_KEY: Optional[str] = Field(None, description="MINDSPIDER API密钥")
MINDSPIDER_BASE_URL: Optional[str] = Field("https://api.deepseek.com", description="MINDSPIDER API基础URL,推荐deepseek-chat模型使用https://api.deepseek.com")
MINDSPIDER_MODEL_NAME: Optional[str] = Field("deepseek-chat", description="MINDSPIDER API模型名称, 推荐deepseek-chat")