feat(xhs): 集成xhshow库优化签名生成与请求参数 (#330)
* feat(xhs): 集成xhshow库优化签名生成与请求参数 - 引入xhshow库用于小红书API签名生成 - 替换原有的seccore_signv2_playwright签名校验方式 - 支持GET和POST请求的差异化签名处理 - 增加对b1值从localStorage获取的容错处理 - 更新x-t时间戳为毫秒级精度 - 在获取博主笔记接口中增加xsec_token和xsec_source参数- 支持通过配置传递验证token和渠道来源 - 更新依赖文件引入xhshow库- 调整配置示例适配新的token参数要求 * Delete MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py 移除配置文件 * Add xhs_config.py for Xiaohongshu platform settings 恢复错误删除的文件 --------- Co-authored-by: gehongbin <gehongbin@autohome.com.cn> Co-authored-by: Doiiars <doiiars@qq.com>
This commit is contained in:
@@ -17,6 +17,7 @@ from urllib.parse import urlencode
|
|||||||
import httpx
|
import httpx
|
||||||
from playwright.async_api import BrowserContext, Page
|
from playwright.async_api import BrowserContext, Page
|
||||||
from tenacity import retry, stop_after_attempt, wait_fixed
|
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||||
|
from xhshow import Xhshow
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from base.base_crawler import AbstractApiClient
|
from base.base_crawler import AbstractApiClient
|
||||||
@@ -27,7 +28,6 @@ from .exception import DataFetchError, IPBlockError
|
|||||||
from .field import SearchNoteType, SearchSortType
|
from .field import SearchNoteType, SearchSortType
|
||||||
from .help import get_search_id, sign
|
from .help import get_search_id, sign
|
||||||
from .extractor import XiaoHongShuExtractor
|
from .extractor import XiaoHongShuExtractor
|
||||||
from .secsign import seccore_signv2_playwright
|
|
||||||
|
|
||||||
|
|
||||||
class XiaoHongShuClient(AbstractApiClient):
|
class XiaoHongShuClient(AbstractApiClient):
|
||||||
@@ -53,24 +53,51 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
self.playwright_page = playwright_page
|
self.playwright_page = playwright_page
|
||||||
self.cookie_dict = cookie_dict
|
self.cookie_dict = cookie_dict
|
||||||
self._extractor = XiaoHongShuExtractor()
|
self._extractor = XiaoHongShuExtractor()
|
||||||
|
# 初始化 xhshow 客户端用于签名生成
|
||||||
|
self._xhshow_client = Xhshow()
|
||||||
|
|
||||||
async def _pre_headers(self, url: str, data=None) -> Dict:
|
async def _pre_headers(self, url: str, data=None) -> Dict:
|
||||||
"""
|
"""
|
||||||
请求头参数签名
|
请求头参数签名,使用 xhshow 库生成签名
|
||||||
Args:
|
Args:
|
||||||
url:
|
url: 完整的 URI(GET 请求包含查询参数)
|
||||||
data:
|
data: POST 请求的请求体数据
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
x_s = await seccore_signv2_playwright(self.playwright_page, url, data)
|
# 获取 a1 cookie 值
|
||||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
a1_value = self.cookie_dict.get("a1", "")
|
||||||
|
|
||||||
|
# 根据请求类型使用不同的签名方法
|
||||||
|
if data is None:
|
||||||
|
# GET 请求:从 url 中提取参数
|
||||||
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
parsed = urlparse(url)
|
||||||
|
params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()}
|
||||||
|
# 使用完整的 URL(包含 host)
|
||||||
|
full_url = f"{self._host}{url}"
|
||||||
|
x_s = self._xhshow_client.sign_xs_get(uri=full_url, a1_value=a1_value, params=params)
|
||||||
|
else:
|
||||||
|
# POST 请求:使用 data 作为 payload
|
||||||
|
full_url = f"{self._host}{url}"
|
||||||
|
x_s = self._xhshow_client.sign_xs_post(uri=full_url, a1_value=a1_value, payload=data)
|
||||||
|
|
||||||
|
# 尝试获取 b1 值(从 localStorage),如果获取失败则使用空字符串
|
||||||
|
b1_value = ""
|
||||||
|
try:
|
||||||
|
if self.playwright_page:
|
||||||
|
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||||
|
b1_value = local_storage.get("b1", "")
|
||||||
|
except Exception as e:
|
||||||
|
utils.logger.warning(f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}, using empty string")
|
||||||
|
|
||||||
|
# 使用 sign 函数生成其他签名头
|
||||||
signs = sign(
|
signs = sign(
|
||||||
a1=self.cookie_dict.get("a1", ""),
|
a1=a1_value,
|
||||||
b1=local_storage.get("b1", ""),
|
b1=b1_value,
|
||||||
x_s=x_s,
|
x_s=x_s,
|
||||||
x_t=str(int(time.time())),
|
x_t=str(int(time.time() * 1000)), # x-t 使用毫秒时间戳
|
||||||
)
|
)
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
@@ -115,7 +142,8 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
elif data["code"] == self.IP_ERROR_CODE:
|
elif data["code"] == self.IP_ERROR_CODE:
|
||||||
raise IPBlockError(self.IP_ERROR_STR)
|
raise IPBlockError(self.IP_ERROR_STR)
|
||||||
else:
|
else:
|
||||||
raise DataFetchError(data.get("msg", None))
|
err_msg = data.get("msg", None) or f"{response.text}"
|
||||||
|
raise DataFetchError(err_msg)
|
||||||
|
|
||||||
async def get(self, uri: str, params=None) -> Dict:
|
async def get(self, uri: str, params=None) -> Dict:
|
||||||
"""
|
"""
|
||||||
@@ -480,6 +508,8 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
creator: str,
|
creator: str,
|
||||||
cursor: str,
|
cursor: str,
|
||||||
page_size: int = 30,
|
page_size: int = 30,
|
||||||
|
xsec_token: str = "",
|
||||||
|
xsec_source: str = "pc_feed",
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
获取博主的笔记
|
获取博主的笔记
|
||||||
@@ -487,24 +517,22 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
creator: 博主ID
|
creator: 博主ID
|
||||||
cursor: 上一页最后一条笔记的ID
|
cursor: 上一页最后一条笔记的ID
|
||||||
page_size: 分页数据长度
|
page_size: 分页数据长度
|
||||||
|
xsec_token: 验证token
|
||||||
|
xsec_source: 渠道来源
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
uri = "/api/sns/web/v1/user_posted"
|
uri = f"/api/sns/web/v1/user_posted?num={page_size}&cursor={cursor}&user_id={creator}&xsec_token={xsec_token}&xsec_source={xsec_source}"
|
||||||
data = {
|
return await self.get(uri)
|
||||||
"user_id": creator,
|
|
||||||
"cursor": cursor,
|
|
||||||
"num": page_size,
|
|
||||||
"image_formats": "jpg,webp,avif",
|
|
||||||
}
|
|
||||||
return await self.get(uri, data)
|
|
||||||
|
|
||||||
async def get_all_notes_by_creator(
|
async def get_all_notes_by_creator(
|
||||||
self,
|
self,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
crawl_interval: float = 1.0,
|
crawl_interval: float = 1.0,
|
||||||
callback: Optional[Callable] = None,
|
callback: Optional[Callable] = None,
|
||||||
|
xsec_token: str = "",
|
||||||
|
xsec_source: str = "pc_feed",
|
||||||
) -> List[Dict]:
|
) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||||
@@ -512,6 +540,8 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
user_id: 用户ID
|
user_id: 用户ID
|
||||||
crawl_interval: 爬取一次的延迟单位(秒)
|
crawl_interval: 爬取一次的延迟单位(秒)
|
||||||
callback: 一次分页爬取结束后的更新回调函数
|
callback: 一次分页爬取结束后的更新回调函数
|
||||||
|
xsec_token: 验证token
|
||||||
|
xsec_source: 渠道来源
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
@@ -520,7 +550,7 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
notes_has_more = True
|
notes_has_more = True
|
||||||
notes_cursor = ""
|
notes_cursor = ""
|
||||||
while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
|
while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
|
notes_res = await self.get_notes_by_creator(user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source)
|
||||||
if not notes_res:
|
if not notes_res:
|
||||||
utils.logger.error(
|
utils.logger.error(
|
||||||
f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."
|
f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."
|
||||||
|
|||||||
@@ -201,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
crawl_interval=crawl_interval,
|
crawl_interval=crawl_interval,
|
||||||
callback=self.fetch_creator_notes_detail,
|
callback=self.fetch_creator_notes_detail,
|
||||||
|
xsec_token=creator_info.xsec_token,
|
||||||
|
xsec_source=creator_info.xsec_source,
|
||||||
)
|
)
|
||||||
|
|
||||||
note_ids = []
|
note_ids = []
|
||||||
@@ -279,12 +281,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
Dict: note detail
|
Dict: note detail
|
||||||
"""
|
"""
|
||||||
note_detail = None
|
note_detail = None
|
||||||
|
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
try:
|
try:
|
||||||
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
|
try:
|
||||||
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
|
note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
||||||
|
except RetryError:
|
||||||
|
pass
|
||||||
|
|
||||||
if not note_detail:
|
if not note_detail:
|
||||||
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
|
||||||
|
enable_cookie=True)
|
||||||
|
if not note_detail:
|
||||||
|
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
||||||
|
|
||||||
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
|
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
|
||||||
|
|
||||||
|
|||||||
@@ -24,3 +24,4 @@ cryptography>=45.0.7
|
|||||||
alembic>=1.16.5
|
alembic>=1.16.5
|
||||||
asyncmy>=0.2.10
|
asyncmy>=0.2.10
|
||||||
sqlalchemy>=2.0.43
|
sqlalchemy>=2.0.43
|
||||||
|
xhshow>=0.1.3
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ parsel==1.9.1
|
|||||||
pyexecjs==1.5.1
|
pyexecjs==1.5.1
|
||||||
typer>=0.12.3
|
typer>=0.12.3
|
||||||
pyhumps==3.8.0
|
pyhumps==3.8.0
|
||||||
|
xhshow>=0.1.3
|
||||||
|
|
||||||
# ===============================
|
# ===============================
|
||||||
# 工具包
|
# 工具包
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ beautifulsoup4>=4.12.0
|
|||||||
lxml>=4.9.0
|
lxml>=4.9.0
|
||||||
parsel==1.9.1
|
parsel==1.9.1
|
||||||
pyexecjs==1.5.1
|
pyexecjs==1.5.1
|
||||||
|
xhshow>=0.1.3
|
||||||
|
|
||||||
# ===== 可视化 =====
|
# ===== 可视化 =====
|
||||||
plotly>=5.17.0
|
plotly>=5.17.0
|
||||||
|
|||||||
Reference in New Issue
Block a user