1. 同步MediaCrawler为最新版本
2. 修复数据库not null错误 3. 支持PG数据库 4. 规范环境变量及配置使用 5. 规范为uv安装 6. 使用loggru
This commit is contained in:
@@ -15,7 +15,7 @@ import random
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from model.m_xiaohongshu import NoteUrlInfo
|
||||
from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
|
||||
from tools.crawler_util import extract_url_params_to_dict
|
||||
|
||||
|
||||
@@ -27,16 +27,17 @@ def sign(a1="", b1="", x_s="", x_t=""):
|
||||
"s0": 3, # getPlatformCode
|
||||
"s1": "",
|
||||
"x0": "1", # localStorage.getItem("b1b1")
|
||||
"x1": "3.7.8-2", # version
|
||||
"x1": "4.2.2", # version
|
||||
"x2": "Mac OS",
|
||||
"x3": "xhs-pc-web",
|
||||
"x4": "4.27.2",
|
||||
"x4": "4.74.0",
|
||||
"x5": a1, # cookie of a1
|
||||
"x6": x_t,
|
||||
"x7": x_s,
|
||||
"x8": b1, # localStorage.getItem("b1")
|
||||
"x9": mrc(x_t + x_s + b1),
|
||||
"x10": 154, # getSigCount
|
||||
"x11": "normal"
|
||||
}
|
||||
encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
|
||||
x_s_common = b64Encode(encode_str)
|
||||
@@ -306,6 +307,37 @@ def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
|
||||
return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)
|
||||
|
||||
|
||||
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||
"""
|
||||
从小红书创作者主页URL中解析出创作者信息
|
||||
支持以下格式:
|
||||
1. 完整URL: "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
|
||||
2. 纯ID: "5eb8e1d400000000010075ae"
|
||||
|
||||
Args:
|
||||
url: 创作者主页URL或user_id
|
||||
Returns:
|
||||
CreatorUrlInfo: 包含user_id, xsec_token, xsec_source的对象
|
||||
"""
|
||||
# 如果是纯ID格式(24位十六进制字符),直接返回
|
||||
if len(url) == 24 and all(c in "0123456789abcdef" for c in url):
|
||||
return CreatorUrlInfo(user_id=url, xsec_token="", xsec_source="")
|
||||
|
||||
# 从URL中提取user_id: /user/profile/xxx
|
||||
import re
|
||||
user_pattern = r'/user/profile/([^/?]+)'
|
||||
match = re.search(user_pattern, url)
|
||||
if match:
|
||||
user_id = match.group(1)
|
||||
# 提取xsec_token和xsec_source参数
|
||||
params = extract_url_params_to_dict(url)
|
||||
xsec_token = params.get("xsec_token", "")
|
||||
xsec_source = params.get("xsec_source", "")
|
||||
return CreatorUrlInfo(user_id=user_id, xsec_token=xsec_token, xsec_source=xsec_source)
|
||||
|
||||
raise ValueError(f"无法从URL中解析出创作者信息: {url}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
_img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
|
||||
# 获取一个图片地址在多个cdn下的url地址
|
||||
@@ -313,4 +345,19 @@ if __name__ == '__main__':
|
||||
final_img_url = get_img_url_by_trace_id(get_trace_id(_img_url))
|
||||
print(final_img_url)
|
||||
|
||||
# 测试创作者URL解析
|
||||
print("\n=== 创作者URL解析测试 ===")
|
||||
test_creator_urls = [
|
||||
"https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
|
||||
"5eb8e1d400000000010075ae",
|
||||
]
|
||||
for url in test_creator_urls:
|
||||
try:
|
||||
result = parse_creator_info_from_url(url)
|
||||
print(f"✓ URL: {url[:80]}...")
|
||||
print(f" 结果: {result}\n")
|
||||
except Exception as e:
|
||||
print(f"✗ URL: {url}")
|
||||
print(f" 错误: {e}\n")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user