Uploading the AI Crawler System: MindSpider
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
*.js linguist-language=python
|
||||
*.css linguist-language=python
|
||||
*.html linguist-language=python
|
||||
@@ -0,0 +1,176 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
*.xml
|
||||
*.iml
|
||||
.idea
|
||||
/temp_image/
|
||||
/browser_data/
|
||||
/data/
|
||||
|
||||
*/.DS_Store
|
||||
.vscode
|
||||
/node_modules
|
||||
docs/.vitepress/cache
|
||||
|
||||
# other gitignore
|
||||
.venv
|
||||
.refer
|
||||
@@ -0,0 +1 @@
|
||||
3.9
|
||||
@@ -0,0 +1,107 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2024/4/6 14:21
|
||||
# @Desc : 异步Aiomysql的增删改查封装
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
import aiomysql
|
||||
|
||||
|
||||
class AsyncMysqlDB:
|
||||
def __init__(self, pool: aiomysql.Pool) -> None:
|
||||
self.__pool = pool
|
||||
|
||||
async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
从给定的 SQL 中查询记录,返回的是一个列表
|
||||
:param sql: 查询的sql
|
||||
:param args: sql中传递动态参数列表
|
||||
:return:
|
||||
"""
|
||||
async with self.__pool.acquire() as conn:
|
||||
async with conn.cursor(aiomysql.DictCursor) as cur:
|
||||
await cur.execute(sql, args)
|
||||
data = await cur.fetchall()
|
||||
return data or []
|
||||
|
||||
async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]:
|
||||
"""
|
||||
从给定的 SQL 中查询记录,返回的是符合条件的第一个结果
|
||||
:param sql: 查询的sql
|
||||
:param args:sql中传递动态参数列表
|
||||
:return:
|
||||
"""
|
||||
async with self.__pool.acquire() as conn:
|
||||
async with conn.cursor(aiomysql.DictCursor) as cur:
|
||||
await cur.execute(sql, args)
|
||||
data = await cur.fetchone()
|
||||
return data
|
||||
|
||||
async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int:
|
||||
"""
|
||||
表中插入数据
|
||||
:param table_name: 表名
|
||||
:param item: 一条记录的字典信息
|
||||
:return:
|
||||
"""
|
||||
fields = list(item.keys())
|
||||
values = list(item.values())
|
||||
fields = [f'`{field}`' for field in fields]
|
||||
fieldstr = ','.join(fields)
|
||||
valstr = ','.join(['%s'] * len(item))
|
||||
sql = "INSERT INTO %s (%s) VALUES(%s)" % (table_name, fieldstr, valstr)
|
||||
async with self.__pool.acquire() as conn:
|
||||
async with conn.cursor(aiomysql.DictCursor) as cur:
|
||||
await cur.execute(sql, values)
|
||||
lastrowid = cur.lastrowid
|
||||
return lastrowid
|
||||
|
||||
async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str,
|
||||
value_where: Union[str, int, float]) -> int:
|
||||
"""
|
||||
更新指定表的记录
|
||||
:param table_name: 表名
|
||||
:param updates: 需要更新的字段和值的 key - value 映射
|
||||
:param field_where: update 语句 where 条件中的字段名
|
||||
:param value_where: update 语句 where 条件中的字段值
|
||||
:return:
|
||||
"""
|
||||
upsets = []
|
||||
values = []
|
||||
for k, v in updates.items():
|
||||
s = '`%s`=%%s' % k
|
||||
upsets.append(s)
|
||||
values.append(v)
|
||||
upsets = ','.join(upsets)
|
||||
sql = 'UPDATE %s SET %s WHERE %s="%s"' % (
|
||||
table_name,
|
||||
upsets,
|
||||
field_where, value_where,
|
||||
)
|
||||
async with self.__pool.acquire() as conn:
|
||||
async with conn.cursor() as cur:
|
||||
rows = await cur.execute(sql, values)
|
||||
return rows
|
||||
|
||||
async def execute(self, sql: str, *args: Union[str, int]) -> int:
|
||||
"""
|
||||
需要更新、写入等操作的 excute 执行语句
|
||||
:param sql:
|
||||
:param args:
|
||||
:return:
|
||||
"""
|
||||
async with self.__pool.acquire() as conn:
|
||||
async with conn.cursor() as cur:
|
||||
rows = await cur.execute(sql, args)
|
||||
return rows
|
||||
@@ -0,0 +1,111 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2024/4/6 14:21
|
||||
# @Desc : 异步SQLite的增删改查封装
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
import aiosqlite
|
||||
|
||||
|
||||
class AsyncSqliteDB:
|
||||
def __init__(self, db_path: str) -> None:
|
||||
self.__db_path = db_path
|
||||
|
||||
async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
从给定的 SQL 中查询记录,返回的是一个列表
|
||||
:param sql: 查询的sql
|
||||
:param args: sql中传递动态参数列表
|
||||
:return:
|
||||
"""
|
||||
async with aiosqlite.connect(self.__db_path) as conn:
|
||||
conn.row_factory = aiosqlite.Row
|
||||
async with conn.execute(sql, args) as cursor:
|
||||
rows = await cursor.fetchall()
|
||||
return [dict(row) for row in rows] if rows else []
|
||||
|
||||
async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]:
|
||||
"""
|
||||
从给定的 SQL 中查询记录,返回的是符合条件的第一个结果
|
||||
:param sql: 查询的sql
|
||||
:param args:sql中传递动态参数列表
|
||||
:return:
|
||||
"""
|
||||
async with aiosqlite.connect(self.__db_path) as conn:
|
||||
conn.row_factory = aiosqlite.Row
|
||||
async with conn.execute(sql, args) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int:
|
||||
"""
|
||||
表中插入数据
|
||||
:param table_name: 表名
|
||||
:param item: 一条记录的字典信息
|
||||
:return:
|
||||
"""
|
||||
fields = list(item.keys())
|
||||
values = list(item.values())
|
||||
fieldstr = ','.join(fields)
|
||||
valstr = ','.join(['?'] * len(item))
|
||||
sql = f"INSERT INTO {table_name} ({fieldstr}) VALUES({valstr})"
|
||||
async with aiosqlite.connect(self.__db_path) as conn:
|
||||
async with conn.execute(sql, values) as cursor:
|
||||
await conn.commit()
|
||||
return cursor.lastrowid
|
||||
|
||||
async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str,
|
||||
value_where: Union[str, int, float]) -> int:
|
||||
"""
|
||||
更新指定表的记录
|
||||
:param table_name: 表名
|
||||
:param updates: 需要更新的字段和值的 key - value 映射
|
||||
:param field_where: update 语句 where 条件中的字段名
|
||||
:param value_where: update 语句 where 条件中的字段值
|
||||
:return:
|
||||
"""
|
||||
upsets = []
|
||||
values = []
|
||||
for k, v in updates.items():
|
||||
upsets.append(f'{k}=?')
|
||||
values.append(v)
|
||||
upsets_str = ','.join(upsets)
|
||||
values.append(value_where)
|
||||
sql = f'UPDATE {table_name} SET {upsets_str} WHERE {field_where}=?'
|
||||
async with aiosqlite.connect(self.__db_path) as conn:
|
||||
async with conn.execute(sql, values) as cursor:
|
||||
await conn.commit()
|
||||
return cursor.rowcount
|
||||
|
||||
async def execute(self, sql: str, *args: Union[str, int]) -> int:
|
||||
"""
|
||||
需要更新、写入等操作的 excute 执行语句
|
||||
:param sql:
|
||||
:param args:
|
||||
:return:
|
||||
"""
|
||||
async with aiosqlite.connect(self.__db_path) as conn:
|
||||
async with conn.execute(sql, args) as cursor:
|
||||
await conn.commit()
|
||||
return cursor.rowcount
|
||||
|
||||
async def executescript(self, sql_script: str) -> None:
|
||||
"""
|
||||
执行SQL脚本,用于初始化数据库表结构
|
||||
:param sql_script: SQL脚本内容
|
||||
:return:
|
||||
"""
|
||||
async with aiosqlite.connect(self.__db_path) as conn:
|
||||
await conn.executescript(sql_script)
|
||||
await conn.commit()
|
||||
@@ -0,0 +1,11 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, BrowserType, Playwright
|
||||
|
||||
|
||||
class AbstractCrawler(ABC):
|
||||
|
||||
@abstractmethod
|
||||
async def start(self):
|
||||
"""
|
||||
start crawler
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def search(self):
|
||||
"""
|
||||
search
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str], headless: bool = True) -> BrowserContext:
|
||||
"""
|
||||
launch browser
|
||||
:param chromium: chromium browser
|
||||
:param playwright_proxy: playwright proxy
|
||||
:param user_agent: user agent
|
||||
:param headless: headless mode
|
||||
:return: browser context
|
||||
"""
|
||||
pass
|
||||
|
||||
async def launch_browser_with_cdp(self, playwright: Playwright, playwright_proxy: Optional[Dict], user_agent: Optional[str], headless: bool = True) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器(可选实现)
|
||||
:param playwright: playwright实例
|
||||
:param playwright_proxy: playwright代理配置
|
||||
:param user_agent: 用户代理
|
||||
:param headless: 无头模式
|
||||
:return: 浏览器上下文
|
||||
"""
|
||||
# 默认实现:回退到标准模式
|
||||
return await self.launch_browser(playwright.chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
|
||||
class AbstractLogin(ABC):
|
||||
|
||||
@abstractmethod
|
||||
async def begin(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def login_by_qrcode(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def login_by_cookies(self):
|
||||
pass
|
||||
|
||||
|
||||
class AbstractStore(ABC):
|
||||
|
||||
@abstractmethod
|
||||
async def store_content(self, content_item: Dict):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
pass
|
||||
|
||||
# TODO support all platform
|
||||
# only xhs is supported, so @abstractmethod is commented
|
||||
@abstractmethod
|
||||
async def store_creator(self, creator: Dict):
|
||||
pass
|
||||
|
||||
|
||||
class AbstractStoreImage(ABC):
|
||||
# TODO: support all platform
|
||||
# only weibo is supported
|
||||
# @abstractmethod
|
||||
async def store_image(self, image_content_item: Dict):
|
||||
pass
|
||||
|
||||
|
||||
class AbstractStoreVideo(ABC):
|
||||
# TODO: support all platform
|
||||
# only weibo is supported
|
||||
# @abstractmethod
|
||||
async def store_video(self, video_content_item: Dict):
|
||||
pass
|
||||
|
||||
|
||||
class AbstractApiClient(ABC):
|
||||
|
||||
@abstractmethod
|
||||
async def request(self, method, url, **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
pass
|
||||
@@ -0,0 +1,11 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/2 11:06
|
||||
# @Desc : 抽象类
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, List, Optional
|
||||
|
||||
|
||||
class AbstractCache(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def get(self, key: str) -> Optional[Any]:
|
||||
"""
|
||||
从缓存中获取键的值。
|
||||
这是一个抽象方法。子类必须实现这个方法。
|
||||
:param key: 键
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def set(self, key: str, value: Any, expire_time: int) -> None:
|
||||
"""
|
||||
将键的值设置到缓存中。
|
||||
这是一个抽象方法。子类必须实现这个方法。
|
||||
:param key: 键
|
||||
:param value: 值
|
||||
:param expire_time: 过期时间
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def keys(self, pattern: str) -> List[str]:
|
||||
"""
|
||||
获取所有符合pattern的key
|
||||
:param pattern: 匹配模式
|
||||
:return:
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@@ -0,0 +1,40 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/2 11:23
|
||||
# @Desc :
|
||||
|
||||
|
||||
class CacheFactory:
|
||||
"""
|
||||
缓存工厂类
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create_cache(cache_type: str, *args, **kwargs):
|
||||
"""
|
||||
创建缓存对象
|
||||
:param cache_type: 缓存类型
|
||||
:param args: 参数
|
||||
:param kwargs: 关键字参数
|
||||
:return:
|
||||
"""
|
||||
if cache_type == 'memory':
|
||||
from .local_cache import ExpiringLocalCache
|
||||
return ExpiringLocalCache(*args, **kwargs)
|
||||
elif cache_type == 'redis':
|
||||
from .redis_cache import RedisCache
|
||||
return RedisCache()
|
||||
else:
|
||||
raise ValueError(f'Unknown cache type: {cache_type}')
|
||||
@@ -0,0 +1,131 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/2 11:05
|
||||
# @Desc : 本地缓存
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from cache.abs_cache import AbstractCache
|
||||
|
||||
|
||||
class ExpiringLocalCache(AbstractCache):
|
||||
|
||||
def __init__(self, cron_interval: int = 10):
|
||||
"""
|
||||
初始化本地缓存
|
||||
:param cron_interval: 定时清楚cache的时间间隔
|
||||
:return:
|
||||
"""
|
||||
self._cron_interval = cron_interval
|
||||
self._cache_container: Dict[str, Tuple[Any, float]] = {}
|
||||
self._cron_task: Optional[asyncio.Task] = None
|
||||
# 开启定时清理任务
|
||||
self._schedule_clear()
|
||||
|
||||
def __del__(self):
|
||||
"""
|
||||
析构函数,清理定时任务
|
||||
:return:
|
||||
"""
|
||||
if self._cron_task is not None:
|
||||
self._cron_task.cancel()
|
||||
|
||||
def get(self, key: str) -> Optional[Any]:
|
||||
"""
|
||||
从缓存中获取键的值
|
||||
:param key:
|
||||
:return:
|
||||
"""
|
||||
value, expire_time = self._cache_container.get(key, (None, 0))
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
# 如果键已过期,则删除键并返回None
|
||||
if expire_time < time.time():
|
||||
del self._cache_container[key]
|
||||
return None
|
||||
|
||||
return value
|
||||
|
||||
def set(self, key: str, value: Any, expire_time: int) -> None:
|
||||
"""
|
||||
将键的值设置到缓存中
|
||||
:param key:
|
||||
:param value:
|
||||
:param expire_time:
|
||||
:return:
|
||||
"""
|
||||
self._cache_container[key] = (value, time.time() + expire_time)
|
||||
|
||||
def keys(self, pattern: str) -> List[str]:
|
||||
"""
|
||||
获取所有符合pattern的key
|
||||
:param pattern: 匹配模式
|
||||
:return:
|
||||
"""
|
||||
if pattern == '*':
|
||||
return list(self._cache_container.keys())
|
||||
|
||||
# 本地缓存通配符暂时将*替换为空
|
||||
if '*' in pattern:
|
||||
pattern = pattern.replace('*', '')
|
||||
|
||||
return [key for key in self._cache_container.keys() if pattern in key]
|
||||
|
||||
def _schedule_clear(self):
|
||||
"""
|
||||
开启定时清理任务,
|
||||
:return:
|
||||
"""
|
||||
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
except RuntimeError:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
self._cron_task = loop.create_task(self._start_clear_cron())
|
||||
|
||||
def _clear(self):
|
||||
"""
|
||||
根据过期时间清理缓存
|
||||
:return:
|
||||
"""
|
||||
for key, (value, expire_time) in self._cache_container.items():
|
||||
if expire_time < time.time():
|
||||
del self._cache_container[key]
|
||||
|
||||
async def _start_clear_cron(self):
|
||||
"""
|
||||
开启定时清理任务
|
||||
:return:
|
||||
"""
|
||||
while True:
|
||||
self._clear()
|
||||
await asyncio.sleep(self._cron_interval)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cache = ExpiringLocalCache(cron_interval=2)
|
||||
cache.set('name', '程序员阿江-Relakkes', 3)
|
||||
print(cache.get('key'))
|
||||
print(cache.keys("*"))
|
||||
time.sleep(4)
|
||||
print(cache.get('key'))
|
||||
del cache
|
||||
time.sleep(1)
|
||||
print("done")
|
||||
@@ -0,0 +1,87 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/5/29 22:57
|
||||
# @Desc : RedisCache实现
|
||||
import pickle
|
||||
import time
|
||||
from typing import Any, List
|
||||
|
||||
from redis import Redis
|
||||
|
||||
from cache.abs_cache import AbstractCache
|
||||
from config import db_config
|
||||
|
||||
|
||||
class RedisCache(AbstractCache):
|
||||
|
||||
def __init__(self) -> None:
|
||||
# 连接redis, 返回redis客户端
|
||||
self._redis_client = self._connet_redis()
|
||||
|
||||
@staticmethod
|
||||
def _connet_redis() -> Redis:
|
||||
"""
|
||||
连接redis, 返回redis客户端, 这里按需配置redis连接信息
|
||||
:return:
|
||||
"""
|
||||
return Redis(
|
||||
host=db_config.REDIS_DB_HOST,
|
||||
port=db_config.REDIS_DB_PORT,
|
||||
db=db_config.REDIS_DB_NUM,
|
||||
password=db_config.REDIS_DB_PWD,
|
||||
)
|
||||
|
||||
def get(self, key: str) -> Any:
|
||||
"""
|
||||
从缓存中获取键的值, 并且反序列化
|
||||
:param key:
|
||||
:return:
|
||||
"""
|
||||
value = self._redis_client.get(key)
|
||||
if value is None:
|
||||
return None
|
||||
return pickle.loads(value)
|
||||
|
||||
def set(self, key: str, value: Any, expire_time: int) -> None:
|
||||
"""
|
||||
将键的值设置到缓存中, 并且序列化
|
||||
:param key:
|
||||
:param value:
|
||||
:param expire_time:
|
||||
:return:
|
||||
"""
|
||||
self._redis_client.set(key, pickle.dumps(value), ex=expire_time)
|
||||
|
||||
def keys(self, pattern: str) -> List[str]:
|
||||
"""
|
||||
获取所有符合pattern的key
|
||||
"""
|
||||
return [key.decode() for key in self._redis_client.keys(pattern)]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
redis_cache = RedisCache()
|
||||
# basic usage
|
||||
redis_cache.set("name", "程序员阿江-Relakkes", 1)
|
||||
print(redis_cache.get("name")) # Relakkes
|
||||
print(redis_cache.keys("*")) # ['name']
|
||||
time.sleep(2)
|
||||
print(redis_cache.get("name")) # None
|
||||
|
||||
# special python type usage
|
||||
# list
|
||||
redis_cache.set("list", [1, 2, 3], 10)
|
||||
_value = redis_cache.get("list")
|
||||
print(_value, f"value type:{type(_value)}") # [1, 2, 3]
|
||||
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from .arg import *
|
||||
@@ -0,0 +1,55 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import argparse
|
||||
|
||||
import config
|
||||
from tools.utils import str2bool
|
||||
|
||||
|
||||
async def parse_cmd():
|
||||
# 读取command arg
|
||||
parser = argparse.ArgumentParser(description='Media crawler program. / 媒体爬虫程序')
|
||||
parser.add_argument('--platform', type=str,
|
||||
help='Media platform select / 选择媒体平台 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)',
|
||||
choices=["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"], default=config.PLATFORM)
|
||||
parser.add_argument('--lt', type=str,
|
||||
help='Login type / 登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)',
|
||||
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
||||
parser.add_argument('--type', type=str,
|
||||
help='Crawler type / 爬取类型 (search=搜索 | detail=详情 | creator=创作者)',
|
||||
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
|
||||
parser.add_argument('--start', type=int,
|
||||
help='Number of start page / 起始页码', default=config.START_PAGE)
|
||||
parser.add_argument('--keywords', type=str,
|
||||
help='Please input keywords / 请输入关键词', default=config.KEYWORDS)
|
||||
parser.add_argument('--get_comment', type=str2bool,
|
||||
help='''Whether to crawl level one comment / 是否爬取一级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
|
||||
parser.add_argument('--get_sub_comment', type=str2bool,
|
||||
help=''''Whether to crawl level two comment / 是否爬取二级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
|
||||
parser.add_argument('--save_data_option', type=str,
|
||||
help='Where to save the data / 数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)',
|
||||
choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION)
|
||||
parser.add_argument('--cookies', type=str,
|
||||
help='Cookies used for cookie login type / Cookie登录方式使用的Cookie值', default=config.COOKIES)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# override config
|
||||
config.PLATFORM = args.platform
|
||||
config.LOGIN_TYPE = args.lt
|
||||
config.CRAWLER_TYPE = args.type
|
||||
config.START_PAGE = args.start
|
||||
config.KEYWORDS = args.keywords
|
||||
config.ENABLE_GET_COMMENTS = args.get_comment
|
||||
config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
|
||||
config.SAVE_DATA_OPTION = args.save_data_option
|
||||
config.COOKIES = args.cookies
|
||||
@@ -0,0 +1,14 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from .base_config import *
|
||||
from .db_config import *
|
||||
from .tieba_config import *
|
||||
@@ -0,0 +1,115 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# 基础配置
|
||||
PLATFORM = "xhs" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
|
||||
KEYWORDS = "黑神话钟馗,九三阅兵,种地吧,董璇,非亲生,医美风险,游戏科学,阅兵准备,热巴,醉驾判无罪" # 关键词搜索配置,以英文逗号分隔
|
||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||
COOKIES = ""
|
||||
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
||||
# 是否开启 IP 代理
|
||||
ENABLE_IP_PROXY = False
|
||||
|
||||
# 代理IP池数量
|
||||
IP_PROXY_POOL_COUNT = 2
|
||||
|
||||
# 代理IP提供商名称
|
||||
IP_PROXY_PROVIDER_NAME = "kuaidaili" # kuaidaili | wandouhttp
|
||||
|
||||
# 设置为True不会打开浏览器(无头浏览器)
|
||||
# 设置False会打开一个浏览器
|
||||
# 小红书如果一直扫码登录不通过,打开浏览器手动过一下滑动验证码
|
||||
# 抖音如果一直提示失败,打开浏览器看下是否扫码登录之后出现了手机号验证,如果出现了手动过一下再试。
|
||||
HEADLESS = True
|
||||
|
||||
# 是否保存登录状态
|
||||
SAVE_LOGIN_STATE = True
|
||||
|
||||
# ==================== CDP (Chrome DevTools Protocol) 配置 ====================
|
||||
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
|
||||
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
|
||||
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
|
||||
ENABLE_CDP_MODE = False
|
||||
|
||||
# CDP调试端口,用于与浏览器通信
|
||||
# 如果端口被占用,系统会自动尝试下一个可用端口
|
||||
CDP_DEBUG_PORT = 9222
|
||||
|
||||
# 自定义浏览器路径(可选)
|
||||
# 如果为空,系统会自动检测Chrome/Edge的安装路径
|
||||
# Windows示例: "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
|
||||
# macOS示例: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
|
||||
CUSTOM_BROWSER_PATH = ""
|
||||
|
||||
# CDP模式下是否启用无头模式
|
||||
# 注意:即使设置为True,某些反检测功能在无头模式下可能效果不佳
|
||||
CDP_HEADLESS = False
|
||||
|
||||
# 浏览器启动超时时间(秒)
|
||||
BROWSER_LAUNCH_TIMEOUT = 30
|
||||
|
||||
# 是否在程序结束时自动关闭浏览器
|
||||
# 设置为False可以保持浏览器运行,便于调试
|
||||
AUTO_CLOSE_BROWSER = True
|
||||
|
||||
# 数据保存类型选项配置,支持四种类型:csv、db、json、sqlite, 最好保存到DB,有排重的功能。
|
||||
SAVE_DATA_OPTION = "db" # csv or db or json or sqlite
|
||||
|
||||
# 用户浏览器缓存的浏览器文件配置
|
||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||
|
||||
# 爬取开始页数 默认从第一页开始
|
||||
START_PAGE = 1
|
||||
|
||||
# 爬取视频/帖子的数量控制
|
||||
CRAWLER_MAX_NOTES_COUNT = 10
|
||||
|
||||
# 并发爬虫数量控制
|
||||
MAX_CONCURRENCY_NUM = 1
|
||||
|
||||
# 是否开启爬媒体模式(包含图片或视频资源),默认不开启爬媒体
|
||||
ENABLE_GET_MEIDAS = False
|
||||
|
||||
# 是否开启爬评论模式, 默认开启爬评论
|
||||
ENABLE_GET_COMMENTS = True
|
||||
|
||||
# 爬取一级评论的数量控制(单视频/帖子)
|
||||
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 20
|
||||
|
||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||
ENABLE_GET_SUB_COMMENTS = False
|
||||
|
||||
# 词云相关
|
||||
# 是否开启生成评论词云图
|
||||
ENABLE_GET_WORDCLOUD = False
|
||||
# 自定义词语及其分组
|
||||
# 添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
|
||||
CUSTOM_WORDS = {
|
||||
"零几": "年份", # 将“零几”识别为一个整体
|
||||
"高频词": "专业术语", # 示例自定义词
|
||||
}
|
||||
|
||||
# 停用(禁用)词文件路径
|
||||
STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
|
||||
|
||||
# 中文字体文件路径
|
||||
FONT_PATH = "./docs/STZHONGS.TTF"
|
||||
|
||||
# 爬取间隔时间
|
||||
CRAWLER_MAX_SLEEP_SEC = 2
|
||||
|
||||
from .bilibili_config import *
|
||||
from .xhs_config import *
|
||||
from .dy_config import *
|
||||
from .ks_config import *
|
||||
from .weibo_config import *
|
||||
from .tieba_config import *
|
||||
from .zhihu_config import *
|
||||
@@ -0,0 +1,47 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
# bilili 平台配置
|
||||
|
||||
# 每天爬取视频/帖子的数量控制
|
||||
MAX_NOTES_PER_DAY = 1
|
||||
|
||||
# 指定B站视频ID列表
|
||||
BILI_SPECIFIED_ID_LIST = [
|
||||
"BV1d54y1g7db",
|
||||
"BV1Sz4y1U77N",
|
||||
"BV14Q4y1n7jz",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定B站用户ID列表
|
||||
BILI_CREATOR_ID_LIST = [
|
||||
"20813884",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定时间范围
|
||||
START_DAY = "2024-01-01"
|
||||
END_DAY = "2024-01-01"
|
||||
|
||||
# 搜索模式
|
||||
BILI_SEARCH_MODE = "normal"
|
||||
|
||||
# 是否爬取用户信息
|
||||
CREATOR_MODE = True
|
||||
|
||||
# 开始爬取用户信息页码
|
||||
START_CONTACTS_PAGE = 1
|
||||
|
||||
# 单个视频/帖子最大爬取评论数
|
||||
CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
|
||||
|
||||
# 单个视频/帖子最大爬取动态数
|
||||
CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50
|
||||
@@ -0,0 +1,33 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import os
|
||||
|
||||
# mysql config - 使用MindSpider的数据库配置
|
||||
MYSQL_DB_PWD = "mneDccc7sHHANtFk"
|
||||
MYSQL_DB_USER = "root"
|
||||
MYSQL_DB_HOST = "rm-2zeib6b13f6tt9kncoo.mysql.rds.aliyuncs.com"
|
||||
MYSQL_DB_PORT = 3306
|
||||
MYSQL_DB_NAME = "mindspider"
|
||||
|
||||
|
||||
# redis config
|
||||
REDIS_DB_HOST = "127.0.0.1" # your redis host
|
||||
REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password
|
||||
REDIS_DB_PORT = os.getenv("REDIS_DB_PORT", 6379) # your redis port
|
||||
REDIS_DB_NUM = os.getenv("REDIS_DB_NUM", 0) # your redis db num
|
||||
|
||||
# cache type
|
||||
CACHE_TYPE_REDIS = "redis"
|
||||
CACHE_TYPE_MEMORY = "memory"
|
||||
|
||||
# sqlite config
|
||||
SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schema", "sqlite_tables.db")
|
||||
@@ -0,0 +1,25 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# 抖音平台配置
|
||||
PUBLISH_TIME_TYPE = 0
|
||||
|
||||
# 指定DY视频ID列表
|
||||
DY_SPECIFIED_ID_LIST = [
|
||||
"7280854932641664319",
|
||||
"7202432992642387233",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定DY用户ID列表
|
||||
DY_CREATOR_ID_LIST = [
|
||||
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
|
||||
# ........................
|
||||
]
|
||||
@@ -0,0 +1,20 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# 快手平台配置
|
||||
|
||||
# 指定快手视频ID列表
|
||||
KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"]
|
||||
|
||||
# 指定快手用户ID列表
|
||||
KS_CREATOR_ID_LIST = [
|
||||
"3x4sm73aye7jq7i",
|
||||
# ........................
|
||||
]
|
||||
@@ -0,0 +1,25 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# 贴吧平台配置
|
||||
|
||||
# 指定贴吧ID列表
|
||||
TIEBA_SPECIFIED_ID_LIST = []
|
||||
|
||||
# 指定贴吧名称列表
|
||||
TIEBA_NAME_LIST = [
|
||||
# "盗墓笔记"
|
||||
]
|
||||
|
||||
# 指定贴吧用户URL列表
|
||||
TIEBA_CREATOR_URL_LIST = [
|
||||
"https://tieba.baidu.com/home/main/?id=tb.1.7f139e2e.6CyEwxu3VJruH_-QqpCi6g&fr=frs",
|
||||
# ........................
|
||||
]
|
||||
@@ -0,0 +1,27 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# 微博平台配置
|
||||
|
||||
# 搜索类型,具体的枚举值在media_platform/weibo/field.py中
|
||||
WEIBO_SEARCH_TYPE = "popular"
|
||||
|
||||
# 指定微博ID列表
|
||||
WEIBO_SPECIFIED_ID_LIST = [
|
||||
"4982041758140155",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定微博用户ID列表
|
||||
WEIBO_CREATOR_ID_LIST = [
|
||||
"5533390220",
|
||||
# ........................
|
||||
]
|
||||
@@ -0,0 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# 小红书平台配置
|
||||
|
||||
# 排序方式,具体的枚举值在media_platform/xhs/field.py中
|
||||
SORT_TYPE = "popularity_descending"
|
||||
|
||||
# 指定笔记URL列表, 必须要携带xsec_token参数
|
||||
XHS_SPECIFIED_NOTE_URL_LIST = [
|
||||
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定用户ID列表
|
||||
XHS_CREATOR_ID_LIST = [
|
||||
"63e36c9a000000002703502b",
|
||||
# ........................
|
||||
]
|
||||
@@ -0,0 +1,25 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# 知乎平台配置
|
||||
|
||||
# 指定知乎用户URL列表
|
||||
ZHIHU_CREATOR_URL_LIST = [
|
||||
"https://www.zhihu.com/people/yd1234567",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定知乎ID列表
|
||||
ZHIHU_SPECIFIED_ID_LIST = [
|
||||
"https://www.zhihu.com/question/826896610/answer/4885821440", # 回答
|
||||
"https://zhuanlan.zhihu.com/p/673461588", # 文章
|
||||
"https://www.zhihu.com/zvideo/1539542068422144000", # 视频
|
||||
]
|
||||
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
@@ -0,0 +1,14 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
TIEBA_URL = 'https://tieba.baidu.com'
|
||||
@@ -0,0 +1,19 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
ZHIHU_URL = "https://www.zhihu.com"
|
||||
ZHIHU_ZHUANLAN_URL = "https://zhuanlan.zhihu.com"
|
||||
|
||||
ANSWER_NAME = "answer"
|
||||
ARTICLE_NAME = "article"
|
||||
VIDEO_NAME = "zvideo"
|
||||
|
||||
@@ -0,0 +1,209 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2024/4/6 14:54
|
||||
# @Desc : mediacrawler db 管理
|
||||
import asyncio
|
||||
from typing import Dict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiofiles
|
||||
import aiomysql
|
||||
|
||||
import config
|
||||
from async_db import AsyncMysqlDB
|
||||
from async_sqlite_db import AsyncSqliteDB
|
||||
from tools import utils
|
||||
from var import db_conn_pool_var, media_crawler_db_var
|
||||
|
||||
|
||||
async def init_mediacrawler_db():
|
||||
"""
|
||||
初始化数据库链接池对象,并将该对象塞给media_crawler_db_var上下文变量
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pool = await aiomysql.create_pool(
|
||||
host=config.MYSQL_DB_HOST,
|
||||
port=config.MYSQL_DB_PORT,
|
||||
user=config.MYSQL_DB_USER,
|
||||
password=config.MYSQL_DB_PWD,
|
||||
db=config.MYSQL_DB_NAME,
|
||||
autocommit=True,
|
||||
)
|
||||
async_db_obj = AsyncMysqlDB(pool)
|
||||
|
||||
# 将连接池对象和封装的CRUD sql接口对象放到上下文变量中
|
||||
db_conn_pool_var.set(pool)
|
||||
media_crawler_db_var.set(async_db_obj)
|
||||
|
||||
|
||||
async def init_sqlite_db():
|
||||
"""
|
||||
初始化SQLite数据库对象,并将该对象塞给media_crawler_db_var上下文变量
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async_db_obj = AsyncSqliteDB(config.SQLITE_DB_PATH)
|
||||
|
||||
# 将SQLite数据库对象放到上下文变量中
|
||||
media_crawler_db_var.set(async_db_obj)
|
||||
|
||||
|
||||
async def init_db():
|
||||
"""
|
||||
初始化db连接池
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[init_db] start init mediacrawler db connect object")
|
||||
if config.SAVE_DATA_OPTION == "sqlite":
|
||||
await init_sqlite_db()
|
||||
utils.logger.info("[init_db] end init sqlite db connect object")
|
||||
else:
|
||||
await init_mediacrawler_db()
|
||||
utils.logger.info("[init_db] end init mysql db connect object")
|
||||
|
||||
|
||||
async def close():
|
||||
"""
|
||||
关闭数据库连接
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[close] close mediacrawler db connection")
|
||||
if config.SAVE_DATA_OPTION == "sqlite":
|
||||
# SQLite数据库连接会在AsyncSqliteDB对象销毁时自动关闭
|
||||
utils.logger.info("[close] sqlite db connection will be closed automatically")
|
||||
else:
|
||||
# MySQL连接池关闭
|
||||
db_pool: aiomysql.Pool = db_conn_pool_var.get()
|
||||
if db_pool is not None:
|
||||
db_pool.close()
|
||||
utils.logger.info("[close] mysql db pool closed")
|
||||
|
||||
|
||||
async def init_table_schema(db_type: str = None):
|
||||
"""
|
||||
用来初始化数据库表结构,请在第一次需要创建表结构的时候使用,多次执行该函数会将已有的表以及数据全部删除
|
||||
Args:
|
||||
db_type: 数据库类型,可选值为 'sqlite' 或 'mysql',如果不指定则使用配置文件中的设置
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 如果没有指定数据库类型,则使用配置文件中的设置
|
||||
if db_type is None:
|
||||
db_type = config.SAVE_DATA_OPTION
|
||||
|
||||
if db_type == "sqlite":
|
||||
utils.logger.info("[init_table_schema] begin init sqlite table schema ...")
|
||||
|
||||
# 检查并删除可能存在的损坏数据库文件
|
||||
import os
|
||||
if os.path.exists(config.SQLITE_DB_PATH):
|
||||
try:
|
||||
# 尝试删除现有的数据库文件
|
||||
os.remove(config.SQLITE_DB_PATH)
|
||||
utils.logger.info(f"[init_table_schema] removed existing sqlite db file: {config.SQLITE_DB_PATH}")
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[init_table_schema] failed to remove existing sqlite db file: {e}")
|
||||
# 如果删除失败,尝试重命名文件
|
||||
try:
|
||||
backup_path = f"{config.SQLITE_DB_PATH}.backup_{utils.get_current_timestamp()}"
|
||||
os.rename(config.SQLITE_DB_PATH, backup_path)
|
||||
utils.logger.info(f"[init_table_schema] renamed existing sqlite db file to: {backup_path}")
|
||||
except Exception as rename_e:
|
||||
utils.logger.error(f"[init_table_schema] failed to rename existing sqlite db file: {rename_e}")
|
||||
raise rename_e
|
||||
|
||||
await init_sqlite_db()
|
||||
async_db_obj: AsyncSqliteDB = media_crawler_db_var.get()
|
||||
async with aiofiles.open("schema/sqlite_tables.sql", mode="r", encoding="utf-8") as f:
|
||||
schema_sql = await f.read()
|
||||
await async_db_obj.executescript(schema_sql)
|
||||
utils.logger.info("[init_table_schema] sqlite table schema init successful")
|
||||
elif db_type == "mysql":
|
||||
utils.logger.info("[init_table_schema] begin init mysql table schema ...")
|
||||
await init_mediacrawler_db()
|
||||
async_db_obj: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
async with aiofiles.open("schema/tables.sql", mode="r", encoding="utf-8") as f:
|
||||
schema_sql = await f.read()
|
||||
await async_db_obj.execute(schema_sql)
|
||||
utils.logger.info("[init_table_schema] mysql table schema init successful")
|
||||
await close()
|
||||
else:
|
||||
utils.logger.error(f"[init_table_schema] 不支持的数据库类型: {db_type}")
|
||||
raise ValueError(f"不支持的数据库类型: {db_type},支持的类型: sqlite, mysql")
|
||||
|
||||
|
||||
def show_database_options():
|
||||
"""
|
||||
显示支持的数据库选项
|
||||
"""
|
||||
print("\n=== MediaCrawler 数据库初始化工具 ===")
|
||||
print("支持的数据库类型:")
|
||||
print("1. sqlite - SQLite 数据库 (轻量级,无需额外配置)")
|
||||
print("2. mysql - MySQL 数据库 (需要配置数据库连接信息)")
|
||||
print("3. config - 使用配置文件中的设置")
|
||||
print("4. exit - 退出程序")
|
||||
print("="*50)
|
||||
|
||||
|
||||
def get_user_choice():
|
||||
"""
|
||||
获取用户选择的数据库类型
|
||||
Returns:
|
||||
str: 用户选择的数据库类型
|
||||
"""
|
||||
while True:
|
||||
choice = input("请输入数据库类型 (sqlite/mysql/config/exit): ").strip().lower()
|
||||
|
||||
if choice in ['sqlite', 'mysql', 'config', 'exit']:
|
||||
return choice
|
||||
else:
|
||||
print("❌ 无效的选择,请输入: sqlite, mysql, config 或 exit")
|
||||
|
||||
|
||||
async def main():
|
||||
"""
|
||||
主函数,处理用户交互和数据库初始化
|
||||
"""
|
||||
try:
|
||||
show_database_options()
|
||||
|
||||
while True:
|
||||
choice = get_user_choice()
|
||||
|
||||
if choice == 'exit':
|
||||
print("👋 程序已退出")
|
||||
break
|
||||
elif choice == 'config':
|
||||
print(f"📋 使用配置文件中的设置: {config.SAVE_DATA_OPTION}")
|
||||
await init_table_schema()
|
||||
print("✅ 数据库表结构初始化完成!")
|
||||
break
|
||||
else:
|
||||
print(f"🚀 开始初始化 {choice.upper()} 数据库...")
|
||||
await init_table_schema(choice)
|
||||
print("✅ 数据库表结构初始化完成!")
|
||||
break
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n⚠️ 用户中断操作")
|
||||
except Exception as e:
|
||||
print(f"\n❌ 初始化失败: {str(e)}")
|
||||
utils.logger.error(f"[main] 数据库初始化失败: {str(e)}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.get_event_loop().run_until_complete(main())
|
||||
@@ -0,0 +1,435 @@
|
||||
// All the content in this article is only for learning and communication use, not for any other purpose, strictly prohibited for commercial use and illegal use, otherwise all the consequences are irrelevant to the author!
|
||||
// copy from https://github.com/ShilongLee/Crawler/tree/main/lib/js thanks for ShilongLee
|
||||
function rc4_encrypt(plaintext, key) {
|
||||
var s = [];
|
||||
for (var i = 0; i < 256; i++) {
|
||||
s[i] = i;
|
||||
}
|
||||
var j = 0;
|
||||
for (var i = 0; i < 256; i++) {
|
||||
j = (j + s[i] + key.charCodeAt(i % key.length)) % 256;
|
||||
var temp = s[i];
|
||||
s[i] = s[j];
|
||||
s[j] = temp;
|
||||
}
|
||||
|
||||
var i = 0;
|
||||
var j = 0;
|
||||
var cipher = [];
|
||||
for (var k = 0; k < plaintext.length; k++) {
|
||||
i = (i + 1) % 256;
|
||||
j = (j + s[i]) % 256;
|
||||
var temp = s[i];
|
||||
s[i] = s[j];
|
||||
s[j] = temp;
|
||||
var t = (s[i] + s[j]) % 256;
|
||||
cipher.push(String.fromCharCode(s[t] ^ plaintext.charCodeAt(k)));
|
||||
}
|
||||
return cipher.join('');
|
||||
}
|
||||
|
||||
function le(e, r) {
|
||||
return (e << (r %= 32) | e >>> 32 - r) >>> 0
|
||||
}
|
||||
|
||||
function de(e) {
|
||||
return 0 <= e && e < 16 ? 2043430169 : 16 <= e && e < 64 ? 2055708042 : void console['error']("invalid j for constant Tj")
|
||||
}
|
||||
|
||||
function pe(e, r, t, n) {
|
||||
return 0 <= e && e < 16 ? (r ^ t ^ n) >>> 0 : 16 <= e && e < 64 ? (r & t | r & n | t & n) >>> 0 : (console['error']('invalid j for bool function FF'),
|
||||
0)
|
||||
}
|
||||
|
||||
function he(e, r, t, n) {
|
||||
return 0 <= e && e < 16 ? (r ^ t ^ n) >>> 0 : 16 <= e && e < 64 ? (r & t | ~r & n) >>> 0 : (console['error']('invalid j for bool function GG'),
|
||||
0)
|
||||
}
|
||||
|
||||
function reset() {
|
||||
this.reg[0] = 1937774191,
|
||||
this.reg[1] = 1226093241,
|
||||
this.reg[2] = 388252375,
|
||||
this.reg[3] = 3666478592,
|
||||
this.reg[4] = 2842636476,
|
||||
this.reg[5] = 372324522,
|
||||
this.reg[6] = 3817729613,
|
||||
this.reg[7] = 2969243214,
|
||||
this["chunk"] = [],
|
||||
this["size"] = 0
|
||||
}
|
||||
|
||||
function write(e) {
|
||||
var a = "string" == typeof e ? function (e) {
|
||||
n = encodeURIComponent(e)['replace'](/%([0-9A-F]{2})/g, (function (e, r) {
|
||||
return String['fromCharCode']("0x" + r)
|
||||
}
|
||||
))
|
||||
, a = new Array(n['length']);
|
||||
return Array['prototype']['forEach']['call'](n, (function (e, r) {
|
||||
a[r] = e.charCodeAt(0)
|
||||
}
|
||||
)),
|
||||
a
|
||||
}(e) : e;
|
||||
this.size += a.length;
|
||||
var f = 64 - this['chunk']['length'];
|
||||
if (a['length'] < f)
|
||||
this['chunk'] = this['chunk'].concat(a);
|
||||
else
|
||||
for (this['chunk'] = this['chunk'].concat(a.slice(0, f)); this['chunk'].length >= 64;)
|
||||
this['_compress'](this['chunk']),
|
||||
f < a['length'] ? this['chunk'] = a['slice'](f, Math['min'](f + 64, a['length'])) : this['chunk'] = [],
|
||||
f += 64
|
||||
}
|
||||
|
||||
function sum(e, t) {
|
||||
e && (this['reset'](),
|
||||
this['write'](e)),
|
||||
this['_fill']();
|
||||
for (var f = 0; f < this.chunk['length']; f += 64)
|
||||
this._compress(this['chunk']['slice'](f, f + 64));
|
||||
var i = null;
|
||||
if (t == 'hex') {
|
||||
i = "";
|
||||
for (f = 0; f < 8; f++)
|
||||
i += se(this['reg'][f]['toString'](16), 8, "0")
|
||||
} else
|
||||
for (i = new Array(32),
|
||||
f = 0; f < 8; f++) {
|
||||
var c = this.reg[f];
|
||||
i[4 * f + 3] = (255 & c) >>> 0,
|
||||
c >>>= 8,
|
||||
i[4 * f + 2] = (255 & c) >>> 0,
|
||||
c >>>= 8,
|
||||
i[4 * f + 1] = (255 & c) >>> 0,
|
||||
c >>>= 8,
|
||||
i[4 * f] = (255 & c) >>> 0
|
||||
}
|
||||
return this['reset'](),
|
||||
i
|
||||
}
|
||||
|
||||
function _compress(t) {
|
||||
if (t < 64)
|
||||
console.error("compress error: not enough data");
|
||||
else {
|
||||
for (var f = function (e) {
|
||||
for (var r = new Array(132), t = 0; t < 16; t++)
|
||||
r[t] = e[4 * t] << 24,
|
||||
r[t] |= e[4 * t + 1] << 16,
|
||||
r[t] |= e[4 * t + 2] << 8,
|
||||
r[t] |= e[4 * t + 3],
|
||||
r[t] >>>= 0;
|
||||
for (var n = 16; n < 68; n++) {
|
||||
var a = r[n - 16] ^ r[n - 9] ^ le(r[n - 3], 15);
|
||||
a = a ^ le(a, 15) ^ le(a, 23),
|
||||
r[n] = (a ^ le(r[n - 13], 7) ^ r[n - 6]) >>> 0
|
||||
}
|
||||
for (n = 0; n < 64; n++)
|
||||
r[n + 68] = (r[n] ^ r[n + 4]) >>> 0;
|
||||
return r
|
||||
}(t), i = this['reg'].slice(0), c = 0; c < 64; c++) {
|
||||
var o = le(i[0], 12) + i[4] + le(de(c), c)
|
||||
, s = ((o = le(o = (4294967295 & o) >>> 0, 7)) ^ le(i[0], 12)) >>> 0
|
||||
, u = pe(c, i[0], i[1], i[2]);
|
||||
u = (4294967295 & (u = u + i[3] + s + f[c + 68])) >>> 0;
|
||||
var b = he(c, i[4], i[5], i[6]);
|
||||
b = (4294967295 & (b = b + i[7] + o + f[c])) >>> 0,
|
||||
i[3] = i[2],
|
||||
i[2] = le(i[1], 9),
|
||||
i[1] = i[0],
|
||||
i[0] = u,
|
||||
i[7] = i[6],
|
||||
i[6] = le(i[5], 19),
|
||||
i[5] = i[4],
|
||||
i[4] = (b ^ le(b, 9) ^ le(b, 17)) >>> 0
|
||||
}
|
||||
for (var l = 0; l < 8; l++)
|
||||
this['reg'][l] = (this['reg'][l] ^ i[l]) >>> 0
|
||||
}
|
||||
}
|
||||
|
||||
function _fill() {
|
||||
var a = 8 * this['size']
|
||||
, f = this['chunk']['push'](128) % 64;
|
||||
for (64 - f < 8 && (f -= 64); f < 56; f++)
|
||||
this.chunk['push'](0);
|
||||
for (var i = 0; i < 4; i++) {
|
||||
var c = Math['floor'](a / 4294967296);
|
||||
this['chunk'].push(c >>> 8 * (3 - i) & 255)
|
||||
}
|
||||
for (i = 0; i < 4; i++)
|
||||
this['chunk']['push'](a >>> 8 * (3 - i) & 255)
|
||||
|
||||
}
|
||||
|
||||
function SM3() {
|
||||
this.reg = [];
|
||||
this.chunk = [];
|
||||
this.size = 0;
|
||||
this.reset()
|
||||
}
|
||||
SM3.prototype.reset = reset;
|
||||
SM3.prototype.write = write;
|
||||
SM3.prototype.sum = sum;
|
||||
SM3.prototype._compress = _compress;
|
||||
SM3.prototype._fill = _fill;
|
||||
|
||||
function result_encrypt(long_str, num = null) {
|
||||
let s_obj = {
|
||||
"s0": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=",
|
||||
"s1": "Dkdpgh4ZKsQB80/Mfvw36XI1R25+WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe=",
|
||||
"s2": "Dkdpgh4ZKsQB80/Mfvw36XI1R25-WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe=",
|
||||
"s3": "ckdp1h4ZKsUB80/Mfvw36XIgR25+WQAlEi7NLboqYTOPuzmFjJnryx9HVGDaStCe",
|
||||
"s4": "Dkdpgh2ZmsQB80/MfvV36XI1R45-WUAlEixNLwoqYTOPuzKFjJnry79HbGcaStCe"
|
||||
}
|
||||
let constant = {
|
||||
"0": 16515072,
|
||||
"1": 258048,
|
||||
"2": 4032,
|
||||
"str": s_obj[num],
|
||||
}
|
||||
|
||||
let result = "";
|
||||
let lound = 0;
|
||||
let long_int = get_long_int(lound, long_str);
|
||||
for (let i = 0; i < long_str.length / 3 * 4; i++) {
|
||||
if (Math.floor(i / 4) !== lound) {
|
||||
lound += 1;
|
||||
long_int = get_long_int(lound, long_str);
|
||||
}
|
||||
let key = i % 4;
|
||||
switch (key) {
|
||||
case 0:
|
||||
temp_int = (long_int & constant["0"]) >> 18;
|
||||
result += constant["str"].charAt(temp_int);
|
||||
break;
|
||||
case 1:
|
||||
temp_int = (long_int & constant["1"]) >> 12;
|
||||
result += constant["str"].charAt(temp_int);
|
||||
break;
|
||||
case 2:
|
||||
temp_int = (long_int & constant["2"]) >> 6;
|
||||
result += constant["str"].charAt(temp_int);
|
||||
break;
|
||||
case 3:
|
||||
temp_int = long_int & 63;
|
||||
result += constant["str"].charAt(temp_int);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function get_long_int(round, long_str) {
|
||||
round = round * 3;
|
||||
return (long_str.charCodeAt(round) << 16) | (long_str.charCodeAt(round + 1) << 8) | (long_str.charCodeAt(round + 2));
|
||||
}
|
||||
|
||||
function gener_random(random, option) {
|
||||
return [
|
||||
(random & 255 & 170) | option[0] & 85, // 163
|
||||
(random & 255 & 85) | option[0] & 170, //87
|
||||
(random >> 8 & 255 & 170) | option[1] & 85, //37
|
||||
(random >> 8 & 255 & 85) | option[1] & 170, //41
|
||||
]
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////
|
||||
function generate_rc4_bb_str(url_search_params, user_agent, window_env_str, suffix = "cus", Arguments = [0, 1, 14]) {
|
||||
let sm3 = new SM3()
|
||||
let start_time = Date.now()
|
||||
/**
|
||||
* 进行3次加密处理
|
||||
* 1: url_search_params两次sm3之的结果
|
||||
* 2: 对后缀两次sm3之的结果
|
||||
* 3: 对ua处理之后的结果
|
||||
*/
|
||||
// url_search_params两次sm3之的结果
|
||||
let url_search_params_list = sm3.sum(sm3.sum(url_search_params + suffix))
|
||||
// 对后缀两次sm3之的结果
|
||||
let cus = sm3.sum(sm3.sum(suffix))
|
||||
// 对ua处理之后的结果
|
||||
let ua = sm3.sum(result_encrypt(rc4_encrypt(user_agent, String.fromCharCode.apply(null, [0.00390625, 1, Arguments[2]])), "s3"))
|
||||
//
|
||||
let end_time = Date.now()
|
||||
// b
|
||||
let b = {
|
||||
8: 3, // 固定
|
||||
10: end_time, //3次加密结束时间
|
||||
15: {
|
||||
"aid": 6383,
|
||||
"pageId": 6241,
|
||||
"boe": false,
|
||||
"ddrt": 7,
|
||||
"paths": {
|
||||
"include": [
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{}
|
||||
],
|
||||
"exclude": []
|
||||
},
|
||||
"track": {
|
||||
"mode": 0,
|
||||
"delay": 300,
|
||||
"paths": []
|
||||
},
|
||||
"dump": true,
|
||||
"rpU": ""
|
||||
},
|
||||
16: start_time, //3次加密开始时间
|
||||
18: 44, //固定
|
||||
19: [1, 0, 1, 5],
|
||||
}
|
||||
|
||||
//3次加密开始时间
|
||||
b[20] = (b[16] >> 24) & 255
|
||||
b[21] = (b[16] >> 16) & 255
|
||||
b[22] = (b[16] >> 8) & 255
|
||||
b[23] = b[16] & 255
|
||||
b[24] = (b[16] / 256 / 256 / 256 / 256) >> 0
|
||||
b[25] = (b[16] / 256 / 256 / 256 / 256 / 256) >> 0
|
||||
|
||||
// 参数Arguments [0, 1, 14, ...]
|
||||
// let Arguments = [0, 1, 14]
|
||||
b[26] = (Arguments[0] >> 24) & 255
|
||||
b[27] = (Arguments[0] >> 16) & 255
|
||||
b[28] = (Arguments[0] >> 8) & 255
|
||||
b[29] = Arguments[0] & 255
|
||||
|
||||
b[30] = (Arguments[1] / 256) & 255
|
||||
b[31] = (Arguments[1] % 256) & 255
|
||||
b[32] = (Arguments[1] >> 24) & 255
|
||||
b[33] = (Arguments[1] >> 16) & 255
|
||||
|
||||
b[34] = (Arguments[2] >> 24) & 255
|
||||
b[35] = (Arguments[2] >> 16) & 255
|
||||
b[36] = (Arguments[2] >> 8) & 255
|
||||
b[37] = Arguments[2] & 255
|
||||
|
||||
// (url_search_params + "cus") 两次sm3之的结果
|
||||
/**let url_search_params_list = [
|
||||
91, 186, 35, 86, 143, 253, 6, 76,
|
||||
34, 21, 167, 148, 7, 42, 192, 219,
|
||||
188, 20, 182, 85, 213, 74, 213, 147,
|
||||
37, 155, 93, 139, 85, 118, 228, 213
|
||||
]*/
|
||||
b[38] = url_search_params_list[21]
|
||||
b[39] = url_search_params_list[22]
|
||||
|
||||
// ("cus") 对后缀两次sm3之的结果
|
||||
/**
|
||||
* let cus = [
|
||||
136, 101, 114, 147, 58, 77, 207, 201,
|
||||
215, 162, 154, 93, 248, 13, 142, 160,
|
||||
105, 73, 215, 241, 83, 58, 51, 43,
|
||||
255, 38, 168, 141, 216, 194, 35, 236
|
||||
]*/
|
||||
b[40] = cus[21]
|
||||
b[41] = cus[22]
|
||||
|
||||
// 对ua处理之后的结果
|
||||
/**
|
||||
* let ua = [
|
||||
129, 190, 70, 186, 86, 196, 199, 53,
|
||||
99, 38, 29, 209, 243, 17, 157, 69,
|
||||
147, 104, 53, 23, 114, 126, 66, 228,
|
||||
135, 30, 168, 185, 109, 156, 251, 88
|
||||
]*/
|
||||
b[42] = ua[23]
|
||||
b[43] = ua[24]
|
||||
|
||||
//3次加密结束时间
|
||||
b[44] = (b[10] >> 24) & 255
|
||||
b[45] = (b[10] >> 16) & 255
|
||||
b[46] = (b[10] >> 8) & 255
|
||||
b[47] = b[10] & 255
|
||||
b[48] = b[8]
|
||||
b[49] = (b[10] / 256 / 256 / 256 / 256) >> 0
|
||||
b[50] = (b[10] / 256 / 256 / 256 / 256 / 256) >> 0
|
||||
|
||||
|
||||
// object配置项
|
||||
b[51] = b[15]['pageId']
|
||||
b[52] = (b[15]['pageId'] >> 24) & 255
|
||||
b[53] = (b[15]['pageId'] >> 16) & 255
|
||||
b[54] = (b[15]['pageId'] >> 8) & 255
|
||||
b[55] = b[15]['pageId'] & 255
|
||||
|
||||
b[56] = b[15]['aid']
|
||||
b[57] = b[15]['aid'] & 255
|
||||
b[58] = (b[15]['aid'] >> 8) & 255
|
||||
b[59] = (b[15]['aid'] >> 16) & 255
|
||||
b[60] = (b[15]['aid'] >> 24) & 255
|
||||
|
||||
// 中间进行了环境检测
|
||||
// 代码索引: 2496 索引值: 17 (索引64关键条件)
|
||||
// '1536|747|1536|834|0|30|0|0|1536|834|1536|864|1525|747|24|24|Win32'.charCodeAt()得到65位数组
|
||||
/**
|
||||
* let window_env_list = [49, 53, 51, 54, 124, 55, 52, 55, 124, 49, 53, 51, 54, 124, 56, 51, 52, 124, 48, 124, 51,
|
||||
* 48, 124, 48, 124, 48, 124, 49, 53, 51, 54, 124, 56, 51, 52, 124, 49, 53, 51, 54, 124, 56,
|
||||
* 54, 52, 124, 49, 53, 50, 53, 124, 55, 52, 55, 124, 50, 52, 124, 50, 52, 124, 87, 105, 110,
|
||||
* 51, 50]
|
||||
*/
|
||||
let window_env_list = [];
|
||||
for (let index = 0; index < window_env_str.length; index++) {
|
||||
window_env_list.push(window_env_str.charCodeAt(index))
|
||||
}
|
||||
b[64] = window_env_list.length
|
||||
b[65] = b[64] & 255
|
||||
b[66] = (b[64] >> 8) & 255
|
||||
|
||||
b[69] = [].length
|
||||
b[70] = b[69] & 255
|
||||
b[71] = (b[69] >> 8) & 255
|
||||
|
||||
b[72] = b[18] ^ b[20] ^ b[26] ^ b[30] ^ b[38] ^ b[40] ^ b[42] ^ b[21] ^ b[27] ^ b[31] ^ b[35] ^ b[39] ^ b[41] ^ b[43] ^ b[22] ^
|
||||
b[28] ^ b[32] ^ b[36] ^ b[23] ^ b[29] ^ b[33] ^ b[37] ^ b[44] ^ b[45] ^ b[46] ^ b[47] ^ b[48] ^ b[49] ^ b[50] ^ b[24] ^
|
||||
b[25] ^ b[52] ^ b[53] ^ b[54] ^ b[55] ^ b[57] ^ b[58] ^ b[59] ^ b[60] ^ b[65] ^ b[66] ^ b[70] ^ b[71]
|
||||
let bb = [
|
||||
b[18], b[20], b[52], b[26], b[30], b[34], b[58], b[38], b[40], b[53], b[42], b[21], b[27], b[54], b[55], b[31],
|
||||
b[35], b[57], b[39], b[41], b[43], b[22], b[28], b[32], b[60], b[36], b[23], b[29], b[33], b[37], b[44], b[45],
|
||||
b[59], b[46], b[47], b[48], b[49], b[50], b[24], b[25], b[65], b[66], b[70], b[71]
|
||||
]
|
||||
bb = bb.concat(window_env_list).concat(b[72])
|
||||
return rc4_encrypt(String.fromCharCode.apply(null, bb), String.fromCharCode.apply(null, [121]));
|
||||
}
|
||||
|
||||
function generate_random_str() {
|
||||
let random_str_list = []
|
||||
random_str_list = random_str_list.concat(gener_random(Math.random() * 10000, [3, 45]))
|
||||
random_str_list = random_str_list.concat(gener_random(Math.random() * 10000, [1, 0]))
|
||||
random_str_list = random_str_list.concat(gener_random(Math.random() * 10000, [1, 5]))
|
||||
return String.fromCharCode.apply(null, random_str_list)
|
||||
}
|
||||
|
||||
function sign(url_search_params, user_agent, arguments) {
|
||||
/**
|
||||
* url_search_params:"device_platform=webapp&aid=6383&channel=channel_pc_web&update_version_code=170400&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=123.0.0.0&browser_online=true&engine_name=Blink&engine_version=123.0.0.0&os_name=Windows&os_version=10&cpu_core_num=16&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7362810250930783783&msToken=VkDUvz1y24CppXSl80iFPr6ez-3FiizcwD7fI1OqBt6IICq9RWG7nCvxKb8IVi55mFd-wnqoNkXGnxHrikQb4PuKob5Q-YhDp5Um215JzlBszkUyiEvR"
|
||||
* user_agent:"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
|
||||
*/
|
||||
let result_str = generate_random_str() + generate_rc4_bb_str(
|
||||
url_search_params,
|
||||
user_agent,
|
||||
"1536|747|1536|834|0|30|0|0|1536|834|1536|864|1525|747|24|24|Win32",
|
||||
"cus",
|
||||
arguments
|
||||
);
|
||||
return result_encrypt(result_str, "s4") + "=";
|
||||
}
|
||||
|
||||
function sign_datail(params, userAgent) {
|
||||
return sign(params, userAgent, [0, 1, 14])
|
||||
}
|
||||
|
||||
function sign_reply(params, userAgent) {
|
||||
return sign(params, userAgent, [0, 1, 8])
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,166 @@
|
||||
// copy from https://github.com/tiam-bloom/zhihuQuestionAnswer/blob/main/zhihuvmp.js thanks to tiam-bloom
|
||||
// 仅供学习交流使用,严禁用于商业用途,也不要滥用,否则后果自负
|
||||
// modified by relakkes
|
||||
|
||||
const crypto = require('crypto'); // 导入加密模块
|
||||
|
||||
|
||||
let init_str = "6fpLRqJO8M/c3jnYxFkUVC4ZIG12SiH=5v0mXDazWBTsuw7QetbKdoPyAl+hN9rgE";
|
||||
var h = {
|
||||
zk: [1170614578, 1024848638, 1413669199, -343334464, -766094290, -1373058082, -143119608, -297228157, 1933479194, -971186181, -406453910, 460404854, -547427574, -1891326262, -1679095901, 2119585428, -2029270069, 2035090028, -1521520070, -5587175, -77751101, -2094365853, -1243052806, 1579901135, 1321810770, 456816404, -1391643889, -229302305, 330002838, -788960546, 363569021, -1947871109],
|
||||
zb: [20, 223, 245, 7, 248, 2, 194, 209, 87, 6, 227, 253, 240, 128, 222, 91, 237, 9, 125, 157, 230, 93, 252, 205, 90, 79, 144, 199, 159, 197, 186, 167, 39, 37, 156, 198, 38, 42, 43, 168, 217, 153, 15, 103, 80, 189, 71, 191, 97, 84, 247, 95, 36, 69, 14, 35, 12, 171, 28, 114, 178, 148, 86, 182, 32, 83, 158, 109, 22, 255, 94, 238, 151, 85, 77, 124, 254, 18, 4, 26, 123, 176, 232, 193, 131, 172, 143, 142, 150, 30, 10, 146, 162, 62, 224, 218, 196, 229, 1, 192, 213, 27, 110, 56, 231, 180, 138, 107, 242, 187, 54, 120, 19, 44, 117, 228, 215, 203, 53, 239, 251, 127, 81, 11, 133, 96, 204, 132, 41, 115, 73, 55, 249, 147, 102, 48, 122, 145, 106, 118, 74, 190, 29, 16, 174, 5, 177, 129, 63, 113, 99, 31, 161, 76, 246, 34, 211, 13, 60, 68, 207, 160, 65, 111, 82, 165, 67, 169, 225, 57, 112, 244, 155, 51, 236, 200, 233, 58, 61, 47, 100, 137, 185, 64, 17, 70, 234, 163, 219, 108, 170, 166, 59, 149, 52, 105, 24, 212, 78, 173, 45, 0, 116, 226, 119, 136, 206, 135, 175, 195, 25, 92, 121, 208, 126, 139, 3, 75, 141, 21, 130, 98, 241, 40, 154, 66, 184, 49, 181, 46, 243, 88, 101, 183, 8, 23, 72, 188, 104, 179, 210, 134, 250, 201, 164, 89, 216, 202, 220, 50, 221, 152, 140, 33, 235, 214]
|
||||
|
||||
};
|
||||
|
||||
function i(e, t, n) {
|
||||
t[n] = 255 & e >>> 24,
|
||||
t[n + 1] = 255 & e >>> 16,
|
||||
t[n + 2] = 255 & e >>> 8,
|
||||
t[n + 3] = 255 & e
|
||||
}
|
||||
|
||||
function Q(e, t) {
|
||||
return (4294967295 & e) << t | e >>> 32 - t
|
||||
}
|
||||
|
||||
function B(e, t) {
|
||||
return (255 & e[t]) << 24 | (255 & e[t + 1]) << 16 | (255 & e[t + 2]) << 8 | 255 & e[t + 3]
|
||||
}
|
||||
|
||||
function G(e) {
|
||||
var t = new Array(4)
|
||||
, n = new Array(4);
|
||||
i(e, t, 0),
|
||||
n[0] = h.zb[255 & t[0]],
|
||||
n[1] = h.zb[255 & t[1]],
|
||||
n[2] = h.zb[255 & t[2]],
|
||||
n[3] = h.zb[255 & t[3]];
|
||||
|
||||
var r = B(n, 0);
|
||||
return r ^ Q(r, 2) ^ Q(r, 10) ^ Q(r, 18) ^ Q(r, 24)
|
||||
}
|
||||
|
||||
function array_0_16_offset(e) {
|
||||
var t = new Array(16)
|
||||
, n = new Array(36);
|
||||
n[0] = B(e, 0),
|
||||
n[1] = B(e, 4),
|
||||
n[2] = B(e, 8),
|
||||
n[3] = B(e, 12);
|
||||
for (var r = 0; r < 32; r++) {
|
||||
var o = G(n[r + 1] ^ n[r + 2] ^ n[r + 3] ^ h.zk[r]);
|
||||
n[r + 4] = n[r] ^ o
|
||||
}
|
||||
return i(n[35], t, 0),
|
||||
i(n[34], t, 4),
|
||||
i(n[33], t, 8),
|
||||
i(n[32], t, 12),
|
||||
t
|
||||
|
||||
}
|
||||
|
||||
function array_16_48_offset(e, t) {
|
||||
for (var n = [], r = e.length, i = 0; 0 < r; r -= 16) {
|
||||
for (var o = e.slice(16 * i, 16 * (i + 1)), a = new Array(16), c = 0; c < 16; c++)
|
||||
a[c] = o[c] ^ t[c];
|
||||
t = array_0_16_offset(a),
|
||||
n = n.concat(t),
|
||||
i++
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
function encode_0_16(array_0_16) {
|
||||
let result = [];
|
||||
let array_offset = [48, 53, 57, 48, 53, 51, 102, 55, 100, 49, 53, 101, 48, 49, 100, 55];
|
||||
for (let i = 0; i < array_0_16.length; i++) {
|
||||
let a = array_0_16[i] ^ array_offset[i],
|
||||
b = a ^ 42;
|
||||
result.push(b)
|
||||
}
|
||||
return array_0_16_offset(result)
|
||||
}
|
||||
|
||||
function encode(ar) {
|
||||
let b = ar[1] << 8,
|
||||
c = ar[0] | b,
|
||||
d = ar[2] << 16,
|
||||
e = c | d,
|
||||
result_array = [],
|
||||
x6 = 6;
|
||||
result_array.push(e & 63);
|
||||
while (result_array.length < 4) {
|
||||
let a = e >>> x6;
|
||||
result_array.push(a & 63);
|
||||
x6 += 6;
|
||||
}
|
||||
return result_array
|
||||
}
|
||||
|
||||
function get_init_array(encode_md5) {
|
||||
let init_array = []
|
||||
for (let i = 0; i < encode_md5.length; i++) {
|
||||
init_array.push(encode_md5.charCodeAt(i))
|
||||
}
|
||||
init_array.unshift(0)
|
||||
init_array.unshift(Math.floor(Math.random() * 127))
|
||||
while (init_array.length < 48) {
|
||||
init_array.push(14)
|
||||
}
|
||||
let array_0_16 = encode_0_16(init_array.slice(0, 16)),
|
||||
array_16_48 = array_16_48_offset(init_array.slice(16, 48), array_0_16),
|
||||
array_result = array_0_16.concat(array_16_48);
|
||||
return array_result
|
||||
}
|
||||
|
||||
function get_zse_96(encode_md5) {
|
||||
let result_array = [],
|
||||
init_array = get_init_array(encode_md5),
|
||||
result = "";
|
||||
for (let i = 47; i >= 0; i -= 4) {
|
||||
init_array[i] ^= 58
|
||||
}
|
||||
init_array.reverse()
|
||||
for (let j = 3; j <= init_array.length; j += 3) {
|
||||
let ar = init_array.slice(j - 3, j);
|
||||
result_array = result_array.concat(encode(ar))
|
||||
}
|
||||
for (let index = 0; index < result_array.length; index++) {
|
||||
result += init_str.charAt(result_array[index])
|
||||
}
|
||||
result = '2.0_' + result
|
||||
return result
|
||||
}
|
||||
|
||||
/***********************relakkes modify*******************************************************/
|
||||
|
||||
/**
|
||||
* 从cookies中提取dc0的值
|
||||
* @param cookies
|
||||
* @returns {string}
|
||||
*/
|
||||
const extract_dc0_value_from_cookies = function (cookies) {
|
||||
const t9 = RegExp("d_c0=([^;]+)")
|
||||
const tt = t9.exec(cookies);
|
||||
const dc0 = tt && tt[1]
|
||||
return tt && tt[1]
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取zhihu sign value 对python暴漏的接口
|
||||
* @param url 请求的路由参数
|
||||
* @param cookies 请求的cookies,需要包含dc0这个key
|
||||
* @returns {*}
|
||||
*/
|
||||
function get_sign(url, cookies) {
|
||||
const ta = "101_3_3.0"
|
||||
const dc0 = extract_dc0_value_from_cookies(cookies)
|
||||
const tc = "3_2.0aR_sn77yn6O92wOB8hPZnQr0EMYxc4f18wNBUgpTQ6nxERFZfTY0-4Lm-h3_tufIwJS8gcxTgJS_AuPZNcXCTwxI78YxEM20s4PGDwN8gGcYAupMWufIoLVqr4gxrRPOI0cY7HL8qun9g93mFukyigcmebS_FwOYPRP0E4rZUrN9DDom3hnynAUMnAVPF_PhaueTFH9fQL39OCCqYTxfb0rfi9wfPhSM6vxGDJo_rBHpQGNmBBLqPJHK2_w8C9eTVMO9Z9NOrMtfhGH_DgpM-BNM1DOxScLG3gg1Hre1FCXKQcXKkrSL1r9GWDXMk8wqBLNmbRH96BtOFqVZ7UYG3gC8D9cMS7Y9UrHLVCLZPJO8_CL_6GNCOg_zhJS8PbXmGTcBpgxfkieOPhNfthtf2gC_qD3YOce8nCwG2uwBOqeMoML9NBC1xb9yk6SuJhHLK7SM6LVfCve_3vLKlqcL6TxL_UosDvHLxrHmWgxBQ8Xs"
|
||||
const params_join_str = [ta, url, dc0, tc].join("+")
|
||||
const params_md5_value = crypto.createHash('md5').update(params_join_str).digest('hex')
|
||||
|
||||
return {
|
||||
"x-zst-81": tc,
|
||||
"x-zse-96": get_zse_96(params_md5_value),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import cmd_arg
|
||||
import config
|
||||
import db
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from media_platform.bilibili import BilibiliCrawler
|
||||
from media_platform.douyin import DouYinCrawler
|
||||
from media_platform.kuaishou import KuaishouCrawler
|
||||
from media_platform.tieba import TieBaCrawler
|
||||
from media_platform.weibo import WeiboCrawler
|
||||
from media_platform.xhs import XiaoHongShuCrawler
|
||||
from media_platform.zhihu import ZhihuCrawler
|
||||
|
||||
|
||||
class CrawlerFactory:
|
||||
CRAWLERS = {
|
||||
"xhs": XiaoHongShuCrawler,
|
||||
"dy": DouYinCrawler,
|
||||
"ks": KuaishouCrawler,
|
||||
"bili": BilibiliCrawler,
|
||||
"wb": WeiboCrawler,
|
||||
"tieba": TieBaCrawler,
|
||||
"zhihu": ZhihuCrawler,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_crawler(platform: str) -> AbstractCrawler:
|
||||
crawler_class = CrawlerFactory.CRAWLERS.get(platform)
|
||||
if not crawler_class:
|
||||
raise ValueError(
|
||||
"Invalid Media Platform Currently only supported xhs or dy or ks or bili ..."
|
||||
)
|
||||
return crawler_class()
|
||||
|
||||
|
||||
crawler: Optional[AbstractCrawler] = None
|
||||
|
||||
|
||||
async def main():
|
||||
# Init crawler
|
||||
global crawler
|
||||
|
||||
# parse cmd
|
||||
await cmd_arg.parse_cmd()
|
||||
|
||||
# init db
|
||||
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
|
||||
await db.init_db()
|
||||
|
||||
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
|
||||
await crawler.start()
|
||||
|
||||
|
||||
def cleanup():
|
||||
if crawler:
|
||||
# asyncio.run(crawler.close())
|
||||
pass
|
||||
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
|
||||
asyncio.run(db.close())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.get_event_loop().run_until_complete(main())
|
||||
finally:
|
||||
cleanup()
|
||||
@@ -0,0 +1,11 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:36
|
||||
# @Desc :
|
||||
|
||||
from .core import *
|
||||
@@ -0,0 +1,553 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : bilibili 请求客户端
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
|
||||
from .exception import DataFetchError
|
||||
from .field import CommentOrderType, SearchOrderType
|
||||
from .help import BilibiliSign
|
||||
|
||||
|
||||
class BilibiliClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,b 站的长视频需要更久的超时时间
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://api.bilibili.com"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Any:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
try:
|
||||
data: Dict = response.json()
|
||||
except json.JSONDecodeError:
|
||||
utils.logger.error(f"[BilibiliClient.request] Failed to decode JSON from response. status_code: {response.status_code}, response_text: {response.text}")
|
||||
raise DataFetchError(f"Failed to decode JSON, content: {response.text}")
|
||||
if data.get("code") != 0:
|
||||
raise DataFetchError(data.get("message", "unkonw error"))
|
||||
else:
|
||||
return data.get("data", {})
|
||||
|
||||
async def pre_request_data(self, req_data: Dict) -> Dict:
|
||||
"""
|
||||
发送请求进行请求参数签名
|
||||
需要从 localStorage 拿 wbi_img_urls 这参数,值如下:
|
||||
https://i0.hdslb.com/bfs/wbi/7cd084941338484aae1ad9425b84077c.png-https://i0.hdslb.com/bfs/wbi/4932caff0ff746eab6f01bf08b70ac45.png
|
||||
:param req_data:
|
||||
:return:
|
||||
"""
|
||||
if not req_data:
|
||||
return {}
|
||||
img_key, sub_key = await self.get_wbi_keys()
|
||||
return BilibiliSign(img_key, sub_key).sign(req_data)
|
||||
|
||||
async def get_wbi_keys(self) -> Tuple[str, str]:
|
||||
"""
|
||||
获取最新的 img_key 和 sub_key
|
||||
:return:
|
||||
"""
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
wbi_img_urls = local_storage.get("wbi_img_urls", "")
|
||||
if not wbi_img_urls:
|
||||
img_url_from_storage = local_storage.get("wbi_img_url")
|
||||
sub_url_from_storage = local_storage.get("wbi_sub_url")
|
||||
if img_url_from_storage and sub_url_from_storage:
|
||||
wbi_img_urls = f"{img_url_from_storage}-{sub_url_from_storage}"
|
||||
if wbi_img_urls and "-" in wbi_img_urls:
|
||||
img_url, sub_url = wbi_img_urls.split("-")
|
||||
else:
|
||||
resp = await self.request(method="GET", url=self._host + "/x/web-interface/nav")
|
||||
img_url: str = resp['wbi_img']['img_url']
|
||||
sub_url: str = resp['wbi_img']['sub_url']
|
||||
img_key = img_url.rsplit('/', 1)[1].split('.')[0]
|
||||
sub_key = sub_url.rsplit('/', 1)[1].split('.')[0]
|
||||
return img_key, sub_key
|
||||
|
||||
async def get(self, uri: str, params=None, enable_params_sign: bool = True) -> Dict:
|
||||
final_uri = uri
|
||||
if enable_params_sign:
|
||||
params = await self.pre_request_data(params)
|
||||
if isinstance(params, dict):
|
||||
final_uri = (f"{uri}?"
|
||||
f"{urlencode(params)}")
|
||||
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers)
|
||||
|
||||
async def post(self, uri: str, data: dict) -> Dict:
|
||||
data = await self.pre_request_data(data)
|
||||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("[BilibiliClient.pong] Begin pong bilibili...")
|
||||
ping_flag = False
|
||||
try:
|
||||
check_login_uri = "/x/web-interface/nav"
|
||||
response = await self.get(check_login_uri)
|
||||
if response.get("isLogin"):
|
||||
utils.logger.info("[BilibiliClient.pong] Use cache login state get web interface successfull!")
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliClient.pong] Pong bilibili failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def search_video_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
order: SearchOrderType = SearchOrderType.DEFAULT,
|
||||
pubtime_begin_s: int = 0,
|
||||
pubtime_end_s: int = 0,
|
||||
) -> Dict:
|
||||
"""
|
||||
KuaiShou web search api
|
||||
:param keyword: 搜索关键词
|
||||
:param page: 分页参数具体第几页
|
||||
:param page_size: 每一页参数的数量
|
||||
:param order: 搜索结果排序,默认位综合排序
|
||||
:param pubtime_begin_s: 发布时间开始时间戳
|
||||
:param pubtime_end_s: 发布时间结束时间戳
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/web-interface/wbi/search/type"
|
||||
post_data = {
|
||||
"search_type": "video",
|
||||
"keyword": keyword,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"order": order.value,
|
||||
"pubtime_begin_s": pubtime_begin_s,
|
||||
"pubtime_end_s": pubtime_end_s
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict:
|
||||
"""
|
||||
Bilibli web video detail api, aid 和 bvid任选一个参数
|
||||
:param aid: 稿件avid
|
||||
:param bvid: 稿件bvid
|
||||
:return:
|
||||
"""
|
||||
if not aid and not bvid:
|
||||
raise ValueError("请提供 aid 或 bvid 中的至少一个参数")
|
||||
|
||||
uri = "/x/web-interface/view/detail"
|
||||
params = dict()
|
||||
if aid:
|
||||
params.update({"aid": aid})
|
||||
else:
|
||||
params.update({"bvid": bvid})
|
||||
return await self.get(uri, params, enable_params_sign=False)
|
||||
|
||||
async def get_video_play_url(self, aid: int, cid: int) -> Dict:
|
||||
"""
|
||||
Bilibli web video play url api
|
||||
:param aid: 稿件avid
|
||||
:param cid: cid
|
||||
:return:
|
||||
"""
|
||||
if not aid or not cid or aid <= 0 or cid <= 0:
|
||||
raise ValueError("aid 和 cid 必须存在")
|
||||
uri = "/x/player/wbi/playurl"
|
||||
params = {
|
||||
"avid": aid,
|
||||
"cid": cid,
|
||||
"qn": 80,
|
||||
"fourk": 1,
|
||||
"fnval": 1,
|
||||
"platform": "pc",
|
||||
}
|
||||
|
||||
return await self.get(uri, params, enable_params_sign=True)
|
||||
|
||||
async def get_video_media(self, url: str) -> Union[bytes, None]:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
try:
|
||||
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
|
||||
async def get_video_comments(
|
||||
self,
|
||||
video_id: str,
|
||||
order_mode: CommentOrderType = CommentOrderType.DEFAULT,
|
||||
next: int = 0,
|
||||
) -> Dict:
|
||||
"""get video comments
|
||||
:param video_id: 视频 ID
|
||||
:param order_mode: 排序方式
|
||||
:param next: 评论页选择
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/v2/reply/wbi/main"
|
||||
post_data = {"oid": video_id, "mode": order_mode.value, "type": 1, "ps": 20, "next": next}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_video_all_comments(
|
||||
self,
|
||||
video_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
is_fetch_sub_comments=False,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
):
|
||||
"""
|
||||
get video all comments include sub comments
|
||||
:param video_id:
|
||||
:param crawl_interval:
|
||||
:param is_fetch_sub_comments:
|
||||
:param callback:
|
||||
max_count: 一次笔记爬取的最大评论数量
|
||||
|
||||
:return:
|
||||
"""
|
||||
result = []
|
||||
is_end = False
|
||||
next_page = 0
|
||||
max_retries = 3
|
||||
while not is_end and len(result) < max_count:
|
||||
comments_res = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
|
||||
break # Success
|
||||
except DataFetchError as e:
|
||||
if attempt < max_retries - 1:
|
||||
delay = 5 * (2**attempt) + random.uniform(0, 1)
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Retrying video_id {video_id} in {delay:.2f}s... (Attempt {attempt + 1}/{max_retries})")
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
utils.logger.error(f"[BilibiliClient.get_video_all_comments] Max retries reached for video_id: {video_id}. Skipping comments. Error: {e}")
|
||||
is_end = True
|
||||
break
|
||||
if not comments_res:
|
||||
break
|
||||
|
||||
cursor_info: Dict = comments_res.get("cursor")
|
||||
if not cursor_info:
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Could not find 'cursor' in response for video_id: {video_id}. Skipping.")
|
||||
break
|
||||
|
||||
comment_list: List[Dict] = comments_res.get("replies", [])
|
||||
|
||||
# 检查 is_end 和 next 是否存在
|
||||
if "is_end" not in cursor_info or "next" not in cursor_info:
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.")
|
||||
is_end = True
|
||||
else:
|
||||
is_end = cursor_info.get("is_end")
|
||||
next_page = cursor_info.get("next")
|
||||
|
||||
if not isinstance(is_end, bool):
|
||||
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' is not a boolean for video_id: {video_id}. Assuming end of comments.")
|
||||
is_end = True
|
||||
if is_fetch_sub_comments:
|
||||
for comment in comment_list:
|
||||
comment_id = comment['rpid']
|
||||
if (comment.get("rcount", 0) > 0):
|
||||
{await self.get_video_all_level_two_comments(video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)}
|
||||
if len(result) + len(comment_list) > max_count:
|
||||
comment_list = comment_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(video_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not is_fetch_sub_comments:
|
||||
result.extend(comment_list)
|
||||
continue
|
||||
return result
|
||||
|
||||
async def get_video_all_level_two_comments(
|
||||
self,
|
||||
video_id: str,
|
||||
level_one_comment_id: int,
|
||||
order_mode: CommentOrderType,
|
||||
ps: int = 10,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
get video all level two comments for a level one comment
|
||||
:param video_id: 视频 ID
|
||||
:param level_one_comment_id: 一级评论 ID
|
||||
:param order_mode:
|
||||
:param ps: 一页评论数
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:return:
|
||||
"""
|
||||
|
||||
pn = 1
|
||||
while True:
|
||||
result = await self.get_video_level_two_comments(video_id, level_one_comment_id, pn, ps, order_mode)
|
||||
comment_list: List[Dict] = result.get("replies", [])
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(video_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if (int(result["page"]["count"]) <= pn * ps):
|
||||
break
|
||||
|
||||
pn += 1
|
||||
|
||||
async def get_video_level_two_comments(
|
||||
self,
|
||||
video_id: str,
|
||||
level_one_comment_id: int,
|
||||
pn: int,
|
||||
ps: int,
|
||||
order_mode: CommentOrderType,
|
||||
) -> Dict:
|
||||
"""get video level two comments
|
||||
:param video_id: 视频 ID
|
||||
:param level_one_comment_id: 一级评论 ID
|
||||
:param order_mode: 排序方式
|
||||
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/v2/reply/reply"
|
||||
post_data = {
|
||||
"oid": video_id,
|
||||
"mode": order_mode.value,
|
||||
"type": 1,
|
||||
"ps": ps,
|
||||
"pn": pn,
|
||||
"root": level_one_comment_id,
|
||||
}
|
||||
result = await self.get(uri, post_data)
|
||||
return result
|
||||
|
||||
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
|
||||
"""get all videos for a creator
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 页数
|
||||
:param ps: 一页视频数
|
||||
:param order_mode: 排序方式
|
||||
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/space/wbi/arc/search"
|
||||
post_data = {
|
||||
"mid": creator_id,
|
||||
"pn": pn,
|
||||
"ps": ps,
|
||||
"order": order_mode,
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_info(self, creator_id: int) -> Dict:
|
||||
"""
|
||||
get creator info
|
||||
:param creator_id: 作者 ID
|
||||
"""
|
||||
uri = "/x/space/wbi/acc/info"
|
||||
post_data = {
|
||||
"mid": creator_id,
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_fans(
|
||||
self,
|
||||
creator_id: int,
|
||||
pn: int,
|
||||
ps: int = 24,
|
||||
) -> Dict:
|
||||
"""
|
||||
get creator fans
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/relation/fans"
|
||||
post_data = {
|
||||
'vmid': creator_id,
|
||||
"pn": pn,
|
||||
"ps": ps,
|
||||
"gaia_source": "main_web",
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_followings(
|
||||
self,
|
||||
creator_id: int,
|
||||
pn: int,
|
||||
ps: int = 24,
|
||||
) -> Dict:
|
||||
"""
|
||||
get creator followings
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 开始页数
|
||||
:param ps: 每页数量
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/relation/followings"
|
||||
post_data = {
|
||||
"vmid": creator_id,
|
||||
"pn": pn,
|
||||
"ps": ps,
|
||||
"gaia_source": "main_web",
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_dynamics(self, creator_id: int, offset: str = ""):
|
||||
"""
|
||||
get creator comments
|
||||
:param creator_id: 创作者 ID
|
||||
:param offset: 发送请求所需参数
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/polymer/web-dynamic/v1/feed/space"
|
||||
post_data = {
|
||||
"offset": offset,
|
||||
"host_mid": creator_id,
|
||||
"platform": "web",
|
||||
}
|
||||
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_creator_all_fans(
|
||||
self,
|
||||
creator_info: Dict,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 100,
|
||||
) -> List:
|
||||
"""
|
||||
get creator all fans
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大粉丝数量
|
||||
|
||||
:return: up主粉丝数列表
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
pn = config.START_CONTACTS_PAGE
|
||||
while len(result) < max_count:
|
||||
fans_res: Dict = await self.get_creator_fans(creator_id, pn=pn)
|
||||
fans_list: List[Dict] = fans_res.get("list", [])
|
||||
|
||||
pn += 1
|
||||
if len(result) + len(fans_list) > max_count:
|
||||
fans_list = fans_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(creator_info, fans_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not fans_list:
|
||||
break
|
||||
result.extend(fans_list)
|
||||
return result
|
||||
|
||||
async def get_creator_all_followings(
|
||||
self,
|
||||
creator_info: Dict,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 100,
|
||||
) -> List:
|
||||
"""
|
||||
get creator all followings
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大关注者数量
|
||||
|
||||
:return: up主关注者列表
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
pn = config.START_CONTACTS_PAGE
|
||||
while len(result) < max_count:
|
||||
followings_res: Dict = await self.get_creator_followings(creator_id, pn=pn)
|
||||
followings_list: List[Dict] = followings_res.get("list", [])
|
||||
|
||||
pn += 1
|
||||
if len(result) + len(followings_list) > max_count:
|
||||
followings_list = followings_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(creator_info, followings_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not followings_list:
|
||||
break
|
||||
result.extend(followings_list)
|
||||
return result
|
||||
|
||||
async def get_creator_all_dynamics(
|
||||
self,
|
||||
creator_info: Dict,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 20,
|
||||
) -> List:
|
||||
"""
|
||||
get creator all followings
|
||||
:param creator_info:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count: 一个up主爬取的最大动态数量
|
||||
|
||||
:return: up主关注者列表
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
result = []
|
||||
offset = ""
|
||||
has_more = True
|
||||
while has_more and len(result) < max_count:
|
||||
dynamics_res = await self.get_creator_dynamics(creator_id, offset)
|
||||
dynamics_list: List[Dict] = dynamics_res["items"]
|
||||
has_more = dynamics_res["has_more"]
|
||||
offset = dynamics_res["offset"]
|
||||
if len(result) + len(dynamics_list) > max_count:
|
||||
dynamics_list = dynamics_list[:max_count - len(result)]
|
||||
if callback:
|
||||
await callback(creator_info, dynamics_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(dynamics_list)
|
||||
return result
|
||||
@@ -0,0 +1,657 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : B站爬虫
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from datetime import datetime, timedelta
|
||||
import pandas as pd
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
from playwright._impl._errors import TargetClosedError
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import bilibili as bilibili_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import BilibiliClient
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchOrderType
|
||||
from .login import BilibiliLogin
|
||||
|
||||
|
||||
class BilibiliCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
bili_client: BilibiliClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.bilibili.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[BilibiliCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[BilibiliCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url)
|
||||
|
||||
# Create a client to interact with the xiaohongshu website.
|
||||
self.bili_client = await self.create_bilibili_client(httpx_proxy_format)
|
||||
if not await self.bili_client.pong():
|
||||
login_obj = BilibiliLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # your phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.bili_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
if config.CREATOR_MODE:
|
||||
for creator_id in config.BILI_CREATOR_ID_LIST:
|
||||
await self.get_creator_videos(int(creator_id))
|
||||
else:
|
||||
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
|
||||
else:
|
||||
pass
|
||||
utils.logger.info("[BilibiliCrawler.start] Bilibili Crawler finished ...")
|
||||
|
||||
async def search(self):
|
||||
"""
|
||||
search bilibili video
|
||||
"""
|
||||
# Search for video and retrieve their comment information.
|
||||
if config.BILI_SEARCH_MODE == "normal":
|
||||
await self.search_by_keywords()
|
||||
elif config.BILI_SEARCH_MODE == "all_in_time_range":
|
||||
await self.search_by_keywords_in_time_range(daily_limit=False)
|
||||
elif config.BILI_SEARCH_MODE == "daily_limit_in_time_range":
|
||||
await self.search_by_keywords_in_time_range(daily_limit=True)
|
||||
else:
|
||||
utils.logger.warning(f"Unknown BILI_SEARCH_MODE: {config.BILI_SEARCH_MODE}")
|
||||
|
||||
@staticmethod
|
||||
async def get_pubtime_datetime(
|
||||
start: str = config.START_DAY,
|
||||
end: str = config.END_DAY,
|
||||
) -> Tuple[str, str]:
|
||||
"""
|
||||
获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s
|
||||
---
|
||||
:param start: 发布日期起始时间,YYYY-MM-DD
|
||||
:param end: 发布日期结束时间,YYYY-MM-DD
|
||||
|
||||
Note
|
||||
---
|
||||
- 搜索的时间范围为 start 至 end,包含 start 和 end
|
||||
- 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒
|
||||
- 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399
|
||||
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
|
||||
- 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒
|
||||
- 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799
|
||||
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
|
||||
"""
|
||||
# 转换 start 与 end 为 datetime 对象
|
||||
start_day: datetime = datetime.strptime(start, "%Y-%m-%d")
|
||||
end_day: datetime = datetime.strptime(end, "%Y-%m-%d")
|
||||
if start_day > end_day:
|
||||
raise ValueError("Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end")
|
||||
elif start_day == end_day: # 搜索同一天的内容
|
||||
end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 start_day + 1 day - 1 second
|
||||
else: # 搜索 start 至 end
|
||||
end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 end_day + 1 day - 1 second
|
||||
# 将其重新转换为时间戳
|
||||
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
|
||||
|
||||
async def search_by_keywords(self):
|
||||
"""
|
||||
search bilibili video with keywords in normal mode
|
||||
:return:
|
||||
"""
|
||||
utils.logger.info("[BilibiliCrawler.search_by_keywords] Begin search bilibli keywords")
|
||||
bili_limit_count = 20 # bilibili limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
||||
start_page = config.START_PAGE # start page number
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Skip page: {page}")
|
||||
page += 1
|
||||
continue
|
||||
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] search bilibili keyword: {keyword}, page: {page}")
|
||||
video_id_list: List[str] = []
|
||||
videos_res = await self.bili_client.search_video_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
page_size=bili_limit_count,
|
||||
order=SearchOrderType.DEFAULT,
|
||||
pubtime_begin_s=0, # 作品发布日期起始时间戳
|
||||
pubtime_end_s=0, # 作品发布日期结束日期时间戳
|
||||
)
|
||||
video_list: List[Dict] = videos_res.get("result")
|
||||
|
||||
if not video_list:
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] No more videos for '{keyword}', moving to next keyword.")
|
||||
break
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = []
|
||||
try:
|
||||
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BilibiliCrawler.search_by_keywords] error in the task list. The video for this page will not be included. {e}")
|
||||
video_items = await asyncio.gather(*task_list)
|
||||
for video_item in video_items:
|
||||
if video_item:
|
||||
video_id_list.append(video_item.get("View").get("aid"))
|
||||
await bilibili_store.update_bilibili_video(video_item)
|
||||
await bilibili_store.update_up_info(video_item)
|
||||
await self.get_bilibili_video(video_item, semaphore)
|
||||
page += 1
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
async def search_by_keywords_in_time_range(self, daily_limit: bool):
|
||||
"""
|
||||
Search bilibili video with keywords in a given time range.
|
||||
:param daily_limit: if True, strictly limit the number of notes per day and total.
|
||||
"""
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Begin search with daily_limit={daily_limit}")
|
||||
bili_limit_count = 20
|
||||
start_page = config.START_PAGE
|
||||
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}")
|
||||
total_notes_crawled_for_keyword = 0
|
||||
|
||||
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq="D"):
|
||||
if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
|
||||
break
|
||||
|
||||
if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
|
||||
break
|
||||
|
||||
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime("%Y-%m-%d"), end=day.strftime("%Y-%m-%d"))
|
||||
page = 1
|
||||
notes_count_this_day = 0
|
||||
|
||||
while True:
|
||||
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.")
|
||||
break
|
||||
if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.")
|
||||
break
|
||||
if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
break
|
||||
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
|
||||
video_id_list: List[str] = []
|
||||
videos_res = await self.bili_client.search_video_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
page_size=bili_limit_count,
|
||||
order=SearchOrderType.DEFAULT,
|
||||
pubtime_begin_s=pubtime_begin_s,
|
||||
pubtime_end_s=pubtime_end_s,
|
||||
)
|
||||
video_list: List[Dict] = videos_res.get("result")
|
||||
|
||||
if not video_list:
|
||||
utils.logger.info(f"[BilibiliCrawler.search] No more videos for '{keyword}' on {day.ctime()}, moving to next day.")
|
||||
break
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
||||
video_items = await asyncio.gather(*task_list)
|
||||
|
||||
for video_item in video_items:
|
||||
if video_item:
|
||||
if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
break
|
||||
if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
|
||||
break
|
||||
if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
|
||||
break
|
||||
notes_count_this_day += 1
|
||||
total_notes_crawled_for_keyword += 1
|
||||
video_id_list.append(video_item.get("View").get("aid"))
|
||||
await bilibili_store.update_bilibili_video(video_item)
|
||||
await bilibili_store.update_up_info(video_item)
|
||||
await self.get_bilibili_video(video_item, semaphore)
|
||||
|
||||
page += 1
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.search] Error searching on {day.ctime()}: {e}")
|
||||
break
|
||||
|
||||
async def batch_get_video_comments(self, video_id_list: List[str]):
|
||||
"""
|
||||
batch get video comments
|
||||
:param video_id_list:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[BilibiliCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for video_id in video_id_list:
|
||||
task = asyncio.create_task(self.get_comments(video_id, semaphore), name=video_id)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get comment for video id
|
||||
:param video_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
|
||||
await asyncio.sleep(random.uniform(0.5, 1.5))
|
||||
await self.bili_client.get_video_all_comments(
|
||||
video_id=video_id,
|
||||
crawl_interval=random.random(),
|
||||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||||
callback=bilibili_store.batch_update_bilibili_video_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
|
||||
# Propagate the exception to be caught by the main loop
|
||||
raise
|
||||
|
||||
async def get_creator_videos(self, creator_id: int):
|
||||
"""
|
||||
get videos for a creator
|
||||
:return:
|
||||
"""
|
||||
ps = 30
|
||||
pn = 1
|
||||
while True:
|
||||
result = await self.bili_client.get_creator_videos(creator_id, pn, ps)
|
||||
video_bvids_list = [video["bvid"] for video in result["list"]["vlist"]]
|
||||
await self.get_specified_videos(video_bvids_list)
|
||||
if int(result["page"]["count"]) <= pn * ps:
|
||||
break
|
||||
await asyncio.sleep(random.random())
|
||||
pn += 1
|
||||
|
||||
async def get_specified_videos(self, bvids_list: List[str]):
|
||||
"""
|
||||
get specified videos info
|
||||
:return:
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
video_aids_list = []
|
||||
for video_detail in video_details:
|
||||
if video_detail is not None:
|
||||
video_item_view: Dict = video_detail.get("View")
|
||||
video_aid: str = video_item_view.get("aid")
|
||||
if video_aid:
|
||||
video_aids_list.append(video_aid)
|
||||
await bilibili_store.update_bilibili_video(video_detail)
|
||||
await bilibili_store.update_up_info(video_detail)
|
||||
await self.get_bilibili_video(video_detail, semaphore)
|
||||
await self.batch_get_video_comments(video_aids_list)
|
||||
|
||||
async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||
"""
|
||||
Get video detail task
|
||||
:param aid:
|
||||
:param bvid:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def get_video_play_url_task(self, aid: int, cid: int, semaphore: asyncio.Semaphore) -> Union[Dict, None]:
|
||||
"""
|
||||
Get video play url
|
||||
:param aid:
|
||||
:param cid:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.bili_client.get_video_play_url(aid=aid, cid=cid)
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_play_url_task] Get video play url error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_play_url_task] have not fund play url from :{aid}|{cid}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
|
||||
"""
|
||||
create bilibili client
|
||||
:param httpx_proxy: httpx proxy
|
||||
:return: bilibili client
|
||||
"""
|
||||
utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
bilibili_client_obj = BilibiliClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": self.user_agent,
|
||||
"Cookie": cookie_str,
|
||||
"Origin": "https://www.bilibili.com",
|
||||
"Referer": "https://www.bilibili.com",
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return bilibili_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
launch browser and create browser context
|
||||
:param chromium: chromium browser
|
||||
:param playwright_proxy: playwright proxy
|
||||
:param user_agent: user agent
|
||||
:param headless: headless mode
|
||||
:return: browser context
|
||||
"""
|
||||
utils.logger.info("[BilibiliCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
# type: ignore
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy)
|
||||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[BilibiliCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
try:
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
elif self.browser_context:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[BilibiliCrawler.close] Browser context closed ...")
|
||||
except TargetClosedError:
|
||||
utils.logger.warning("[BilibiliCrawler.close] Browser context was already closed.")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.close] An error occurred during close: {e}")
|
||||
|
||||
async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
download bilibili video
|
||||
:param video_item:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Crawling image mode is not enabled")
|
||||
return
|
||||
video_item_view: Dict = video_item.get("View")
|
||||
aid = video_item_view.get("aid")
|
||||
cid = video_item_view.get("cid")
|
||||
result = await self.get_video_play_url_task(aid, cid, semaphore)
|
||||
if result is None:
|
||||
utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video play url failed")
|
||||
return
|
||||
durl_list = result.get("durl")
|
||||
max_size = -1
|
||||
video_url = ""
|
||||
for durl in durl_list:
|
||||
size = durl.get("size")
|
||||
if size > max_size:
|
||||
max_size = size
|
||||
video_url = durl.get("url")
|
||||
if video_url == "":
|
||||
utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video url failed")
|
||||
return
|
||||
|
||||
content = await self.bili_client.get_video_media(video_url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
return
|
||||
extension_file_name = f"video.mp4"
|
||||
await bilibili_store.store_video(aid, content, extension_file_name)
|
||||
|
||||
async def get_all_creator_details(self, creator_id_list: List[int]):
|
||||
"""
|
||||
creator_id_list: get details for creator from creator_id_list
|
||||
"""
|
||||
utils.logger.info(f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
|
||||
utils.logger.info(f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
try:
|
||||
for creator_id in creator_id_list:
|
||||
task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=creator_id)
|
||||
task_list.append(task)
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
|
||||
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_creator_details(self, creator_id: int, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get details for creator id
|
||||
:param creator_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
creator_unhandled_info: Dict = await self.bili_client.get_creator_info(creator_id)
|
||||
creator_info: Dict = {
|
||||
"id": creator_id,
|
||||
"name": creator_unhandled_info.get("name"),
|
||||
"sign": creator_unhandled_info.get("sign"),
|
||||
"avatar": creator_unhandled_info.get("face"),
|
||||
}
|
||||
await self.get_fans(creator_info, semaphore)
|
||||
await self.get_followings(creator_info, semaphore)
|
||||
await self.get_dynamics(creator_info, semaphore)
|
||||
|
||||
async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get fans for creator id
|
||||
:param creator_info:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
|
||||
await self.bili_client.get_creator_all_fans(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_fans,
|
||||
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}")
|
||||
|
||||
async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get followings for creator id
|
||||
:param creator_info:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
|
||||
await self.bili_client.get_creator_all_followings(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_followings,
|
||||
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}")
|
||||
|
||||
async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get dynamics for creator id
|
||||
:param creator_info:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
creator_id = creator_info["id"]
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
|
||||
await self.bili_client.get_creator_all_dynamics(
|
||||
creator_info=creator_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=bilibili_store.batch_update_bilibili_creator_dynamics,
|
||||
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}")
|
||||
@@ -0,0 +1,25 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc :
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,45 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/3 16:20
|
||||
# @Desc :
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchOrderType(Enum):
|
||||
# 综合排序
|
||||
DEFAULT = ""
|
||||
|
||||
# 最多点击
|
||||
MOST_CLICK = "click"
|
||||
|
||||
# 最新发布
|
||||
LAST_PUBLISH = "pubdate"
|
||||
|
||||
# 最多弹幕
|
||||
MOST_DANMU = "dm"
|
||||
|
||||
# 最多收藏
|
||||
MOST_MARK = "stow"
|
||||
|
||||
|
||||
class CommentOrderType(Enum):
|
||||
# 仅按热度
|
||||
DEFAULT = 0
|
||||
|
||||
# 按热度+按时间
|
||||
MIXED = 1
|
||||
|
||||
# 按时间
|
||||
TIME = 2
|
||||
@@ -0,0 +1,81 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 23:26
|
||||
# @Desc : bilibili 请求参数签名
|
||||
# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
|
||||
import urllib.parse
|
||||
from hashlib import md5
|
||||
from typing import Dict
|
||||
|
||||
from tools import utils
|
||||
|
||||
|
||||
class BilibiliSign:
|
||||
def __init__(self, img_key: str, sub_key: str):
|
||||
self.img_key = img_key
|
||||
self.sub_key = sub_key
|
||||
self.map_table = [
|
||||
46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49,
|
||||
33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40,
|
||||
61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11,
|
||||
36, 20, 34, 44, 52
|
||||
]
|
||||
|
||||
def get_salt(self) -> str:
|
||||
"""
|
||||
获取加盐的 key
|
||||
:return:
|
||||
"""
|
||||
salt = ""
|
||||
mixin_key = self.img_key + self.sub_key
|
||||
for mt in self.map_table:
|
||||
salt += mixin_key[mt]
|
||||
return salt[:32]
|
||||
|
||||
def sign(self, req_data: Dict) -> Dict:
|
||||
"""
|
||||
请求参数中加上当前时间戳对请求参数中的key进行字典序排序
|
||||
再将请求参数进行 url 编码集合 salt 进行 md5 就可以生成w_rid参数了
|
||||
:param req_data:
|
||||
:return:
|
||||
"""
|
||||
current_ts = utils.get_unix_timestamp()
|
||||
req_data.update({"wts": current_ts})
|
||||
req_data = dict(sorted(req_data.items()))
|
||||
req_data = {
|
||||
# 过滤 value 中的 "!'()*" 字符
|
||||
k: ''.join(filter(lambda ch: ch not in "!'()*", str(v)))
|
||||
for k, v
|
||||
in req_data.items()
|
||||
}
|
||||
query = urllib.parse.urlencode(req_data)
|
||||
salt = self.get_salt()
|
||||
wbi_sign = md5((query + salt).encode()).hexdigest() # 计算 w_rid
|
||||
req_data['w_rid'] = wbi_sign
|
||||
return req_data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
_img_key = "7cd084941338484aae1ad9425b84077c"
|
||||
_sub_key = "4932caff0ff746eab6f01bf08b70ac45"
|
||||
_search_url = "__refresh__=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset=0&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword=python&order=click&page=1&page_size=20&platform=pc&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&search_type=video&single_column=0&source_tag=3&web_location=1430654"
|
||||
_req_data = dict()
|
||||
for params in _search_url.split("&"):
|
||||
kvalues = params.split("=")
|
||||
key = kvalues[0]
|
||||
value = kvalues[1]
|
||||
_req_data[key] = value
|
||||
print("pre req_data", _req_data)
|
||||
_req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data={"aid":170001})
|
||||
print(_req_data)
|
||||
@@ -0,0 +1,118 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc : bilibli登录实现类
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class BilibiliLogin(AbstractLogin):
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
async def begin(self):
|
||||
"""Start login bilibili"""
|
||||
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError(
|
||||
"[BilibiliLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||||
if max retry times reached, raise RetryError
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
if cookie_dict.get("SESSDATA", "") or cookie_dict.get("DedeUserID"):
|
||||
return True
|
||||
return False
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login bilibili website and keep webdriver login state"""
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")
|
||||
|
||||
# click login button
|
||||
login_button_ele = self.context_page.locator(
|
||||
"xpath=//div[@class='right-entry__outside go-login-btn']//div"
|
||||
)
|
||||
await login_button_ele.click()
|
||||
await asyncio.sleep(1)
|
||||
# find login qrcode
|
||||
qrcode_img_selector = "//div[@class='login-scan-box']//img"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] Login bilibili failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(
|
||||
f"[BilibiliLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".bilibili.com",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from .core import DouYinCrawler
|
||||
@@ -0,0 +1,326 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import urllib.parse
|
||||
from typing import Any, Callable, Dict, Union, Optional
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext
|
||||
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
from var import request_keyword_var
|
||||
|
||||
from .exception import *
|
||||
from .field import *
|
||||
from .help import *
|
||||
|
||||
|
||||
class DouYinClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict,
|
||||
playwright_page: Optional[Page],
|
||||
cookie_dict: Dict,
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://www.douyin.com"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def __process_req_params(
|
||||
self,
|
||||
uri: str,
|
||||
params: Optional[Dict] = None,
|
||||
headers: Optional[Dict] = None,
|
||||
request_method="GET",
|
||||
):
|
||||
|
||||
if not params:
|
||||
return
|
||||
headers = headers or self.headers
|
||||
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore
|
||||
common_params = {
|
||||
"device_platform": "webapp",
|
||||
"aid": "6383",
|
||||
"channel": "channel_pc_web",
|
||||
"version_code": "190600",
|
||||
"version_name": "19.6.0",
|
||||
"update_version_code": "170400",
|
||||
"pc_client_type": "1",
|
||||
"cookie_enabled": "true",
|
||||
"browser_language": "zh-CN",
|
||||
"browser_platform": "MacIntel",
|
||||
"browser_name": "Chrome",
|
||||
"browser_version": "125.0.0.0",
|
||||
"browser_online": "true",
|
||||
"engine_name": "Blink",
|
||||
"os_name": "Mac OS",
|
||||
"os_version": "10.15.7",
|
||||
"cpu_core_num": "8",
|
||||
"device_memory": "8",
|
||||
"engine_version": "109.0",
|
||||
"platform": "PC",
|
||||
"screen_width": "2560",
|
||||
"screen_height": "1440",
|
||||
'effective_type': '4g',
|
||||
"round_trip_time": "50",
|
||||
"webid": get_web_id(),
|
||||
"msToken": local_storage.get("xmst"),
|
||||
}
|
||||
params.update(common_params)
|
||||
query_string = urllib.parse.urlencode(params)
|
||||
|
||||
# 20240927 a-bogus更新(JS版本)
|
||||
post_data = {}
|
||||
if request_method == "POST":
|
||||
post_data = params
|
||||
a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
|
||||
params["a_bogus"] = a_bogus
|
||||
|
||||
async def request(self, method, url, **kwargs):
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
try:
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
raise DataFetchError(f"{e}, {response.text}")
|
||||
|
||||
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
||||
"""
|
||||
GET请求
|
||||
"""
|
||||
await self.__process_req_params(uri, params, headers)
|
||||
headers = headers or self.headers
|
||||
return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
|
||||
|
||||
async def post(self, uri: str, data: dict, headers: Optional[Dict] = None):
|
||||
await self.__process_req_params(uri, data, headers)
|
||||
headers = headers or self.headers
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers)
|
||||
|
||||
async def pong(self, browser_context: BrowserContext) -> bool:
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
if local_storage.get("HasUserLogin", "") == "1":
|
||||
return True
|
||||
|
||||
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
return cookie_dict.get("LOGIN_STATUS") == "1"
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def search_info_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
offset: int = 0,
|
||||
search_channel: SearchChannelType = SearchChannelType.GENERAL,
|
||||
sort_type: SearchSortType = SearchSortType.GENERAL,
|
||||
publish_time: PublishTimeType = PublishTimeType.UNLIMITED,
|
||||
search_id: str = "",
|
||||
):
|
||||
"""
|
||||
DouYin Web Search API
|
||||
:param keyword:
|
||||
:param offset:
|
||||
:param search_channel:
|
||||
:param sort_type:
|
||||
:param publish_time: ·
|
||||
:param search_id: ·
|
||||
:return:
|
||||
"""
|
||||
query_params = {
|
||||
'search_channel': search_channel.value,
|
||||
'enable_history': '1',
|
||||
'keyword': keyword,
|
||||
'search_source': 'tab_search',
|
||||
'query_correct_type': '1',
|
||||
'is_filter_search': '0',
|
||||
'from_group_id': '7378810571505847586',
|
||||
'offset': offset,
|
||||
'count': '15',
|
||||
'need_filter_settings': '1',
|
||||
'list_type': 'multi',
|
||||
'search_id': search_id,
|
||||
}
|
||||
if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
|
||||
query_params["filter_selected"] = json.dumps({"sort_type": str(sort_type.value), "publish_time": str(publish_time.value)})
|
||||
query_params["is_filter_search"] = 1
|
||||
query_params["search_source"] = "tab_search"
|
||||
referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)
|
||||
|
||||
async def get_video_by_id(self, aweme_id: str) -> Any:
|
||||
"""
|
||||
DouYin Video Detail API
|
||||
:param aweme_id:
|
||||
:return:
|
||||
"""
|
||||
params = {"aweme_id": aweme_id}
|
||||
headers = copy.copy(self.headers)
|
||||
del headers["Origin"]
|
||||
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
||||
return res.get("aweme_detail", {})
|
||||
|
||||
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
|
||||
"""get note comments
|
||||
|
||||
"""
|
||||
uri = "/aweme/v1/web/comment/list/"
|
||||
params = {"aweme_id": aweme_id, "cursor": cursor, "count": 20, "item_type": 0}
|
||||
keywords = request_keyword_var.get()
|
||||
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_sub_comments(self, aweme_id: str, comment_id: str, cursor: int = 0):
|
||||
"""
|
||||
获取子评论
|
||||
"""
|
||||
uri = "/aweme/v1/web/comment/list/reply/"
|
||||
params = {
|
||||
'comment_id': comment_id,
|
||||
"cursor": cursor,
|
||||
"count": 20,
|
||||
"item_type": 0,
|
||||
"item_id": aweme_id,
|
||||
}
|
||||
keywords = request_keyword_var.get()
|
||||
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_aweme_all_comments(
|
||||
self,
|
||||
aweme_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
is_fetch_sub_comments=False,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
):
|
||||
"""
|
||||
获取帖子的所有评论,包括子评论
|
||||
:param aweme_id: 帖子ID
|
||||
:param crawl_interval: 抓取间隔
|
||||
:param is_fetch_sub_comments: 是否抓取子评论
|
||||
:param callback: 回调函数,用于处理抓取到的评论
|
||||
:param max_count: 一次帖子爬取的最大评论数量
|
||||
:return: 评论列表
|
||||
"""
|
||||
result = []
|
||||
comments_has_more = 1
|
||||
comments_cursor = 0
|
||||
while comments_has_more and len(result) < max_count:
|
||||
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
|
||||
comments_has_more = comments_res.get("has_more", 0)
|
||||
comments_cursor = comments_res.get("cursor", 0)
|
||||
comments = comments_res.get("comments", [])
|
||||
if not comments:
|
||||
continue
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
result.extend(comments)
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(aweme_id, comments)
|
||||
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not is_fetch_sub_comments:
|
||||
continue
|
||||
# 获取二级评论
|
||||
for comment in comments:
|
||||
reply_comment_total = comment.get("reply_comment_total")
|
||||
|
||||
if reply_comment_total > 0:
|
||||
comment_id = comment.get("cid")
|
||||
sub_comments_has_more = 1
|
||||
sub_comments_cursor = 0
|
||||
|
||||
while sub_comments_has_more:
|
||||
sub_comments_res = await self.get_sub_comments(aweme_id, comment_id, sub_comments_cursor)
|
||||
sub_comments_has_more = sub_comments_res.get("has_more", 0)
|
||||
sub_comments_cursor = sub_comments_res.get("cursor", 0)
|
||||
sub_comments = sub_comments_res.get("comments", [])
|
||||
|
||||
if not sub_comments:
|
||||
continue
|
||||
result.extend(sub_comments)
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(aweme_id, sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return result
|
||||
|
||||
async def get_user_info(self, sec_user_id: str):
|
||||
uri = "/aweme/v1/web/user/profile/other/"
|
||||
params = {
|
||||
"sec_user_id": sec_user_id,
|
||||
"publish_video_strategy_type": 2,
|
||||
"personal_center_strategy": 1,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_user_aweme_posts(self, sec_user_id: str, max_cursor: str = "") -> Dict:
|
||||
uri = "/aweme/v1/web/aweme/post/"
|
||||
params = {
|
||||
"sec_user_id": sec_user_id,
|
||||
"count": 18,
|
||||
"max_cursor": max_cursor,
|
||||
"locate_query": "false",
|
||||
"publish_video_strategy_type": 2,
|
||||
'verifyFp': 'verify_ma3hrt8n_q2q2HyYA_uLyO_4N6D_BLvX_E2LgoGmkA1BU',
|
||||
'fp': 'verify_ma3hrt8n_q2q2HyYA_uLyO_4N6D_BLvX_E2LgoGmkA1BU'
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_all_user_aweme_posts(self, sec_user_id: str, callback: Optional[Callable] = None):
|
||||
posts_has_more = 1
|
||||
max_cursor = ""
|
||||
result = []
|
||||
while posts_has_more == 1:
|
||||
aweme_post_res = await self.get_user_aweme_posts(sec_user_id, max_cursor)
|
||||
posts_has_more = aweme_post_res.get("has_more", 0)
|
||||
max_cursor = aweme_post_res.get("max_cursor")
|
||||
aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else []
|
||||
utils.logger.info(f"[DouYinClient.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
|
||||
if callback:
|
||||
await callback(aweme_list)
|
||||
result.extend(aweme_list)
|
||||
return result
|
||||
|
||||
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
try:
|
||||
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] request {url} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
@@ -0,0 +1,393 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import douyin as douyin_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import DouYinClient
|
||||
from .exception import DataFetchError
|
||||
from .field import PublishTimeType
|
||||
from .login import DouYinLogin
|
||||
|
||||
|
||||
class DouYinCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
dy_client: DouYinClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.douyin.com"
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[DouYinCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
None,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[DouYinCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
chromium,
|
||||
playwright_proxy_format,
|
||||
user_agent=None,
|
||||
headless=config.HEADLESS,
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url)
|
||||
|
||||
self.dy_client = await self.create_douyin_client(httpx_proxy_format)
|
||||
if not await self.dy_client.pong(browser_context=self.browser_context):
|
||||
login_obj = DouYinLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # you phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_awemes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get the information and comments of the specified creator
|
||||
await self.get_creators_and_videos()
|
||||
|
||||
utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
|
||||
dy_limit_count = 10 # douyin limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
||||
start_page = config.START_PAGE # start page number
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
||||
aweme_list: List[str] = []
|
||||
page = 0
|
||||
dy_search_id = ""
|
||||
while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
|
||||
page += 1
|
||||
continue
|
||||
try:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
|
||||
posts_res = await self.dy_client.search_info_by_keyword(
|
||||
keyword=keyword,
|
||||
offset=page * dy_limit_count - dy_limit_count,
|
||||
publish_time=PublishTimeType(config.PUBLISH_TIME_TYPE),
|
||||
search_id=dy_search_id,
|
||||
)
|
||||
if posts_res.get("data") is None or posts_res.get("data") == []:
|
||||
utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
|
||||
break
|
||||
except DataFetchError:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
|
||||
break
|
||||
|
||||
page += 1
|
||||
if "data" not in posts_res:
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。")
|
||||
break
|
||||
dy_search_id = posts_res.get("extra", {}).get("logid", "")
|
||||
for post_item in posts_res.get("data"):
|
||||
try:
|
||||
aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
|
||||
except TypeError:
|
||||
continue
|
||||
aweme_list.append(aweme_info.get("aweme_id", ""))
|
||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
|
||||
await self.get_aweme_media(aweme_item=aweme_info)
|
||||
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
async def get_specified_awemes(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST]
|
||||
aweme_details = await asyncio.gather(*task_list)
|
||||
for aweme_detail in aweme_details:
|
||||
if aweme_detail is not None:
|
||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_detail)
|
||||
await self.get_aweme_media(aweme_item=aweme_detail)
|
||||
await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
|
||||
|
||||
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
|
||||
"""Get note detail"""
|
||||
async with semaphore:
|
||||
try:
|
||||
return await self.dy_client.get_video_by_id(aweme_id)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||
"""
|
||||
Batch get note comments
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
task_list: List[Task] = []
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
for aweme_id in aweme_list:
|
||||
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
|
||||
task_list.append(task)
|
||||
if len(task_list) > 0:
|
||||
await asyncio.wait(task_list)
|
||||
|
||||
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
|
||||
async with semaphore:
|
||||
try:
|
||||
# 将关键词列表传递给 get_aweme_all_comments 方法
|
||||
await self.dy_client.get_aweme_all_comments(
|
||||
aweme_id=aweme_id,
|
||||
crawl_interval=random.random(),
|
||||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||||
callback=douyin_store.batch_update_dy_aweme_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
||||
except DataFetchError as e:
|
||||
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
|
||||
|
||||
async def get_creators_and_videos(self) -> None:
|
||||
"""
|
||||
Get the information and videos of the specified creator
|
||||
"""
|
||||
utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators")
|
||||
for user_id in config.DY_CREATOR_ID_LIST:
|
||||
creator_info: Dict = await self.dy_client.get_user_info(user_id)
|
||||
if creator_info:
|
||||
await douyin_store.save_creator(user_id, creator=creator_info)
|
||||
|
||||
# Get all video information of the creator
|
||||
all_video_list = await self.dy_client.get_all_user_aweme_posts(sec_user_id=user_id, callback=self.fetch_creator_video_detail)
|
||||
|
||||
video_ids = [video_item.get("aweme_id") for video_item in all_video_list]
|
||||
await self.batch_get_note_comments(video_ids)
|
||||
|
||||
async def fetch_creator_video_detail(self, video_list: List[Dict]):
|
||||
"""
|
||||
Concurrently obtain the specified post list and save the data
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list]
|
||||
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
for aweme_item in note_details:
|
||||
if aweme_item is not None:
|
||||
await douyin_store.update_douyin_aweme(aweme_item=aweme_item)
|
||||
await self.get_aweme_media(aweme_item=aweme_item)
|
||||
|
||||
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient:
|
||||
"""Create douyin client"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
||||
douyin_client = DouYinClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
|
||||
"Cookie": cookie_str,
|
||||
"Host": "www.douyin.com",
|
||||
"Origin": "https://www.douyin.com/",
|
||||
"Referer": "https://www.douyin.com/",
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return douyin_client
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
) # type: ignore
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 添加反检测脚本
|
||||
await self.cdp_manager.add_stealth_script()
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[DouYinCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
|
||||
|
||||
async def get_aweme_media(self, aweme_item: Dict):
|
||||
"""
|
||||
获取抖音媒体,自动判断媒体类型是短视频还是帖子图片并下载
|
||||
|
||||
Args:
|
||||
aweme_item (Dict): 抖音作品详情
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[DouYinCrawler.get_aweme_media] Crawling image mode is not enabled")
|
||||
return
|
||||
# 笔记 urls 列表,若为短视频类型则返回为空列表
|
||||
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
|
||||
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
|
||||
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
|
||||
# TODO: 抖音并没采用音视频分离的策略,故音频可从原视频中分离,暂不提取
|
||||
if note_download_url:
|
||||
await self.get_aweme_images(aweme_item)
|
||||
else:
|
||||
await self.get_aweme_video(aweme_item)
|
||||
|
||||
async def get_aweme_images(self, aweme_item: Dict):
|
||||
"""
|
||||
get aweme images. please use get_aweme_media
|
||||
|
||||
Args:
|
||||
aweme_item (Dict): 抖音作品详情
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
aweme_id = aweme_item.get("aweme_id")
|
||||
# 笔记 urls 列表,若为短视频类型则返回为空列表
|
||||
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
|
||||
|
||||
if not note_download_url:
|
||||
return
|
||||
picNum = 0
|
||||
for url in note_download_url:
|
||||
if not url:
|
||||
continue
|
||||
content = await self.dy_client.get_aweme_media(url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
continue
|
||||
extension_file_name = f"{picNum:>03d}.jpeg"
|
||||
picNum += 1
|
||||
await douyin_store.update_dy_aweme_image(aweme_id, content, extension_file_name)
|
||||
|
||||
async def get_aweme_video(self, aweme_item: Dict):
|
||||
"""
|
||||
get aweme videos. please use get_aweme_media
|
||||
|
||||
Args:
|
||||
aweme_item (Dict): 抖音作品详情
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
aweme_id = aweme_item.get("aweme_id")
|
||||
|
||||
# 视频 url,永远存在,但为短视频类型时的文件其实是音频文件
|
||||
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
|
||||
|
||||
if not video_download_url:
|
||||
return
|
||||
content = await self.dy_client.get_aweme_media(video_download_url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
return
|
||||
extension_file_name = f"video.mp4"
|
||||
await douyin_store.update_dy_aweme_video(aweme_id, content, extension_file_name)
|
||||
@@ -0,0 +1,20 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,34 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchChannelType(Enum):
|
||||
"""search channel type"""
|
||||
GENERAL = "aweme_general" # 综合
|
||||
VIDEO = "aweme_video_web" # 视频
|
||||
USER = "aweme_user_web" # 用户
|
||||
LIVE = "aweme_live" # 直播
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
GENERAL = 0 # 综合排序
|
||||
MOST_LIKE = 1 # 最多点赞
|
||||
LATEST = 2 # 最新发布
|
||||
|
||||
class PublishTimeType(Enum):
|
||||
"""publish time type"""
|
||||
UNLIMITED = 0 # 不限
|
||||
ONE_DAY = 1 # 一天内
|
||||
ONE_WEEK = 7 # 一周内
|
||||
SIX_MONTH = 180 # 半年内
|
||||
@@ -0,0 +1,85 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/10 02:24
|
||||
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
|
||||
|
||||
import random
|
||||
|
||||
import execjs
|
||||
from playwright.async_api import Page
|
||||
|
||||
douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').read())
|
||||
|
||||
def get_web_id():
|
||||
"""
|
||||
生成随机的webid
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
def e(t):
|
||||
if t is not None:
|
||||
return str(t ^ (int(16 * random.random()) >> (t // 4)))
|
||||
else:
|
||||
return ''.join(
|
||||
[str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))]
|
||||
)
|
||||
|
||||
web_id = ''.join(
|
||||
e(int(x)) if x in '018' else x for x in e(None)
|
||||
)
|
||||
return web_id.replace('-', '')[:19]
|
||||
|
||||
|
||||
|
||||
async def get_a_bogus(url: str, params: str, post_data: dict, user_agent: str, page: Page = None):
|
||||
"""
|
||||
获取 a_bogus 参数, 目前不支持post请求类型的签名
|
||||
"""
|
||||
return get_a_bogus_from_js(url, params, user_agent)
|
||||
|
||||
def get_a_bogus_from_js(url: str, params: str, user_agent: str):
|
||||
"""
|
||||
通过js获取 a_bogus 参数
|
||||
Args:
|
||||
url:
|
||||
params:
|
||||
user_agent:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
sign_js_name = "sign_datail"
|
||||
if "/reply" in url:
|
||||
sign_js_name = "sign_reply"
|
||||
return douyin_sign_obj.call(sign_js_name, params, user_agent)
|
||||
|
||||
|
||||
|
||||
async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
|
||||
"""
|
||||
通过playright获取 a_bogus 参数
|
||||
playwright版本已失效
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not post_data:
|
||||
post_data = ""
|
||||
a_bogus = await page.evaluate(
|
||||
"([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])",
|
||||
[params, post_data, user_agent])
|
||||
|
||||
return a_bogus
|
||||
|
||||
@@ -0,0 +1,265 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from cache.cache_factory import CacheFactory
|
||||
from tools import utils
|
||||
|
||||
|
||||
class DouYinLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext, # type: ignore
|
||||
context_page: Page, # type: ignore
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: Optional[str] = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.scan_qrcode_time = 60
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
async def begin(self):
|
||||
"""
|
||||
Start login douyin website
|
||||
滑块中间页面的验证准确率不太OK... 如果没有特俗要求,建议不开抖音登录,或者使用cookies登录
|
||||
"""
|
||||
|
||||
# popup login dialog
|
||||
await self.popup_login_dialog()
|
||||
|
||||
# select login type
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块
|
||||
await asyncio.sleep(6)
|
||||
current_page_title = await self.context_page.title()
|
||||
if "验证码中间页" in current_page_title:
|
||||
await self.check_page_display_slider(move_step=3, slider_level="hard")
|
||||
|
||||
# check login state
|
||||
utils.logger.info(f"[DouYinLogin.begin] login finished then check login state ...")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[DouYinLogin.begin] login failed please confirm ...")
|
||||
sys.exit()
|
||||
|
||||
# wait for redirect
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[DouYinLogin.begin] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self):
|
||||
"""Check if the current login status is successful and return True otherwise return False"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
|
||||
for page in self.browser_context.pages:
|
||||
try:
|
||||
local_storage = await page.evaluate("() => window.localStorage")
|
||||
if local_storage.get("HasUserLogin", "") == "1":
|
||||
return True
|
||||
except Exception as e:
|
||||
# utils.logger.warn(f"[DouYinLogin] check_login_state waring: {e}")
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
if cookie_dict.get("LOGIN_STATUS") == "1":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def popup_login_dialog(self):
|
||||
"""If the login dialog box does not pop up automatically, we will manually click the login button"""
|
||||
dialog_selector = "xpath=//div[@id='login-panel-new']"
|
||||
try:
|
||||
# check dialog box is auto popup and wait for 10 seconds
|
||||
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
|
||||
utils.logger.info("[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
|
||||
login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']")
|
||||
await login_button_ele.click()
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
utils.logger.info("[DouYinLogin.login_by_qrcode] Begin login douyin by qrcode...")
|
||||
qrcode_img_selector = "xpath=//div[@id='animate_qrcode_container']//img"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[DouYinLogin.login_by_qrcode] login qrcode not found please confirm ...")
|
||||
sys.exit()
|
||||
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
utils.logger.info("[DouYinLogin.login_by_mobile] Begin login douyin by mobile ...")
|
||||
mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']")
|
||||
await mobile_tap_ele.click()
|
||||
await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']")
|
||||
mobile_input_ele = self.context_page.locator("xpath=//input[@placeholder='手机号']")
|
||||
await mobile_input_ele.fill(self.login_phone)
|
||||
await asyncio.sleep(0.5)
|
||||
send_sms_code_btn = self.context_page.locator("xpath=//span[text() = '获取验证码']")
|
||||
await send_sms_code_btn.click()
|
||||
|
||||
# 检查是否有滑动验证码
|
||||
await self.check_page_display_slider(move_step=10, slider_level="easy")
|
||||
cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
sms_code_key = f"dy_{self.login_phone}"
|
||||
sms_code_value = cache_client.get(sms_code_key)
|
||||
if not sms_code_value:
|
||||
max_get_sms_code_time -= 1
|
||||
continue
|
||||
|
||||
sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']")
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode())
|
||||
await asyncio.sleep(0.5)
|
||||
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
|
||||
await submit_btn_ele.click() # 点击登录
|
||||
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
|
||||
break
|
||||
|
||||
async def check_page_display_slider(self, move_step: int = 10, slider_level: str = "easy"):
|
||||
"""
|
||||
检查页面是否出现滑动验证码
|
||||
:return:
|
||||
"""
|
||||
# 等待滑动验证码的出现
|
||||
back_selector = "#captcha-verify-image"
|
||||
try:
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="visible", timeout=30 * 1000)
|
||||
except PlaywrightTimeoutError: # 没有滑动验证码,直接返回
|
||||
return
|
||||
|
||||
gap_selector = 'xpath=//*[@id="captcha_container"]/div/div[2]/img[2]'
|
||||
max_slider_try_times = 20
|
||||
slider_verify_success = False
|
||||
while not slider_verify_success:
|
||||
if max_slider_try_times <= 0:
|
||||
utils.logger.error("[DouYinLogin.check_page_display_slider] slider verify failed ...")
|
||||
sys.exit()
|
||||
try:
|
||||
await self.move_slider(back_selector, gap_selector, move_step, slider_level)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
|
||||
page_content = await self.context_page.content()
|
||||
if "操作过慢" in page_content or "提示重新操作" in page_content:
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...")
|
||||
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
|
||||
continue
|
||||
|
||||
# 滑动成功后,等待滑块消失
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
|
||||
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...")
|
||||
slider_verify_success = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[DouYinLogin.check_page_display_slider] slider verify failed, error: {e}")
|
||||
await asyncio.sleep(1)
|
||||
max_slider_try_times -= 1
|
||||
utils.logger.info(f"[DouYinLogin.check_page_display_slider] remaining slider try times: {max_slider_try_times}")
|
||||
continue
|
||||
|
||||
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
|
||||
"""
|
||||
Move the slider to the right to complete the verification
|
||||
:param back_selector: 滑动验证码背景图片的选择器
|
||||
:param gap_selector: 滑动验证码的滑块选择器
|
||||
:param move_step: 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
|
||||
:param slider_level: 滑块难度 easy hard,分别对应手机验证码的滑块和验证码中间的滑块
|
||||
:return:
|
||||
"""
|
||||
|
||||
# get slider background image
|
||||
slider_back_elements = await self.context_page.wait_for_selector(
|
||||
selector=back_selector,
|
||||
timeout=1000 * 10, # wait 10 seconds
|
||||
)
|
||||
slide_back = str(await slider_back_elements.get_property("src")) # type: ignore
|
||||
|
||||
# get slider gap image
|
||||
gap_elements = await self.context_page.wait_for_selector(
|
||||
selector=gap_selector,
|
||||
timeout=1000 * 10, # wait 10 seconds
|
||||
)
|
||||
gap_src = str(await gap_elements.get_property("src")) # type: ignore
|
||||
|
||||
# 识别滑块位置
|
||||
slide_app = utils.Slide(gap=gap_src, bg=slide_back)
|
||||
distance = slide_app.discern()
|
||||
|
||||
# 获取移动轨迹
|
||||
tracks = utils.get_tracks(distance, slider_level)
|
||||
new_1 = tracks[-1] - (sum(tracks) - distance)
|
||||
tracks.pop()
|
||||
tracks.append(new_1)
|
||||
|
||||
# 根据轨迹拖拽滑块到指定位置
|
||||
element = await self.context_page.query_selector(gap_selector)
|
||||
bounding_box = await element.bounding_box() # type: ignore
|
||||
|
||||
await self.context_page.mouse.move(bounding_box["x"] + bounding_box["width"] / 2, # type: ignore
|
||||
bounding_box["y"] + bounding_box["height"] / 2) # type: ignore
|
||||
# 这里获取到x坐标中心点位置
|
||||
x = bounding_box["x"] + bounding_box["width"] / 2 # type: ignore
|
||||
# 模拟滑动操作
|
||||
await element.hover() # type: ignore
|
||||
await self.context_page.mouse.down()
|
||||
|
||||
for track in tracks:
|
||||
# 循环鼠标按照轨迹移动
|
||||
# steps 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢
|
||||
await self.context_page.mouse.move(x + track, 0, steps=move_step)
|
||||
x += track
|
||||
await self.context_page.mouse.up()
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("[DouYinLogin.login_by_cookies] Begin login douyin by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".douyin.com",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,13 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from .core import KuaishouCrawler
|
||||
@@ -0,0 +1,313 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
|
||||
from .exception import DataFetchError
|
||||
from .graphql import KuaiShouGraphQL
|
||||
|
||||
|
||||
class KuaiShouClient(AbstractApiClient):
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://www.kuaishou.com/graphql"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self.graphql = KuaiShouGraphQL()
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Any:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
data: Dict = response.json()
|
||||
if data.get("errors"):
|
||||
raise DataFetchError(data.get("errors", "unkonw error"))
|
||||
else:
|
||||
return data.get("data", {})
|
||||
|
||||
async def get(self, uri: str, params=None) -> Dict:
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = f"{uri}?" f"{urlencode(params)}"
|
||||
return await self.request(
|
||||
method="GET", url=f"{self._host}{final_uri}", headers=self.headers
|
||||
)
|
||||
|
||||
async def post(self, uri: str, data: dict) -> Dict:
|
||||
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
|
||||
return await self.request(
|
||||
method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers
|
||||
)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...")
|
||||
ping_flag = False
|
||||
try:
|
||||
post_data = {
|
||||
"operationName": "visionProfileUserList",
|
||||
"variables": {
|
||||
"ftype": 1,
|
||||
},
|
||||
"query": self.graphql.get("vision_profile_user_list"),
|
||||
}
|
||||
res = await self.post("", post_data)
|
||||
if res.get("visionProfileUserList", {}).get("result") == 1:
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again..."
|
||||
)
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def search_info_by_keyword(
|
||||
self, keyword: str, pcursor: str, search_session_id: str = ""
|
||||
):
|
||||
"""
|
||||
KuaiShou web search api
|
||||
:param keyword: search keyword
|
||||
:param pcursor: limite page curson
|
||||
:param search_session_id: search session id
|
||||
:return:
|
||||
"""
|
||||
post_data = {
|
||||
"operationName": "visionSearchPhoto",
|
||||
"variables": {
|
||||
"keyword": keyword,
|
||||
"pcursor": pcursor,
|
||||
"page": "search",
|
||||
"searchSessionId": search_session_id,
|
||||
},
|
||||
"query": self.graphql.get("search_query"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_info(self, photo_id: str) -> Dict:
|
||||
"""
|
||||
Kuaishou web video detail api
|
||||
:param photo_id:
|
||||
:return:
|
||||
"""
|
||||
post_data = {
|
||||
"operationName": "visionVideoDetail",
|
||||
"variables": {"photoId": photo_id, "page": "search"},
|
||||
"query": self.graphql.get("video_detail"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_comments(self, photo_id: str, pcursor: str = "") -> Dict:
|
||||
"""get video comments
|
||||
:param photo_id: photo id you want to fetch
|
||||
:param pcursor: last you get pcursor, defaults to ""
|
||||
:return:
|
||||
"""
|
||||
post_data = {
|
||||
"operationName": "commentListQuery",
|
||||
"variables": {"photoId": photo_id, "pcursor": pcursor},
|
||||
"query": self.graphql.get("comment_list"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_sub_comments(
|
||||
self, photo_id: str, rootCommentId: str, pcursor: str = ""
|
||||
) -> Dict:
|
||||
"""get video sub comments
|
||||
:param photo_id: photo id you want to fetch
|
||||
:param pcursor: last you get pcursor, defaults to ""
|
||||
:return:
|
||||
"""
|
||||
post_data = {
|
||||
"operationName": "visionSubCommentList",
|
||||
"variables": {
|
||||
"photoId": photo_id,
|
||||
"pcursor": pcursor,
|
||||
"rootCommentId": rootCommentId,
|
||||
},
|
||||
"query": self.graphql.get("vision_sub_comment_list"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_creator_profile(self, userId: str) -> Dict:
|
||||
post_data = {
|
||||
"operationName": "visionProfile",
|
||||
"variables": {"userId": userId},
|
||||
"query": self.graphql.get("vision_profile"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict:
|
||||
post_data = {
|
||||
"operationName": "visionProfilePhotoList",
|
||||
"variables": {"page": "profile", "pcursor": pcursor, "userId": userId},
|
||||
"query": self.graphql.get("vision_profile_photo_list"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_all_comments(
|
||||
self,
|
||||
photo_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
):
|
||||
"""
|
||||
get video all comments include sub comments
|
||||
:param photo_id:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count:
|
||||
:return:
|
||||
"""
|
||||
|
||||
result = []
|
||||
pcursor = ""
|
||||
|
||||
while pcursor != "no_more" and len(result) < max_count:
|
||||
comments_res = await self.get_video_comments(photo_id, pcursor)
|
||||
vision_commen_list = comments_res.get("visionCommentList", {})
|
||||
pcursor = vision_commen_list.get("pcursor", "")
|
||||
comments = vision_commen_list.get("rootComments", [])
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[: max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(photo_id, comments)
|
||||
result.extend(comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
sub_comments = await self.get_comments_all_sub_comments(
|
||||
comments, photo_id, crawl_interval, callback
|
||||
)
|
||||
result.extend(sub_comments)
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
self,
|
||||
comments: List[Dict],
|
||||
photo_id,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
|
||||
Args:
|
||||
comments: 评论列表
|
||||
photo_id: 视频id
|
||||
crawl_interval: 爬取一次评论的延迟单位(秒)
|
||||
callback: 一次评论爬取结束后
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
utils.logger.info(
|
||||
f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
|
||||
)
|
||||
return []
|
||||
|
||||
result = []
|
||||
for comment in comments:
|
||||
sub_comments = comment.get("subComments")
|
||||
if sub_comments and callback:
|
||||
await callback(photo_id, sub_comments)
|
||||
|
||||
sub_comment_pcursor = comment.get("subCommentsPcursor")
|
||||
if sub_comment_pcursor == "no_more":
|
||||
continue
|
||||
|
||||
root_comment_id = comment.get("commentId")
|
||||
sub_comment_pcursor = ""
|
||||
|
||||
while sub_comment_pcursor != "no_more":
|
||||
comments_res = await self.get_video_sub_comments(
|
||||
photo_id, root_comment_id, sub_comment_pcursor
|
||||
)
|
||||
vision_sub_comment_list = comments_res.get("visionSubCommentList", {})
|
||||
sub_comment_pcursor = vision_sub_comment_list.get("pcursor", "no_more")
|
||||
|
||||
comments = vision_sub_comment_list.get("subComments", {})
|
||||
if callback:
|
||||
await callback(photo_id, comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comments)
|
||||
return result
|
||||
|
||||
async def get_creator_info(self, user_id: str) -> Dict:
|
||||
"""
|
||||
eg: https://www.kuaishou.com/profile/3x4jtnbfter525a
|
||||
快手用户主页
|
||||
"""
|
||||
|
||||
visionProfile = await self.get_creator_profile(user_id)
|
||||
return visionProfile.get("userProfile")
|
||||
|
||||
async def get_all_videos_by_creator(
|
||||
self,
|
||||
user_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Args:
|
||||
user_id: 用户ID
|
||||
crawl_interval: 爬取一次的延迟单位(秒)
|
||||
callback: 一次分页爬取结束后的更新回调函数
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
pcursor = ""
|
||||
|
||||
while pcursor != "no_more":
|
||||
videos_res = await self.get_video_by_creater(user_id, pcursor)
|
||||
if not videos_res:
|
||||
utils.logger.error(
|
||||
f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data."
|
||||
)
|
||||
break
|
||||
|
||||
vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {})
|
||||
pcursor = vision_profile_photo_list.get("pcursor", "")
|
||||
|
||||
videos = vision_profile_photo_list.get("feeds", [])
|
||||
utils.logger.info(
|
||||
f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}"
|
||||
)
|
||||
|
||||
if callback:
|
||||
await callback(videos)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(videos)
|
||||
return result
|
||||
@@ -0,0 +1,396 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import kuaishou as kuaishou_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import comment_tasks_var, crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import KuaiShouClient
|
||||
from .exception import DataFetchError
|
||||
from .login import KuaishouLogin
|
||||
|
||||
|
||||
class KuaishouCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
ks_client: KuaiShouClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.kuaishou.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(
|
||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||
)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||
ip_proxy_info
|
||||
)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[KuaishouCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[KuaishouCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
chromium, None, self.user_agent, headless=config.HEADLESS
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(f"{self.index_url}?isHome=1")
|
||||
|
||||
# Create a client to interact with the kuaishou website.
|
||||
self.ks_client = await self.create_ks_client(httpx_proxy_format)
|
||||
if not await self.ks_client.pong():
|
||||
login_obj = KuaishouLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone=httpx_proxy_format,
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.ks_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for videos and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_videos()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their videos and comments
|
||||
await self.get_creators_and_videos()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("[KuaishouCrawler.start] Kuaishou Crawler finished ...")
|
||||
|
||||
async def search(self):
|
||||
utils.logger.info("[KuaishouCrawler.search] Begin search kuaishou keywords")
|
||||
ks_limit_count = 20 # kuaishou limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
|
||||
start_page = config.START_PAGE
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
search_session_id = ""
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.search] Current search keyword: {keyword}"
|
||||
)
|
||||
page = 1
|
||||
while (
|
||||
page - start_page + 1
|
||||
) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
|
||||
page += 1
|
||||
continue
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.search] search kuaishou keyword: {keyword}, page: {page}"
|
||||
)
|
||||
video_id_list: List[str] = []
|
||||
videos_res = await self.ks_client.search_info_by_keyword(
|
||||
keyword=keyword,
|
||||
pcursor=str(page),
|
||||
search_session_id=search_session_id,
|
||||
)
|
||||
if not videos_res:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data"
|
||||
)
|
||||
continue
|
||||
|
||||
vision_search_photo: Dict = videos_res.get("visionSearchPhoto")
|
||||
if vision_search_photo.get("result") != 1:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data "
|
||||
)
|
||||
continue
|
||||
search_session_id = vision_search_photo.get("searchSessionId", "")
|
||||
for video_detail in vision_search_photo.get("feeds"):
|
||||
video_id_list.append(video_detail.get("photo", {}).get("id"))
|
||||
await kuaishou_store.update_kuaishou_video(video_item=video_detail)
|
||||
|
||||
# batch fetch video comments
|
||||
page += 1
|
||||
await self.batch_get_video_comments(video_id_list)
|
||||
|
||||
async def get_specified_videos(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_video_info_task(video_id=video_id, semaphore=semaphore)
|
||||
for video_id in config.KS_SPECIFIED_ID_LIST
|
||||
]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
for video_detail in video_details:
|
||||
if video_detail is not None:
|
||||
await kuaishou_store.update_kuaishou_video(video_detail)
|
||||
await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST)
|
||||
|
||||
async def get_video_info_task(
|
||||
self, video_id: str, semaphore: asyncio.Semaphore
|
||||
) -> Optional[Dict]:
|
||||
"""Get video detail task"""
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.ks_client.get_video_info(video_id)
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..."
|
||||
)
|
||||
return result.get("visionVideoDetail")
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}"
|
||||
)
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}"
|
||||
)
|
||||
return None
|
||||
|
||||
async def batch_get_video_comments(self, video_id_list: List[str]):
|
||||
"""
|
||||
batch get video comments
|
||||
:param video_id_list:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled"
|
||||
)
|
||||
return
|
||||
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}"
|
||||
)
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for video_id in video_id_list:
|
||||
task = asyncio.create_task(
|
||||
self.get_comments(video_id, semaphore), name=video_id
|
||||
)
|
||||
task_list.append(task)
|
||||
|
||||
comment_tasks_var.set(task_list)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get comment for video id
|
||||
:param video_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..."
|
||||
)
|
||||
await self.ks_client.get_video_all_comments(
|
||||
photo_id=video_id,
|
||||
crawl_interval=random.random(),
|
||||
callback=kuaishou_store.batch_update_ks_video_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}"
|
||||
)
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}"
|
||||
)
|
||||
# use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task
|
||||
# maybe kuaishou block our request, we will take a nap and update the cookie again
|
||||
current_running_tasks = comment_tasks_var.get()
|
||||
for task in current_running_tasks:
|
||||
task.cancel()
|
||||
time.sleep(20)
|
||||
await self.context_page.goto(f"{self.index_url}?isHome=1")
|
||||
await self.ks_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
)
|
||||
|
||||
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
|
||||
"""Create ks client"""
|
||||
utils.logger.info(
|
||||
"[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ..."
|
||||
)
|
||||
cookie_str, cookie_dict = utils.convert_cookies(
|
||||
await self.browser_context.cookies()
|
||||
)
|
||||
ks_client_obj = KuaiShouClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": self.user_agent,
|
||||
"Cookie": cookie_str,
|
||||
"Origin": self.index_url,
|
||||
"Referer": self.index_url,
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return ks_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info(
|
||||
"[KuaishouCrawler.launch_browser] Begin create browser context ..."
|
||||
)
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(
|
||||
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
|
||||
) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
|
||||
)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[KuaishouCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[KuaishouCrawler] CDP模式启动失败,回退到标准模式: {e}"
|
||||
)
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
)
|
||||
|
||||
async def get_creators_and_videos(self) -> None:
|
||||
"""Get creator's videos and retrieve their comment information."""
|
||||
utils.logger.info(
|
||||
"[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators"
|
||||
)
|
||||
for user_id in config.KS_CREATOR_ID_LIST:
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
|
||||
if createor_info:
|
||||
await kuaishou_store.save_creator(user_id, creator=createor_info)
|
||||
|
||||
# Get all video information of the creator
|
||||
all_video_list = await self.ks_client.get_all_videos_by_creator(
|
||||
user_id=user_id,
|
||||
crawl_interval=random.random(),
|
||||
callback=self.fetch_creator_video_detail,
|
||||
)
|
||||
|
||||
video_ids = [
|
||||
video_item.get("photo", {}).get("id") for video_item in all_video_list
|
||||
]
|
||||
await self.batch_get_video_comments(video_ids)
|
||||
|
||||
async def fetch_creator_video_detail(self, video_list: List[Dict]):
|
||||
"""
|
||||
Concurrently obtain the specified post list and save the data
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore)
|
||||
for post_item in video_list
|
||||
]
|
||||
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
for video_detail in video_details:
|
||||
if video_detail is not None:
|
||||
await kuaishou_store.update_kuaishou_video(video_detail)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")
|
||||
@@ -0,0 +1,20 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
@@ -0,0 +1,33 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# 快手的数据传输是基于GraphQL实现的
|
||||
# 这个类负责获取一些GraphQL的schema
|
||||
from typing import Dict
|
||||
|
||||
|
||||
class KuaiShouGraphQL:
|
||||
graphql_queries: Dict[str, str]= {}
|
||||
|
||||
def __init__(self):
|
||||
self.graphql_dir = "media_platform/kuaishou/graphql/"
|
||||
self.load_graphql_queries()
|
||||
|
||||
def load_graphql_queries(self):
|
||||
graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"]
|
||||
|
||||
for file in graphql_files:
|
||||
with open(self.graphql_dir + file, mode="r") as f:
|
||||
query_name = file.split(".")[0]
|
||||
self.graphql_queries[query_name] = f.read()
|
||||
|
||||
def get(self, query_name: str) -> str:
|
||||
return self.graphql_queries.get(query_name, "Query not found")
|
||||
+39
@@ -0,0 +1,39 @@
|
||||
query commentListQuery($photoId: String, $pcursor: String) {
|
||||
visionCommentList(photoId: $photoId, pcursor: $pcursor) {
|
||||
commentCount
|
||||
pcursor
|
||||
rootComments {
|
||||
commentId
|
||||
authorId
|
||||
authorName
|
||||
content
|
||||
headurl
|
||||
timestamp
|
||||
likedCount
|
||||
realLikedCount
|
||||
liked
|
||||
status
|
||||
authorLiked
|
||||
subCommentCount
|
||||
subCommentsPcursor
|
||||
subComments {
|
||||
commentId
|
||||
authorId
|
||||
authorName
|
||||
content
|
||||
headurl
|
||||
timestamp
|
||||
likedCount
|
||||
realLikedCount
|
||||
liked
|
||||
status
|
||||
authorLiked
|
||||
replyToUserName
|
||||
replyTo
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+111
@@ -0,0 +1,111 @@
|
||||
fragment photoContent on PhotoEntity {
|
||||
__typename
|
||||
id
|
||||
duration
|
||||
caption
|
||||
originCaption
|
||||
likeCount
|
||||
viewCount
|
||||
commentCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
photoH265Url
|
||||
manifest
|
||||
manifestH265
|
||||
videoResource
|
||||
coverUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
timestamp
|
||||
expTag
|
||||
animatedCoverUrl
|
||||
distance
|
||||
videoRatio
|
||||
liked
|
||||
stereoType
|
||||
profileUserTopPhoto
|
||||
musicBlocked
|
||||
}
|
||||
|
||||
fragment recoPhotoFragment on recoPhotoEntity {
|
||||
__typename
|
||||
id
|
||||
duration
|
||||
caption
|
||||
originCaption
|
||||
likeCount
|
||||
viewCount
|
||||
commentCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
photoH265Url
|
||||
manifest
|
||||
manifestH265
|
||||
videoResource
|
||||
coverUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
timestamp
|
||||
expTag
|
||||
animatedCoverUrl
|
||||
distance
|
||||
videoRatio
|
||||
liked
|
||||
stereoType
|
||||
profileUserTopPhoto
|
||||
musicBlocked
|
||||
}
|
||||
|
||||
fragment feedContent on Feed {
|
||||
type
|
||||
author {
|
||||
id
|
||||
name
|
||||
headerUrl
|
||||
following
|
||||
headerUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
photo {
|
||||
...photoContent
|
||||
...recoPhotoFragment
|
||||
__typename
|
||||
}
|
||||
canAddComment
|
||||
llsid
|
||||
status
|
||||
currentPcursor
|
||||
tags {
|
||||
type
|
||||
name
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
|
||||
query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {
|
||||
visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {
|
||||
result
|
||||
llsid
|
||||
webPageArea
|
||||
feeds {
|
||||
...feedContent
|
||||
__typename
|
||||
}
|
||||
searchSessionId
|
||||
pcursor
|
||||
aladdinBanner {
|
||||
imgUrl
|
||||
link
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+80
@@ -0,0 +1,80 @@
|
||||
query visionVideoDetail($photoId: String, $type: String, $page: String, $webPageArea: String) {
|
||||
visionVideoDetail(photoId: $photoId, type: $type, page: $page, webPageArea: $webPageArea) {
|
||||
status
|
||||
type
|
||||
author {
|
||||
id
|
||||
name
|
||||
following
|
||||
headerUrl
|
||||
__typename
|
||||
}
|
||||
photo {
|
||||
id
|
||||
duration
|
||||
caption
|
||||
likeCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
liked
|
||||
timestamp
|
||||
expTag
|
||||
llsid
|
||||
viewCount
|
||||
videoRatio
|
||||
stereoType
|
||||
musicBlocked
|
||||
manifest {
|
||||
mediaType
|
||||
businessType
|
||||
version
|
||||
adaptationSet {
|
||||
id
|
||||
duration
|
||||
representation {
|
||||
id
|
||||
defaultSelect
|
||||
backupUrl
|
||||
codecs
|
||||
url
|
||||
height
|
||||
width
|
||||
avgBitrate
|
||||
maxBitrate
|
||||
m3u8Slice
|
||||
qualityType
|
||||
qualityLabel
|
||||
frameRate
|
||||
featureP2sp
|
||||
hidden
|
||||
disableAdaptive
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
manifestH265
|
||||
photoH265Url
|
||||
coronaCropManifest
|
||||
coronaCropManifestH265
|
||||
croppedPhotoH265Url
|
||||
croppedPhotoUrl
|
||||
videoResource
|
||||
__typename
|
||||
}
|
||||
tags {
|
||||
type
|
||||
name
|
||||
__typename
|
||||
}
|
||||
commentLimit {
|
||||
canAddComment
|
||||
__typename
|
||||
}
|
||||
llsid
|
||||
danmakuSwitch
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
query visionProfile($userId: String) {
|
||||
visionProfile(userId: $userId) {
|
||||
result
|
||||
hostName
|
||||
userProfile {
|
||||
ownerCount {
|
||||
fan
|
||||
photo
|
||||
follow
|
||||
photo_public
|
||||
__typename
|
||||
}
|
||||
profile {
|
||||
gender
|
||||
user_name
|
||||
user_id
|
||||
headurl
|
||||
user_text
|
||||
user_profile_bg_url
|
||||
__typename
|
||||
}
|
||||
isFollowing
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+110
@@ -0,0 +1,110 @@
|
||||
fragment photoContent on PhotoEntity {
|
||||
__typename
|
||||
id
|
||||
duration
|
||||
caption
|
||||
originCaption
|
||||
likeCount
|
||||
viewCount
|
||||
commentCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
photoH265Url
|
||||
manifest
|
||||
manifestH265
|
||||
videoResource
|
||||
coverUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
timestamp
|
||||
expTag
|
||||
animatedCoverUrl
|
||||
distance
|
||||
videoRatio
|
||||
liked
|
||||
stereoType
|
||||
profileUserTopPhoto
|
||||
musicBlocked
|
||||
riskTagContent
|
||||
riskTagUrl
|
||||
}
|
||||
|
||||
fragment recoPhotoFragment on recoPhotoEntity {
|
||||
__typename
|
||||
id
|
||||
duration
|
||||
caption
|
||||
originCaption
|
||||
likeCount
|
||||
viewCount
|
||||
commentCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
photoH265Url
|
||||
manifest
|
||||
manifestH265
|
||||
videoResource
|
||||
coverUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
timestamp
|
||||
expTag
|
||||
animatedCoverUrl
|
||||
distance
|
||||
videoRatio
|
||||
liked
|
||||
stereoType
|
||||
profileUserTopPhoto
|
||||
musicBlocked
|
||||
riskTagContent
|
||||
riskTagUrl
|
||||
}
|
||||
|
||||
fragment feedContent on Feed {
|
||||
type
|
||||
author {
|
||||
id
|
||||
name
|
||||
headerUrl
|
||||
following
|
||||
headerUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
photo {
|
||||
...photoContent
|
||||
...recoPhotoFragment
|
||||
__typename
|
||||
}
|
||||
canAddComment
|
||||
llsid
|
||||
status
|
||||
currentPcursor
|
||||
tags {
|
||||
type
|
||||
name
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
|
||||
query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {
|
||||
visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {
|
||||
result
|
||||
llsid
|
||||
webPageArea
|
||||
feeds {
|
||||
...feedContent
|
||||
__typename
|
||||
}
|
||||
hostName
|
||||
pcursor
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+16
@@ -0,0 +1,16 @@
|
||||
query visionProfileUserList($pcursor: String, $ftype: Int) {
|
||||
visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
|
||||
result
|
||||
fols {
|
||||
user_name
|
||||
headurl
|
||||
user_text
|
||||
isFollowing
|
||||
user_id
|
||||
__typename
|
||||
}
|
||||
hostName
|
||||
pcursor
|
||||
__typename
|
||||
}
|
||||
}
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) {
|
||||
visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) {
|
||||
pcursor
|
||||
subComments {
|
||||
commentId
|
||||
authorId
|
||||
authorName
|
||||
content
|
||||
headurl
|
||||
timestamp
|
||||
likedCount
|
||||
realLikedCount
|
||||
liked
|
||||
status
|
||||
authorLiked
|
||||
replyToUserName
|
||||
replyTo
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,113 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class KuaishouLogin(AbstractLogin):
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
async def begin(self):
|
||||
"""Start login xiaohongshu"""
|
||||
utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||||
if max retry times reached, raise RetryError
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
kuaishou_pass_token = cookie_dict.get("passToken")
|
||||
if kuaishou_pass_token:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login kuaishou website and keep webdriver login state"""
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
|
||||
|
||||
# click login button
|
||||
login_button_ele = self.context_page.locator(
|
||||
"xpath=//p[text()='登录']"
|
||||
)
|
||||
await login_button_ele.click()
|
||||
|
||||
# find login qrcode
|
||||
qrcode_img_selector = "//div[@class='qrcode-img']//img"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
|
||||
# show login qrcode
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[KuaishouLogin.login_by_qrcode] waiting for scan code login, remaining time is 20s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login kuaishou failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[KuaishouLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("[KuaishouLogin.login_by_cookies] Begin login kuaishou by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".kuaishou.com",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,13 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from .core import TieBaCrawler
|
||||
@@ -0,0 +1,385 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext
|
||||
from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote
|
||||
from proxy.proxy_ip_pool import ProxyIpPool
|
||||
from tools import utils
|
||||
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import TieBaExtractor
|
||||
|
||||
|
||||
class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
ip_pool=None,
|
||||
default_ip_proxy=None,
|
||||
):
|
||||
self.ip_pool: Optional[ProxyIpPool] = ip_pool
|
||||
self.timeout = timeout
|
||||
self.headers = {
|
||||
"User-Agent": utils.get_user_agent(),
|
||||
"Cookies": "",
|
||||
}
|
||||
self._host = "https://tieba.baidu.com"
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.default_ip_proxy = default_ip_proxy
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
return_ori_content: 是否返回原始内容
|
||||
proxies: 代理IP
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
actual_proxy = proxy if proxy else self.default_ip_proxy
|
||||
async with httpx.AsyncClient(proxy=actual_proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, headers=self.headers, **kwargs)
|
||||
|
||||
if response.status_code != 200:
|
||||
utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
|
||||
utils.logger.error(f"Request failed, response: {response.text}")
|
||||
raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
|
||||
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
|
||||
if return_ori_content:
|
||||
return response.text
|
||||
|
||||
return response.json()
|
||||
|
||||
async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
return_ori_content: 是否返回原始内容
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = (f"{uri}?"
|
||||
f"{urlencode(params)}")
|
||||
try:
|
||||
res = await self.request(method="GET", url=f"{self._host}{final_uri}", return_ori_content=return_ori_content, **kwargs)
|
||||
return res
|
||||
except RetryError as e:
|
||||
if self.ip_pool:
|
||||
proxie_model = await self.ip_pool.get_proxy()
|
||||
_, proxy = utils.format_proxy_info(proxie_model)
|
||||
res = await self.request(method="GET", url=f"{self._host}{final_uri}", return_ori_content=return_ori_content, proxy=proxy, **kwargs)
|
||||
self.default_ip_proxy = proxy
|
||||
return res
|
||||
|
||||
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
"""
|
||||
POST请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
data: 请求体参数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, **kwargs)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...")
|
||||
try:
|
||||
uri = "/mo/q/sync"
|
||||
res: Dict = await self.get(uri)
|
||||
utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}")
|
||||
if res and res.get("no") == 0:
|
||||
ping_flag = True
|
||||
else:
|
||||
utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...")
|
||||
ping_flag = False
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
async def get_notes_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
page_size: int = 10,
|
||||
sort: SearchSortType = SearchSortType.TIME_DESC,
|
||||
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
根据关键词搜索贴吧帖子
|
||||
Args:
|
||||
keyword: 关键词
|
||||
page: 分页第几页
|
||||
page_size: 每页大小
|
||||
sort: 结果排序方式
|
||||
note_type: 帖子类型(主题贴|主题+回复混合模式)
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/f/search/res"
|
||||
params = {
|
||||
"isnew": 1,
|
||||
"qw": keyword,
|
||||
"rn": page_size,
|
||||
"pn": page,
|
||||
"sm": sort.value,
|
||||
"only_thread": note_type.value,
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
return self._page_extractor.extract_search_note_list(page_content)
|
||||
|
||||
async def get_note_by_id(self, note_id: str) -> TiebaNote:
|
||||
"""
|
||||
根据帖子ID获取帖子详情
|
||||
Args:
|
||||
note_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{note_id}"
|
||||
page_content = await self.get(uri, return_ori_content=True)
|
||||
return self._page_extractor.extract_note_detail(page_content)
|
||||
|
||||
async def get_note_all_comments(
|
||||
self,
|
||||
note_detail: TiebaNote,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Args:
|
||||
note_detail: 帖子详情对象
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
max_count: 一次帖子爬取的最大评论数量
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{note_detail.note_id}"
|
||||
result: List[TiebaComment] = []
|
||||
current_page = 1
|
||||
while note_detail.total_replay_page >= current_page and len(result) < max_count:
|
||||
params = {
|
||||
"pn": current_page,
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id)
|
||||
if not comments:
|
||||
break
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
if callback:
|
||||
await callback(note_detail.note_id, comments)
|
||||
result.extend(comments)
|
||||
# 获取所有子评论
|
||||
await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
self,
|
||||
comments: List[TiebaComment],
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定评论下的所有子评论
|
||||
Args:
|
||||
comments: 评论列表
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/p/comment"
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
return []
|
||||
|
||||
# # 贴吧获取所有子评论需要登录态
|
||||
# if self.headers.get("Cookies") == "" or not self.pong():
|
||||
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
|
||||
|
||||
all_sub_comments: List[TiebaComment] = []
|
||||
for parment_comment in comments:
|
||||
if parment_comment.sub_comment_count == 0:
|
||||
continue
|
||||
|
||||
current_page = 1
|
||||
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
|
||||
while max_sub_page_num >= current_page:
|
||||
params = {
|
||||
"tid": parment_comment.note_id, # 帖子ID
|
||||
"pid": parment_comment.comment_id, # 父级评论ID
|
||||
"fid": parment_comment.tieba_id, # 贴吧ID
|
||||
"pn": current_page # 页码
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, parent_comment=parment_comment)
|
||||
|
||||
if not sub_comments:
|
||||
break
|
||||
if callback:
|
||||
await callback(parment_comment.note_id, sub_comments)
|
||||
all_sub_comments.extend(sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return all_sub_comments
|
||||
|
||||
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
||||
"""
|
||||
根据贴吧名称获取帖子列表
|
||||
Args:
|
||||
tieba_name: 贴吧名称
|
||||
page_num: 分页数量
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/f?kw={tieba_name}&pn={page_num}"
|
||||
page_content = await self.get(uri, return_ori_content=True)
|
||||
return self._page_extractor.extract_tieba_note_list(page_content)
|
||||
|
||||
async def get_creator_info_by_url(self, creator_url: str) -> str:
|
||||
"""
|
||||
根据创作者ID获取创作者信息
|
||||
Args:
|
||||
creator_url: 创作者主页URL
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
|
||||
return page_content
|
||||
|
||||
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
|
||||
"""
|
||||
根据创作者获取创作者的所有帖子
|
||||
Args:
|
||||
user_name:
|
||||
page_number:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/home/get/getthread"
|
||||
params = {
|
||||
"un": user_name,
|
||||
"pn": page_number,
|
||||
"id": "utf-8",
|
||||
"_": utils.get_current_timestamp(),
|
||||
}
|
||||
return await self.get(uri, params=params)
|
||||
|
||||
async def get_all_notes_by_creator_user_name(
|
||||
self,
|
||||
user_name: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_note_count: int = 0,
|
||||
creator_page_html_content: str = None,
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
根据创作者用户名获取创作者所有帖子
|
||||
Args:
|
||||
user_name: 创作者用户名
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后的回调函数,是一个awaitable类型的函数
|
||||
max_note_count: 帖子最大获取数量,如果为0则获取所有
|
||||
creator_page_html_content: 创作者主页HTML内容
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 百度贴吧比较特殊一些,前10个帖子是直接展示在主页上的,要单独处理,通过API获取不到
|
||||
result: List[TiebaNote] = []
|
||||
if creator_page_html_content:
|
||||
thread_id_list = (self._page_extractor.extract_tieba_thread_id_list_from_creator_page(creator_page_html_content))
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}")
|
||||
note_detail_task = [self.get_note_by_id(thread_id) for thread_id in thread_id_list]
|
||||
notes = await asyncio.gather(*note_detail_task)
|
||||
if callback:
|
||||
await callback(notes)
|
||||
result.extend(notes)
|
||||
|
||||
notes_has_more = 1
|
||||
page_number = 1
|
||||
page_per_count = 20
|
||||
total_get_count = 0
|
||||
while notes_has_more == 1 and (max_note_count == 0 or total_get_count < max_note_count):
|
||||
notes_res = await self.get_notes_by_creator(user_name, page_number)
|
||||
if not notes_res or notes_res.get("no") != 0:
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}")
|
||||
break
|
||||
notes_data = notes_res.get("data")
|
||||
notes_has_more = notes_data.get("has_more")
|
||||
notes = notes_data["thread_list"]
|
||||
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}")
|
||||
|
||||
note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes]
|
||||
notes = await asyncio.gather(*note_detail_task)
|
||||
if callback:
|
||||
await callback(notes)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(notes)
|
||||
page_number += 1
|
||||
total_get_count += page_per_count
|
||||
return result
|
||||
@@ -0,0 +1,418 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from model.m_baidu_tieba import TiebaCreator, TiebaNote
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import tieba as tieba_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import BaiduTieBaClient
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import TieBaExtractor
|
||||
from .login import BaiduTieBaLogin
|
||||
|
||||
|
||||
class TieBaCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
tieba_client: BaiduTieBaClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://tieba.baidu.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
Start the crawler
|
||||
Returns:
|
||||
|
||||
"""
|
||||
ip_proxy_pool, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
utils.logger.info(
|
||||
"[BaiduTieBaCrawler.start] Begin create ip proxy pool ..."
|
||||
)
|
||||
ip_proxy_pool = await create_ip_pool(
|
||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||
)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
_, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}"
|
||||
)
|
||||
|
||||
# Create a client to interact with the baidutieba website.
|
||||
self.tieba_client = BaiduTieBaClient(
|
||||
ip_pool=ip_proxy_pool,
|
||||
default_ip_proxy=httpx_proxy_format,
|
||||
)
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
await self.get_specified_tieba_notes()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""
|
||||
Search for notes and retrieve their comment information.
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[BaiduTieBaCrawler.search] Begin search baidu tieba keywords"
|
||||
)
|
||||
tieba_limit_count = 10 # tieba limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||
start_page = config.START_PAGE
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}"
|
||||
)
|
||||
page = 1
|
||||
while (
|
||||
page - start_page + 1
|
||||
) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Skip page {page}")
|
||||
page += 1
|
||||
continue
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}"
|
||||
)
|
||||
notes_list: List[TiebaNote] = (
|
||||
await self.tieba_client.get_notes_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
page_size=tieba_limit_count,
|
||||
sort=SearchSortType.TIME_DESC,
|
||||
note_type=SearchNoteType.FIXED_THREAD,
|
||||
)
|
||||
)
|
||||
if not notes_list:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] Search note list is empty"
|
||||
)
|
||||
break
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}"
|
||||
)
|
||||
await self.get_specified_notes(
|
||||
note_id_list=[note_detail.note_id for note_detail in notes_list]
|
||||
)
|
||||
page += 1
|
||||
except Exception as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}"
|
||||
)
|
||||
break
|
||||
|
||||
async def get_specified_tieba_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post by tieba name
|
||||
Returns:
|
||||
|
||||
"""
|
||||
tieba_limit_count = 50
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||
for tieba_name in config.TIEBA_NAME_LIST:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}"
|
||||
)
|
||||
page_number = 0
|
||||
while page_number <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
note_list: List[TiebaNote] = (
|
||||
await self.tieba_client.get_notes_by_tieba_name(
|
||||
tieba_name=tieba_name, page_num=page_number
|
||||
)
|
||||
)
|
||||
if not note_list:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty"
|
||||
)
|
||||
break
|
||||
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}"
|
||||
)
|
||||
await self.get_specified_notes([note.note_id for note in note_list])
|
||||
page_number += tieba_limit_count
|
||||
|
||||
async def get_specified_notes(
|
||||
self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST
|
||||
):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
Args:
|
||||
note_id_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore)
|
||||
for note_id in note_id_list
|
||||
]
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
note_details_model: List[TiebaNote] = []
|
||||
for note_detail in note_details:
|
||||
if note_detail is not None:
|
||||
note_details_model.append(note_detail)
|
||||
await tieba_store.update_tieba_note(note_detail)
|
||||
await self.batch_get_note_comments(note_details_model)
|
||||
|
||||
async def get_note_detail_async_task(
|
||||
self, note_id: str, semaphore: asyncio.Semaphore
|
||||
) -> Optional[TiebaNote]:
|
||||
"""
|
||||
Get note detail
|
||||
Args:
|
||||
note_id: baidu tieba note id
|
||||
semaphore: asyncio semaphore
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}"
|
||||
)
|
||||
note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
|
||||
if not note_detail:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}"
|
||||
)
|
||||
return None
|
||||
return note_detail
|
||||
except Exception as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error: {ex}"
|
||||
)
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}"
|
||||
)
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, note_detail_list: List[TiebaNote]):
|
||||
"""
|
||||
Batch get note comments
|
||||
Args:
|
||||
note_detail_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
return
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for note_detail in note_detail_list:
|
||||
task = asyncio.create_task(
|
||||
self.get_comments_async_task(note_detail, semaphore),
|
||||
name=note_detail.note_id,
|
||||
)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments_async_task(
|
||||
self, note_detail: TiebaNote, semaphore: asyncio.Semaphore
|
||||
):
|
||||
"""
|
||||
Get comments async task
|
||||
Args:
|
||||
note_detail:
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}"
|
||||
)
|
||||
await self.tieba_client.get_note_all_comments(
|
||||
note_detail=note_detail,
|
||||
crawl_interval=random.random(),
|
||||
callback=tieba_store.batch_update_tieba_note_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""
|
||||
Get creator's information and their notes and comments
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[WeiboCrawler.get_creators_and_notes] Begin get weibo creators"
|
||||
)
|
||||
for creator_url in config.TIEBA_CREATOR_URL_LIST:
|
||||
creator_page_html_content = await self.tieba_client.get_creator_info_by_url(
|
||||
creator_url=creator_url
|
||||
)
|
||||
creator_info: TiebaCreator = self._page_extractor.extract_creator_info(
|
||||
creator_page_html_content
|
||||
)
|
||||
if creator_info:
|
||||
utils.logger.info(
|
||||
f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}"
|
||||
)
|
||||
if not creator_info:
|
||||
raise Exception("Get creator info error")
|
||||
|
||||
await tieba_store.save_creator(user_info=creator_info)
|
||||
|
||||
# Get all note information of the creator
|
||||
all_notes_list = (
|
||||
await self.tieba_client.get_all_notes_by_creator_user_name(
|
||||
user_name=creator_info.user_name,
|
||||
crawl_interval=0,
|
||||
callback=tieba_store.batch_update_tieba_notes,
|
||||
max_note_count=config.CRAWLER_MAX_NOTES_COUNT,
|
||||
creator_page_html_content=creator_page_html_content,
|
||||
)
|
||||
)
|
||||
|
||||
await self.batch_get_note_comments(all_notes_list)
|
||||
|
||||
else:
|
||||
utils.logger.error(
|
||||
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
|
||||
)
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
Launch browser and create browser
|
||||
Args:
|
||||
chromium:
|
||||
playwright_proxy:
|
||||
user_agent:
|
||||
headless:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[BaiduTieBaCrawler.launch_browser] Begin create browser context ..."
|
||||
)
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
user_data_dir = os.path.join(
|
||||
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
|
||||
) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
|
||||
)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[TieBaCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[TieBaCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
"""
|
||||
Close browser context
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
|
||||
@@ -0,0 +1,29 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
# 按时间倒序
|
||||
TIME_DESC = "1"
|
||||
# 按时间顺序
|
||||
TIME_ASC = "0"
|
||||
# 按相关性顺序
|
||||
RELEVANCE_ORDER = "2"
|
||||
|
||||
|
||||
class SearchNoteType(Enum):
|
||||
# 只看主题贴
|
||||
MAIN_THREAD = "1"
|
||||
# 混合模式(帖子+回复)
|
||||
FIXED_THREAD = "0"
|
||||
@@ -0,0 +1,418 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
from urllib.parse import parse_qs, unquote
|
||||
|
||||
from parsel import Selector
|
||||
|
||||
from constant import baidu_tieba as const
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote
|
||||
from tools import utils
|
||||
|
||||
GENDER_MALE = "sex_male"
|
||||
GENDER_FEMALE = "sex_female"
|
||||
|
||||
|
||||
class TieBaExtractor:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def extract_search_note_list(page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据
|
||||
Args:
|
||||
page_content: 页面内容的HTML字符串
|
||||
|
||||
Returns:
|
||||
包含帖子信息的字典列表
|
||||
"""
|
||||
xpath_selector = "//div[@class='s_post']"
|
||||
post_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
result: List[TiebaNote] = []
|
||||
for post in post_list:
|
||||
tieba_note = TiebaNote(note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(),
|
||||
title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(),
|
||||
desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(),
|
||||
note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(
|
||||
default=''),
|
||||
user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(
|
||||
default='').strip(), user_link=const.TIEBA_URL + post.xpath(
|
||||
".//a[starts-with(@href, '/home/main')]/@href").get(default=''),
|
||||
tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(),
|
||||
tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(
|
||||
default=''),
|
||||
publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(
|
||||
default='').strip(), )
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表
|
||||
Args:
|
||||
page_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
page_content = page_content.replace('<!--', "")
|
||||
content_selector = Selector(text=page_content)
|
||||
xpath_selector = "//ul[@id='thread_list']/li"
|
||||
post_list = content_selector.xpath(xpath_selector)
|
||||
result: List[TiebaNote] = []
|
||||
for post_selector in post_list:
|
||||
post_field_value: Dict = self.extract_data_field_value(post_selector)
|
||||
if not post_field_value:
|
||||
continue
|
||||
note_id = str(post_field_value.get("id"))
|
||||
tieba_note = TiebaNote(note_id=note_id,
|
||||
title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
|
||||
desc=post_selector.xpath(
|
||||
".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
|
||||
default='').strip(), note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + post_selector.xpath(
|
||||
".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
|
||||
user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get(
|
||||
"author_name"),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(
|
||||
default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath(
|
||||
"//a[@class='card_title_fname']/@href").get(default=''),
|
||||
total_replay_num=post_field_value.get("reply_num", 0))
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_note_detail(self, page_content: str) -> TiebaNote:
|
||||
"""
|
||||
提取贴吧帖子详情
|
||||
Args:
|
||||
page_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
content_selector = Selector(text=page_content)
|
||||
first_floor_selector = content_selector.xpath("//div[@class='p_postlist'][1]")
|
||||
only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip()
|
||||
note_id = only_view_author_link.split("?")[0].split("/")[-1]
|
||||
# 帖子回复数、回复页数
|
||||
thread_num_infos = content_selector.xpath(
|
||||
"//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']")
|
||||
# IP地理位置、发表时间
|
||||
other_info_content = content_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
note = TiebaNote(note_id=note_id, title=content_selector.xpath("//title/text()").get(default='').strip(),
|
||||
desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + first_floor_selector.xpath(
|
||||
".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
||||
user_nickname=first_floor_selector.xpath(
|
||||
".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(),
|
||||
user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(
|
||||
default='').strip(),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(
|
||||
default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath(
|
||||
"//a[@class='card_title_fname']/@href").get(default=''), ip_location=ip_location,
|
||||
publish_time=publish_time,
|
||||
total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(),
|
||||
total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(), )
|
||||
note.title = note.title.replace(f"【{note.tieba_name}】_百度贴吧", "")
|
||||
return note
|
||||
|
||||
def extract_tieba_note_parment_comments(self, page_content: str, note_id: str) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子一级评论
|
||||
Args:
|
||||
page_content:
|
||||
note_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
xpath_selector = "//div[@class='l_post l_post_bright j_l_post clearfix ']"
|
||||
comment_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
result: List[TiebaComment] = []
|
||||
for comment_selector in comment_list:
|
||||
comment_field_value: Dict = self.extract_data_field_value(comment_selector)
|
||||
if not comment_field_value:
|
||||
continue
|
||||
tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip()
|
||||
other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
tieba_comment = TiebaComment(comment_id=str(comment_field_value.get("content").get("post_id")),
|
||||
sub_comment_count=comment_field_value.get("content").get("comment_num"),
|
||||
content=utils.extract_text_from_html(
|
||||
comment_field_value.get("content").get("content")),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + comment_selector.xpath(
|
||||
".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
||||
user_nickname=comment_selector.xpath(
|
||||
".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(),
|
||||
user_avatar=comment_selector.xpath(
|
||||
".//a[@class='p_author_face ']/img/@src").get(default='').strip(),
|
||||
tieba_id=str(comment_field_value.get("content").get("forum_id", "")),
|
||||
tieba_name=tieba_name, tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}",
|
||||
ip_location=ip_location, publish_time=publish_time, note_id=note_id, )
|
||||
result.append(tieba_comment)
|
||||
return result
|
||||
|
||||
def extract_tieba_note_sub_comments(self, page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子二级评论
|
||||
Args:
|
||||
page_content:
|
||||
parent_comment:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(page_content)
|
||||
comments = []
|
||||
comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']")
|
||||
comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']"))
|
||||
for comment_ele in comment_ele_list:
|
||||
comment_value = self.extract_data_field_value(comment_ele)
|
||||
if not comment_value:
|
||||
continue
|
||||
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
|
||||
content = utils.extract_text_from_html(
|
||||
comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
|
||||
comment = TiebaComment(
|
||||
comment_id=str(comment_value.get("spid")), content=content,
|
||||
user_link=comment_user_a_selector.xpath("./@href").get(default=""),
|
||||
user_nickname=comment_value.get("showname"),
|
||||
user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""),
|
||||
publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(),
|
||||
parent_comment_id=parent_comment.comment_id,
|
||||
note_id=parent_comment.note_id, note_url=parent_comment.note_url,
|
||||
tieba_id=parent_comment.tieba_id, tieba_name=parent_comment.tieba_name,
|
||||
tieba_link=parent_comment.tieba_link)
|
||||
comments.append(comment)
|
||||
|
||||
return comments
|
||||
|
||||
def extract_creator_info(self, html_content: str) -> TiebaCreator:
|
||||
"""
|
||||
提取贴吧创作者信息
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(text=html_content)
|
||||
user_link_selector = selector.xpath("//p[@class='space']/a")
|
||||
user_link: str = user_link_selector.xpath("./@href").get(default='')
|
||||
user_link_params: Dict = parse_qs(unquote(user_link.split("?")[-1]))
|
||||
user_name = user_link_params.get("un")[0] if user_link_params.get("un") else ""
|
||||
user_id = user_link_params.get("id")[0] if user_link_params.get("id") else ""
|
||||
userinfo_userdata_selector = selector.xpath("//div[@class='userinfo_userdata']")
|
||||
follow_fans_selector = selector.xpath("//span[@class='concern_num']")
|
||||
follows, fans = 0, 0
|
||||
if len(follow_fans_selector) == 2:
|
||||
follows, fans = self.extract_follow_and_fans(follow_fans_selector)
|
||||
user_content = userinfo_userdata_selector.get(default='')
|
||||
return TiebaCreator(user_id=user_id, user_name=user_name,
|
||||
nickname=selector.xpath(".//span[@class='userinfo_username ']/text()").get(
|
||||
default='').strip(),
|
||||
avatar=selector.xpath(".//div[@class='userinfo_left_head']//img/@src").get(
|
||||
default='').strip(),
|
||||
gender=self.extract_gender(user_content),
|
||||
ip_location=self.extract_ip(user_content),
|
||||
follows=follows,
|
||||
fans=fans,
|
||||
registration_duration=self.extract_registration_duration(user_content)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def extract_tieba_thread_id_list_from_creator_page(
|
||||
html_content: str
|
||||
) -> List[str]:
|
||||
"""
|
||||
提取贴吧创作者主页的帖子列表
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(text=html_content)
|
||||
thread_id_list = []
|
||||
xpath_selector = (
|
||||
"//ul[@class='new_list clearfix']//div[@class='thread_name']/a[1]/@href"
|
||||
)
|
||||
thread_url_list = selector.xpath(xpath_selector).getall()
|
||||
for thread_url in thread_url_list:
|
||||
thread_id = thread_url.split("?")[0].split("/")[-1]
|
||||
thread_id_list.append(thread_id)
|
||||
return thread_id_list
|
||||
|
||||
def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
|
||||
"""
|
||||
提取IP位置和发布时间
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>')
|
||||
time_match = pattern_pub_time.search(html_content)
|
||||
pub_time = time_match.group(1) if time_match else ""
|
||||
return self.extract_ip(html_content), pub_time
|
||||
|
||||
@staticmethod
|
||||
def extract_ip(html_content: str) -> str:
|
||||
"""
|
||||
提取IP
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
|
||||
ip_match = pattern_ip.search(html_content)
|
||||
ip = ip_match.group(1) if ip_match else ""
|
||||
return ip
|
||||
|
||||
@staticmethod
|
||||
def extract_gender(html_content: str) -> str:
|
||||
"""
|
||||
提取性别
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if GENDER_MALE in html_content:
|
||||
return '男'
|
||||
elif GENDER_FEMALE in html_content:
|
||||
return '女'
|
||||
return '未知'
|
||||
|
||||
@staticmethod
|
||||
def extract_follow_and_fans(selectors: List[Selector]) -> Tuple[str, str]:
|
||||
"""
|
||||
提取关注数和粉丝数
|
||||
Args:
|
||||
selectors:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern = re.compile(r'<span class="concern_num">\(<a[^>]*>(\d+)</a>\)</span>')
|
||||
follow_match = pattern.findall(selectors[0].get())
|
||||
fans_match = pattern.findall(selectors[1].get())
|
||||
follows = follow_match[0] if follow_match else 0
|
||||
fans = fans_match[0] if fans_match else 0
|
||||
return follows, fans
|
||||
|
||||
@staticmethod
|
||||
def extract_registration_duration(html_content: str) -> str:
|
||||
"""
|
||||
"<span>吧龄:1.9年</span>"
|
||||
Returns: 1.9年
|
||||
|
||||
"""
|
||||
pattern = re.compile(r'<span>吧龄:(\S+)</span>')
|
||||
match = pattern.search(html_content)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
@staticmethod
|
||||
def extract_data_field_value(selector: Selector) -> Dict:
|
||||
"""
|
||||
提取data-field的值
|
||||
Args:
|
||||
selector:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
data_field_value = selector.xpath("./@data-field").get(default='').strip()
|
||||
if not data_field_value or data_field_value == "{}":
|
||||
return {}
|
||||
try:
|
||||
# 先使用 html.unescape 处理转义字符 再json.loads 将 JSON 字符串转换为 Python 字典
|
||||
unescaped_json_str = html.unescape(data_field_value)
|
||||
data_field_dict_value = json.loads(unescaped_json_str)
|
||||
except Exception as ex:
|
||||
print(f"extract_data_field_value,错误信息:{ex}, 尝试使用其他方式解析")
|
||||
data_field_dict_value = {}
|
||||
return data_field_dict_value
|
||||
|
||||
|
||||
def test_extract_search_note_list():
|
||||
with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_search_note_list(content)
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_note_detail():
|
||||
with open("test_data/note_detail.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_note_detail(content)
|
||||
print(result.model_dump())
|
||||
|
||||
|
||||
def test_extract_tieba_note_parment_comments():
|
||||
with open("test_data/note_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_tieba_note_parment_comments(content, "123456")
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_tieba_note_sub_comments():
|
||||
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
fake_parment_comment = TiebaComment(comment_id="123456", content="content", user_link="user_link",
|
||||
user_nickname="user_nickname", user_avatar="user_avatar",
|
||||
publish_time="publish_time", parent_comment_id="parent_comment_id",
|
||||
note_id="note_id", note_url="note_url", tieba_id="tieba_id",
|
||||
tieba_name="tieba_name", )
|
||||
result = extractor.extract_tieba_note_sub_comments(content, fake_parment_comment)
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_tieba_note_list():
|
||||
with open("test_data/tieba_note_list.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_tieba_note_list(content)
|
||||
print(result)
|
||||
pass
|
||||
|
||||
|
||||
def test_extract_creator_info():
|
||||
with open("test_data/creator_info.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_creator_info(content)
|
||||
print(result.model_dump_json())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# test_extract_search_note_list()
|
||||
# test_extract_note_detail()
|
||||
# test_extract_tieba_note_parment_comments()
|
||||
# test_extract_tieba_note_list()
|
||||
test_extract_creator_info()
|
||||
@@ -0,0 +1,123 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class BaiduTieBaLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
轮训检查登录状态是否成功,成功返回True否则返回False
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
stoken = cookie_dict.get("STOKEN")
|
||||
ptoken = cookie_dict.get("PTOKEN")
|
||||
if stoken or ptoken:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def begin(self):
|
||||
"""Start login baidutieba"""
|
||||
utils.logger.info("[BaiduTieBaLogin.begin] Begin login baidutieba ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[BaiduTieBaLogin.begin]Invalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
|
||||
async def login_by_mobile(self):
|
||||
"""Login baidutieba by mobile"""
|
||||
pass
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login baidutieba website and keep webdriver login state"""
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Begin login baidutieba by qrcode ...")
|
||||
qrcode_img_selector = "xpath=//img[@class='tang-pass-qrcode-img']"
|
||||
# find login qrcode
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||
await asyncio.sleep(0.5)
|
||||
login_button_ele = self.context_page.locator("xpath=//li[@class='u_login']")
|
||||
await login_button_ele.click()
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
# fix issue #12
|
||||
# we need to use partial function to call show_qrcode function and run in executor
|
||||
# then current asyncio event loop will not be blocked
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Login baidutieba failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_cookies(self):
|
||||
"""login baidutieba website by cookies"""
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_cookies] Begin login baidutieba by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".baidu.com",
|
||||
'path': "/"
|
||||
}])
|
||||
+874
File diff suppressed because one or more lines are too long
+839
File diff suppressed because one or more lines are too long
+189
@@ -0,0 +1,189 @@
|
||||
<li class="lzl_single_post j_lzl_s_p first_no_border" data-field='{"spid":150726504693,"showname":"heinzfrentzen","user_name":"heinzfrentzen","portrait":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}'>
|
||||
<a rel="noopener" name="150726504693"></a>
|
||||
<a rel="noopener" data-field='{"un":"heinzfrentzen","id":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&fr=pb" username="heinzfrentzen">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"heinzfrentzen","id":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}' href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&ie=utf-8&fr=pb" target="_blank" username="heinzfrentzen">heinzfrentzen</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:11</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726506822,"showname":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","user_name":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","portrait":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}'>
|
||||
<a rel="noopener" name="150726506822"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","id":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&fr=pb" username="可爱的搬运工94">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","id":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}' href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&ie=utf-8&fr=pb" target="_blank" username="可爱的搬运工94">可爱的搬运工94</a>
|
||||
:<span class="lzl_content_main" data-username="">陈芋汐水花也不小 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726508024,"showname":"\u56fd\u9645\u4f53\u575b\u5de8\u661f\u9752\u6912\u8089\u4e1d","user_name":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","portrait":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}'>
|
||||
<a rel="noopener" name="150726508024"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","id":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&fr=pb" username="蚂蚁雅虎哈哈">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","id":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}' href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&ie=utf-8&fr=pb" target="_blank" username="蚂蚁雅虎哈哈">国际体坛巨星青椒肉丝</a>
|
||||
:<span class="lzl_content_main" data-username="">你怀孕了吗 老是呕吐 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726509762,"showname":"\u8317\u82b1\u5c11\u5e05","user_name":"\u8317\u82b1\u5c11\u5e05","portrait":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}'>
|
||||
<a rel="noopener" name="150726509762"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u8317\u82b1\u5c11\u5e05","id":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&fr=pb" username="茗花少帅">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1421248220","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1421248220","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u8317\u82b1\u5c11\u5e05","id":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}' href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&ie=utf-8&fr=pb" target="_blank" username="茗花少帅">茗花少帅</a>
|
||||
:<span class="lzl_content_main" data-username="">你就只看水花,不看空中姿态吗 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726510645,"showname":"\u4e1c\u534e\u6b66\u5170","user_name":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","portrait":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}'>
|
||||
<a rel="noopener" name="150726510645"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","id":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&fr=pb" username="西安交大前一百">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1644033630","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1644033630","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","id":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}' href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&ie=utf-8&fr=pb" target="_blank" username="西安交大前一百">东华武兰</a>
|
||||
:<span class="lzl_content_main" data-username="">经典只看水花 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726514057,"showname":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","user_name":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","portrait":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}'>
|
||||
<a rel="noopener" name="150726514057"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","id":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&fr=pb" username="上下班要注意">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","id":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}' href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&ie=utf-8&fr=pb" target="_blank" username="上下班要注意">上下班要注意</a>
|
||||
:<span class="lzl_content_main" data-username="">额,分数正常吧 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:13</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726520372,"showname":"\u9759\u770b\u8682\u8681\u4e0a\u6811","user_name":"\u9759\u770b\u8682\u8681\u4e0a\u6811","portrait":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}'>
|
||||
<a rel="noopener" name="150726520372"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u9759\u770b\u8682\u8681\u4e0a\u6811","id":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&fr=pb" username="静看蚂蚁上树">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u9759\u770b\u8682\u8681\u4e0a\u6811","id":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}' href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&ie=utf-8&fr=pb" target="_blank" username="静看蚂蚁上树">静看蚂蚁上树</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg" target="_blank" class="at">国际体坛巨星青椒肉丝</a>
|
||||
:吃酸黄瓜吃多了<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:14</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726524963,"showname":"\u4e0d\u61c2\u53d6\u5565\u540d\u5b57\ud83d\ude1c","user_name":"\u9ec4\u5c0f\u6e2forz","portrait":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}'>
|
||||
<a rel="noopener" name="150726524963"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u9ec4\u5c0f\u6e2forz","id":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&fr=pb" username="黄小港orz">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u9ec4\u5c0f\u6e2forz","id":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}' href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&ie=utf-8&fr=pb" target="_blank" username="黄小港orz">不懂取啥名字😜</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
请你去跟国际泳联投诉<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:15</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726535666,"showname":"\ud83d\udcab\u6cfd\u8d6b\u62c9\ud83d\udcaf","user_name":"\u5feb\u770b\u5361\u5361\u5361\u5361","portrait":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}'>
|
||||
<a rel="noopener" name="150726535666"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u5feb\u770b\u5361\u5361\u5361\u5361","id":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&fr=pb" username="快看卡卡卡卡">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1539783937","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1539783937","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u5feb\u770b\u5361\u5361\u5361\u5361","id":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}' href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&ie=utf-8&fr=pb" target="_blank" username="快看卡卡卡卡">💫泽赫拉💯</a>
|
||||
:<span class="lzl_content_main" data-username="">第五跳陈空中分腿了,空中姿态明显全红婵更好 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:17</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726536076,"showname":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\ud83d\udc36","user_name":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","portrait":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}'>
|
||||
<a rel="noopener" name="150726536076"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","id":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&fr=pb" username="嗯嗯哦哦啊啊哼">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":null,"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","id":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}' href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&ie=utf-8&fr=pb" target="_blank" username="嗯嗯哦哦啊啊哼">嗯嗯哦哦啊啊🐶</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.84497425.b5GLK5lGm90mTB2BhjrgpA" target="_blank" class="at">美味蟹黄堡💞</a>
|
||||
:你不会看起跳高度和空中姿态?
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:17</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_li_pager j_lzl_l_p lzl_li_pager_s" data-field='{"total_num":16,"total_page":2}'>
|
||||
<a rel="noopener" class="j_lzl_p btn-sub btn-small pull-right" href="##">
|
||||
<i class="icon-reply"></i>
|
||||
我也说一句
|
||||
</a>
|
||||
<p class="j_pager l_pager pager_theme_2">
|
||||
<span class="tP">1</span>
|
||||
<a href="#2">2</a>
|
||||
<a href="#2">下一页</a>
|
||||
<a href="#2">尾页</a>
|
||||
</p>
|
||||
</li>
|
||||
+96
@@ -0,0 +1,96 @@
|
||||
<div class="s_post_list">
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9117888152" data-fid="26976424" class="bluelink"
|
||||
href="/p/9117888152?pid=150718967291&cid=0#150718967291"
|
||||
target="_blank">武汉交互空间科技:富士康10亿加码中国大陆,印度为何逐渐“失宠</a></span>
|
||||
<div class="p_content">
|
||||
全球知名的电子制造服务巨头富士康的母公司鸿海精密工业股份有限公司正式对外发布了一则重大投资公告,富士康将在郑州投资建设新事业总部大楼,承载新事业总部功能。这一战略举措不仅彰显了富士康对中国市场持续深化的承诺与信心,也预示着该集团业务版图的新一轮扩张与升级。
|
||||
项目一期选址位于郑东新区,建筑面积约700公亩,总投资约10亿元人民币。主要建设总部管理中心、研发中心和工程中心、战略产业发展中心、战略产业金融平台、
|
||||
</div>
|
||||
贴吧:<a data-fid="26976424" class="p_forum" href="/f?kw=%CE%E4%BA%BA%BD%BB%BB%A5%BF%D5%BC%E4"
|
||||
target="_blank"><font class="p_violet">武汉交互空间</font></a>作者:<a
|
||||
href="/home/main?un=VR%D0%E9%C4%E2%B4%EF%C8%CB" target="_blank"><font class="p_violet">VR虚拟达人</font></a>
|
||||
<font class="p_green p_date">2024-08-05 16:45</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9114743782" data-fid="90367" class="bluelink"
|
||||
href="/p/9114743782?pid=150705176739&cid=0#150705176739"
|
||||
target="_blank">请各位急用玛尼的小心,骗子最多</a></span>
|
||||
<div class="p_content">
|
||||
这里面到处是骗子,大家小心。特别那些叫出村背货的,基本是卖园区,天下没有那么好的事。就是有这好事,我们在边境上的人,比你们最清楚,轮不到你们,边境上比你们胆子大的人大把,你一不熟悉小路,为什么叫你带货。东南亚带货的集结地,一般在南宁,防城港,昆明,西双版纳,临沧然后师机接了走小路出去,南宁,防城港坐船出去。好多都是二十几手的中介,之前卖园区一个三十万,现在不知道行情,但好多园区不收
|
||||
</div>
|
||||
贴吧:<a data-fid="90367" class="p_forum" href="/f?kw=%B1%B3%B0%FC%BF%CD" target="_blank"><font class="p_violet">背包客</font></a>作者:<a
|
||||
href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC64AUS" target="_blank"><font class="p_violet">贴吧用户_GC64AUS</font></a>
|
||||
<font class="p_green p_date">2024-08-03 07:35</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9095684158" data-fid="1388265" class="bluelink"
|
||||
href="/p/9095684158?pid=150616716870&cid=0#150616716870"
|
||||
target="_blank">*2025泰国冷链制冷运输展*东南亚外贸出口</a></span>
|
||||
<div class="p_content">**2025泰国曼谷国际冷库、空调制冷、仓储暨冷链运输展 *2025泰国冷链制冷运输展*东南亚外贸出口-观展游览考察
|
||||
展出时间:2025-7月(具体时间待定) 展出地点:泰国曼谷会展中心 展会周期:一年一届 组展单位:北京励航国际商务会展有限公司
|
||||
人员跟团观展补贴!为您节省成本,寻找适合您的市场:
|
||||
本公司为您提供观展考察机会,让您在大型展会上获得世界同行**科技的资料同时,感受异域文化气息。展会现场走展考察→→当地游览→→当地相关市
|
||||
</div>
|
||||
贴吧:<a data-fid="1388265" class="p_forum" href="/f?kw=%B9%FA%BC%CA%D5%B9%BB%E1" target="_blank"><font
|
||||
class="p_violet">国际展会</font></a>作者:<a href="/home/main?un=zhaot_188" target="_blank"><font
|
||||
class="p_violet">zhaot_188</font></a> <font class="p_green p_date">2024-07-19 15:44</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9093564752" data-fid="27984246" class="bluelink"
|
||||
href="/p/9093564752?pid=150606964195&cid=0#150606964195"
|
||||
target="_blank">京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承</a></span>
|
||||
<div class="p_content">来源标题:京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承 京湘楼(KING HERO)品牌创始人:肖鑫
|
||||
京湘楼,KING
|
||||
HERO,集酱板鸭、肥肠、鸭头、鸭脖、鸭肠、小龙虾、牛蛙、捆鸡、鸡爪、鱼嘴巴、鱼尾、鱿鱼、牛肉、猪头肉等特色食品卤制,加工、包装与生产经营。2022年3月在北京朝阳区双井开设了第一家“京湘楼·鲜卤集市”卤味熟食快餐店,2023年5月在湖南省长沙市开福区注册成立了“长沙京湘楼品牌管理有限公司”,以“京湘楼”作为品
|
||||
</div>
|
||||
贴吧:<a data-fid="27984246" class="p_forum" href="/f?kw=%BE%A9%CF%E6%C2%A5" target="_blank"><font
|
||||
class="p_violet">京湘楼</font></a>作者:<a href="/home/main?un=%CC%EC%C9%F1%B6%C9%B3%BE" target="_blank"><font
|
||||
class="p_violet">天神渡尘</font></a> <font class="p_green p_date">2024-07-17 23:43</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9088419293" data-fid="310" class="bluelink"
|
||||
href="/p/9088419293?pid=150582471307&cid=0#150582471307"
|
||||
target="_blank">广州能争取到迪士尼与环球落户吗?</a></span>
|
||||
<div class="p_content">
|
||||
不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。
|
||||
美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃
|
||||
</div>
|
||||
贴吧:<a data-fid="310" class="p_forum" href="/f?kw=%B5%D8%C0%ED" target="_blank"><font
|
||||
class="p_violet">地理</font></a>作者:<a href="/home/main?un=SeaRoutes" target="_blank"><font
|
||||
class="p_violet">SeaRoutes</font></a> <font class="p_green p_date">2024-07-13 20:17</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9088416365" data-fid="7561034" class="bluelink"
|
||||
href="/p/9088416365?pid=150582456551&cid=0#150582456551"
|
||||
target="_blank">#城市GDP#广州应该全力去争取迪士尼和环球影城</a></span>
|
||||
<div class="p_content">
|
||||
不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。
|
||||
美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃
|
||||
</div>
|
||||
贴吧:<a data-fid="7561034" class="p_forum" href="/f?kw=%B3%C7%CA%D0gdp" target="_blank"><font class="p_violet">城市gdp</font></a>作者:<a
|
||||
href="/home/main?un=SeaRoutes" target="_blank"><font class="p_violet">SeaRoutes</font></a> <font
|
||||
class="p_green p_date">2024-07-13 20:14</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9087419039" data-fid="46374" class="bluelink"
|
||||
href="/p/9087419039?pid=150577861626&cid=0#150577861626"
|
||||
target="_blank">云南省首批《云南日报》昆明新闻头条聚焦阳宗海省级物流枢纽建设</a></span>
|
||||
<div class="p_content">
|
||||
7月11日《云南日报》昆明新闻头条刊发文章《阳宗海风景名胜区立足“衔接西部陆海新通道与中老铁路”优势——加速28个物流枢纽设施建设》聚焦昆明阳宗海风景名胜区系统推进省级物流枢纽建设和功能提升深挖比较优势壮大物流产业据云南省发展和改革委员会在昆明召开的新闻发布会上公布,今年全省共有5地纳入云南省第一批省级物流枢纽和省级骨干冷链物流基地建设名单,其中,昆明市有两家获批,阳宗海物流枢纽上榜!一起来看近日,云南省
|
||||
</div>
|
||||
贴吧:<a data-fid="46374" class="p_forum" href="/f?kw=%C0%A5%C3%F7" target="_blank"><font
|
||||
class="p_violet">昆明</font></a>作者:<a href="/home/main?un=%8F%EC" target="_blank"><font
|
||||
class="p_violet">忟</font></a> <font class="p_green p_date">2024-07-12 23:04</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9085102046" data-fid="348713" class="bluelink"
|
||||
href="/p/9085102046?pid=150567555367&cid=0#150567555367"
|
||||
target="_blank">寻找弟弟,很久没跟家里联系</a></span>
|
||||
<div class="p_content">Kk四期世纪园区,寻找弟弟,外号大佐,F3 2楼,公司cj集团</div>
|
||||
贴吧:<a data-fid="348713" class="p_forum" href="/f?kw=%B6%AB%C4%CF%D1%C7" target="_blank"><font
|
||||
class="p_violet">东南亚</font></a>作者:<a href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC2CtRa"
|
||||
target="_blank"><font class="p_violet">贴吧用户_GC2CtRa</font></a>
|
||||
<font class="p_green p_date">2024-07-11 07:53</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9083888071" data-fid="30" class="bluelink"
|
||||
href="/p/9083888071?pid=150562129935&cid=0#150562129935"
|
||||
target="_blank">拉美 非洲 东南亚 南亚等发展中国家不太可能普及八小时双休吧?</a></span>
|
||||
<div class="p_content">拉美 和 东南亚的泰国 之类的连毒枭和黑色产业都管不好感觉普及八小时双休不太可能 缅甸和非洲军阀林立
|
||||
跟军阀谈八小时双休那么不开玩笑?缅北诈骗园区就能看出来。
|
||||
</div>
|
||||
贴吧:<a data-fid="30" class="p_forum" href="/f?kw=%C0%FA%CA%B7" target="_blank"><font
|
||||
class="p_violet">历史</font></a>作者:<a href="/home/main?un=yoursagain" target="_blank"><font
|
||||
class="p_violet">yoursagain</font></a> <font class="p_green p_date">2024-07-10 09:00</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9071937582" data-fid="8103241" class="bluelink"
|
||||
href="/p/9071937582?pid=150510120873&cid=0#150510120873"
|
||||
target="_blank">东南亚,园区【 工 价 低 】</a></span>
|
||||
<div class="p_content"></div>
|
||||
贴吧:<a data-fid="8103241" class="p_forum" href="/f?kw=%D4%B0%C7%F8%D5%D0%C9%CC" target="_blank"><font
|
||||
class="p_violet">园区招商</font></a>作者:<a href="/home/main?un=QQ59052966" target="_blank"><font
|
||||
class="p_violet">QQ59052966</font></a> <font class="p_green p_date">2024-06-30 12:09</font></div>
|
||||
</div>
|
||||
+3627
File diff suppressed because one or more lines are too long
@@ -0,0 +1,18 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:40
|
||||
# @Desc :
|
||||
from .client import WeiboClient
|
||||
from .core import WeiboCrawler
|
||||
from .login import WeiboLogin
|
||||
@@ -0,0 +1,381 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:40
|
||||
# @Desc : 微博爬虫 API 请求 client
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import re
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import parse_qs, unquote, urlencode
|
||||
|
||||
import httpx
|
||||
from httpx import Response
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
import config
|
||||
from tools import utils
|
||||
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchType
|
||||
|
||||
|
||||
class WeiboClient:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://m.weibo.cn"
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self._image_agent_host = "https://i1.wp.com/"
|
||||
|
||||
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
|
||||
enable_return_response = kwargs.pop("return_response", False)
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
|
||||
if enable_return_response:
|
||||
return response
|
||||
|
||||
data: Dict = response.json()
|
||||
ok_code = data.get("ok")
|
||||
if ok_code == 0: # response error
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||
raise DataFetchError(data.get("msg", "response error"))
|
||||
elif ok_code != 1: # unknown error
|
||||
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
||||
raise DataFetchError(data.get("msg", "unknown error"))
|
||||
else: # response right
|
||||
return data.get("data", {})
|
||||
|
||||
async def get(self, uri: str, params=None, headers=None, **kwargs) -> Union[Response, Dict]:
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = (f"{uri}?"
|
||||
f"{urlencode(params)}")
|
||||
|
||||
if headers is None:
|
||||
headers = self.headers
|
||||
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers, **kwargs)
|
||||
|
||||
async def post(self, uri: str, data: dict) -> Dict:
|
||||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
|
||||
ping_flag = False
|
||||
try:
|
||||
uri = "/api/config"
|
||||
resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers)
|
||||
if resp_data.get("login"):
|
||||
ping_flag = True
|
||||
else:
|
||||
utils.logger.error(f"[WeiboClient.pong] cookie may be invalid and again login...")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboClient.pong] Pong weibo failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def get_note_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
search_type: SearchType = SearchType.DEFAULT,
|
||||
) -> Dict:
|
||||
"""
|
||||
search note by keyword
|
||||
:param keyword: 微博搜搜的关键词
|
||||
:param page: 分页参数 -当前页码
|
||||
:param search_type: 搜索的类型,见 weibo/filed.py 中的枚举SearchType
|
||||
:return:
|
||||
"""
|
||||
uri = "/api/container/getIndex"
|
||||
containerid = f"100103type={search_type.value}&q={keyword}"
|
||||
params = {
|
||||
"containerid": containerid,
|
||||
"page_type": "searchall",
|
||||
"page": page,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_note_comments(self, mid_id: str, max_id: int, max_id_type: int = 0) -> Dict:
|
||||
"""get notes comments
|
||||
:param mid_id: 微博ID
|
||||
:param max_id: 分页参数ID
|
||||
:param max_id_type: 分页参数ID类型
|
||||
:return:
|
||||
"""
|
||||
uri = "/comments/hotflow"
|
||||
params = {
|
||||
"id": mid_id,
|
||||
"mid": mid_id,
|
||||
"max_id_type": max_id_type,
|
||||
}
|
||||
if max_id > 0:
|
||||
params.update({"max_id": max_id})
|
||||
referer_url = f"https://m.weibo.cn/detail/{mid_id}"
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = referer_url
|
||||
|
||||
return await self.get(uri, params, headers=headers)
|
||||
|
||||
async def get_note_all_comments(
|
||||
self,
|
||||
note_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
):
|
||||
"""
|
||||
get note all comments include sub comments
|
||||
:param note_id:
|
||||
:param crawl_interval:
|
||||
:param callback:
|
||||
:param max_count:
|
||||
:return:
|
||||
"""
|
||||
result = []
|
||||
is_end = False
|
||||
max_id = -1
|
||||
max_id_type = 0
|
||||
while not is_end and len(result) < max_count:
|
||||
comments_res = await self.get_note_comments(note_id, max_id, max_id_type)
|
||||
max_id: int = comments_res.get("max_id")
|
||||
max_id_type: int = comments_res.get("max_id_type")
|
||||
comment_list: List[Dict] = comments_res.get("data", [])
|
||||
is_end = max_id == 0
|
||||
if len(result) + len(comment_list) > max_count:
|
||||
comment_list = comment_list[:max_count - len(result)]
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(note_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comment_list)
|
||||
sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
|
||||
result.extend(sub_comment_result)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
async def get_comments_all_sub_comments(
|
||||
note_id: str,
|
||||
comment_list: List[Dict],
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取评论的所有子评论
|
||||
Args:
|
||||
note_id:
|
||||
comment_list:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
utils.logger.info(f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
|
||||
return []
|
||||
|
||||
res_sub_comments = []
|
||||
for comment in comment_list:
|
||||
sub_comments = comment.get("comments")
|
||||
if sub_comments and isinstance(sub_comments, list):
|
||||
await callback(note_id, sub_comments)
|
||||
res_sub_comments.extend(sub_comments)
|
||||
return res_sub_comments
|
||||
|
||||
async def get_note_info_by_id(self, note_id: str) -> Dict:
|
||||
"""
|
||||
根据帖子ID获取详情
|
||||
:param note_id:
|
||||
:return:
|
||||
"""
|
||||
url = f"{self._host}/detail/{note_id}"
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
||||
if response.status_code != 200:
|
||||
raise DataFetchError(f"get weibo detail err: {response.text}")
|
||||
match = re.search(r'var \$render_data = (\[.*?\])\[0\]', response.text, re.DOTALL)
|
||||
if match:
|
||||
render_data_json = match.group(1)
|
||||
render_data_dict = json.loads(render_data_json)
|
||||
note_detail = render_data_dict[0].get("status")
|
||||
note_item = {"mblog": note_detail}
|
||||
return note_item
|
||||
else:
|
||||
utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
|
||||
return dict()
|
||||
|
||||
async def get_note_image(self, image_url: str) -> bytes:
|
||||
image_url = image_url[8:] # 去掉 https://
|
||||
sub_url = image_url.split("/")
|
||||
image_url = ""
|
||||
for i in range(len(sub_url)):
|
||||
if i == 1:
|
||||
image_url += "large/" # 都获取高清大图
|
||||
elif i == len(sub_url) - 1:
|
||||
image_url += sub_url[i]
|
||||
else:
|
||||
image_url += sub_url[i] + "/"
|
||||
# 微博图床对外存在防盗链,所以需要代理访问
|
||||
# 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下
|
||||
final_uri = (f"{self._image_agent_host}"
|
||||
f"{image_url}")
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
try:
|
||||
response = await client.request("GET", final_uri, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
|
||||
async def get_creator_container_info(self, creator_id: str) -> Dict:
|
||||
"""
|
||||
获取用户的容器ID, 容器信息代表着真实请求的API路径
|
||||
fid_container_id:用户的微博详情API的容器ID
|
||||
lfid_container_id:用户的微博列表API的容器ID
|
||||
Args:
|
||||
creator_id:
|
||||
|
||||
Returns: {
|
||||
|
||||
"""
|
||||
response = await self.get(f"/u/{creator_id}", return_response=True)
|
||||
m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
|
||||
if not m_weibocn_params:
|
||||
raise DataFetchError("get containerid failed")
|
||||
m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
|
||||
return {"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0], "lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]}
|
||||
|
||||
async def get_creator_info_by_id(self, creator_id: str) -> Dict:
|
||||
"""
|
||||
根据用户ID获取用户详情
|
||||
Args:
|
||||
creator_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/container/getIndex"
|
||||
container_info = await self.get_creator_container_info(creator_id)
|
||||
if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
|
||||
utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
|
||||
raise DataFetchError("get containerid failed")
|
||||
params = {
|
||||
"jumpfrom": "weibocom",
|
||||
"type": "uid",
|
||||
"value": creator_id,
|
||||
"containerid": container_info["fid_container_id"],
|
||||
}
|
||||
|
||||
user_res = await self.get(uri, params)
|
||||
|
||||
if user_res.get("tabsInfo"):
|
||||
tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
|
||||
for tab in tabs:
|
||||
if tab.get("tabKey") == "weibo":
|
||||
container_info["lfid_container_id"] = tab.get("containerid")
|
||||
break
|
||||
|
||||
user_res.update(container_info)
|
||||
return user_res
|
||||
|
||||
async def get_notes_by_creator(
|
||||
self,
|
||||
creator: str,
|
||||
container_id: str,
|
||||
since_id: str = "0",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取博主的笔记
|
||||
Args:
|
||||
creator: 博主ID
|
||||
container_id: 容器ID
|
||||
since_id: 上一页最后一条笔记的ID
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
uri = "/api/container/getIndex"
|
||||
params = {
|
||||
"jumpfrom": "weibocom",
|
||||
"type": "uid",
|
||||
"value": creator,
|
||||
"containerid": container_id,
|
||||
"since_id": since_id,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_all_notes_by_creator_id(
|
||||
self,
|
||||
creator_id: str,
|
||||
container_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Args:
|
||||
creator_id:
|
||||
container_id:
|
||||
crawl_interval:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
notes_has_more = True
|
||||
since_id = ""
|
||||
crawler_total_count = 0
|
||||
while notes_has_more:
|
||||
notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
|
||||
if not notes_res:
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
|
||||
break
|
||||
since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
|
||||
if "cards" not in notes_res:
|
||||
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
|
||||
break
|
||||
|
||||
notes = notes_res["cards"]
|
||||
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}")
|
||||
notes = [note for note in notes if note.get("card_type") == 9]
|
||||
if callback:
|
||||
await callback(notes)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(notes)
|
||||
crawler_total_count += 10
|
||||
notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count
|
||||
return result
|
||||
@@ -0,0 +1,373 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:41
|
||||
# @Desc : 微博爬虫主流程代码
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import weibo as weibo_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import WeiboClient
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchType
|
||||
from .help import filter_search_result_card
|
||||
from .login import WeiboLogin
|
||||
|
||||
|
||||
class WeiboCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
wb_client: WeiboClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self):
|
||||
self.index_url = "https://www.weibo.com"
|
||||
self.mobile_index_url = "https://m.weibo.cn"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self.mobile_user_agent = utils.get_mobile_user_agent()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[WeiboCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.mobile_user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.mobile_index_url)
|
||||
|
||||
# Create a client to interact with the xiaohongshu website.
|
||||
self.wb_client = await self.create_weibo_client(httpx_proxy_format)
|
||||
if not await self.wb_client.pong():
|
||||
login_obj = WeiboLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # your phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
|
||||
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
|
||||
utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
|
||||
await self.context_page.goto(self.mobile_index_url)
|
||||
await asyncio.sleep(2)
|
||||
await self.wb_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for video and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
utils.logger.info("[WeiboCrawler.start] Weibo Crawler finished ...")
|
||||
|
||||
async def search(self):
|
||||
"""
|
||||
search weibo note with keywords
|
||||
:return:
|
||||
"""
|
||||
utils.logger.info("[WeiboCrawler.search] Begin search weibo keywords")
|
||||
weibo_limit_count = 10 # weibo limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
|
||||
start_page = config.START_PAGE
|
||||
|
||||
# Set the search type based on the configuration for weibo
|
||||
if config.WEIBO_SEARCH_TYPE == "default":
|
||||
search_type = SearchType.DEFAULT
|
||||
elif config.WEIBO_SEARCH_TYPE == "real_time":
|
||||
search_type = SearchType.REAL_TIME
|
||||
elif config.WEIBO_SEARCH_TYPE == "popular":
|
||||
search_type = SearchType.POPULAR
|
||||
elif config.WEIBO_SEARCH_TYPE == "video":
|
||||
search_type = SearchType.VIDEO
|
||||
else:
|
||||
utils.logger.error(f"[WeiboCrawler.search] Invalid WEIBO_SEARCH_TYPE: {config.WEIBO_SEARCH_TYPE}")
|
||||
return
|
||||
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
|
||||
page += 1
|
||||
continue
|
||||
utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
|
||||
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
|
||||
note_id_list: List[str] = []
|
||||
note_list = filter_search_result_card(search_res.get("cards"))
|
||||
for note_item in note_list:
|
||||
if note_item:
|
||||
mblog: Dict = note_item.get("mblog")
|
||||
if mblog:
|
||||
note_id_list.append(mblog.get("id"))
|
||||
await weibo_store.update_weibo_note(note_item)
|
||||
await self.get_note_images(mblog)
|
||||
|
||||
page += 1
|
||||
await self.batch_get_notes_comments(note_id_list)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""
|
||||
get specified notes info
|
||||
:return:
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [self.get_note_info_task(note_id=note_id, semaphore=semaphore) for note_id in config.WEIBO_SPECIFIED_ID_LIST]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
for note_item in video_details:
|
||||
if note_item:
|
||||
await weibo_store.update_weibo_note(note_item)
|
||||
await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST)
|
||||
|
||||
async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||
"""
|
||||
Get note detail task
|
||||
:param note_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.wb_client.get_note_info_by_id(note_id)
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_info_task] have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_notes_comments(self, note_id_list: List[str]):
|
||||
"""
|
||||
batch get notes comments
|
||||
:param note_id_list:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for note_id in note_id_list:
|
||||
task = asyncio.create_task(self.get_note_comments(note_id, semaphore), name=note_id)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
get comment for note id
|
||||
:param note_id:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
|
||||
await self.wb_client.get_note_all_comments(
|
||||
note_id=note_id,
|
||||
crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些
|
||||
callback=weibo_store.batch_update_weibo_note_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}")
|
||||
|
||||
async def get_note_images(self, mblog: Dict):
|
||||
"""
|
||||
get note images
|
||||
:param mblog:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled")
|
||||
return
|
||||
|
||||
pics: Dict = mblog.get("pics")
|
||||
if not pics:
|
||||
return
|
||||
for pic in pics:
|
||||
url = pic.get("url")
|
||||
if not url:
|
||||
continue
|
||||
content = await self.wb_client.get_note_image(url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content != None:
|
||||
extension_file_name = url.split(".")[-1]
|
||||
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""
|
||||
Get creator's information and their notes and comments
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
|
||||
for user_id in config.WEIBO_CREATOR_ID_LIST:
|
||||
createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id)
|
||||
if createor_info_res:
|
||||
createor_info: Dict = createor_info_res.get("userInfo", {})
|
||||
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
|
||||
if not createor_info:
|
||||
raise DataFetchError("Get creator info error")
|
||||
await weibo_store.save_creator(user_id, user_info=createor_info)
|
||||
|
||||
# Get all note information of the creator
|
||||
all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
|
||||
creator_id=user_id,
|
||||
container_id=createor_info_res.get("lfid_container_id"),
|
||||
crawl_interval=0,
|
||||
callback=weibo_store.batch_update_weibo_notes,
|
||||
)
|
||||
|
||||
note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")]
|
||||
await self.batch_get_notes_comments(note_ids)
|
||||
|
||||
else:
|
||||
utils.logger.error(f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}")
|
||||
|
||||
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
weibo_client_obj = WeiboClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": utils.get_mobile_user_agent(),
|
||||
"Cookie": cookie_str,
|
||||
"Origin": "https://m.weibo.cn",
|
||||
"Referer": "https://m.weibo.cn",
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return weibo_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[WeiboCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[WeiboCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[WeiboCrawler.close] Browser context closed ...")
|
||||
@@ -0,0 +1,25 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/2 18:44
|
||||
# @Desc :
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,30 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:41
|
||||
# @Desc :
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchType(Enum):
|
||||
# 综合
|
||||
DEFAULT = "1"
|
||||
|
||||
# 实时
|
||||
REAL_TIME = "61"
|
||||
|
||||
# 热门
|
||||
POPULAR = "60"
|
||||
|
||||
# 视频
|
||||
VIDEO = "64"
|
||||
@@ -0,0 +1,36 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/24 17:37
|
||||
# @Desc :
|
||||
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
def filter_search_result_card(card_list: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
过滤微博搜索的结果,只保留card_type为9类型的数据
|
||||
:param card_list:
|
||||
:return:
|
||||
"""
|
||||
note_list: List[Dict] = []
|
||||
for card_item in card_list:
|
||||
if card_item.get("card_type") == 9:
|
||||
note_list.append(card_item)
|
||||
if len(card_item.get("card_group", [])) > 0:
|
||||
card_group = card_item.get("card_group")
|
||||
for card_group_item in card_group:
|
||||
if card_group_item.get("card_type") == 9:
|
||||
note_list.append(card_group_item)
|
||||
|
||||
return note_list
|
||||
@@ -0,0 +1,123 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Time : 2023/12/23 15:42
|
||||
# @Desc : 微博登录实现
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class WeiboLogin(AbstractLogin):
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
|
||||
|
||||
async def begin(self):
|
||||
"""Start login weibo"""
|
||||
utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError(
|
||||
"[WeiboLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self, no_logged_in_session: str) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||||
if max retry times reached, raise RetryError
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
if cookie_dict.get("SSOLoginState"):
|
||||
return True
|
||||
current_web_session = cookie_dict.get("WBPSESS")
|
||||
if current_web_session != no_logged_in_session:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login weibo website and keep webdriver login state"""
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
|
||||
await self.context_page.goto(self.weibo_sso_login_url)
|
||||
# find login qrcode
|
||||
qrcode_img_selector = "xpath=//img[@class='w-full h-full']"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[WeiboLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
|
||||
|
||||
# get not logged session
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
no_logged_in_session = cookie_dict.get("WBPSESS")
|
||||
|
||||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Login weibo failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(
|
||||
f"[WeiboLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".weibo.cn",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,13 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from .core import XiaoHongShuCrawler
|
||||
from .field import *
|
||||
@@ -0,0 +1,592 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
from html import unescape
|
||||
|
||||
from .exception import DataFetchError, IPBlockError
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import get_search_id, sign
|
||||
|
||||
|
||||
class XiaoHongShuClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=60, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.headers = headers
|
||||
self._host = "https://edith.xiaohongshu.com"
|
||||
self._domain = "https://www.xiaohongshu.com"
|
||||
self.IP_ERROR_STR = "网络连接异常,请检查网络设置或重启试试"
|
||||
self.IP_ERROR_CODE = 300012
|
||||
self.NOTE_ABNORMAL_STR = "笔记状态异常,请稍后查看"
|
||||
self.NOTE_ABNORMAL_CODE = -510001
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def _pre_headers(self, url: str, data=None) -> Dict:
|
||||
"""
|
||||
请求头参数签名
|
||||
Args:
|
||||
url:
|
||||
data:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data])
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
signs = sign(
|
||||
a1=self.cookie_dict.get("a1", ""),
|
||||
b1=local_storage.get("b1", ""),
|
||||
x_s=encrypt_params.get("X-s", ""),
|
||||
x_t=str(encrypt_params.get("X-t", "")),
|
||||
)
|
||||
|
||||
headers = {
|
||||
"X-S": signs["x-s"],
|
||||
"X-T": signs["x-t"],
|
||||
"x-S-Common": signs["x-s-common"],
|
||||
"X-B3-Traceid": signs["x-b3-traceid"],
|
||||
}
|
||||
self.headers.update(headers)
|
||||
return self.headers
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# return response.text
|
||||
return_response = kwargs.pop("return_response", False)
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
|
||||
if response.status_code == 471 or response.status_code == 461:
|
||||
# someday someone maybe will bypass captcha
|
||||
verify_type = response.headers["Verifytype"]
|
||||
verify_uuid = response.headers["Verifyuuid"]
|
||||
msg = f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}"
|
||||
utils.logger.error(msg)
|
||||
raise Exception(msg)
|
||||
|
||||
if return_response:
|
||||
return response.text
|
||||
data: Dict = response.json()
|
||||
if data["success"]:
|
||||
return data.get("data", data.get("success", {}))
|
||||
elif data["code"] == self.IP_ERROR_CODE:
|
||||
raise IPBlockError(self.IP_ERROR_STR)
|
||||
else:
|
||||
raise DataFetchError(data.get("msg", None))
|
||||
|
||||
async def get(self, uri: str, params=None) -> Dict:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = f"{uri}?" f"{urlencode(params)}"
|
||||
headers = await self._pre_headers(final_uri)
|
||||
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
"""
|
||||
POST请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
data: 请求体参数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
headers = await self._pre_headers(uri, data)
|
||||
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
|
||||
return await self.request(
|
||||
method="POST",
|
||||
url=f"{self._host}{uri}",
|
||||
data=json_str,
|
||||
headers=headers,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def get_note_media(self, url: str) -> Union[bytes, None]:
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
try:
|
||||
response = await client.request("GET", url, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
|
||||
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
|
||||
return None
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Returns:
|
||||
|
||||
"""
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("[XiaoHongShuClient.pong] Begin to pong xhs...")
|
||||
ping_flag = False
|
||||
try:
|
||||
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
|
||||
if note_card.get("items"):
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.headers["Cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def get_note_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
search_id: str = get_search_id(),
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
sort: SearchSortType = SearchSortType.GENERAL,
|
||||
note_type: SearchNoteType = SearchNoteType.ALL,
|
||||
) -> Dict:
|
||||
"""
|
||||
根据关键词搜索笔记
|
||||
Args:
|
||||
keyword: 关键词参数
|
||||
page: 分页第几页
|
||||
page_size: 分页数据长度
|
||||
sort: 搜索结果排序指定
|
||||
note_type: 搜索的笔记类型
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/sns/web/v1/search/notes"
|
||||
data = {
|
||||
"keyword": keyword,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"search_id": search_id,
|
||||
"sort": sort.value,
|
||||
"note_type": note_type.value,
|
||||
}
|
||||
return await self.post(uri, data)
|
||||
|
||||
async def get_note_by_id(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_source: str,
|
||||
xsec_token: str,
|
||||
) -> Dict:
|
||||
"""
|
||||
获取笔记详情API
|
||||
Args:
|
||||
note_id:笔记ID
|
||||
xsec_source: 渠道来源
|
||||
xsec_token: 搜索关键字之后返回的比较列表中返回的token
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if xsec_source == "":
|
||||
xsec_source = "pc_search"
|
||||
|
||||
data = {
|
||||
"source_note_id": note_id,
|
||||
"image_formats": ["jpg", "webp", "avif"],
|
||||
"extra": {
|
||||
"need_body_topic": 1
|
||||
},
|
||||
"xsec_source": xsec_source,
|
||||
"xsec_token": xsec_token,
|
||||
}
|
||||
uri = "/api/sns/web/v1/feed"
|
||||
res = await self.post(uri, data)
|
||||
if res and res.get("items"):
|
||||
res_dict: Dict = res["items"][0]["note_card"]
|
||||
return res_dict
|
||||
# 爬取频繁了可能会出现有的笔记能有结果有的没有
|
||||
utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}")
|
||||
return dict()
|
||||
|
||||
async def get_note_comments(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_token: str,
|
||||
cursor: str = "",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取一级评论的API
|
||||
Args:
|
||||
note_id: 笔记ID
|
||||
xsec_token: 验证token
|
||||
cursor: 分页游标
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/sns/web/v2/comment/page"
|
||||
params = {
|
||||
"note_id": note_id,
|
||||
"cursor": cursor,
|
||||
"top_comment_id": "",
|
||||
"image_formats": "jpg,webp,avif",
|
||||
"xsec_token": xsec_token,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_note_sub_comments(
|
||||
self,
|
||||
note_id: str,
|
||||
root_comment_id: str,
|
||||
xsec_token: str,
|
||||
num: int = 10,
|
||||
cursor: str = "",
|
||||
):
|
||||
"""
|
||||
获取指定父评论下的子评论的API
|
||||
Args:
|
||||
note_id: 子评论的帖子ID
|
||||
root_comment_id: 根评论ID
|
||||
xsec_token: 验证token
|
||||
num: 分页数量
|
||||
cursor: 分页游标
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/sns/web/v2/comment/sub/page"
|
||||
params = {
|
||||
"note_id": note_id,
|
||||
"root_comment_id": root_comment_id,
|
||||
"num": num,
|
||||
"cursor": cursor,
|
||||
"image_formats": "jpg,webp,avif",
|
||||
"top_comment_id": "",
|
||||
"xsec_token": xsec_token,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_note_all_comments(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_token: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Args:
|
||||
note_id: 笔记ID
|
||||
xsec_token: 验证token
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
max_count: 一次笔记爬取的最大评论数量
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
comments_has_more = True
|
||||
comments_cursor = ""
|
||||
while comments_has_more and len(result) < max_count:
|
||||
comments_res = await self.get_note_comments(note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor)
|
||||
comments_has_more = comments_res.get("has_more", False)
|
||||
comments_cursor = comments_res.get("cursor", "")
|
||||
if "comments" not in comments_res:
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
|
||||
break
|
||||
comments = comments_res["comments"]
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
if callback:
|
||||
await callback(note_id, comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comments)
|
||||
sub_comments = await self.get_comments_all_sub_comments(
|
||||
comments=comments,
|
||||
xsec_token=xsec_token,
|
||||
crawl_interval=crawl_interval,
|
||||
callback=callback,
|
||||
)
|
||||
result.extend(sub_comments)
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
self,
|
||||
comments: List[Dict],
|
||||
xsec_token: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
|
||||
Args:
|
||||
comments: 评论列表
|
||||
xsec_token: 验证token
|
||||
crawl_interval: 爬取一次评论的延迟单位(秒)
|
||||
callback: 一次评论爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
|
||||
return []
|
||||
|
||||
result = []
|
||||
for comment in comments:
|
||||
note_id = comment.get("note_id")
|
||||
sub_comments = comment.get("sub_comments")
|
||||
if sub_comments and callback:
|
||||
await callback(note_id, sub_comments)
|
||||
|
||||
sub_comment_has_more = comment.get("sub_comment_has_more")
|
||||
if not sub_comment_has_more:
|
||||
continue
|
||||
|
||||
root_comment_id = comment.get("id")
|
||||
sub_comment_cursor = comment.get("sub_comment_cursor")
|
||||
|
||||
while sub_comment_has_more:
|
||||
comments_res = await self.get_note_sub_comments(
|
||||
note_id=note_id,
|
||||
root_comment_id=root_comment_id,
|
||||
xsec_token=xsec_token,
|
||||
num=10,
|
||||
cursor=sub_comment_cursor,
|
||||
)
|
||||
|
||||
if comments_res is None:
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}")
|
||||
continue
|
||||
sub_comment_has_more = comments_res.get("has_more", False)
|
||||
sub_comment_cursor = comments_res.get("cursor", "")
|
||||
if "comments" not in comments_res:
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}")
|
||||
break
|
||||
comments = comments_res["comments"]
|
||||
if callback:
|
||||
await callback(note_id, comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comments)
|
||||
return result
|
||||
|
||||
async def get_creator_info(self, user_id: str) -> Dict:
|
||||
"""
|
||||
通过解析网页版的用户主页HTML,获取用户个人简要信息
|
||||
PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可
|
||||
eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
|
||||
"""
|
||||
uri = f"/user/profile/{user_id}"
|
||||
html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers)
|
||||
match = re.search(r"<script>window.__INITIAL_STATE__=(.+)<\/script>", html_content, re.M)
|
||||
|
||||
if match is None:
|
||||
return {}
|
||||
|
||||
info = json.loads(match.group(1).replace(":undefined", ":null"), strict=False)
|
||||
if info is None:
|
||||
return {}
|
||||
return info.get("user").get("userPageData")
|
||||
|
||||
async def get_notes_by_creator(
|
||||
self,
|
||||
creator: str,
|
||||
cursor: str,
|
||||
page_size: int = 30,
|
||||
) -> Dict:
|
||||
"""
|
||||
获取博主的笔记
|
||||
Args:
|
||||
creator: 博主ID
|
||||
cursor: 上一页最后一条笔记的ID
|
||||
page_size: 分页数据长度
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/sns/web/v1/user_posted"
|
||||
data = {
|
||||
"user_id": creator,
|
||||
"cursor": cursor,
|
||||
"num": page_size,
|
||||
"image_formats": "jpg,webp,avif",
|
||||
}
|
||||
return await self.get(uri, data)
|
||||
|
||||
async def get_all_notes_by_creator(
|
||||
self,
|
||||
user_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Args:
|
||||
user_id: 用户ID
|
||||
crawl_interval: 爬取一次的延迟单位(秒)
|
||||
callback: 一次分页爬取结束后的更新回调函数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
notes_has_more = True
|
||||
notes_cursor = ""
|
||||
while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
|
||||
notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
|
||||
if not notes_res:
|
||||
utils.logger.error(f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
|
||||
break
|
||||
|
||||
notes_has_more = notes_res.get("has_more", False)
|
||||
notes_cursor = notes_res.get("cursor", "")
|
||||
if "notes" not in notes_res:
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
|
||||
break
|
||||
|
||||
notes = notes_res["notes"]
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
|
||||
|
||||
remaining = config.CRAWLER_MAX_NOTES_COUNT - len(result)
|
||||
if remaining <= 0:
|
||||
break
|
||||
|
||||
notes_to_add = notes[:remaining]
|
||||
if callback:
|
||||
await callback(notes_to_add)
|
||||
|
||||
result.extend(notes_to_add)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
|
||||
utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] Finished getting notes for user {user_id}, total: {len(result)}")
|
||||
return result
|
||||
|
||||
async def get_note_short_url(self, note_id: str) -> Dict:
|
||||
"""
|
||||
获取笔记的短链接
|
||||
Args:
|
||||
note_id: 笔记ID
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/sns/web/short_url"
|
||||
data = {"original_url": f"{self._domain}/discovery/item/{note_id}"}
|
||||
return await self.post(uri, data=data, return_response=True)
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def get_note_by_id_from_html(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_source: str,
|
||||
xsec_token: str,
|
||||
enable_cookie: bool = False,
|
||||
) -> Optional[Dict]:
|
||||
"""
|
||||
通过解析网页版的笔记详情页HTML,获取笔记详情, 该接口可能会出现失败的情况,这里尝试重试3次
|
||||
copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259
|
||||
thanks for ReaJason
|
||||
Args:
|
||||
note_id:
|
||||
xsec_source:
|
||||
xsec_token:
|
||||
enable_cookie:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
def camel_to_underscore(key):
|
||||
return re.sub(r"(?<!^)(?=[A-Z])", "_", key).lower()
|
||||
|
||||
def transform_json_keys(json_data):
|
||||
data_dict = json.loads(json_data)
|
||||
dict_new = {}
|
||||
for key, value in data_dict.items():
|
||||
new_key = camel_to_underscore(key)
|
||||
if not value:
|
||||
dict_new[new_key] = value
|
||||
elif isinstance(value, dict):
|
||||
dict_new[new_key] = transform_json_keys(json.dumps(value))
|
||||
elif isinstance(value, list):
|
||||
dict_new[new_key] = [(transform_json_keys(json.dumps(item)) if (item and isinstance(item, dict)) else item) for item in value]
|
||||
else:
|
||||
dict_new[new_key] = value
|
||||
return dict_new
|
||||
|
||||
url = ("https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}")
|
||||
copy_headers = self.headers.copy()
|
||||
if not enable_cookie:
|
||||
del copy_headers["Cookie"]
|
||||
|
||||
html = await self.request(method="GET", url=url, return_response=True, headers=copy_headers)
|
||||
|
||||
def get_note_dict(html):
|
||||
state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[0].replace("undefined", '""')
|
||||
|
||||
if state != "{}":
|
||||
note_dict = transform_json_keys(state)
|
||||
return note_dict["note"]["note_detail_map"][note_id]["note"]
|
||||
return {}
|
||||
|
||||
try:
|
||||
return get_note_dict(html)
|
||||
except:
|
||||
return None
|
||||
@@ -0,0 +1,485 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
from tenacity import RetryError
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||
from model.m_xiaohongshu import NoteUrlInfo
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import xhs as xhs_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import XiaoHongShuClient
|
||||
from .exception import DataFetchError
|
||||
from .field import SearchSortType
|
||||
from .help import parse_note_info_from_note_url, get_search_id
|
||||
from .login import XiaoHongShuLogin
|
||||
|
||||
|
||||
class XiaoHongShuCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
xhs_client: XiaoHongShuClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.xiaohongshu.com"
|
||||
# self.user_agent = utils.get_user_agent()
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[XiaoHongShuCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[XiaoHongShuCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
chromium,
|
||||
playwright_proxy_format,
|
||||
self.user_agent,
|
||||
headless=config.HEADLESS,
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url)
|
||||
|
||||
# Create a client to interact with the xiaohongshu website.
|
||||
self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
|
||||
if not await self.xhs_client.pong():
|
||||
login_obj = XiaoHongShuLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # input your phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""Search for notes and retrieve their comment information."""
|
||||
utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords")
|
||||
xhs_limit_count = 20 # xhs limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
|
||||
start_page = config.START_PAGE
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
search_id = get_search_id()
|
||||
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
|
||||
page += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] search xhs keyword: {keyword}, page: {page}")
|
||||
note_ids: List[str] = []
|
||||
xsec_tokens: List[str] = []
|
||||
notes_res = await self.xhs_client.get_note_by_keyword(
|
||||
keyword=keyword,
|
||||
search_id=search_id,
|
||||
page=page,
|
||||
sort=(SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != "" else SearchSortType.GENERAL),
|
||||
)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
||||
if not notes_res or not notes_res.get("has_more", False):
|
||||
utils.logger.info("No more content!")
|
||||
break
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail_async_task(
|
||||
note_id=post_item.get("id"),
|
||||
xsec_source=post_item.get("xsec_source"),
|
||||
xsec_token=post_item.get("xsec_token"),
|
||||
semaphore=semaphore,
|
||||
) for post_item in notes_res.get("items", {}) if post_item.get("model_type") not in ("rec_query", "hot_query")
|
||||
]
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
for note_detail in note_details:
|
||||
if note_detail:
|
||||
await xhs_store.update_xhs_note(note_detail)
|
||||
await self.get_notice_media(note_detail)
|
||||
note_ids.append(note_detail.get("note_id"))
|
||||
xsec_tokens.append(note_detail.get("xsec_token"))
|
||||
page += 1
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
||||
await self.batch_get_note_comments(note_ids, xsec_tokens)
|
||||
except DataFetchError:
|
||||
utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
|
||||
break
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""Get creator's notes and retrieve their comment information."""
|
||||
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
|
||||
for user_id in config.XHS_CREATOR_ID_LIST:
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id)
|
||||
if createor_info:
|
||||
await xhs_store.save_creator(user_id, creator=createor_info)
|
||||
|
||||
# When proxy is not enabled, increase the crawling interval
|
||||
if config.ENABLE_IP_PROXY:
|
||||
crawl_interval = random.random()
|
||||
else:
|
||||
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
||||
# Get all note information of the creator
|
||||
all_notes_list = await self.xhs_client.get_all_notes_by_creator(
|
||||
user_id=user_id,
|
||||
crawl_interval=crawl_interval,
|
||||
callback=self.fetch_creator_notes_detail,
|
||||
)
|
||||
|
||||
note_ids = []
|
||||
xsec_tokens = []
|
||||
for note_item in all_notes_list:
|
||||
note_ids.append(note_item.get("note_id"))
|
||||
xsec_tokens.append(note_item.get("xsec_token"))
|
||||
await self.batch_get_note_comments(note_ids, xsec_tokens)
|
||||
|
||||
async def fetch_creator_notes_detail(self, note_list: List[Dict]):
|
||||
"""
|
||||
Concurrently obtain the specified post list and save the data
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail_async_task(
|
||||
note_id=post_item.get("note_id"),
|
||||
xsec_source=post_item.get("xsec_source"),
|
||||
xsec_token=post_item.get("xsec_token"),
|
||||
semaphore=semaphore,
|
||||
) for post_item in note_list
|
||||
]
|
||||
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
for note_detail in note_details:
|
||||
if note_detail:
|
||||
await xhs_store.update_xhs_note(note_detail)
|
||||
await self.get_notice_media(note_detail)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
must be specified note_id, xsec_source, xsec_token⚠️⚠️⚠️
|
||||
Returns:
|
||||
|
||||
"""
|
||||
get_note_detail_task_list = []
|
||||
for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST:
|
||||
note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}")
|
||||
crawler_task = self.get_note_detail_async_task(
|
||||
note_id=note_url_info.note_id,
|
||||
xsec_source=note_url_info.xsec_source,
|
||||
xsec_token=note_url_info.xsec_token,
|
||||
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
|
||||
)
|
||||
get_note_detail_task_list.append(crawler_task)
|
||||
|
||||
need_get_comment_note_ids = []
|
||||
xsec_tokens = []
|
||||
note_details = await asyncio.gather(*get_note_detail_task_list)
|
||||
for note_detail in note_details:
|
||||
if note_detail:
|
||||
need_get_comment_note_ids.append(note_detail.get("note_id", ""))
|
||||
xsec_tokens.append(note_detail.get("xsec_token", ""))
|
||||
await xhs_store.update_xhs_note(note_detail)
|
||||
await self.get_notice_media(note_detail)
|
||||
await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)
|
||||
|
||||
async def get_note_detail_async_task(
|
||||
self,
|
||||
note_id: str,
|
||||
xsec_source: str,
|
||||
xsec_token: str,
|
||||
semaphore: asyncio.Semaphore,
|
||||
) -> Optional[Dict]:
|
||||
"""Get note detail
|
||||
|
||||
Args:
|
||||
note_id:
|
||||
xsec_source:
|
||||
xsec_token:
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
Dict: note detail
|
||||
"""
|
||||
note_detail = None
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
|
||||
|
||||
try:
|
||||
note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
||||
except RetryError as e:
|
||||
pass
|
||||
|
||||
if not note_detail:
|
||||
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
|
||||
if not note_detail:
|
||||
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
||||
|
||||
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
|
||||
return note_detail
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, note_list: List[str], xsec_tokens: List[str]):
|
||||
"""Batch get note comments"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for index, note_id in enumerate(note_list):
|
||||
task = asyncio.create_task(
|
||||
self.get_comments(note_id=note_id, xsec_token=xsec_tokens[index], semaphore=semaphore),
|
||||
name=note_id,
|
||||
)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments(self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore):
|
||||
"""Get note comments with keyword filtering and quantity limitation"""
|
||||
async with semaphore:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
|
||||
# When proxy is not enabled, increase the crawling interval
|
||||
if config.ENABLE_IP_PROXY:
|
||||
crawl_interval = random.random()
|
||||
else:
|
||||
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
||||
await self.xhs_client.get_note_all_comments(
|
||||
note_id=note_id,
|
||||
xsec_token=xsec_token,
|
||||
crawl_interval=crawl_interval,
|
||||
callback=xhs_store.batch_update_xhs_note_comments,
|
||||
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
xhs_client_obj = XiaoHongShuClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"accept": "application/json, text/plain, */*",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"cache-control": "no-cache",
|
||||
"content-type": "application/json;charset=UTF-8",
|
||||
"origin": "https://www.xiaohongshu.com",
|
||||
"pragma": "no-cache",
|
||||
"priority": "u=1, i",
|
||||
"referer": "https://www.xiaohongshu.com/",
|
||||
"sec-ch-ua": '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": '"Windows"',
|
||||
"sec-fetch-dest": "empty",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-site": "same-site",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
|
||||
"Cookie": cookie_str,
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return xhs_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info("[XiaoHongShuCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[XiaoHongShuCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[XiaoHongShuCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
|
||||
|
||||
async def get_notice_media(self, note_detail: Dict):
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled")
|
||||
return
|
||||
await self.get_note_images(note_detail)
|
||||
await self.get_notice_video(note_detail)
|
||||
|
||||
async def get_note_images(self, note_item: Dict):
|
||||
"""
|
||||
get note images. please use get_notice_media
|
||||
:param note_item:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
note_id = note_item.get("note_id")
|
||||
image_list: List[Dict] = note_item.get("image_list", [])
|
||||
|
||||
for img in image_list:
|
||||
if img.get("url_default") != "":
|
||||
img.update({"url": img.get("url_default")})
|
||||
|
||||
if not image_list:
|
||||
return
|
||||
picNum = 0
|
||||
for pic in image_list:
|
||||
url = pic.get("url")
|
||||
if not url:
|
||||
continue
|
||||
content = await self.xhs_client.get_note_media(url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
continue
|
||||
extension_file_name = f"{picNum}.jpg"
|
||||
picNum += 1
|
||||
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)
|
||||
|
||||
async def get_notice_video(self, note_item: Dict):
|
||||
"""
|
||||
get note videos. please use get_notice_media
|
||||
:param note_item:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_MEIDAS:
|
||||
return
|
||||
note_id = note_item.get("note_id")
|
||||
|
||||
videos = xhs_store.get_video_url_arr(note_item)
|
||||
|
||||
if not videos:
|
||||
return
|
||||
videoNum = 0
|
||||
for url in videos:
|
||||
content = await self.xhs_client.get_note_media(url)
|
||||
await asyncio.sleep(random.random())
|
||||
if content is None:
|
||||
continue
|
||||
extension_file_name = f"{videoNum}.mp4"
|
||||
videoNum += 1
|
||||
await xhs_store.update_xhs_note_video(note_id, content, extension_file_name)
|
||||
@@ -0,0 +1,20 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
@@ -0,0 +1,83 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from enum import Enum
|
||||
from typing import NamedTuple
|
||||
|
||||
|
||||
class FeedType(Enum):
|
||||
# 推荐
|
||||
RECOMMEND = "homefeed_recommend"
|
||||
# 穿搭
|
||||
FASION = "homefeed.fashion_v3"
|
||||
# 美食
|
||||
FOOD = "homefeed.food_v3"
|
||||
# 彩妆
|
||||
COSMETICS = "homefeed.cosmetics_v3"
|
||||
# 影视
|
||||
MOVIE = "homefeed.movie_and_tv_v3"
|
||||
# 职场
|
||||
CAREER = "homefeed.career_v3"
|
||||
# 情感
|
||||
EMOTION = "homefeed.love_v3"
|
||||
# 家居
|
||||
HOURSE = "homefeed.household_product_v3"
|
||||
# 游戏
|
||||
GAME = "homefeed.gaming_v3"
|
||||
# 旅行
|
||||
TRAVEL = "homefeed.travel_v3"
|
||||
# 健身
|
||||
FITNESS = "homefeed.fitness_v3"
|
||||
|
||||
|
||||
class NoteType(Enum):
|
||||
NORMAL = "normal"
|
||||
VIDEO = "video"
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
# default
|
||||
GENERAL = "general"
|
||||
# most popular
|
||||
MOST_POPULAR = "popularity_descending"
|
||||
# Latest
|
||||
LATEST = "time_descending"
|
||||
|
||||
|
||||
class SearchNoteType(Enum):
|
||||
"""search note type
|
||||
"""
|
||||
# default
|
||||
ALL = 0
|
||||
# only video
|
||||
VIDEO = 1
|
||||
# only image
|
||||
IMAGE = 2
|
||||
|
||||
|
||||
class Note(NamedTuple):
|
||||
"""note tuple"""
|
||||
note_id: str
|
||||
title: str
|
||||
desc: str
|
||||
type: str
|
||||
user: dict
|
||||
img_urls: list
|
||||
video_url: str
|
||||
tag_list: list
|
||||
at_user_list: list
|
||||
collected_count: str
|
||||
comment_count: str
|
||||
liked_count: str
|
||||
share_count: str
|
||||
time: int
|
||||
last_update_time: int
|
||||
@@ -0,0 +1,316 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import ctypes
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
from model.m_xiaohongshu import NoteUrlInfo
|
||||
from tools.crawler_util import extract_url_params_to_dict
|
||||
|
||||
|
||||
def sign(a1="", b1="", x_s="", x_t=""):
|
||||
"""
|
||||
takes in a URI (uniform resource identifier), an optional data dictionary, and an optional ctime parameter. It returns a dictionary containing two keys: "x-s" and "x-t".
|
||||
"""
|
||||
common = {
|
||||
"s0": 3, # getPlatformCode
|
||||
"s1": "",
|
||||
"x0": "1", # localStorage.getItem("b1b1")
|
||||
"x1": "3.7.8-2", # version
|
||||
"x2": "Mac OS",
|
||||
"x3": "xhs-pc-web",
|
||||
"x4": "4.27.2",
|
||||
"x5": a1, # cookie of a1
|
||||
"x6": x_t,
|
||||
"x7": x_s,
|
||||
"x8": b1, # localStorage.getItem("b1")
|
||||
"x9": mrc(x_t + x_s + b1),
|
||||
"x10": 154, # getSigCount
|
||||
}
|
||||
encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
|
||||
x_s_common = b64Encode(encode_str)
|
||||
x_b3_traceid = get_b3_trace_id()
|
||||
return {
|
||||
"x-s": x_s,
|
||||
"x-t": x_t,
|
||||
"x-s-common": x_s_common,
|
||||
"x-b3-traceid": x_b3_traceid
|
||||
}
|
||||
|
||||
|
||||
def get_b3_trace_id():
|
||||
re = "abcdef0123456789"
|
||||
je = 16
|
||||
e = ""
|
||||
for t in range(16):
|
||||
e += re[random.randint(0, je - 1)]
|
||||
return e
|
||||
|
||||
|
||||
def mrc(e):
|
||||
ie = [
|
||||
0, 1996959894, 3993919788, 2567524794, 124634137, 1886057615, 3915621685,
|
||||
2657392035, 249268274, 2044508324, 3772115230, 2547177864, 162941995,
|
||||
2125561021, 3887607047, 2428444049, 498536548, 1789927666, 4089016648,
|
||||
2227061214, 450548861, 1843258603, 4107580753, 2211677639, 325883990,
|
||||
1684777152, 4251122042, 2321926636, 335633487, 1661365465, 4195302755,
|
||||
2366115317, 997073096, 1281953886, 3579855332, 2724688242, 1006888145,
|
||||
1258607687, 3524101629, 2768942443, 901097722, 1119000684, 3686517206,
|
||||
2898065728, 853044451, 1172266101, 3705015759, 2882616665, 651767980,
|
||||
1373503546, 3369554304, 3218104598, 565507253, 1454621731, 3485111705,
|
||||
3099436303, 671266974, 1594198024, 3322730930, 2970347812, 795835527,
|
||||
1483230225, 3244367275, 3060149565, 1994146192, 31158534, 2563907772,
|
||||
4023717930, 1907459465, 112637215, 2680153253, 3904427059, 2013776290,
|
||||
251722036, 2517215374, 3775830040, 2137656763, 141376813, 2439277719,
|
||||
3865271297, 1802195444, 476864866, 2238001368, 4066508878, 1812370925,
|
||||
453092731, 2181625025, 4111451223, 1706088902, 314042704, 2344532202,
|
||||
4240017532, 1658658271, 366619977, 2362670323, 4224994405, 1303535960,
|
||||
984961486, 2747007092, 3569037538, 1256170817, 1037604311, 2765210733,
|
||||
3554079995, 1131014506, 879679996, 2909243462, 3663771856, 1141124467,
|
||||
855842277, 2852801631, 3708648649, 1342533948, 654459306, 3188396048,
|
||||
3373015174, 1466479909, 544179635, 3110523913, 3462522015, 1591671054,
|
||||
702138776, 2966460450, 3352799412, 1504918807, 783551873, 3082640443,
|
||||
3233442989, 3988292384, 2596254646, 62317068, 1957810842, 3939845945,
|
||||
2647816111, 81470997, 1943803523, 3814918930, 2489596804, 225274430,
|
||||
2053790376, 3826175755, 2466906013, 167816743, 2097651377, 4027552580,
|
||||
2265490386, 503444072, 1762050814, 4150417245, 2154129355, 426522225,
|
||||
1852507879, 4275313526, 2312317920, 282753626, 1742555852, 4189708143,
|
||||
2394877945, 397917763, 1622183637, 3604390888, 2714866558, 953729732,
|
||||
1340076626, 3518719985, 2797360999, 1068828381, 1219638859, 3624741850,
|
||||
2936675148, 906185462, 1090812512, 3747672003, 2825379669, 829329135,
|
||||
1181335161, 3412177804, 3160834842, 628085408, 1382605366, 3423369109,
|
||||
3138078467, 570562233, 1426400815, 3317316542, 2998733608, 733239954,
|
||||
1555261956, 3268935591, 3050360625, 752459403, 1541320221, 2607071920,
|
||||
3965973030, 1969922972, 40735498, 2617837225, 3943577151, 1913087877,
|
||||
83908371, 2512341634, 3803740692, 2075208622, 213261112, 2463272603,
|
||||
3855990285, 2094854071, 198958881, 2262029012, 4057260610, 1759359992,
|
||||
534414190, 2176718541, 4139329115, 1873836001, 414664567, 2282248934,
|
||||
4279200368, 1711684554, 285281116, 2405801727, 4167216745, 1634467795,
|
||||
376229701, 2685067896, 3608007406, 1308918612, 956543938, 2808555105,
|
||||
3495958263, 1231636301, 1047427035, 2932959818, 3654703836, 1088359270,
|
||||
936918000, 2847714899, 3736837829, 1202900863, 817233897, 3183342108,
|
||||
3401237130, 1404277552, 615818150, 3134207493, 3453421203, 1423857449,
|
||||
601450431, 3009837614, 3294710456, 1567103746, 711928724, 3020668471,
|
||||
3272380065, 1510334235, 755167117,
|
||||
]
|
||||
o = -1
|
||||
|
||||
def right_without_sign(num: int, bit: int=0) -> int:
|
||||
val = ctypes.c_uint32(num).value >> bit
|
||||
MAX32INT = 4294967295
|
||||
return (val + (MAX32INT + 1)) % (2 * (MAX32INT + 1)) - MAX32INT - 1
|
||||
|
||||
for n in range(57):
|
||||
o = ie[(o & 255) ^ ord(e[n])] ^ right_without_sign(o, 8)
|
||||
return o ^ -1 ^ 3988292384
|
||||
|
||||
|
||||
lookup = [
|
||||
"Z",
|
||||
"m",
|
||||
"s",
|
||||
"e",
|
||||
"r",
|
||||
"b",
|
||||
"B",
|
||||
"o",
|
||||
"H",
|
||||
"Q",
|
||||
"t",
|
||||
"N",
|
||||
"P",
|
||||
"+",
|
||||
"w",
|
||||
"O",
|
||||
"c",
|
||||
"z",
|
||||
"a",
|
||||
"/",
|
||||
"L",
|
||||
"p",
|
||||
"n",
|
||||
"g",
|
||||
"G",
|
||||
"8",
|
||||
"y",
|
||||
"J",
|
||||
"q",
|
||||
"4",
|
||||
"2",
|
||||
"K",
|
||||
"W",
|
||||
"Y",
|
||||
"j",
|
||||
"0",
|
||||
"D",
|
||||
"S",
|
||||
"f",
|
||||
"d",
|
||||
"i",
|
||||
"k",
|
||||
"x",
|
||||
"3",
|
||||
"V",
|
||||
"T",
|
||||
"1",
|
||||
"6",
|
||||
"I",
|
||||
"l",
|
||||
"U",
|
||||
"A",
|
||||
"F",
|
||||
"M",
|
||||
"9",
|
||||
"7",
|
||||
"h",
|
||||
"E",
|
||||
"C",
|
||||
"v",
|
||||
"u",
|
||||
"R",
|
||||
"X",
|
||||
"5",
|
||||
]
|
||||
|
||||
|
||||
def tripletToBase64(e):
|
||||
return (
|
||||
lookup[63 & (e >> 18)] +
|
||||
lookup[63 & (e >> 12)] +
|
||||
lookup[(e >> 6) & 63] +
|
||||
lookup[e & 63]
|
||||
)
|
||||
|
||||
|
||||
def encodeChunk(e, t, r):
|
||||
m = []
|
||||
for b in range(t, r, 3):
|
||||
n = (16711680 & (e[b] << 16)) + \
|
||||
((e[b + 1] << 8) & 65280) + (e[b + 2] & 255)
|
||||
m.append(tripletToBase64(n))
|
||||
return ''.join(m)
|
||||
|
||||
|
||||
def b64Encode(e):
|
||||
P = len(e)
|
||||
W = P % 3
|
||||
U = []
|
||||
z = 16383
|
||||
H = 0
|
||||
Z = P - W
|
||||
while H < Z:
|
||||
U.append(encodeChunk(e, H, Z if H + z > Z else H + z))
|
||||
H += z
|
||||
if 1 == W:
|
||||
F = e[P - 1]
|
||||
U.append(lookup[F >> 2] + lookup[(F << 4) & 63] + "==")
|
||||
elif 2 == W:
|
||||
F = (e[P - 2] << 8) + e[P - 1]
|
||||
U.append(lookup[F >> 10] + lookup[63 & (F >> 4)] +
|
||||
lookup[(F << 2) & 63] + "=")
|
||||
return "".join(U)
|
||||
|
||||
|
||||
def encodeUtf8(e):
|
||||
b = []
|
||||
m = urllib.parse.quote(e, safe='~()*!.\'')
|
||||
w = 0
|
||||
while w < len(m):
|
||||
T = m[w]
|
||||
if T == "%":
|
||||
E = m[w + 1] + m[w + 2]
|
||||
S = int(E, 16)
|
||||
b.append(S)
|
||||
w += 2
|
||||
else:
|
||||
b.append(ord(T[0]))
|
||||
w += 1
|
||||
return b
|
||||
|
||||
|
||||
def base36encode(number, alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
|
||||
"""Converts an integer to a base36 string."""
|
||||
if not isinstance(number, int):
|
||||
raise TypeError('number must be an integer')
|
||||
|
||||
base36 = ''
|
||||
sign = ''
|
||||
|
||||
if number < 0:
|
||||
sign = '-'
|
||||
number = -number
|
||||
|
||||
if 0 <= number < len(alphabet):
|
||||
return sign + alphabet[number]
|
||||
|
||||
while number != 0:
|
||||
number, i = divmod(number, len(alphabet))
|
||||
base36 = alphabet[i] + base36
|
||||
|
||||
return sign + base36
|
||||
|
||||
|
||||
def base36decode(number):
|
||||
return int(number, 36)
|
||||
|
||||
|
||||
def get_search_id():
|
||||
e = int(time.time() * 1000) << 64
|
||||
t = int(random.uniform(0, 2147483646))
|
||||
return base36encode((e + t))
|
||||
|
||||
|
||||
img_cdns = [
|
||||
"https://sns-img-qc.xhscdn.com",
|
||||
"https://sns-img-hw.xhscdn.com",
|
||||
"https://sns-img-bd.xhscdn.com",
|
||||
"https://sns-img-qn.xhscdn.com",
|
||||
]
|
||||
|
||||
def get_img_url_by_trace_id(trace_id: str, format_type: str = "png"):
|
||||
return f"{random.choice(img_cdns)}/{trace_id}?imageView2/format/{format_type}"
|
||||
|
||||
|
||||
def get_img_urls_by_trace_id(trace_id: str, format_type: str = "png"):
|
||||
return [f"{cdn}/{trace_id}?imageView2/format/{format_type}" for cdn in img_cdns]
|
||||
|
||||
|
||||
def get_trace_id(img_url: str):
|
||||
# 浏览器端上传的图片多了 /spectrum/ 这个路径
|
||||
return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1]
|
||||
|
||||
|
||||
def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
|
||||
"""
|
||||
从小红书笔记url中解析出笔记信息
|
||||
Args:
|
||||
url: "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
||||
Returns:
|
||||
|
||||
"""
|
||||
note_id = url.split("/")[-1].split("?")[0]
|
||||
params = extract_url_params_to_dict(url)
|
||||
xsec_token = params.get("xsec_token", "")
|
||||
xsec_source = params.get("xsec_source", "")
|
||||
return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
_img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
|
||||
# 获取一个图片地址在多个cdn下的url地址
|
||||
# final_img_urls = get_img_urls_by_trace_id(get_trace_id(_img_url))
|
||||
final_img_url = get_img_url_by_trace_id(get_trace_id(_img_url))
|
||||
print(final_img_url)
|
||||
|
||||
|
||||
@@ -0,0 +1,197 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from cache.cache_factory import CacheFactory
|
||||
from tools import utils
|
||||
|
||||
|
||||
class XiaoHongShuLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self, no_logged_in_session: str) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||||
if max retry times reached, raise RetryError
|
||||
"""
|
||||
|
||||
if "请通过验证" in await self.context_page.content():
|
||||
utils.logger.info("[XiaoHongShuLogin.check_login_state] 登录过程中出现验证码,请手动验证")
|
||||
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
current_web_session = cookie_dict.get("web_session")
|
||||
if current_web_session != no_logged_in_session:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def begin(self):
|
||||
"""Start login xiaohongshu"""
|
||||
utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
|
||||
async def login_by_mobile(self):
|
||||
"""Login xiaohongshu by mobile"""
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
|
||||
await asyncio.sleep(1)
|
||||
try:
|
||||
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
|
||||
login_button_ele = await self.context_page.wait_for_selector(
|
||||
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
|
||||
timeout=5000
|
||||
)
|
||||
await login_button_ele.click()
|
||||
# 弹窗的登录对话框也有两种形态,一种是直接可以看到手机号和验证码的
|
||||
# 另一种是需要点击切换到手机登录的
|
||||
element = await self.context_page.wait_for_selector(
|
||||
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
|
||||
timeout=5000
|
||||
)
|
||||
await element.click()
|
||||
except Exception as e:
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] have not found mobile button icon and keep going ...")
|
||||
|
||||
await asyncio.sleep(1)
|
||||
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
||||
input_ele = await login_container_ele.query_selector("label.phone > input")
|
||||
await input_ele.fill(self.login_phone)
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
|
||||
await send_btn_ele.click() # 点击发送验证码
|
||||
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
|
||||
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
|
||||
cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
no_logged_in_session = ""
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
sms_code_key = f"xhs_{self.login_phone}"
|
||||
sms_code_value = cache_client.get(sms_code_key)
|
||||
if not sms_code_value:
|
||||
max_get_sms_code_time -= 1
|
||||
continue
|
||||
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
no_logged_in_session = cookie_dict.get("web_session")
|
||||
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码
|
||||
await asyncio.sleep(0.5)
|
||||
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
|
||||
await agree_privacy_ele.click() # 点击同意隐私协议
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
await submit_btn_ele.click() # 点击登录
|
||||
|
||||
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
|
||||
break
|
||||
|
||||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login xiaohongshu website and keep webdriver login state"""
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
|
||||
# login_selector = "div.login-container > div.left > div.qrcode > img"
|
||||
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
|
||||
# find login qrcode
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||
await asyncio.sleep(0.5)
|
||||
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
|
||||
await login_button_ele.click()
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
sys.exit()
|
||||
|
||||
# get not logged session
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
no_logged_in_session = cookie_dict.get("web_session")
|
||||
|
||||
# show login qrcode
|
||||
# fix issue #12
|
||||
# we need to use partial function to call show_qrcode function and run in executor
|
||||
# then current asyncio event loop will not be blocked
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_cookies(self):
|
||||
"""login xiaohongshu website by cookies"""
|
||||
utils.logger.info("[XiaoHongShuLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
if key != "web_session": # only set web_session cookie attr
|
||||
continue
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".xiaohongshu.com",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,13 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from .core import ZhihuCrawler
|
||||
@@ -0,0 +1,568 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from httpx import Response
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from constant import zhihu as zhihu_constant
|
||||
from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
|
||||
from tools import utils
|
||||
|
||||
from .exception import DataFetchError, ForbiddenError
|
||||
from .field import SearchSort, SearchTime, SearchType
|
||||
from .help import ZhihuExtractor, sign
|
||||
|
||||
|
||||
class ZhiHuClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
proxy=None,
|
||||
*,
|
||||
headers: Dict[str, str],
|
||||
playwright_page: Page,
|
||||
cookie_dict: Dict[str, str],
|
||||
):
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
self.default_headers = headers
|
||||
self.cookie_dict = cookie_dict
|
||||
self._extractor = ZhihuExtractor()
|
||||
|
||||
async def _pre_headers(self, url: str) -> Dict:
|
||||
"""
|
||||
请求头参数签名
|
||||
Args:
|
||||
url: 请求的URL需要包含请求的参数
|
||||
Returns:
|
||||
|
||||
"""
|
||||
d_c0 = self.cookie_dict.get("d_c0")
|
||||
if not d_c0:
|
||||
raise Exception("d_c0 not found in cookies")
|
||||
sign_res = sign(url, self.default_headers["cookie"])
|
||||
headers = self.default_headers.copy()
|
||||
headers['x-zst-81'] = sign_res["x-zst-81"]
|
||||
headers['x-zse-96'] = sign_res["x-zse-96"]
|
||||
return headers
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# return response.text
|
||||
return_response = kwargs.pop('return_response', False)
|
||||
|
||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
||||
|
||||
if response.status_code != 200:
|
||||
utils.logger.error(f"[ZhiHuClient.request] Requset Url: {url}, Request error: {response.text}")
|
||||
if response.status_code == 403:
|
||||
raise ForbiddenError(response.text)
|
||||
elif response.status_code == 404: # 如果一个content没有评论也是404
|
||||
return {}
|
||||
|
||||
raise DataFetchError(response.text)
|
||||
|
||||
if return_response:
|
||||
return response.text
|
||||
try:
|
||||
data: Dict = response.json()
|
||||
if data.get("error"):
|
||||
utils.logger.error(f"[ZhiHuClient.request] Request error: {data}")
|
||||
raise DataFetchError(data.get("error", {}).get("message"))
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
utils.logger.error(f"[ZhiHuClient.request] Request error: {response.text}")
|
||||
raise DataFetchError(response.text)
|
||||
|
||||
async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, str]:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri += '?' + urlencode(params)
|
||||
headers = await self._pre_headers(final_uri)
|
||||
base_url = (zhihu_constant.ZHIHU_URL if "/p/" not in uri else zhihu_constant.ZHIHU_ZHUANLAN_URL)
|
||||
return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[ZhiHuClient.pong] Begin to pong zhihu...")
|
||||
ping_flag = False
|
||||
try:
|
||||
res = await self.get_current_user_info()
|
||||
if res.get("uid") and res.get("name"):
|
||||
ping_flag = True
|
||||
utils.logger.info("[ZhiHuClient.pong] Ping zhihu successfully")
|
||||
else:
|
||||
utils.logger.error(f"[ZhiHuClient.pong] Ping zhihu failed, response data: {res}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhiHuClient.pong] Ping zhihu failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
self.default_headers["cookie"] = cookie_str
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def get_current_user_info(self) -> Dict:
|
||||
"""
|
||||
获取当前登录用户信息
|
||||
Returns:
|
||||
|
||||
"""
|
||||
params = {"include": "email,is_active,is_bind_phone"}
|
||||
return await self.get("/api/v4/me", params)
|
||||
|
||||
async def get_note_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
sort: SearchSort = SearchSort.DEFAULT,
|
||||
note_type: SearchType = SearchType.DEFAULT,
|
||||
search_time: SearchTime = SearchTime.DEFAULT,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
根据关键词搜索
|
||||
Args:
|
||||
keyword: 关键词
|
||||
page: 第几页
|
||||
page_size: 分页size
|
||||
sort: 排序
|
||||
note_type: 搜索结果类型
|
||||
search_time: 搜索多久时间的结果
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/v4/search_v3"
|
||||
params = {
|
||||
"gk_version": "gz-gaokao",
|
||||
"t": "general",
|
||||
"q": keyword,
|
||||
"correction": 1,
|
||||
"offset": (page - 1) * page_size,
|
||||
"limit": page_size,
|
||||
"filter_fields": "",
|
||||
"lc_idx": (page - 1) * page_size,
|
||||
"show_all_topics": 0,
|
||||
"search_source": "Filter",
|
||||
"time_interval": search_time.value,
|
||||
"sort": sort.value,
|
||||
"vertical": note_type.value,
|
||||
}
|
||||
search_res = await self.get(uri, params)
|
||||
utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}")
|
||||
return self._extractor.extract_contents_from_search(search_res)
|
||||
|
||||
async def get_root_comments(
|
||||
self,
|
||||
content_id: str,
|
||||
content_type: str,
|
||||
offset: str = "",
|
||||
limit: int = 10,
|
||||
order_by: str = "score",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取内容的一级评论
|
||||
Args:
|
||||
content_id: 内容ID
|
||||
content_type: 内容类型(answer, article, zvideo)
|
||||
offset:
|
||||
limit:
|
||||
order_by:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment"
|
||||
params = {"order": order_by, "offset": offset, "limit": limit}
|
||||
return await self.get(uri, params)
|
||||
# uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
|
||||
# params = {
|
||||
# "order": order_by,
|
||||
# "offset": offset,
|
||||
# "limit": limit
|
||||
# }
|
||||
# return await self.get(uri, params)
|
||||
|
||||
async def get_child_comments(
|
||||
self,
|
||||
root_comment_id: str,
|
||||
offset: str = "",
|
||||
limit: int = 10,
|
||||
order_by: str = "sort",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取一级评论下的子评论
|
||||
Args:
|
||||
root_comment_id:
|
||||
offset:
|
||||
limit:
|
||||
order_by:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/comment_v5/comment/{root_comment_id}/child_comment"
|
||||
params = {
|
||||
"order": order_by,
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_note_all_comments(
|
||||
self,
|
||||
content: ZhihuContent,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuComment]:
|
||||
"""
|
||||
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Args:
|
||||
content: 内容详情对象(问题|文章|视频)
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result: List[ZhihuComment] = []
|
||||
is_end: bool = False
|
||||
offset: str = ""
|
||||
limit: int = 10
|
||||
while not is_end:
|
||||
root_comment_res = await self.get_root_comments(content.content_id, content.content_type, offset, limit)
|
||||
if not root_comment_res:
|
||||
break
|
||||
paging_info = root_comment_res.get("paging", {})
|
||||
is_end = paging_info.get("is_end")
|
||||
offset = self._extractor.extract_offset(paging_info)
|
||||
comments = self._extractor.extract_comments(content, root_comment_res.get("data"))
|
||||
|
||||
if not comments:
|
||||
break
|
||||
|
||||
if callback:
|
||||
await callback(comments)
|
||||
|
||||
result.extend(comments)
|
||||
await self.get_comments_all_sub_comments(content, comments, crawl_interval=crawl_interval, callback=callback)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
self,
|
||||
content: ZhihuContent,
|
||||
comments: List[ZhihuComment],
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuComment]:
|
||||
"""
|
||||
获取指定评论下的所有子评论
|
||||
Args:
|
||||
content: 内容详情对象(问题|文章|视频)
|
||||
comments: 评论列表
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
return []
|
||||
|
||||
all_sub_comments: List[ZhihuComment] = []
|
||||
for parment_comment in comments:
|
||||
if parment_comment.sub_comment_count == 0:
|
||||
continue
|
||||
|
||||
is_end: bool = False
|
||||
offset: str = ""
|
||||
limit: int = 10
|
||||
while not is_end:
|
||||
child_comment_res = await self.get_child_comments(parment_comment.comment_id, offset, limit)
|
||||
if not child_comment_res:
|
||||
break
|
||||
paging_info = child_comment_res.get("paging", {})
|
||||
is_end = paging_info.get("is_end")
|
||||
offset = self._extractor.extract_offset(paging_info)
|
||||
sub_comments = self._extractor.extract_comments(content, child_comment_res.get("data"))
|
||||
|
||||
if not sub_comments:
|
||||
break
|
||||
|
||||
if callback:
|
||||
await callback(sub_comments)
|
||||
|
||||
all_sub_comments.extend(sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return all_sub_comments
|
||||
|
||||
async def get_creator_info(self, url_token: str) -> Optional[ZhihuCreator]:
|
||||
"""
|
||||
获取创作者信息
|
||||
Args:
|
||||
url_token:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/people/{url_token}"
|
||||
html_content: str = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_creator(url_token, html_content)
|
||||
|
||||
async def get_creator_answers(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的回答
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
limit:
|
||||
|
||||
Returns:
|
||||
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/members/{url_token}/answers"
|
||||
params = {
|
||||
"include":
|
||||
"data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,attachment,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,excerpt,paid_info,reaction_instruction,is_labeled,label_info,relationship.is_authorized,voting,is_author,is_thanked,is_nothelp;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;data[*].question.has_publishing_draft,relationship",
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"order_by": "created"
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_creator_articles(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的文章
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
limit:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/members/{url_token}/articles"
|
||||
params = {
|
||||
"include":
|
||||
"data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,reaction_instruction,is_labeled,label_info;data[*].vessay_info;data[*].author.badge[?(type=best_answerer)].topics;data[*].author.vip_info;",
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"order_by": "created"
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_creator_videos(self, url_token: str, offset: int = 0, limit: int = 20) -> Dict:
|
||||
"""
|
||||
获取创作者的视频
|
||||
Args:
|
||||
url_token:
|
||||
offset:
|
||||
limit:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/members/{url_token}/zvideos"
|
||||
params = {
|
||||
"include": "similar_zvideo,creation_relationship,reaction_instruction",
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"similar_aggregation": "true",
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
async def get_all_anwser_by_creator(self, creator: ZhihuCreator, crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有回答
|
||||
Args:
|
||||
creator: 创作者信息
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
all_contents: List[ZhihuContent] = []
|
||||
is_end: bool = False
|
||||
offset: int = 0
|
||||
limit: int = 20
|
||||
while not is_end:
|
||||
res = await self.get_creator_answers(creator.url_token, offset, limit)
|
||||
if not res:
|
||||
break
|
||||
utils.logger.info(f"[ZhiHuClient.get_all_anwser_by_creator] Get creator {creator.url_token} answers: {res}")
|
||||
paging_info = res.get("paging", {})
|
||||
is_end = paging_info.get("is_end")
|
||||
contents = self._extractor.extract_content_list_from_creator(res.get("data"))
|
||||
if callback:
|
||||
await callback(contents)
|
||||
all_contents.extend(contents)
|
||||
offset += limit
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return all_contents
|
||||
|
||||
async def get_all_articles_by_creator(
|
||||
self,
|
||||
creator: ZhihuCreator,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有文章
|
||||
Args:
|
||||
creator:
|
||||
crawl_interval:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
all_contents: List[ZhihuContent] = []
|
||||
is_end: bool = False
|
||||
offset: int = 0
|
||||
limit: int = 20
|
||||
while not is_end:
|
||||
res = await self.get_creator_articles(creator.url_token, offset, limit)
|
||||
if not res:
|
||||
break
|
||||
paging_info = res.get("paging", {})
|
||||
is_end = paging_info.get("is_end")
|
||||
contents = self._extractor.extract_content_list_from_creator(res.get("data"))
|
||||
if callback:
|
||||
await callback(contents)
|
||||
all_contents.extend(contents)
|
||||
offset += limit
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return all_contents
|
||||
|
||||
async def get_all_videos_by_creator(
|
||||
self,
|
||||
creator: ZhihuCreator,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[ZhihuContent]:
|
||||
"""
|
||||
获取创作者的所有视频
|
||||
Args:
|
||||
creator:
|
||||
crawl_interval:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
all_contents: List[ZhihuContent] = []
|
||||
is_end: bool = False
|
||||
offset: int = 0
|
||||
limit: int = 20
|
||||
while not is_end:
|
||||
res = await self.get_creator_videos(creator.url_token, offset, limit)
|
||||
if not res:
|
||||
break
|
||||
paging_info = res.get("paging", {})
|
||||
is_end = paging_info.get("is_end")
|
||||
contents = self._extractor.extract_content_list_from_creator(res.get("data"))
|
||||
if callback:
|
||||
await callback(contents)
|
||||
all_contents.extend(contents)
|
||||
offset += limit
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return all_contents
|
||||
|
||||
async def get_answer_info(
|
||||
self,
|
||||
question_id: str,
|
||||
answer_id: str,
|
||||
) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取回答信息
|
||||
Args:
|
||||
question_id:
|
||||
answer_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/question/{question_id}/answer/{answer_id}"
|
||||
response_html = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_answer_content_from_html(response_html)
|
||||
|
||||
async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取文章信息
|
||||
Args:
|
||||
article_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{article_id}"
|
||||
response_html = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_article_content_from_html(response_html)
|
||||
|
||||
async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取视频信息
|
||||
Args:
|
||||
video_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/zvideo/{video_id}"
|
||||
response_html = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_zvideo_content_from_html(response_html)
|
||||
@@ -0,0 +1,455 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple, cast
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from constant import zhihu as constant
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from model.m_zhihu import ZhihuContent, ZhihuCreator
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import zhihu as zhihu_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import ZhiHuClient
|
||||
from .exception import DataFetchError
|
||||
from .help import ZhihuExtractor, judge_zhihu_url
|
||||
from .login import ZhiHuLogin
|
||||
|
||||
|
||||
class ZhihuCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
zhihu_client: ZhiHuClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://www.zhihu.com"
|
||||
# self.user_agent = utils.get_user_agent()
|
||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
||||
self._extractor = ZhihuExtractor()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
Start the crawler
|
||||
Returns:
|
||||
|
||||
"""
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
ip_proxy_pool = await create_ip_pool(
|
||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||
)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
|
||||
ip_proxy_info
|
||||
)
|
||||
|
||||
async with async_playwright() as playwright:
|
||||
# 根据配置选择启动模式
|
||||
if config.ENABLE_CDP_MODE:
|
||||
utils.logger.info("[ZhihuCrawler] 使用CDP模式启动浏览器")
|
||||
self.browser_context = await self.launch_browser_with_cdp(
|
||||
playwright,
|
||||
playwright_proxy_format,
|
||||
self.user_agent,
|
||||
headless=config.CDP_HEADLESS,
|
||||
)
|
||||
else:
|
||||
utils.logger.info("[ZhihuCrawler] 使用标准模式启动浏览器")
|
||||
# Launch a browser context.
|
||||
chromium = playwright.chromium
|
||||
self.browser_context = await self.launch_browser(
|
||||
chromium, None, self.user_agent, headless=config.HEADLESS
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
||||
|
||||
self.context_page = await self.browser_context.new_page()
|
||||
await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
|
||||
|
||||
# Create a client to interact with the zhihu website.
|
||||
self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format)
|
||||
if not await self.zhihu_client.pong():
|
||||
login_obj = ZhiHuLogin(
|
||||
login_type=config.LOGIN_TYPE,
|
||||
login_phone="", # input your phone number
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES,
|
||||
)
|
||||
await login_obj.begin()
|
||||
await self.zhihu_client.update_cookies(
|
||||
browser_context=self.browser_context
|
||||
)
|
||||
|
||||
# 知乎的搜索接口需要打开搜索页面之后cookies才能访问API,单独的首页不行
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.start] Zhihu跳转到搜索页面获取搜索页面的Cookies,该过程需要5秒左右"
|
||||
)
|
||||
await self.context_page.goto(
|
||||
f"{self.index_url}/search?q=python&search_source=Guess&utm_content=search_hot&type=content"
|
||||
)
|
||||
await asyncio.sleep(5)
|
||||
await self.zhihu_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""Search for notes and retrieve their comment information."""
|
||||
utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords")
|
||||
zhihu_limit_count = 20 # zhihu limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count
|
||||
start_page = config.START_PAGE
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] Current search keyword: {keyword}"
|
||||
)
|
||||
page = 1
|
||||
while (
|
||||
page - start_page + 1
|
||||
) * zhihu_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[ZhihuCrawler.search] Skip page {page}")
|
||||
page += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] search zhihu keyword: {keyword}, page: {page}"
|
||||
)
|
||||
content_list: List[ZhihuContent] = (
|
||||
await self.zhihu_client.get_note_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
)
|
||||
)
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.search] Search contents :{content_list}"
|
||||
)
|
||||
if not content_list:
|
||||
utils.logger.info("No more content!")
|
||||
break
|
||||
|
||||
page += 1
|
||||
for content in content_list:
|
||||
await zhihu_store.update_zhihu_content(content)
|
||||
|
||||
await self.batch_get_content_comments(content_list)
|
||||
except DataFetchError:
|
||||
utils.logger.error("[ZhihuCrawler.search] Search content error")
|
||||
return
|
||||
|
||||
async def batch_get_content_comments(self, content_list: List[ZhihuContent]):
|
||||
"""
|
||||
Batch get content comments
|
||||
Args:
|
||||
content_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.batch_get_content_comments] Crawling comment mode is not enabled"
|
||||
)
|
||||
return
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for content_item in content_list:
|
||||
task = asyncio.create_task(
|
||||
self.get_comments(content_item, semaphore), name=content_item.content_id
|
||||
)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments(
|
||||
self, content_item: ZhihuContent, semaphore: asyncio.Semaphore
|
||||
):
|
||||
"""
|
||||
Get note comments with keyword filtering and quantity limitation
|
||||
Args:
|
||||
content_item:
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_comments] Begin get note id comments {content_item.content_id}"
|
||||
)
|
||||
await self.zhihu_client.get_note_all_comments(
|
||||
content=content_item,
|
||||
crawl_interval=random.random(),
|
||||
callback=zhihu_store.batch_update_zhihu_note_comments,
|
||||
)
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""
|
||||
Get creator's information and their notes and comments
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.get_creators_and_notes] Begin get xiaohongshu creators"
|
||||
)
|
||||
for user_link in config.ZHIHU_CREATOR_URL_LIST:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_creators_and_notes] Begin get creator {user_link}"
|
||||
)
|
||||
user_url_token = user_link.split("/")[-1]
|
||||
# get creator detail info from web html content
|
||||
createor_info: ZhihuCreator = await self.zhihu_client.get_creator_info(
|
||||
url_token=user_url_token
|
||||
)
|
||||
if not createor_info:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_creators_and_notes] Creator {user_url_token} not found"
|
||||
)
|
||||
continue
|
||||
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_creators_and_notes] Creator info: {createor_info}"
|
||||
)
|
||||
await zhihu_store.save_creator(creator=createor_info)
|
||||
|
||||
# 默认只提取回答信息,如果需要文章和视频,把下面的注释打开即可
|
||||
|
||||
# Get all anwser information of the creator
|
||||
all_content_list = await self.zhihu_client.get_all_anwser_by_creator(
|
||||
creator=createor_info,
|
||||
crawl_interval=random.random(),
|
||||
callback=zhihu_store.batch_update_zhihu_contents,
|
||||
)
|
||||
|
||||
# Get all articles of the creator's contents
|
||||
# all_content_list = await self.zhihu_client.get_all_articles_by_creator(
|
||||
# creator=createor_info,
|
||||
# crawl_interval=random.random(),
|
||||
# callback=zhihu_store.batch_update_zhihu_contents
|
||||
# )
|
||||
|
||||
# Get all videos of the creator's contents
|
||||
# all_content_list = await self.zhihu_client.get_all_videos_by_creator(
|
||||
# creator=createor_info,
|
||||
# crawl_interval=random.random(),
|
||||
# callback=zhihu_store.batch_update_zhihu_contents
|
||||
# )
|
||||
|
||||
# Get all comments of the creator's contents
|
||||
await self.batch_get_content_comments(all_content_list)
|
||||
|
||||
async def get_note_detail(
|
||||
self, full_note_url: str, semaphore: asyncio.Semaphore
|
||||
) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
Get note detail
|
||||
Args:
|
||||
full_note_url: str
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
|
||||
)
|
||||
# judge note type
|
||||
note_type: str = judge_zhihu_url(full_note_url)
|
||||
if note_type == constant.ANSWER_NAME:
|
||||
question_id = full_note_url.split("/")[-3]
|
||||
answer_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_answer_info(question_id, answer_id)
|
||||
|
||||
elif note_type == constant.ARTICLE_NAME:
|
||||
article_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_article_info(article_id)
|
||||
|
||||
elif note_type == constant.VIDEO_NAME:
|
||||
video_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_video_info(video_id)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
Returns:
|
||||
|
||||
"""
|
||||
get_note_detail_task_list = []
|
||||
for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST:
|
||||
# remove query params
|
||||
full_note_url = full_note_url.split("?")[0]
|
||||
crawler_task = self.get_note_detail(
|
||||
full_note_url=full_note_url,
|
||||
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
|
||||
)
|
||||
get_note_detail_task_list.append(crawler_task)
|
||||
|
||||
need_get_comment_notes: List[ZhihuContent] = []
|
||||
note_details = await asyncio.gather(*get_note_detail_task_list)
|
||||
for index, note_detail in enumerate(note_details):
|
||||
if not note_detail:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found"
|
||||
)
|
||||
continue
|
||||
|
||||
note_detail = cast(ZhihuContent, note_detail) # only for type check
|
||||
need_get_comment_notes.append(note_detail)
|
||||
await zhihu_store.update_zhihu_content(note_detail)
|
||||
|
||||
await self.batch_get_content_comments(need_get_comment_notes)
|
||||
|
||||
async def create_zhihu_client(self, httpx_proxy: Optional[str]) -> ZhiHuClient:
|
||||
"""Create zhihu client"""
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.create_zhihu_client] Begin create zhihu API client ..."
|
||||
)
|
||||
cookie_str, cookie_dict = utils.convert_cookies(
|
||||
await self.browser_context.cookies()
|
||||
)
|
||||
zhihu_client_obj = ZhiHuClient(
|
||||
proxy=httpx_proxy,
|
||||
headers={
|
||||
"accept": "*/*",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"cookie": cookie_str,
|
||||
"priority": "u=1, i",
|
||||
"referer": "https://www.zhihu.com/search?q=python&time_interval=a_year&type=content",
|
||||
"user-agent": self.user_agent,
|
||||
"x-api-version": "3.0.91",
|
||||
"x-app-za": "OS=Web",
|
||||
"x-requested-with": "fetch",
|
||||
"x-zse-93": "101_3_3.0",
|
||||
},
|
||||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return zhihu_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info(
|
||||
"[ZhihuCrawler.launch_browser] Begin create browser context ..."
|
||||
)
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
user_data_dir = os.path.join(
|
||||
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
|
||||
) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
|
||||
)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[ZhihuCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[ZhihuCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[ZhihuCrawler.close] Browser context closed ...")
|
||||
@@ -0,0 +1,23 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from httpx import RequestError
|
||||
|
||||
|
||||
class DataFetchError(RequestError):
|
||||
"""something error when fetch"""
|
||||
|
||||
|
||||
class IPBlockError(RequestError):
|
||||
"""fetch so fast that the server block us ip"""
|
||||
|
||||
class ForbiddenError(RequestError):
|
||||
"""Forbidden"""
|
||||
@@ -0,0 +1,47 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from enum import Enum
|
||||
from typing import NamedTuple
|
||||
|
||||
from constant import zhihu as zhihu_constant
|
||||
|
||||
|
||||
class SearchTime(Enum):
|
||||
"""
|
||||
搜索时间范围
|
||||
"""
|
||||
DEFAULT = "" # 不限时间
|
||||
ONE_DAY = "a_day" # 一天内
|
||||
ONE_WEEK = "a_week" # 一周内
|
||||
ONE_MONTH = "a_month" # 一个月内
|
||||
THREE_MONTH = "three_months" # 三个月内
|
||||
HALF_YEAR = "half_a_year" # 半年内
|
||||
ONE_YEAR = "a_year" # 一年内
|
||||
|
||||
|
||||
class SearchType(Enum):
|
||||
"""
|
||||
搜索结果类型
|
||||
"""
|
||||
DEFAULT = "" # 不限类型
|
||||
ANSWER = zhihu_constant.ANSWER_NAME # 只看回答
|
||||
ARTICLE = zhihu_constant.ARTICLE_NAME # 只看文章
|
||||
VIDEO = zhihu_constant.VIDEO_NAME # 只看视频
|
||||
|
||||
|
||||
class SearchSort(Enum):
|
||||
"""
|
||||
搜索结果排序
|
||||
"""
|
||||
DEFAULT = "" # 综合排序
|
||||
UPVOTED_COUNT = "upvoted_count" # 最多赞同
|
||||
CREATE_TIME = "created_time" # 最新发布
|
||||
@@ -0,0 +1,467 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
from typing import Dict, List, Optional
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
import execjs
|
||||
from parsel import Selector
|
||||
|
||||
from constant import zhihu as zhihu_constant
|
||||
from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
|
||||
from tools import utils
|
||||
from tools.crawler_util import extract_text_from_html
|
||||
|
||||
ZHIHU_SGIN_JS = None
|
||||
|
||||
|
||||
def sign(url: str, cookies: str) -> Dict:
|
||||
"""
|
||||
zhihu sign algorithm
|
||||
Args:
|
||||
url: request url with query string
|
||||
cookies: request cookies with d_c0 key
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
global ZHIHU_SGIN_JS
|
||||
if not ZHIHU_SGIN_JS:
|
||||
with open("libs/zhihu.js", mode="r", encoding="utf-8-sig") as f:
|
||||
ZHIHU_SGIN_JS = execjs.compile(f.read())
|
||||
|
||||
return ZHIHU_SGIN_JS.call("get_sign", url, cookies)
|
||||
|
||||
|
||||
class ZhihuExtractor:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def extract_contents_from_search(self, json_data: Dict) -> List[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu contents
|
||||
Args:
|
||||
json_data: zhihu json data
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not json_data:
|
||||
return []
|
||||
|
||||
search_result: List[Dict] = json_data.get("data", [])
|
||||
search_result = [s_item for s_item in search_result if s_item.get("type") in ['search_result', 'zvideo']]
|
||||
return self._extract_content_list([sr_item.get("object") for sr_item in search_result if sr_item.get("object")])
|
||||
|
||||
|
||||
def _extract_content_list(self, content_list: List[Dict]) -> List[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu content list
|
||||
Args:
|
||||
content_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not content_list:
|
||||
return []
|
||||
|
||||
res: List[ZhihuContent] = []
|
||||
for content in content_list:
|
||||
if content.get("type") == zhihu_constant.ANSWER_NAME:
|
||||
res.append(self._extract_answer_content(content))
|
||||
elif content.get("type") == zhihu_constant.ARTICLE_NAME:
|
||||
res.append(self._extract_article_content(content))
|
||||
elif content.get("type") == zhihu_constant.VIDEO_NAME:
|
||||
res.append(self._extract_zvideo_content(content))
|
||||
else:
|
||||
continue
|
||||
return res
|
||||
|
||||
def _extract_answer_content(self, answer: Dict) -> ZhihuContent:
|
||||
"""
|
||||
extract zhihu answer content
|
||||
Args:
|
||||
answer: zhihu answer
|
||||
|
||||
Returns:
|
||||
"""
|
||||
res = ZhihuContent()
|
||||
res.content_id = answer.get("id")
|
||||
res.content_type = answer.get("type")
|
||||
res.content_text = extract_text_from_html(answer.get("content", ""))
|
||||
res.question_id = answer.get("question").get("id")
|
||||
res.content_url = f"{zhihu_constant.ZHIHU_URL}/question/{res.question_id}/answer/{res.content_id}"
|
||||
res.title = extract_text_from_html(answer.get("title", ""))
|
||||
res.desc = extract_text_from_html(answer.get("description", "") or answer.get("excerpt", ""))
|
||||
res.created_time = answer.get("created_time")
|
||||
res.updated_time = answer.get("updated_time")
|
||||
res.voteup_count = answer.get("voteup_count", 0)
|
||||
res.comment_count = answer.get("comment_count", 0)
|
||||
|
||||
# extract author info
|
||||
author_info = self._extract_content_or_comment_author(answer.get("author"))
|
||||
res.user_id = author_info.user_id
|
||||
res.user_link = author_info.user_link
|
||||
res.user_nickname = author_info.user_nickname
|
||||
res.user_avatar = author_info.user_avatar
|
||||
res.user_url_token = author_info.url_token
|
||||
return res
|
||||
|
||||
def _extract_article_content(self, article: Dict) -> ZhihuContent:
|
||||
"""
|
||||
extract zhihu article content
|
||||
Args:
|
||||
article: zhihu article
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
res = ZhihuContent()
|
||||
res.content_id = article.get("id")
|
||||
res.content_type = article.get("type")
|
||||
res.content_text = extract_text_from_html(article.get("content"))
|
||||
res.content_url = f"{zhihu_constant.ZHIHU_ZHUANLAN_URL}/p/{res.content_id}"
|
||||
res.title = extract_text_from_html(article.get("title"))
|
||||
res.desc = extract_text_from_html(article.get("excerpt"))
|
||||
res.created_time = article.get("created_time", 0) or article.get("created", 0)
|
||||
res.updated_time = article.get("updated_time", 0) or article.get("updated", 0)
|
||||
res.voteup_count = article.get("voteup_count", 0)
|
||||
res.comment_count = article.get("comment_count", 0)
|
||||
|
||||
# extract author info
|
||||
author_info = self._extract_content_or_comment_author(article.get("author"))
|
||||
res.user_id = author_info.user_id
|
||||
res.user_link = author_info.user_link
|
||||
res.user_nickname = author_info.user_nickname
|
||||
res.user_avatar = author_info.user_avatar
|
||||
res.user_url_token = author_info.url_token
|
||||
return res
|
||||
|
||||
def _extract_zvideo_content(self, zvideo: Dict) -> ZhihuContent:
|
||||
"""
|
||||
extract zhihu zvideo content
|
||||
Args:
|
||||
zvideo:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
res = ZhihuContent()
|
||||
|
||||
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
|
||||
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
|
||||
res.created_time = zvideo.get("published_at")
|
||||
res.updated_time = zvideo.get("updated_at")
|
||||
else:
|
||||
res.content_url = zvideo.get("video_url")
|
||||
res.created_time = zvideo.get("created_at")
|
||||
res.content_id = zvideo.get("id")
|
||||
res.content_type = zvideo.get("type")
|
||||
res.title = extract_text_from_html(zvideo.get("title"))
|
||||
res.desc = extract_text_from_html(zvideo.get("description"))
|
||||
res.voteup_count = zvideo.get("voteup_count")
|
||||
res.comment_count = zvideo.get("comment_count")
|
||||
|
||||
# extract author info
|
||||
author_info = self._extract_content_or_comment_author(zvideo.get("author"))
|
||||
res.user_id = author_info.user_id
|
||||
res.user_link = author_info.user_link
|
||||
res.user_nickname = author_info.user_nickname
|
||||
res.user_avatar = author_info.user_avatar
|
||||
res.user_url_token = author_info.url_token
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def _extract_content_or_comment_author(author: Dict) -> ZhihuCreator:
|
||||
"""
|
||||
extract zhihu author
|
||||
Args:
|
||||
author:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
res = ZhihuCreator()
|
||||
try:
|
||||
if not author:
|
||||
return res
|
||||
if not author.get("id"):
|
||||
author = author.get("member")
|
||||
res.user_id = author.get("id")
|
||||
res.user_link = f"{zhihu_constant.ZHIHU_URL}/people/{author.get('url_token')}"
|
||||
res.user_nickname = author.get("name")
|
||||
res.user_avatar = author.get("avatar_url")
|
||||
res.url_token = author.get("url_token")
|
||||
|
||||
except Exception as e :
|
||||
utils.logger.warning(
|
||||
f"[ZhihuExtractor._extract_content_or_comment_author] User Maybe Blocked. {e}"
|
||||
)
|
||||
return res
|
||||
|
||||
def extract_comments(self, page_content: ZhihuContent, comments: List[Dict]) -> List[ZhihuComment]:
|
||||
"""
|
||||
extract zhihu comments
|
||||
Args:
|
||||
page_content: zhihu content object
|
||||
comments: zhihu comments
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not comments:
|
||||
return []
|
||||
res: List[ZhihuComment] = []
|
||||
for comment in comments:
|
||||
if comment.get("type") != "comment":
|
||||
continue
|
||||
res.append(self._extract_comment(page_content, comment))
|
||||
return res
|
||||
|
||||
def _extract_comment(self, page_content: ZhihuContent, comment: Dict) -> ZhihuComment:
|
||||
"""
|
||||
extract zhihu comment
|
||||
Args:
|
||||
page_content: comment with content object
|
||||
comment: zhihu comment
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
res = ZhihuComment()
|
||||
res.comment_id = str(comment.get("id", ""))
|
||||
res.parent_comment_id = comment.get("reply_comment_id")
|
||||
res.content = extract_text_from_html(comment.get("content"))
|
||||
res.publish_time = comment.get("created_time")
|
||||
res.ip_location = self._extract_comment_ip_location(comment.get("comment_tag", []))
|
||||
res.sub_comment_count = comment.get("child_comment_count")
|
||||
res.like_count = comment.get("like_count") if comment.get("like_count") else 0
|
||||
res.dislike_count = comment.get("dislike_count") if comment.get("dislike_count") else 0
|
||||
res.content_id = page_content.content_id
|
||||
res.content_type = page_content.content_type
|
||||
|
||||
# extract author info
|
||||
author_info = self._extract_content_or_comment_author(comment.get("author"))
|
||||
res.user_id = author_info.user_id
|
||||
res.user_link = author_info.user_link
|
||||
res.user_nickname = author_info.user_nickname
|
||||
res.user_avatar = author_info.user_avatar
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def _extract_comment_ip_location(comment_tags: List[Dict]) -> str:
|
||||
"""
|
||||
extract comment ip location
|
||||
Args:
|
||||
comment_tags:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not comment_tags:
|
||||
return ""
|
||||
|
||||
for ct in comment_tags:
|
||||
if ct.get("type") == "ip_info":
|
||||
return ct.get("text")
|
||||
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def extract_offset(paging_info: Dict) -> str:
|
||||
"""
|
||||
extract offset
|
||||
Args:
|
||||
paging_info:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# https://www.zhihu.com/api/v4/comment_v5/zvideos/1424368906836807681/root_comment?limit=10&offset=456770961_10125996085_0&order_by=score
|
||||
next_url = paging_info.get("next")
|
||||
if not next_url:
|
||||
return ""
|
||||
|
||||
parsed_url = urlparse(next_url)
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
offset = query_params.get('offset', [""])[0]
|
||||
return offset
|
||||
|
||||
@staticmethod
|
||||
def _foramt_gender_text(gender: int) -> str:
|
||||
"""
|
||||
format gender text
|
||||
Args:
|
||||
gender:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if gender == 1:
|
||||
return "男"
|
||||
elif gender == 0:
|
||||
return "女"
|
||||
else:
|
||||
return "未知"
|
||||
|
||||
|
||||
def extract_creator(self, user_url_token: str, html_content: str) -> Optional[ZhihuCreator]:
|
||||
"""
|
||||
extract zhihu creator
|
||||
Args:
|
||||
user_url_token : zhihu creator url token
|
||||
html_content: zhihu creator html content
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not html_content:
|
||||
return None
|
||||
|
||||
js_init_data = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="").strip()
|
||||
if not js_init_data:
|
||||
return None
|
||||
|
||||
js_init_data_dict: Dict = json.loads(js_init_data)
|
||||
users_info: Dict = js_init_data_dict.get("initialState", {}).get("entities", {}).get("users", {})
|
||||
if not users_info:
|
||||
return None
|
||||
|
||||
creator_info: Dict = users_info.get(user_url_token)
|
||||
if not creator_info:
|
||||
return None
|
||||
|
||||
res = ZhihuCreator()
|
||||
res.user_id = creator_info.get("id")
|
||||
res.user_link = f"{zhihu_constant.ZHIHU_URL}/people/{user_url_token}"
|
||||
res.user_nickname = creator_info.get("name")
|
||||
res.user_avatar = creator_info.get("avatarUrl")
|
||||
res.url_token = creator_info.get("urlToken") or user_url_token
|
||||
res.gender = self._foramt_gender_text(creator_info.get("gender"))
|
||||
res.ip_location = creator_info.get("ipInfo")
|
||||
res.follows = creator_info.get("followingCount")
|
||||
res.fans = creator_info.get("followerCount")
|
||||
res.anwser_count = creator_info.get("answerCount")
|
||||
res.video_count = creator_info.get("zvideoCount")
|
||||
res.question_count = creator_info.get("questionCount")
|
||||
res.article_count = creator_info.get("articlesCount")
|
||||
res.column_count = creator_info.get("columnsCount")
|
||||
res.get_voteup_count = creator_info.get("voteupCount")
|
||||
return res
|
||||
|
||||
|
||||
def extract_content_list_from_creator(self, anwser_list: List[Dict]) -> List[ZhihuContent]:
|
||||
"""
|
||||
extract content list from creator
|
||||
Args:
|
||||
anwser_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not anwser_list:
|
||||
return []
|
||||
|
||||
return self._extract_content_list(anwser_list)
|
||||
|
||||
|
||||
|
||||
|
||||
def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu answer content from html
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||
if not js_init_data:
|
||||
return None
|
||||
json_data: Dict = json.loads(js_init_data)
|
||||
answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {})
|
||||
if not answer_info:
|
||||
return None
|
||||
|
||||
return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0]))
|
||||
|
||||
def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu article content from html
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||
if not js_init_data:
|
||||
return None
|
||||
json_data: Dict = json.loads(js_init_data)
|
||||
article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {})
|
||||
if not article_info:
|
||||
return None
|
||||
|
||||
return self._extract_article_content(article_info.get(list(article_info.keys())[0]))
|
||||
|
||||
def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu zvideo content from html
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||
if not js_init_data:
|
||||
return None
|
||||
json_data: Dict = json.loads(js_init_data)
|
||||
zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {})
|
||||
users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {})
|
||||
if not zvideo_info:
|
||||
return None
|
||||
|
||||
# handler user info and video info
|
||||
video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0])
|
||||
if not video_detail_info:
|
||||
return None
|
||||
if isinstance(video_detail_info.get("author"), str):
|
||||
author_name: str = video_detail_info.get("author")
|
||||
video_detail_info["author"] = users.get(author_name)
|
||||
|
||||
return self._extract_zvideo_content(video_detail_info)
|
||||
|
||||
|
||||
def judge_zhihu_url(note_detail_url: str) -> str:
|
||||
"""
|
||||
judge zhihu url type
|
||||
Args:
|
||||
note_detail_url:
|
||||
eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer
|
||||
eg2: https://www.zhihu.com/p/123456789 # article
|
||||
eg3: https://www.zhihu.com/zvideo/123456789 # zvideo
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if "/answer/" in note_detail_url:
|
||||
return zhihu_constant.ANSWER_NAME
|
||||
elif "/p/" in note_detail_url:
|
||||
return zhihu_constant.ARTICLE_NAME
|
||||
elif "/zvideo/" in note_detail_url:
|
||||
return zhihu_constant.VIDEO_NAME
|
||||
else:
|
||||
return ""
|
||||
@@ -0,0 +1,115 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class ZhiHuLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
Check if the current login status is successful and return True otherwise return False
|
||||
Returns:
|
||||
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
current_web_session = cookie_dict.get("z_c0")
|
||||
if current_web_session:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def begin(self):
|
||||
"""Start login zhihu"""
|
||||
utils.logger.info("[ZhiHu.begin] Begin login zhihu ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[ZhiHu.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
|
||||
async def login_by_mobile(self):
|
||||
"""Login zhihu by mobile"""
|
||||
# todo implement login by mobile
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login zhihu website and keep webdriver login state"""
|
||||
utils.logger.info("[ZhiHu.login_by_qrcode] Begin login zhihu by qrcode ...")
|
||||
qrcode_img_selector = "canvas.Qrcode-qrcode"
|
||||
# find login qrcode
|
||||
base64_qrcode_img = await utils.find_qrcode_img_from_canvas(
|
||||
self.context_page,
|
||||
canvas_selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[ZhiHu.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
if not base64_qrcode_img:
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
# fix issue #12
|
||||
# we need to use partial function to call show_qrcode function and run in executor
|
||||
# then current asyncio event loop will not be blocked
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[ZhiHu.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
|
||||
except RetryError:
|
||||
utils.logger.info("[ZhiHu.login_by_qrcode] Login zhihu failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(
|
||||
f"[ZhiHu.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_cookies(self):
|
||||
"""login zhihu website by cookies"""
|
||||
utils.logger.info("[ZhiHu.login_by_cookies] Begin login zhihu by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".zhihu.com",
|
||||
'path': "/"
|
||||
}])
|
||||
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
@@ -0,0 +1,71 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class TiebaNote(BaseModel):
|
||||
"""
|
||||
百度贴吧帖子
|
||||
"""
|
||||
note_id: str = Field(..., description="帖子ID")
|
||||
title: str = Field(..., description="帖子标题")
|
||||
desc: str = Field(default="", description="帖子描述")
|
||||
note_url: str = Field(..., description="帖子链接")
|
||||
publish_time: str = Field(default="", description="发布时间")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
tieba_name: str = Field(..., description="贴吧名称")
|
||||
tieba_link: str = Field(..., description="贴吧链接")
|
||||
total_replay_num: int = Field(default=0, description="回复总数")
|
||||
total_replay_page: int = Field(default=0, description="回复总页数")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
source_keyword: str = Field(default="", description="来源关键词")
|
||||
|
||||
|
||||
class TiebaComment(BaseModel):
|
||||
"""
|
||||
百度贴吧评论
|
||||
"""
|
||||
|
||||
comment_id: str = Field(..., description="评论ID")
|
||||
parent_comment_id: str = Field(default="", description="父评论ID")
|
||||
content: str = Field(..., description="评论内容")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
publish_time: str = Field(default="", description="发布时间")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
sub_comment_count: int = Field(default=0, description="子评论数")
|
||||
note_id: str = Field(..., description="帖子ID")
|
||||
note_url: str = Field(..., description="帖子链接")
|
||||
tieba_id: str = Field(..., description="所属的贴吧ID")
|
||||
tieba_name: str = Field(..., description="所属的贴吧名称")
|
||||
tieba_link: str = Field(..., description="贴吧链接")
|
||||
|
||||
|
||||
class TiebaCreator(BaseModel):
|
||||
"""
|
||||
百度贴吧创作者
|
||||
"""
|
||||
user_id: str = Field(..., description="用户ID")
|
||||
user_name: str = Field(..., description="用户名")
|
||||
nickname: str = Field(..., description="用户昵称")
|
||||
gender: str = Field(default="", description="用户性别")
|
||||
avatar: str = Field(..., description="用户头像地址")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
follows: int = Field(default=0, description="关注数")
|
||||
fans: int = Field(default=0, description="粉丝数")
|
||||
registration_duration: str = Field(default="", description="注册时长")
|
||||
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
@@ -0,0 +1,12 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
@@ -0,0 +1,21 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class NoteUrlInfo(BaseModel):
|
||||
note_id: str = Field(title="note id")
|
||||
xsec_token: str = Field(title="xsec token")
|
||||
xsec_source: str = Field(title="xsec source")
|
||||
@@ -0,0 +1,83 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ZhihuContent(BaseModel):
|
||||
"""
|
||||
知乎内容(回答、文章、视频)
|
||||
"""
|
||||
content_id: str = Field(default="", description="内容ID")
|
||||
content_type: str = Field(default="", description="内容类型(article | answer | zvideo)")
|
||||
content_text: str = Field(default="", description="内容文本, 如果是视频类型这里为空")
|
||||
content_url: str = Field(default="", description="内容落地链接")
|
||||
question_id: str = Field(default="", description="问题ID, type为answer时有值")
|
||||
title: str = Field(default="", description="内容标题")
|
||||
desc: str = Field(default="", description="内容描述")
|
||||
created_time: int = Field(default=0, description="创建时间")
|
||||
updated_time: int = Field(default=0, description="更新时间")
|
||||
voteup_count: int = Field(default=0, description="赞同人数")
|
||||
comment_count: int = Field(default=0, description="评论数量")
|
||||
source_keyword: str = Field(default="", description="来源关键词")
|
||||
|
||||
user_id: str = Field(default="", description="用户ID")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
user_url_token: str = Field(default="", description="用户url_token")
|
||||
|
||||
|
||||
class ZhihuComment(BaseModel):
|
||||
"""
|
||||
知乎评论
|
||||
"""
|
||||
|
||||
comment_id: str = Field(default="", description="评论ID")
|
||||
parent_comment_id: str = Field(default="", description="父评论ID")
|
||||
content: str = Field(default="", description="评论内容")
|
||||
publish_time: int = Field(default=0, description="发布时间")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
sub_comment_count: int = Field(default=0, description="子评论数")
|
||||
like_count: int = Field(default=0, description="点赞数")
|
||||
dislike_count: int = Field(default=0, description="踩数")
|
||||
content_id: str = Field(default="", description="内容ID")
|
||||
content_type: str = Field(default="", description="内容类型(article | answer | zvideo)")
|
||||
|
||||
user_id: str = Field(default="", description="用户ID")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
|
||||
|
||||
class ZhihuCreator(BaseModel):
|
||||
"""
|
||||
知乎创作者
|
||||
"""
|
||||
user_id: str = Field(default="", description="用户ID")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
url_token: str = Field(default="", description="用户url_token")
|
||||
gender: str = Field(default="", description="用户性别")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
follows: int = Field(default=0, description="关注数")
|
||||
fans: int = Field(default=0, description="粉丝数")
|
||||
anwser_count: int = Field(default=0, description="回答数")
|
||||
video_count: int = Field(default=0, description="视频数")
|
||||
question_count: int = Field(default=0, description="提问数")
|
||||
article_count: int = Field(default=0, description="文章数")
|
||||
column_count: int = Field(default=0, description="专栏数")
|
||||
get_voteup_count: int = Field(default=0, description="获得的赞同数")
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user