diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..6046351 --- /dev/null +++ b/.env.example @@ -0,0 +1,67 @@ +# ====================== 数据库配置 ====================== +# 数据库主机,例如localhost 或 127.0.0.1 +DB_HOST=your_db_host +# 数据库端口号,默认为3306 +DB_PORT=3306 +# 数据库用户名 +DB_USER=your_db_user +# 数据库密码 +DB_PASSWORD=your_db_password +# 数据库名称 +DB_NAME=your_db_name +# 数据库字符集,推荐utf8mb4,兼容emoji +DB_CHARSET=utf8mb4 +# 数据库类型mysql或postgresql +DB_DIALECT=postgresql + +# ======================= LLM 相关 ======================= +# Insight Agent(推荐Kimi,https://platform.moonshot.cn/)API密钥,用于主LLM +INSIGHT_ENGINE_API_KEY= +# Insight Agent LLM接口BaseUrl,可自定义厂商API +INSIGHT_ENGINE_BASE_URL= +# Insight Agent LLM模型名称,如kimi-k2-0711-preview +INSIGHT_ENGINE_MODEL_NAME= +# Media Agent(推荐Gemini,可用中转厂商 https://www.chataiapi.com/)API密钥 +MEDIA_ENGINE_API_KEY= +# Media Agent LLM接口BaseUrl +MEDIA_ENGINE_BASE_URL= +# Media Agent LLM模型名称,如gemini-2.5-pro +MEDIA_ENGINE_MODEL_NAME= + +# Media Agent API密钥(推荐Deepseek) +MINDSPIDER_API_KEY= +# MindSpider LLM接口BaseUrl +MINDSPIDER_BASE_URL= +# MindSpider LLM模型名称,如deepseek-chat +MINDSPIDER_MODEL_NAME= + +# Query Agent(推荐DeepSeek,https://www.deepseek.com/)API密钥 +QUERY_ENGINE_API_KEY= +# Query Agent LLM接口BaseUrl +QUERY_ENGINE_BASE_URL= +# Query Agent LLM模型,如deepseek-reasoner +QUERY_ENGINE_MODEL_NAME= +# Report Agent(推荐Gemini,可用中转厂商 https://www.chataiapi.com/)API密钥 +REPORT_ENGINE_API_KEY= +# Report Agent LLM接口BaseUrl +REPORT_ENGINE_BASE_URL= +# Report Agent LLM模型,如gemini-2.5-pro +REPORT_ENGINE_MODEL_NAME= +# Forum Host LLM API密钥,Qwen3最新模型,推荐 https://cloud.siliconflow.cn/ +FORUM_HOST_API_KEY= +# Forum Host LLM BaseUrl +FORUM_HOST_BASE_URL= +# Forum Host LLM模型名,如Qwen/Qwen3-235B-A22B-Instruct-2507 +FORUM_HOST_MODEL_NAME= +# SQL Keyword Optimizer LLM密钥,小参数Qwen3模型 https://cloud.siliconflow.cn/ +KEYWORD_OPTIMIZER_API_KEY= +# Keyword Optimizer BaseUrl +KEYWORD_OPTIMIZER_BASE_URL= +# Keyword Optimizer LLM模型名称,如deepseek-chat +KEYWORD_OPTIMIZER_MODEL_NAME= + +# ================== 网络工具配置 ==================== +# Tavily API密钥,用于Tavily网络搜索。注册地址:https://www.tavily.com/ +TAVILY_API_KEY= +# Bocha Web Search API密钥,用于Bocha搜索。注册地址:https://open.bochaai.com/ +BOCHA_WEB_SEARCH_API_KEY= \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/bug_report.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..cee4270 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,39 @@ +--- +name: MediaCrawler Bug反馈 +about: 创建一个问题Bug以帮助MediaCrawler开源项目改进 +title: '[BUG] ' +labels: bug +assignees: '' +--- + +## 🔍 问题检查清单 + + +- [ ] 我已经仔细阅读了项目使用过程中的[常见问题汇总](https://nanmicoder.github.io/MediaCrawler/%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98.html) +- [ ] 我已经搜索并查看了[已关闭的issues](https://github.com/NanmiCoder/MediaCrawler/issues?q=is%3Aissue+is%3Aclosed) +- [ ] 我确认这不是由于滑块验证码、Cookie过期、Cookie提取错误、平台风控等常见原因导致的问题 + +## 🐛 问题描述 + + + +## 📝 复现步骤 +1. +2. +3. + +## 💻 运行环境 +- 操作系统: +- Python版本: +- 是否使用IP代理: +- 是否使用VPN翻墙软件: +- 目标平台(抖音/小红书/微博等): + +## 📋 错误日志 + +```shell +在此粘贴错误日志 +``` + +## 📷 错误截图 + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/quesiton.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/quesiton.md new file mode 100644 index 0000000..649c263 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/ISSUE_TEMPLATE/quesiton.md @@ -0,0 +1,36 @@ +--- +name: MediaCrawler使用问题咨询 +about: 提交使用过程中遇到的问题 +title: '[问题] ' +labels: question +assignees: '' +--- + +## ⚠️ 提交前确认 + +- [ ] 我已经仔细阅读了项目使用过程中的[常见问题汇总](https://nanmicoder.github.io/MediaCrawler/%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98.html) +- [ ] 我已经搜索并查看了[已关闭的issues](https://github.com/NanmiCoder/MediaCrawler/issues?q=is%3Aissue+is%3Aclosed) +- [ ] 我确认这不是由于滑块验证码、Cookie过期、Cookie提取错误、平台风控等常见原因导致的问题 + +## ❓ 问题描述 + + +## 🔍 使用场景 + +- 目标平台: (如:小红书/抖音/微博等) +- 使用功能: (如:关键词搜索/用户主页爬取等) + +## 💻 环境信息 +- 操作系统: +- Python版本: +- 是否使用IP代理: +- 是否使用VPN翻墙软件: +- 目标平台(抖音/小红书/微博等): + +## 📋 错误日志 +```shell +在此粘贴完整的错误日志 +``` + +## 📷 错误截图 + diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/workflows/deploy.yml b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/workflows/deploy.yml new file mode 100644 index 0000000..eece8af --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.github/workflows/deploy.yml @@ -0,0 +1,64 @@ +# 构建 VitePress 站点并将其部署到 GitHub Pages 的示例工作流程 +# +name: Deploy VitePress site to Pages + +on: + # 在针对 `main` 分支的推送上运行。如果你 + # 使用 `master` 分支作为默认分支,请将其更改为 `master` + push: + branches: [main] + + # 允许你从 Actions 选项卡手动运行此工作流程 + workflow_dispatch: + +# 设置 GITHUB_TOKEN 的权限,以允许部署到 GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# 只允许同时进行一次部署,跳过正在运行和最新队列之间的运行队列 +# 但是,不要取消正在进行的运行,因为我们希望允许这些生产部署完成 +concurrency: + group: pages + cancel-in-progress: false + +jobs: + # 构建工作 + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 # 如果未启用 lastUpdated,则不需要 + # - uses: pnpm/action-setup@v3 # 如果使用 pnpm,请取消注释 + # - uses: oven-sh/setup-bun@v1 # 如果使用 Bun,请取消注释 + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: 20 + cache: npm # 或 pnpm / yarn + - name: Setup Pages + uses: actions/configure-pages@v4 + - name: Install dependencies + run: npm ci # 或 pnpm install / yarn install / bun install + - name: Build with VitePress + run: npm run docs:build # 或 pnpm docs:build / yarn docs:build / bun run docs:build + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: docs/.vitepress/dist + + # 部署工作 + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + needs: build + runs-on: ubuntu-latest + name: Deploy + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore b/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore index c9a9ac8..dedbdfa 100644 --- a/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.gitignore @@ -173,4 +173,9 @@ docs/.vitepress/cache # other gitignore .venv -.refer \ No newline at end of file +.refer + +agent_zone +debug_tools + +database/*.db \ No newline at end of file diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version b/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version index bd28b9c..2c07333 100644 --- a/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/.python-version @@ -1 +1 @@ -3.9 +3.11 diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/LICENSE b/MindSpider/DeepSentimentCrawling/MediaCrawler/LICENSE new file mode 100644 index 0000000..78408f6 --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/LICENSE @@ -0,0 +1,58 @@ +NON-COMMERCIAL LEARNING LICENSE 1.1 + +Copyright (c) [2024] [relakkes@gmail.com] + +WHEREAS: +1. The copyright owner owns and controls the copyright of this software and related documentation files (hereinafter referred to as the "Software"); +2. The user wishes to use the Software for learning purposes; +3. The copyright owner is willing to authorize the user to use the Software under the conditions stated in this license; + +NOW, THEREFORE, the parties, in compliance with relevant laws and regulations, agree to the following terms: + +SCOPE OF AUTHORIZATION: +1. The copyright owner hereby grants any natural person or legal entity (hereinafter referred to as the "User") accepting this license a free, non-exclusive, non-transferable right to use, copy, modify, and merge the Software for non-commercial learning purposes, subject to the following conditions. + +CONDITIONS: +1. The User must include the above copyright notice and this license statement in all reasonably prominent locations of the Software and its copies. +2. The Software is limited to learning and research purposes only, and may not be used for large-scale crawling or activities that disrupt platform operations. +3. Without the written consent of the copyright owner, the Software may not be used for any commercial purposes or to cause improper influence on third parties. + +DISCLAIMER: +1. The Software is provided "AS IS," without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose, and non-infringement. +2. In no event shall the copyright owner be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this Software, even if advised of the possibility of such damage. + +APPLICABLE LAW: +1. The interpretation and enforcement of this license shall comply with local laws and regulations. +2. Any disputes arising from or related to this license shall be resolved through friendly negotiation between the parties; if negotiation fails, either party may submit the dispute to the people's court where the copyright owner is located for resolution. + +This license constitutes the entire agreement between the parties regarding the Software, superseding and merging all prior discussions, communications, and agreements, whether oral or written. + + +非商业学习使用许可证 1.1 + +版权所有 (c) [2024] [relakkes@gmail.com] + +鉴于: +1. 版权所有者拥有和控制本软件和相关文档文件(以下简称“软件”)的版权; +2. 使用者希望使用该软件进行学习; +3. 版权所有者愿意在本许可证所述的条件下授权使用者使用该软件; + +现因此,双方遵循相关法律法规,同意如下条款: + +授权范围: +1. 版权所有者特此免费授予接受本许可证的任何自然人或法人(以下简称“使用者”)非独占的、不可转让的权利,在非商业学习目的下使用、复制、修改、合并本软件,前提是遵守以下条件。 + +条件: +1. 使用者必须在软件及其副本的所有合理显著位置包含上述版权声明和本许可证声明。 +2. 本软件仅限用于学习和研究目的,不得用于大规模爬虫或对平台造成运营干扰的行为。 +3. 未经版权所有者书面同意,不得将本软件用于任何商业用途或对第三方造成不当影响。 + +免责声明: +1. 本软件按“现状”提供,不提供任何形式的明示或暗示保证,包括但不限于对适销性、特定用途的适用性和非侵权的保证。 +2. 在任何情况下,版权所有者均不对因使用本软件而产生的,或在任何方式上与本软件有关的任何直接、间接、偶然、特殊、示例性或后果性损害负责(包括但不限于采购替代品或服务;使用、数据或利润的损失;或业务中断),无论这些损害是如何引起的,以及无论是通过合同、严格责任还是侵权行为(包括疏忽或其他方式)产生的,即使已被告知此类损害的可能性。 + +适用法律: +1. 本许可证的解释和执行应遵循当地法律法规。 +2. 因本许可证引起的或与之相关的任何争议,双方应友好协商解决;协商不成时,任何一方可将争议提交至版权所有者所在地的人民法院诉讼解决。 + +本许可证构成双方之间关于本软件的完整协议,取代并合并以前的讨论、交流和协议,无论是口头还是书面的。 diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/README.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/README.md new file mode 100644 index 0000000..ef01d7c --- /dev/null +++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/README.md @@ -0,0 +1,342 @@ +# 🔥 MediaCrawler - 自媒体平台爬虫 🕷️ + +
+
+
+### [Warp is built for coding with multiple AI agents](https://go.warp.dev/MediaCrawler)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+### [Warp is built for coding with multiple AI agents](https://go.warp.dev/MediaCrawler)
+
+
+
+
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/README_es.md b/MindSpider/DeepSentimentCrawling/MediaCrawler/README_es.md
new file mode 100644
index 0000000..61e7783
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/README_es.md
@@ -0,0 +1,327 @@
+
+
+
+### [Warp is built for coding with multiple AI agents](https://go.warp.dev/MediaCrawler)
+
+
+
+
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/async_db.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/async_db.py
deleted file mode 100644
index 33859fa..0000000
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/async_db.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
-# 1. 不得用于任何商业用途。
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
-# 3. 不得进行大规模爬取或对平台造成运营干扰。
-# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
-# 5. 不得用于任何非法或不当的用途。
-#
-# 详细许可条款请参阅项目根目录下的LICENSE文件。
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
-
-
-# -*- coding: utf-8 -*-
-# @Author : relakkes@gmail.com
-# @Time : 2024/4/6 14:21
-# @Desc : 异步Aiomysql的增删改查封装
-from typing import Any, Dict, List, Union
-
-import aiomysql
-
-
-class AsyncMysqlDB:
- def __init__(self, pool: aiomysql.Pool) -> None:
- self.__pool = pool
-
- async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]:
- """
- 从给定的 SQL 中查询记录,返回的是一个列表
- :param sql: 查询的sql
- :param args: sql中传递动态参数列表
- :return:
- """
- async with self.__pool.acquire() as conn:
- async with conn.cursor(aiomysql.DictCursor) as cur:
- await cur.execute(sql, args)
- data = await cur.fetchall()
- return data or []
-
- async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]:
- """
- 从给定的 SQL 中查询记录,返回的是符合条件的第一个结果
- :param sql: 查询的sql
- :param args:sql中传递动态参数列表
- :return:
- """
- async with self.__pool.acquire() as conn:
- async with conn.cursor(aiomysql.DictCursor) as cur:
- await cur.execute(sql, args)
- data = await cur.fetchone()
- return data
-
- async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int:
- """
- 表中插入数据
- :param table_name: 表名
- :param item: 一条记录的字典信息
- :return:
- """
- fields = list(item.keys())
- values = list(item.values())
- fields = [f'`{field}`' for field in fields]
- fieldstr = ','.join(fields)
- valstr = ','.join(['%s'] * len(item))
- sql = "INSERT INTO %s (%s) VALUES(%s)" % (table_name, fieldstr, valstr)
- async with self.__pool.acquire() as conn:
- async with conn.cursor(aiomysql.DictCursor) as cur:
- await cur.execute(sql, values)
- lastrowid = cur.lastrowid
- return lastrowid
-
- async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str,
- value_where: Union[str, int, float]) -> int:
- """
- 更新指定表的记录
- :param table_name: 表名
- :param updates: 需要更新的字段和值的 key - value 映射
- :param field_where: update 语句 where 条件中的字段名
- :param value_where: update 语句 where 条件中的字段值
- :return:
- """
- upsets = []
- values = []
- for k, v in updates.items():
- s = '`%s`=%%s' % k
- upsets.append(s)
- values.append(v)
- upsets = ','.join(upsets)
- sql = 'UPDATE %s SET %s WHERE %s="%s"' % (
- table_name,
- upsets,
- field_where, value_where,
- )
- async with self.__pool.acquire() as conn:
- async with conn.cursor() as cur:
- rows = await cur.execute(sql, values)
- return rows
-
- async def execute(self, sql: str, *args: Union[str, int]) -> int:
- """
- 需要更新、写入等操作的 excute 执行语句
- :param sql:
- :param args:
- :return:
- """
- async with self.__pool.acquire() as conn:
- async with conn.cursor() as cur:
- rows = await cur.execute(sql, args)
- return rows
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/async_sqlite_db.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/async_sqlite_db.py
deleted file mode 100644
index d9409bd..0000000
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/async_sqlite_db.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
-# 1. 不得用于任何商业用途。
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
-# 3. 不得进行大规模爬取或对平台造成运营干扰。
-# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
-# 5. 不得用于任何非法或不当的用途。
-#
-# 详细许可条款请参阅项目根目录下的LICENSE文件。
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
-
-
-# -*- coding: utf-8 -*-
-# @Author : relakkes@gmail.com
-# @Time : 2024/4/6 14:21
-# @Desc : 异步SQLite的增删改查封装
-from typing import Any, Dict, List, Union
-
-import aiosqlite
-
-
-class AsyncSqliteDB:
- def __init__(self, db_path: str) -> None:
- self.__db_path = db_path
-
- async def query(self, sql: str, *args: Union[str, int]) -> List[Dict[str, Any]]:
- """
- 从给定的 SQL 中查询记录,返回的是一个列表
- :param sql: 查询的sql
- :param args: sql中传递动态参数列表
- :return:
- """
- async with aiosqlite.connect(self.__db_path) as conn:
- conn.row_factory = aiosqlite.Row
- async with conn.execute(sql, args) as cursor:
- rows = await cursor.fetchall()
- return [dict(row) for row in rows] if rows else []
-
- async def get_first(self, sql: str, *args: Union[str, int]) -> Union[Dict[str, Any], None]:
- """
- 从给定的 SQL 中查询记录,返回的是符合条件的第一个结果
- :param sql: 查询的sql
- :param args:sql中传递动态参数列表
- :return:
- """
- async with aiosqlite.connect(self.__db_path) as conn:
- conn.row_factory = aiosqlite.Row
- async with conn.execute(sql, args) as cursor:
- row = await cursor.fetchone()
- return dict(row) if row else None
-
- async def item_to_table(self, table_name: str, item: Dict[str, Any]) -> int:
- """
- 表中插入数据
- :param table_name: 表名
- :param item: 一条记录的字典信息
- :return:
- """
- fields = list(item.keys())
- values = list(item.values())
- fieldstr = ','.join(fields)
- valstr = ','.join(['?'] * len(item))
- sql = f"INSERT INTO {table_name} ({fieldstr}) VALUES({valstr})"
- async with aiosqlite.connect(self.__db_path) as conn:
- async with conn.execute(sql, values) as cursor:
- await conn.commit()
- return cursor.lastrowid
-
- async def update_table(self, table_name: str, updates: Dict[str, Any], field_where: str,
- value_where: Union[str, int, float]) -> int:
- """
- 更新指定表的记录
- :param table_name: 表名
- :param updates: 需要更新的字段和值的 key - value 映射
- :param field_where: update 语句 where 条件中的字段名
- :param value_where: update 语句 where 条件中的字段值
- :return:
- """
- upsets = []
- values = []
- for k, v in updates.items():
- upsets.append(f'{k}=?')
- values.append(v)
- upsets_str = ','.join(upsets)
- values.append(value_where)
- sql = f'UPDATE {table_name} SET {upsets_str} WHERE {field_where}=?'
- async with aiosqlite.connect(self.__db_path) as conn:
- async with conn.execute(sql, values) as cursor:
- await conn.commit()
- return cursor.rowcount
-
- async def execute(self, sql: str, *args: Union[str, int]) -> int:
- """
- 需要更新、写入等操作的 excute 执行语句
- :param sql:
- :param args:
- :return:
- """
- async with aiosqlite.connect(self.__db_path) as conn:
- async with conn.execute(sql, args) as cursor:
- await conn.commit()
- return cursor.rowcount
-
- async def executescript(self, sql_script: str) -> None:
- """
- 执行SQL脚本,用于初始化数据库表结构
- :param sql_script: SQL脚本内容
- :return:
- """
- async with aiosqlite.connect(self.__db_path) as conn:
- await conn.executescript(sql_script)
- await conn.commit()
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py
index 12643ee..0fa375b 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/cmd_arg/arg.py
@@ -1,55 +1,259 @@
-# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
-# 1. 不得用于任何商业用途。
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
-# 3. 不得进行大规模爬取或对平台造成运营干扰。
-# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
+# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
-#
-# 详细许可条款请参阅项目根目录下的LICENSE文件。
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
-import argparse
+from __future__ import annotations
+
+
+import sys
+from enum import Enum
+from types import SimpleNamespace
+from typing import Iterable, Optional, Sequence, Type, TypeVar
+
+import typer
+from typing_extensions import Annotated
import config
from tools.utils import str2bool
-async def parse_cmd():
- # 读取command arg
- parser = argparse.ArgumentParser(description='Media crawler program. / 媒体爬虫程序')
- parser.add_argument('--platform', type=str,
- help='Media platform select / 选择媒体平台 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)',
- choices=["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"], default=config.PLATFORM)
- parser.add_argument('--lt', type=str,
- help='Login type / 登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)',
- choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
- parser.add_argument('--type', type=str,
- help='Crawler type / 爬取类型 (search=搜索 | detail=详情 | creator=创作者)',
- choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
- parser.add_argument('--start', type=int,
- help='Number of start page / 起始页码', default=config.START_PAGE)
- parser.add_argument('--keywords', type=str,
- help='Please input keywords / 请输入关键词', default=config.KEYWORDS)
- parser.add_argument('--get_comment', type=str2bool,
- help='''Whether to crawl level one comment / 是否爬取一级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
- parser.add_argument('--get_sub_comment', type=str2bool,
- help=''''Whether to crawl level two comment / 是否爬取二级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
- parser.add_argument('--save_data_option', type=str,
- help='Where to save the data / 数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)',
- choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION)
- parser.add_argument('--cookies', type=str,
- help='Cookies used for cookie login type / Cookie登录方式使用的Cookie值', default=config.COOKIES)
+EnumT = TypeVar("EnumT", bound=Enum)
- args = parser.parse_args()
- # override config
- config.PLATFORM = args.platform
- config.LOGIN_TYPE = args.lt
- config.CRAWLER_TYPE = args.type
- config.START_PAGE = args.start
- config.KEYWORDS = args.keywords
- config.ENABLE_GET_COMMENTS = args.get_comment
- config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
- config.SAVE_DATA_OPTION = args.save_data_option
- config.COOKIES = args.cookies
+class PlatformEnum(str, Enum):
+ """支持的媒体平台枚举"""
+
+ XHS = "xhs"
+ DOUYIN = "dy"
+ KUAISHOU = "ks"
+ BILIBILI = "bili"
+ WEIBO = "wb"
+ TIEBA = "tieba"
+ ZHIHU = "zhihu"
+
+
+class LoginTypeEnum(str, Enum):
+ """登录方式枚举"""
+
+ QRCODE = "qrcode"
+ PHONE = "phone"
+ COOKIE = "cookie"
+
+
+class CrawlerTypeEnum(str, Enum):
+ """爬虫类型枚举"""
+
+ SEARCH = "search"
+ DETAIL = "detail"
+ CREATOR = "creator"
+
+
+class SaveDataOptionEnum(str, Enum):
+ """数据保存方式枚举"""
+
+ CSV = "csv"
+ DB = "db"
+ JSON = "json"
+ SQLITE = "sqlite"
+ POSTGRESQL = "postgresql"
+
+
+class InitDbOptionEnum(str, Enum):
+ """数据库初始化选项"""
+
+ SQLITE = "sqlite"
+ MYSQL = "mysql"
+ POSTGRESQL = "postgresql"
+
+
+def _to_bool(value: bool | str) -> bool:
+ if isinstance(value, bool):
+ return value
+ return str2bool(value)
+
+
+def _coerce_enum(
+ enum_cls: Type[EnumT],
+ value: EnumT | str,
+ default: EnumT,
+) -> EnumT:
+ """Safely convert a raw config value to an enum member."""
+
+ if isinstance(value, enum_cls):
+ return value
+
+ try:
+ return enum_cls(value)
+ except ValueError:
+ typer.secho(
+ f"⚠️ 配置值 '{value}' 不在 {enum_cls.__name__} 支持的范围内,已回退到默认值 '{default.value}'.",
+ fg=typer.colors.YELLOW,
+ )
+ return default
+
+
+def _normalize_argv(argv: Optional[Sequence[str]]) -> Iterable[str]:
+ if argv is None:
+ return list(sys.argv[1:])
+ return list(argv)
+
+
+def _inject_init_db_default(args: Sequence[str]) -> list[str]:
+ """Ensure bare --init_db defaults to sqlite for backward compatibility."""
+
+ normalized: list[str] = []
+ i = 0
+ while i < len(args):
+ arg = args[i]
+ normalized.append(arg)
+
+ if arg == "--init_db":
+ next_arg = args[i + 1] if i + 1 < len(args) else None
+ if not next_arg or next_arg.startswith("-"):
+ normalized.append(InitDbOptionEnum.SQLITE.value)
+ i += 1
+
+ return normalized
+
+
+async def parse_cmd(argv: Optional[Sequence[str]] = None):
+ """使用 Typer 解析命令行参数。"""
+
+ app = typer.Typer(add_completion=False)
+
+ @app.callback(invoke_without_command=True)
+ def main(
+ platform: Annotated[
+ PlatformEnum,
+ typer.Option(
+ "--platform",
+ help="媒体平台选择 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)",
+ rich_help_panel="基础配置",
+ ),
+ ] = _coerce_enum(PlatformEnum, config.PLATFORM, PlatformEnum.XHS),
+ lt: Annotated[
+ LoginTypeEnum,
+ typer.Option(
+ "--lt",
+ help="登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)",
+ rich_help_panel="账号配置",
+ ),
+ ] = _coerce_enum(LoginTypeEnum, config.LOGIN_TYPE, LoginTypeEnum.QRCODE),
+ crawler_type: Annotated[
+ CrawlerTypeEnum,
+ typer.Option(
+ "--type",
+ help="爬取类型 (search=搜索 | detail=详情 | creator=创作者)",
+ rich_help_panel="基础配置",
+ ),
+ ] = _coerce_enum(CrawlerTypeEnum, config.CRAWLER_TYPE, CrawlerTypeEnum.SEARCH),
+ start: Annotated[
+ int,
+ typer.Option(
+ "--start",
+ help="起始页码",
+ rich_help_panel="基础配置",
+ ),
+ ] = config.START_PAGE,
+ keywords: Annotated[
+ str,
+ typer.Option(
+ "--keywords",
+ help="请输入关键词,多个关键词用逗号分隔",
+ rich_help_panel="基础配置",
+ ),
+ ] = config.KEYWORDS,
+ get_comment: Annotated[
+ str,
+ typer.Option(
+ "--get_comment",
+ help="是否爬取一级评论,支持 yes/true/t/y/1 或 no/false/f/n/0",
+ rich_help_panel="评论配置",
+ show_default=True,
+ ),
+ ] = str(config.ENABLE_GET_COMMENTS),
+ get_sub_comment: Annotated[
+ str,
+ typer.Option(
+ "--get_sub_comment",
+ help="是否爬取二级评论,支持 yes/true/t/y/1 或 no/false/f/n/0",
+ rich_help_panel="评论配置",
+ show_default=True,
+ ),
+ ] = str(config.ENABLE_GET_SUB_COMMENTS),
+ save_data_option: Annotated[
+ SaveDataOptionEnum,
+ typer.Option(
+ "--save_data_option",
+ help="数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库 | postgresql=PostgreSQL数据库)",
+ rich_help_panel="存储配置",
+ ),
+ ] = _coerce_enum(
+ SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSON
+ ),
+ init_db: Annotated[
+ Optional[InitDbOptionEnum],
+ typer.Option(
+ "--init_db",
+ help="初始化数据库表结构 (sqlite | mysql | postgresql)",
+ rich_help_panel="存储配置",
+ ),
+ ] = None,
+ cookies: Annotated[
+ str,
+ typer.Option(
+ "--cookies",
+ help="Cookie 登录方式使用的 Cookie 值",
+ rich_help_panel="账号配置",
+ ),
+ ] = config.COOKIES,
+ ) -> SimpleNamespace:
+ """MediaCrawler 命令行入口"""
+
+ enable_comment = _to_bool(get_comment)
+ enable_sub_comment = _to_bool(get_sub_comment)
+ init_db_value = init_db.value if init_db else None
+
+ # override global config
+ config.PLATFORM = platform.value
+ config.LOGIN_TYPE = lt.value
+ config.CRAWLER_TYPE = crawler_type.value
+ config.START_PAGE = start
+ config.KEYWORDS = keywords
+ config.ENABLE_GET_COMMENTS = enable_comment
+ config.ENABLE_GET_SUB_COMMENTS = enable_sub_comment
+ config.SAVE_DATA_OPTION = save_data_option.value
+ config.COOKIES = cookies
+
+ return SimpleNamespace(
+ platform=config.PLATFORM,
+ lt=config.LOGIN_TYPE,
+ type=config.CRAWLER_TYPE,
+ start=config.START_PAGE,
+ keywords=config.KEYWORDS,
+ get_comment=config.ENABLE_GET_COMMENTS,
+ get_sub_comment=config.ENABLE_GET_SUB_COMMENTS,
+ save_data_option=config.SAVE_DATA_OPTION,
+ init_db=init_db_value,
+ cookies=config.COOKIES,
+ )
+
+ command = typer.main.get_command(app)
+
+ cli_args = _normalize_argv(argv)
+ cli_args = _inject_init_db_default(cli_args)
+
+ try:
+ result = command.main(args=cli_args, standalone_mode=False)
+ if isinstance(result, int): # help/options handled by Typer; propagate exit code
+ raise SystemExit(result)
+ return result
+ except typer.Exit as exc: # pragma: no cover - CLI exit paths
+ raise SystemExit(exc.exit_code) from exc
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py
index eb3f161..1c1e97c 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/__init__.py
@@ -10,5 +10,4 @@
from .base_config import *
-from .db_config import *
-from .tieba_config import *
\ No newline at end of file
+from .db_config import *
\ No newline at end of file
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py
index 70665b4..dbea153 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/base_config.py
@@ -9,11 +9,12 @@
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 基础配置
-PLATFORM = "xhs" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
-KEYWORDS = "黑神话钟馗,九三阅兵,种地吧,董璇,非亲生,医美风险,游戏科学,阅兵准备,热巴,醉驾判无罪" # 关键词搜索配置,以英文逗号分隔
+PLATFORM = "bili" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
+KEYWORDS = "电影鬼灭之刃,亲属想侵吞3姐妹亡父赔偿款,网警斩断侵害未成年人网络黑色产业链,2007年后出生的人不能在马尔代夫吸烟,沈月,是公主也是自己的骑士,以军虐囚视频,唐朝诡事录,广州地铁回应APP乘车码频繁弹窗广告,全红婵的减肥计划精确到克" # 关键词搜索配置,以英文逗号分隔
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = ""
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
+
# 是否开启 IP 代理
ENABLE_IP_PROXY = False
@@ -36,7 +37,7 @@ SAVE_LOGIN_STATE = True
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
-ENABLE_CDP_MODE = False
+ENABLE_CDP_MODE = True
# CDP调试端口,用于与浏览器通信
# 如果端口被占用,系统会自动尝试下一个可用端口
@@ -59,8 +60,8 @@ BROWSER_LAUNCH_TIMEOUT = 30
# 设置为False可以保持浏览器运行,便于调试
AUTO_CLOSE_BROWSER = True
-# 数据保存类型选项配置,支持四种类型:csv、db、json、sqlite, 最好保存到DB,有排重的功能。
-SAVE_DATA_OPTION = "db" # csv or db or json or sqlite
+# 数据保存类型选项配置,支持五种类型:csv、db、json、sqlite、postgresql, 最好保存到DB,有排重的功能。
+SAVE_DATA_OPTION = "postgresql" # csv or db or json or sqlite or postgresql
# 用户浏览器缓存的浏览器文件配置
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
@@ -69,7 +70,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
START_PAGE = 1
# 爬取视频/帖子的数量控制
-CRAWLER_MAX_NOTES_COUNT = 10
+CRAWLER_MAX_NOTES_COUNT = 5
# 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 1
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py
index 2b516b4..779ab75 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/bilibili_config.py
@@ -13,16 +13,23 @@
# 每天爬取视频/帖子的数量控制
MAX_NOTES_PER_DAY = 1
-# 指定B站视频ID列表
+# 指定B站视频URL列表 (支持完整URL或BV号)
+# 示例:
+# - 完整URL: "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
+# - BV号: "BV1d54y1g7db"
BILI_SPECIFIED_ID_LIST = [
- "BV1d54y1g7db",
+ "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click",
"BV1Sz4y1U77N",
"BV14Q4y1n7jz",
# ........................
]
-# 指定B站用户ID列表
+# 指定B站创作者URL列表 (支持完整URL或UID)
+# 示例:
+# - 完整URL: "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
+# - UID: "20813884"
BILI_CREATOR_ID_LIST = [
+ "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0",
"20813884",
# ........................
]
@@ -34,6 +41,11 @@ END_DAY = "2024-01-01"
# 搜索模式
BILI_SEARCH_MODE = "normal"
+# 视频清晰度(qn)配置,常见取值:
+# 16=360p, 32=480p, 64=720p, 80=1080p, 112=1080p高码率, 116=1080p60, 120=4K
+# 注意:更高清晰度需要账号/视频本身支持
+BILI_QN = 80
+
# 是否爬取用户信息
CREATOR_MODE = True
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py
index fd85c35..0b6d45b 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/db_config.py
@@ -12,11 +12,19 @@
import os
# mysql config - 使用MindSpider的数据库配置
-MYSQL_DB_PWD = "mneDccc7sHHANtFk"
-MYSQL_DB_USER = "root"
-MYSQL_DB_HOST = "rm-2zeib6b13f6tt9kncoo.mysql.rds.aliyuncs.com"
-MYSQL_DB_PORT = 3306
-MYSQL_DB_NAME = "mindspider"
+MYSQL_DB_PWD = "bettafish"
+MYSQL_DB_USER = "bettafish"
+MYSQL_DB_HOST = "127.0.0.1"
+MYSQL_DB_PORT = 5444
+MYSQL_DB_NAME = "bettafish"
+
+mysql_db_config = {
+ "user": MYSQL_DB_USER,
+ "password": MYSQL_DB_PWD,
+ "host": MYSQL_DB_HOST,
+ "port": MYSQL_DB_PORT,
+ "db_name": MYSQL_DB_NAME,
+}
# redis config
@@ -30,4 +38,24 @@ CACHE_TYPE_REDIS = "redis"
CACHE_TYPE_MEMORY = "memory"
# sqlite config
-SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "schema", "sqlite_tables.db")
\ No newline at end of file
+SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "database", "sqlite_tables.db")
+
+sqlite_db_config = {
+ "db_path": SQLITE_DB_PATH
+}
+
+# postgresql config - 使用MindSpider的数据库配置(如果DB_DIALECT是postgresql)或环境变量
+POSTGRESQL_DB_PWD = os.getenv("POSTGRESQL_DB_PWD", "bettafish")
+POSTGRESQL_DB_USER = os.getenv("POSTGRESQL_DB_USER", "bettafish")
+POSTGRESQL_DB_HOST = os.getenv("POSTGRESQL_DB_HOST", "127.0.0.1")
+POSTGRESQL_DB_PORT = os.getenv("POSTGRESQL_DB_PORT", "5444")
+POSTGRESQL_DB_NAME = os.getenv("POSTGRESQL_DB_NAME", "bettafish")
+
+postgresql_db_config = {
+ "user": POSTGRESQL_DB_USER,
+ "password": POSTGRESQL_DB_PWD,
+ "host": POSTGRESQL_DB_HOST,
+ "port": POSTGRESQL_DB_PORT,
+ "db_name": POSTGRESQL_DB_NAME,
+}
+
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py
index b974dca..cd36065 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/dy_config.py
@@ -11,15 +11,27 @@
# 抖音平台配置
PUBLISH_TIME_TYPE = 0
-# 指定DY视频ID列表
+# 指定DY视频URL列表 (支持多种格式)
+# 支持格式:
+# 1. 完整视频URL: "https://www.douyin.com/video/7525538910311632128"
+# 2. 带modal_id的URL: "https://www.douyin.com/user/xxx?modal_id=7525538910311632128"
+# 3. 搜索页带modal_id: "https://www.douyin.com/root/search/python?modal_id=7525538910311632128"
+# 4. 短链接: "https://v.douyin.com/drIPtQ_WPWY/"
+# 5. 纯视频ID: "7280854932641664319"
DY_SPECIFIED_ID_LIST = [
- "7280854932641664319",
- "7202432992642387233",
+ "https://www.douyin.com/video/7525538910311632128",
+ "https://v.douyin.com/drIPtQ_WPWY/",
+ "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525538910311632128",
+ "7202432992642387233",
# ........................
]
-# 指定DY用户ID列表
+# 指定DY创作者URL列表 (支持完整URL或sec_user_id)
+# 支持格式:
+# 1. 完整创作者主页URL: "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main"
+# 2. sec_user_id: "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
DY_CREATOR_ID_LIST = [
- "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
+ "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main",
+ "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
# ........................
]
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py
index 962b457..d84d4a7 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/ks_config.py
@@ -10,11 +10,22 @@
# 快手平台配置
-# 指定快手视频ID列表
-KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"]
+# 指定快手视频URL列表 (支持完整URL或纯ID)
+# 支持格式:
+# 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
+# 2. 纯视频ID: "3xf8enb8dbj6uig"
+KS_SPECIFIED_ID_LIST = [
+ "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
+ "3xf8enb8dbj6uig",
+ # ........................
+]
-# 指定快手用户ID列表
+# 指定快手创作者URL列表 (支持完整URL或纯ID)
+# 支持格式:
+# 1. 创作者主页URL: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
+# 2. 纯user_id: "3x4sm73aye7jq7i"
KS_CREATOR_ID_LIST = [
+ "https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
"3x4sm73aye7jq7i",
# ........................
]
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py
index 485277a..2359b96 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/config/xhs_config.py
@@ -17,12 +17,16 @@ SORT_TYPE = "popularity_descending"
# 指定笔记URL列表, 必须要携带xsec_token参数
XHS_SPECIFIED_NOTE_URL_LIST = [
- "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
+ "https://www.xiaohongshu.com/explore/68f99f6d0000000007033fcf?xsec_token=ABZEzjuN2fPjKF9EcMsCCxfbt3IBRsFZldGFoCJbdDmXI=&xsec_source=pc_feed"
# ........................
]
-# 指定用户ID列表
+# 指定创作者URL列表 (支持完整URL或纯ID)
+# 支持格式:
+# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
+# 2. 纯user_id: "63e36c9a000000002703502b"
XHS_CREATOR_ID_LIST = [
- "63e36c9a000000002703502b",
+ "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
+ "63e36c9a000000002703502b",
# ........................
]
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/database/__init__.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/database/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/database/db.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/database/db.py
new file mode 100644
index 0000000..68a651b
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/database/db.py
@@ -0,0 +1,35 @@
+# persist-1
+ 微信赞赏+
+ |
+
+ 支付宝赞赏+
+ |
+
标签或直接是JSON)
+ try:
+ # 尝试从页面中提取JSON
+ json_text = await self.playwright_page.evaluate("() => document.body.innerText")
+ result = json.loads(json_text)
+ utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 成功获取创作者帖子数据")
+ return result
+ except json.JSONDecodeError as e:
+ utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] JSON解析失败: {e}")
+ utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 页面内容: {page_content[:500]}")
+ raise Exception(f"Failed to parse JSON from creator notes page: {e}")
+
+ except Exception as e:
+ utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 获取创作者帖子列表失败: {e}")
+ raise
async def get_all_notes_by_creator_user_name(
self,
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py
index 8635104..268cf26 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/tieba/core.py
@@ -11,7 +11,6 @@
import asyncio
import os
-import random
from asyncio import Task
from typing import Dict, List, Optional, Tuple
@@ -26,7 +25,7 @@ from playwright.async_api import (
import config
from base.base_crawler import AbstractCrawler
from model.m_baidu_tieba import TiebaCreator, TiebaNote
-from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
+from proxy.proxy_ip_pool import IpInfoModel, ProxyIpPool, create_ip_pool
from store import tieba as tieba_store
from tools import utils
from tools.cdp_browser import CDPBrowserManager
@@ -56,7 +55,7 @@ class TieBaCrawler(AbstractCrawler):
Returns:
"""
- ip_proxy_pool, httpx_proxy_format = None, None
+ playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
utils.logger.info(
"[BaiduTieBaCrawler.start] Begin create ip proxy pool ..."
@@ -65,31 +64,73 @@ class TieBaCrawler(AbstractCrawler):
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
- _, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
+ playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
utils.logger.info(
f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}"
)
- # Create a client to interact with the baidutieba website.
- self.tieba_client = BaiduTieBaClient(
- ip_pool=ip_proxy_pool,
- default_ip_proxy=httpx_proxy_format,
- )
- crawler_type_var.set(config.CRAWLER_TYPE)
- if config.CRAWLER_TYPE == "search":
- # Search for notes and retrieve their comment information.
- await self.search()
- await self.get_specified_tieba_notes()
- elif config.CRAWLER_TYPE == "detail":
- # Get the information and comments of the specified post
- await self.get_specified_notes()
- elif config.CRAWLER_TYPE == "creator":
- # Get creator's information and their notes and comments
- await self.get_creators_and_notes()
- else:
- pass
+ async with async_playwright() as playwright:
+ # 根据配置选择启动模式
+ if config.ENABLE_CDP_MODE:
+ utils.logger.info("[BaiduTieBaCrawler] 使用CDP模式启动浏览器")
+ self.browser_context = await self.launch_browser_with_cdp(
+ playwright,
+ playwright_proxy_format,
+ self.user_agent,
+ headless=config.CDP_HEADLESS,
+ )
+ else:
+ utils.logger.info("[BaiduTieBaCrawler] 使用标准模式启动浏览器")
+ # Launch a browser context.
+ chromium = playwright.chromium
+ self.browser_context = await self.launch_browser(
+ chromium,
+ playwright_proxy_format,
+ self.user_agent,
+ headless=config.HEADLESS,
+ )
- utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
+ # 注入反检测脚本 - 针对百度的特殊检测
+ await self._inject_anti_detection_scripts()
+
+ self.context_page = await self.browser_context.new_page()
+
+ # 先访问百度首页,再点击贴吧链接,避免触发安全验证
+ await self._navigate_to_tieba_via_baidu()
+
+ # Create a client to interact with the baidutieba website.
+ self.tieba_client = await self.create_tieba_client(
+ httpx_proxy_format,
+ ip_proxy_pool if config.ENABLE_IP_PROXY else None
+ )
+
+ # Check login status and perform login if necessary
+ if not await self.tieba_client.pong(browser_context=self.browser_context):
+ login_obj = BaiduTieBaLogin(
+ login_type=config.LOGIN_TYPE,
+ login_phone="", # your phone number
+ browser_context=self.browser_context,
+ context_page=self.context_page,
+ cookie_str=config.COOKIES,
+ )
+ await login_obj.begin()
+ await self.tieba_client.update_cookies(browser_context=self.browser_context)
+
+ crawler_type_var.set(config.CRAWLER_TYPE)
+ if config.CRAWLER_TYPE == "search":
+ # Search for notes and retrieve their comment information.
+ await self.search()
+ await self.get_specified_tieba_notes()
+ elif config.CRAWLER_TYPE == "detail":
+ # Get the information and comments of the specified post
+ await self.get_specified_notes()
+ elif config.CRAWLER_TYPE == "creator":
+ # Get creator's information and their notes and comments
+ await self.get_creators_and_notes()
+ else:
+ pass
+
+ utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
async def search(self) -> None:
"""
@@ -141,6 +182,11 @@ class TieBaCrawler(AbstractCrawler):
await self.get_specified_notes(
note_id_list=[note_detail.note_id for note_detail in notes_list]
)
+
+ # Sleep after page navigation
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[TieBaCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page}")
+
page += 1
except Exception as ex:
utils.logger.error(
@@ -178,6 +224,11 @@ class TieBaCrawler(AbstractCrawler):
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}"
)
await self.get_specified_notes([note.note_id for note in note_list])
+
+ # Sleep after processing notes
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[TieBaCrawler.get_specified_tieba_notes] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after processing notes from page {page_number}")
+
page_number += tieba_limit_count
async def get_specified_notes(
@@ -222,6 +273,11 @@ class TieBaCrawler(AbstractCrawler):
f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}"
)
note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
+
+ # Sleep after fetching note details
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[TieBaCrawler.get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
+
if not note_detail:
utils.logger.error(
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}"
@@ -277,9 +333,14 @@ class TieBaCrawler(AbstractCrawler):
utils.logger.info(
f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}"
)
+
+ # Sleep before fetching comments
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[TieBaCrawler.get_comments_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_detail.note_id}")
+
await self.tieba_client.get_note_all_comments(
note_detail=note_detail,
- crawl_interval=random.random(),
+ crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
callback=tieba_store.batch_update_tieba_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
@@ -327,6 +388,198 @@ class TieBaCrawler(AbstractCrawler):
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
)
+ async def _navigate_to_tieba_via_baidu(self):
+ """
+ 模拟真实用户访问路径:
+ 1. 先访问百度首页 (https://www.baidu.com/)
+ 2. 等待页面加载
+ 3. 点击顶部导航栏的"贴吧"链接
+ 4. 跳转到贴吧首页
+
+ 这样做可以避免触发百度的安全验证
+ """
+ utils.logger.info("[TieBaCrawler] 模拟真实用户访问路径...")
+
+ try:
+ # Step 1: 访问百度首页
+ utils.logger.info("[TieBaCrawler] Step 1: 访问百度首页 https://www.baidu.com/")
+ await self.context_page.goto("https://www.baidu.com/", wait_until="domcontentloaded")
+
+ # Step 2: 等待页面加载,使用配置文件中的延时设置
+ utils.logger.info(f"[TieBaCrawler] Step 2: 等待 {config.CRAWLER_MAX_SLEEP_SEC}秒 模拟用户浏览...")
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ # Step 3: 查找并点击"贴吧"链接
+ utils.logger.info("[TieBaCrawler] Step 3: 查找并点击'贴吧'链接...")
+
+ # 尝试多种选择器,确保能找到贴吧链接
+ tieba_selectors = [
+ 'a[href="http://tieba.baidu.com/"]',
+ 'a[href="https://tieba.baidu.com/"]',
+ 'a.mnav:has-text("贴吧")',
+ 'text=贴吧',
+ ]
+
+ tieba_link = None
+ for selector in tieba_selectors:
+ try:
+ tieba_link = await self.context_page.wait_for_selector(selector, timeout=5000)
+ if tieba_link:
+ utils.logger.info(f"[TieBaCrawler] 找到贴吧链接 (selector: {selector})")
+ break
+ except Exception:
+ continue
+
+ if not tieba_link:
+ utils.logger.warning("[TieBaCrawler] 未找到贴吧链接,直接访问贴吧首页")
+ await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
+ return
+
+ # Step 4: 点击贴吧链接 (检查是否会打开新标签页)
+ utils.logger.info("[TieBaCrawler] Step 4: 点击贴吧链接...")
+
+ # 检查链接的target属性
+ target_attr = await tieba_link.get_attribute("target")
+ utils.logger.info(f"[TieBaCrawler] 链接target属性: {target_attr}")
+
+ if target_attr == "_blank":
+ # 如果是新标签页,需要等待新页面并切换
+ utils.logger.info("[TieBaCrawler] 链接会在新标签页打开,等待新页面...")
+
+ async with self.browser_context.expect_page() as new_page_info:
+ await tieba_link.click()
+
+ # 获取新打开的页面
+ new_page = await new_page_info.value
+ await new_page.wait_for_load_state("domcontentloaded")
+
+ # 关闭旧的百度首页
+ await self.context_page.close()
+
+ # 切换到新的贴吧页面
+ self.context_page = new_page
+ utils.logger.info("[TieBaCrawler] ✅ 已切换到新标签页 (贴吧页面)")
+ else:
+ # 如果是同一标签页跳转,正常等待导航
+ utils.logger.info("[TieBaCrawler] 链接在当前标签页跳转...")
+ async with self.context_page.expect_navigation(wait_until="domcontentloaded"):
+ await tieba_link.click()
+
+ # Step 5: 等待页面稳定,使用配置文件中的延时设置
+ utils.logger.info(f"[TieBaCrawler] Step 5: 页面加载完成,等待 {config.CRAWLER_MAX_SLEEP_SEC}秒...")
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+ current_url = self.context_page.url
+ utils.logger.info(f"[TieBaCrawler] ✅ 成功通过百度首页进入贴吧! 当前URL: {current_url}")
+
+ except Exception as e:
+ utils.logger.error(f"[TieBaCrawler] 通过百度首页访问贴吧失败: {e}")
+ utils.logger.info("[TieBaCrawler] 回退:直接访问贴吧首页")
+ await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
+
+ async def _inject_anti_detection_scripts(self):
+ """
+ 注入反检测JavaScript脚本
+ 针对百度贴吧的特殊检测机制
+ """
+ utils.logger.info("[TieBaCrawler] Injecting anti-detection scripts...")
+
+ # 轻量级反检测脚本,只覆盖关键检测点
+ anti_detection_js = """
+ // 覆盖 navigator.webdriver
+ Object.defineProperty(navigator, 'webdriver', {
+ get: () => undefined,
+ configurable: true
+ });
+
+ // 覆盖 window.navigator.chrome
+ if (!window.navigator.chrome) {
+ window.navigator.chrome = {
+ runtime: {},
+ loadTimes: function() {},
+ csi: function() {},
+ app: {}
+ };
+ }
+
+ // 覆盖 Permissions API
+ const originalQuery = window.navigator.permissions.query;
+ window.navigator.permissions.query = (parameters) => (
+ parameters.name === 'notifications' ?
+ Promise.resolve({ state: Notification.permission }) :
+ originalQuery(parameters)
+ );
+
+ // 覆盖 plugins 长度(让它看起来有插件)
+ Object.defineProperty(navigator, 'plugins', {
+ get: () => [1, 2, 3, 4, 5],
+ configurable: true
+ });
+
+ // 覆盖 languages
+ Object.defineProperty(navigator, 'languages', {
+ get: () => ['zh-CN', 'zh', 'en'],
+ configurable: true
+ });
+
+ // 移除 window.cdc_ 等 ChromeDriver 残留
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Array;
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Promise;
+ delete window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
+
+ console.log('[Anti-Detection] Scripts injected successfully');
+ """
+
+ await self.browser_context.add_init_script(anti_detection_js)
+ utils.logger.info("[TieBaCrawler] Anti-detection scripts injected")
+
+ async def create_tieba_client(
+ self, httpx_proxy: Optional[str], ip_pool: Optional[ProxyIpPool] = None
+ ) -> BaiduTieBaClient:
+ """
+ Create tieba client with real browser User-Agent and complete headers
+ Args:
+ httpx_proxy: HTTP代理
+ ip_pool: IP代理池
+
+ Returns:
+ BaiduTieBaClient实例
+ """
+ utils.logger.info("[TieBaCrawler.create_tieba_client] Begin create tieba API client...")
+
+ # 从真实浏览器提取User-Agent,避免被检测
+ user_agent = await self.context_page.evaluate("() => navigator.userAgent")
+ utils.logger.info(f"[TieBaCrawler.create_tieba_client] Extracted User-Agent from browser: {user_agent}")
+
+ cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
+
+ # 构建完整的浏览器请求头,模拟真实浏览器行为
+ tieba_client = BaiduTieBaClient(
+ timeout=10,
+ ip_pool=ip_pool,
+ default_ip_proxy=httpx_proxy,
+ headers={
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+ "Accept-Language": "zh-CN,zh;q=0.9",
+ "Accept-Encoding": "gzip, deflate, br",
+ "Connection": "keep-alive",
+ "User-Agent": user_agent, # 使用真实浏览器的UA
+ "Cookie": cookie_str,
+ "Host": "tieba.baidu.com",
+ "Referer": "https://tieba.baidu.com/",
+ "Sec-Fetch-Dest": "document",
+ "Sec-Fetch-Mode": "navigate",
+ "Sec-Fetch-Site": "same-origin",
+ "Sec-Fetch-User": "?1",
+ "Upgrade-Insecure-Requests": "1",
+ "sec-ch-ua": '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"',
+ "sec-ch-ua-mobile": "?0",
+ "sec-ch-ua-platform": '"macOS"',
+ },
+ playwright_page=self.context_page, # 传入playwright页面对象
+ )
+ return tieba_client
+
async def launch_browser(
self,
chromium: BrowserType,
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py
index 552801f..e78a212 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/weibo/core.py
@@ -15,7 +15,7 @@
import asyncio
import os
-import random
+# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task
from typing import Dict, List, Optional, Tuple
@@ -77,8 +77,11 @@ class WeiboCrawler(AbstractCrawler):
# Launch a browser context.
chromium = playwright.chromium
self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
- # stealth.min.js is a js script to prevent the website from detecting the crawler.
- await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
+ # stealth.min.js is a js script to prevent the website from detecting the crawler.
+ await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
+
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.mobile_index_url)
@@ -160,6 +163,11 @@ class WeiboCrawler(AbstractCrawler):
await self.get_note_images(mblog)
page += 1
+
+ # Sleep after page navigation
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[WeiboCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
+
await self.batch_get_notes_comments(note_id_list)
async def get_specified_notes(self):
@@ -185,6 +193,11 @@ class WeiboCrawler(AbstractCrawler):
async with semaphore:
try:
result = await self.wb_client.get_note_info_by_id(note_id)
+
+ # Sleep after fetching note details
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[WeiboCrawler.get_note_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
+
return result
except DataFetchError as ex:
utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
@@ -221,9 +234,14 @@ class WeiboCrawler(AbstractCrawler):
async with semaphore:
try:
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
+
+ # Sleep before fetching comments
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[WeiboCrawler.get_note_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_id}")
+
await self.wb_client.get_note_all_comments(
note_id=note_id,
- crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些
+ crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, # Use fixed interval instead of random
callback=weibo_store.batch_update_weibo_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
@@ -250,7 +268,8 @@ class WeiboCrawler(AbstractCrawler):
if not url:
continue
content = await self.wb_client.get_note_image(url)
- await asyncio.sleep(random.random())
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[WeiboCrawler.get_note_images] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching image")
if content != None:
extension_file_name = url.split(".")[-1]
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py
index 982373a..652667f 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/client.py
@@ -10,22 +10,24 @@
import asyncio
import json
-import re
+import time
from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode
import httpx
from playwright.async_api import BrowserContext, Page
-from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result
+from tenacity import retry, stop_after_attempt, wait_fixed
import config
from base.base_crawler import AbstractApiClient
from tools import utils
-from html import unescape
+
from .exception import DataFetchError, IPBlockError
from .field import SearchNoteType, SearchSortType
from .help import get_search_id, sign
+from .extractor import XiaoHongShuExtractor
+from .secsign import seccore_signv2_playwright
class XiaoHongShuClient(AbstractApiClient):
@@ -50,6 +52,7 @@ class XiaoHongShuClient(AbstractApiClient):
self.NOTE_ABNORMAL_CODE = -510001
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
+ self._extractor = XiaoHongShuExtractor()
async def _pre_headers(self, url: str, data=None) -> Dict:
"""
@@ -61,13 +64,13 @@ class XiaoHongShuClient(AbstractApiClient):
Returns:
"""
- encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data])
+ x_s = await seccore_signv2_playwright(self.playwright_page, url, data)
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
signs = sign(
a1=self.cookie_dict.get("a1", ""),
b1=local_storage.get("b1", ""),
- x_s=encrypt_params.get("X-s", ""),
- x_t=str(encrypt_params.get("X-t", "")),
+ x_s=x_s,
+ x_t=str(int(time.time())),
)
headers = {
@@ -128,7 +131,9 @@ class XiaoHongShuClient(AbstractApiClient):
if isinstance(params, dict):
final_uri = f"{uri}?" f"{urlencode(params)}"
headers = await self._pre_headers(final_uri)
- return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
+ return await self.request(
+ method="GET", url=f"{self._host}{final_uri}", headers=headers
+ )
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
"""
@@ -156,12 +161,18 @@ class XiaoHongShuClient(AbstractApiClient):
response = await client.request("GET", url, timeout=self.timeout)
response.raise_for_status()
if not response.reason_phrase == "OK":
- utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
+ utils.logger.error(
+ f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}"
+ )
return None
else:
return response.content
- except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
- utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
+ except (
+ httpx.HTTPError
+ ) as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
+ utils.logger.error(
+ f"[XiaoHongShuClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}"
+ ) # 保留原始异常类型名称,以便开发者调试
return None
async def pong(self) -> bool:
@@ -178,7 +189,9 @@ class XiaoHongShuClient(AbstractApiClient):
if note_card.get("items"):
ping_flag = True
except Exception as e:
- utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...")
+ utils.logger.error(
+ f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again..."
+ )
ping_flag = False
return ping_flag
@@ -249,9 +262,7 @@ class XiaoHongShuClient(AbstractApiClient):
data = {
"source_note_id": note_id,
"image_formats": ["jpg", "webp", "avif"],
- "extra": {
- "need_body_topic": 1
- },
+ "extra": {"need_body_topic": 1},
"xsec_source": xsec_source,
"xsec_token": xsec_token,
}
@@ -261,7 +272,9 @@ class XiaoHongShuClient(AbstractApiClient):
res_dict: Dict = res["items"][0]["note_card"]
return res_dict
# 爬取频繁了可能会出现有的笔记能有结果有的没有
- utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}")
+ utils.logger.error(
+ f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}"
+ )
return dict()
async def get_note_comments(
@@ -345,15 +358,19 @@ class XiaoHongShuClient(AbstractApiClient):
comments_has_more = True
comments_cursor = ""
while comments_has_more and len(result) < max_count:
- comments_res = await self.get_note_comments(note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor)
+ comments_res = await self.get_note_comments(
+ note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor
+ )
comments_has_more = comments_res.get("has_more", False)
comments_cursor = comments_res.get("cursor", "")
if "comments" not in comments_res:
- utils.logger.info(f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
+ utils.logger.info(
+ f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}"
+ )
break
comments = comments_res["comments"]
if len(result) + len(comments) > max_count:
- comments = comments[:max_count - len(result)]
+ comments = comments[: max_count - len(result)]
if callback:
await callback(note_id, comments)
await asyncio.sleep(crawl_interval)
@@ -386,7 +403,9 @@ class XiaoHongShuClient(AbstractApiClient):
"""
if not config.ENABLE_GET_SUB_COMMENTS:
- utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
+ utils.logger.info(
+ f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
+ )
return []
result = []
@@ -413,12 +432,16 @@ class XiaoHongShuClient(AbstractApiClient):
)
if comments_res is None:
- utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}")
+ utils.logger.info(
+ f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}"
+ )
continue
sub_comment_has_more = comments_res.get("has_more", False)
sub_comment_cursor = comments_res.get("cursor", "")
if "comments" not in comments_res:
- utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}")
+ utils.logger.info(
+ f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}"
+ )
break
comments = comments_res["comments"]
if callback:
@@ -427,23 +450,30 @@ class XiaoHongShuClient(AbstractApiClient):
result.extend(comments)
return result
- async def get_creator_info(self, user_id: str) -> Dict:
+ async def get_creator_info(
+ self, user_id: str, xsec_token: str = "", xsec_source: str = ""
+ ) -> Dict:
"""
通过解析网页版的用户主页HTML,获取用户个人简要信息
PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可
- eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
+
+ Args:
+ user_id: 用户ID
+ xsec_token: 验证token (可选,如果URL中包含此参数则传入)
+ xsec_source: 渠道来源 (可选,如果URL中包含此参数则传入)
+
+ Returns:
+ Dict: 创作者信息
"""
+ # 构建URI,如果有xsec参数则添加到URL中
uri = f"/user/profile/{user_id}"
- html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers)
- match = re.search(r"", html)[0].replace("undefined", '""')
-
- if state != "{}":
- note_dict = transform_json_keys(state)
- return note_dict["note"]["note_detail_map"][note_id]["note"]
- return {}
-
- try:
- return get_note_dict(html)
- except:
- return None
+ return self._extractor.extract_note_detail_from_html(note_id, html)
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py
index 9c88f1c..68d2139 100644
--- a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/core.py
@@ -11,9 +11,8 @@
import asyncio
import os
import random
-import time
from asyncio import Task
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
from playwright.async_api import (
BrowserContext,
@@ -27,7 +26,7 @@ from tenacity import RetryError
import config
from base.base_crawler import AbstractCrawler
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
-from model.m_xiaohongshu import NoteUrlInfo
+from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store
from tools import utils
@@ -37,7 +36,7 @@ from var import crawler_type_var, source_keyword_var
from .client import XiaoHongShuClient
from .exception import DataFetchError
from .field import SearchSortType
-from .help import parse_note_info_from_note_url, get_search_id
+from .help import parse_note_info_from_note_url, parse_creator_info_from_url, get_search_id
from .login import XiaoHongShuLogin
@@ -80,8 +79,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.user_agent,
headless=config.HEADLESS,
)
- # stealth.min.js is a js script to prevent the website from detecting the crawler.
- await self.browser_context.add_init_script(path="libs/stealth.min.js")
+ # stealth.min.js is a js script to prevent the website from detecting the crawler.
+ await self.browser_context.add_init_script(path="libs/stealth.min.js")
+
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url)
@@ -164,6 +164,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
page += 1
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
await self.batch_get_note_comments(note_ids, xsec_tokens)
+
+ # Sleep after each page navigation
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[XiaoHongShuCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
except DataFetchError:
utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
break
@@ -171,17 +175,27 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def get_creators_and_notes(self) -> None:
"""Get creator's notes and retrieve their comment information."""
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
- for user_id in config.XHS_CREATOR_ID_LIST:
- # get creator detail info from web html content
- createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id)
- if createor_info:
- await xhs_store.save_creator(user_id, creator=createor_info)
+ for creator_url in config.XHS_CREATOR_ID_LIST:
+ try:
+ # Parse creator URL to get user_id and security tokens
+ creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url)
+ utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] Parse creator URL info: {creator_info}")
+ user_id = creator_info.user_id
- # When proxy is not enabled, increase the crawling interval
- if config.ENABLE_IP_PROXY:
- crawl_interval = random.random()
- else:
- crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
+ # get creator detail info from web html content
+ createor_info: Dict = await self.xhs_client.get_creator_info(
+ user_id=user_id,
+ xsec_token=creator_info.xsec_token,
+ xsec_source=creator_info.xsec_source
+ )
+ if createor_info:
+ await xhs_store.save_creator(user_id, creator=createor_info)
+ except ValueError as e:
+ utils.logger.error(f"[XiaoHongShuCrawler.get_creators_and_notes] Failed to parse creator URL: {e}")
+ continue
+
+ # Use fixed crawling interval
+ crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
# Get all note information of the creator
all_notes_list = await self.xhs_client.get_all_notes_by_creator(
user_id=user_id,
@@ -268,18 +282,16 @@ class XiaoHongShuCrawler(AbstractCrawler):
async with semaphore:
try:
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
-
- try:
- note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
- except RetryError as e:
- pass
-
+ note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
if not note_detail:
- note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
- if not note_detail:
- raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
+ raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
+
+ # Sleep after fetching note detail
+ await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+ utils.logger.info(f"[get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note {note_id}")
+
return note_detail
except DataFetchError as ex:
@@ -310,11 +322,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
"""Get note comments with keyword filtering and quantity limitation"""
async with semaphore:
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
- # When proxy is not enabled, increase the crawling interval
- if config.ENABLE_IP_PROXY:
- crawl_interval = random.random()
- else:
- crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
+ # Use fixed crawling interval
+ crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
await self.xhs_client.get_note_all_comments(
note_id=note_id,
xsec_token=xsec_token,
@@ -322,6 +331,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
callback=xhs_store.batch_update_xhs_note_comments,
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
+
+ # Sleep after fetching comments
+ await asyncio.sleep(crawl_interval)
+ utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for note {note_id}")
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
"""Create xhs client"""
diff --git a/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/extractor.py b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/extractor.py
new file mode 100644
index 0000000..b8d7540
--- /dev/null
+++ b/MindSpider/DeepSentimentCrawling/MediaCrawler/media_platform/xhs/extractor.py
@@ -0,0 +1,60 @@
+# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
+# 5. 不得用于任何非法或不当的用途。
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
+
+import json
+import re
+from typing import Dict, Optional
+
+import humps
+
+
+class XiaoHongShuExtractor:
+ def __init__(self):
+ pass
+
+ def extract_note_detail_from_html(self, note_id: str, html: str) -> Optional[Dict]:
+ """从html中提取笔记详情
+
+ Args:
+ html (str): html字符串
+
+ Returns:
+ Dict: 笔记详情字典
+ """
+ if "noteDetailMap" not in html:
+ # 这种情况要么是出了验证码了,要么是笔记不存在
+ return None
+
+ state = re.findall(r"window.__INITIAL_STATE__=({.*})", html)[
+ 0
+ ].replace("undefined", '""')
+ if state != "{}":
+ note_dict = humps.decamelize(json.loads(state))
+ return note_dict["note"]["note_detail_map"][note_id]["note"]
+ return None
+
+ def extract_creator_info_from_html(self, html: str) -> Optional[Dict]:
+ """从html中提取用户信息
+
+ Args:
+ html (str): html字符串
+
+ Returns:
+ Dict: 用户信息字典
+ """
+ match = re.search(
+ r"