Uploading the AI Crawler System: MindSpider
This commit is contained in:
@@ -0,0 +1,13 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from .core import TieBaCrawler
|
||||
@@ -0,0 +1,385 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext
|
||||
from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote
|
||||
from proxy.proxy_ip_pool import ProxyIpPool
|
||||
from tools import utils
|
||||
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import TieBaExtractor
|
||||
|
||||
|
||||
class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
ip_pool=None,
|
||||
default_ip_proxy=None,
|
||||
):
|
||||
self.ip_pool: Optional[ProxyIpPool] = ip_pool
|
||||
self.timeout = timeout
|
||||
self.headers = {
|
||||
"User-Agent": utils.get_user_agent(),
|
||||
"Cookies": "",
|
||||
}
|
||||
self._host = "https://tieba.baidu.com"
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.default_ip_proxy = default_ip_proxy
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
return_ori_content: 是否返回原始内容
|
||||
proxies: 代理IP
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
actual_proxy = proxy if proxy else self.default_ip_proxy
|
||||
async with httpx.AsyncClient(proxy=actual_proxy) as client:
|
||||
response = await client.request(method, url, timeout=self.timeout, headers=self.headers, **kwargs)
|
||||
|
||||
if response.status_code != 200:
|
||||
utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
|
||||
utils.logger.error(f"Request failed, response: {response.text}")
|
||||
raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
|
||||
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
|
||||
if return_ori_content:
|
||||
return response.text
|
||||
|
||||
return response.json()
|
||||
|
||||
async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
return_ori_content: 是否返回原始内容
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = (f"{uri}?"
|
||||
f"{urlencode(params)}")
|
||||
try:
|
||||
res = await self.request(method="GET", url=f"{self._host}{final_uri}", return_ori_content=return_ori_content, **kwargs)
|
||||
return res
|
||||
except RetryError as e:
|
||||
if self.ip_pool:
|
||||
proxie_model = await self.ip_pool.get_proxy()
|
||||
_, proxy = utils.format_proxy_info(proxie_model)
|
||||
res = await self.request(method="GET", url=f"{self._host}{final_uri}", return_ori_content=return_ori_content, proxy=proxy, **kwargs)
|
||||
self.default_ip_proxy = proxy
|
||||
return res
|
||||
|
||||
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
"""
|
||||
POST请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
data: 请求体参数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, **kwargs)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...")
|
||||
try:
|
||||
uri = "/mo/q/sync"
|
||||
res: Dict = await self.get(uri)
|
||||
utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}")
|
||||
if res and res.get("no") == 0:
|
||||
ping_flag = True
|
||||
else:
|
||||
utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...")
|
||||
ping_flag = False
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
async def get_notes_by_keyword(
|
||||
self,
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
page_size: int = 10,
|
||||
sort: SearchSortType = SearchSortType.TIME_DESC,
|
||||
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
根据关键词搜索贴吧帖子
|
||||
Args:
|
||||
keyword: 关键词
|
||||
page: 分页第几页
|
||||
page_size: 每页大小
|
||||
sort: 结果排序方式
|
||||
note_type: 帖子类型(主题贴|主题+回复混合模式)
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/f/search/res"
|
||||
params = {
|
||||
"isnew": 1,
|
||||
"qw": keyword,
|
||||
"rn": page_size,
|
||||
"pn": page,
|
||||
"sm": sort.value,
|
||||
"only_thread": note_type.value,
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
return self._page_extractor.extract_search_note_list(page_content)
|
||||
|
||||
async def get_note_by_id(self, note_id: str) -> TiebaNote:
|
||||
"""
|
||||
根据帖子ID获取帖子详情
|
||||
Args:
|
||||
note_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{note_id}"
|
||||
page_content = await self.get(uri, return_ori_content=True)
|
||||
return self._page_extractor.extract_note_detail(page_content)
|
||||
|
||||
async def get_note_all_comments(
|
||||
self,
|
||||
note_detail: TiebaNote,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_count: int = 10,
|
||||
) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Args:
|
||||
note_detail: 帖子详情对象
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
max_count: 一次帖子爬取的最大评论数量
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{note_detail.note_id}"
|
||||
result: List[TiebaComment] = []
|
||||
current_page = 1
|
||||
while note_detail.total_replay_page >= current_page and len(result) < max_count:
|
||||
params = {
|
||||
"pn": current_page,
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id)
|
||||
if not comments:
|
||||
break
|
||||
if len(result) + len(comments) > max_count:
|
||||
comments = comments[:max_count - len(result)]
|
||||
if callback:
|
||||
await callback(note_detail.note_id, comments)
|
||||
result.extend(comments)
|
||||
# 获取所有子评论
|
||||
await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
self,
|
||||
comments: List[TiebaComment],
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定评论下的所有子评论
|
||||
Args:
|
||||
comments: 评论列表
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/p/comment"
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
return []
|
||||
|
||||
# # 贴吧获取所有子评论需要登录态
|
||||
# if self.headers.get("Cookies") == "" or not self.pong():
|
||||
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
|
||||
|
||||
all_sub_comments: List[TiebaComment] = []
|
||||
for parment_comment in comments:
|
||||
if parment_comment.sub_comment_count == 0:
|
||||
continue
|
||||
|
||||
current_page = 1
|
||||
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
|
||||
while max_sub_page_num >= current_page:
|
||||
params = {
|
||||
"tid": parment_comment.note_id, # 帖子ID
|
||||
"pid": parment_comment.comment_id, # 父级评论ID
|
||||
"fid": parment_comment.tieba_id, # 贴吧ID
|
||||
"pn": current_page # 页码
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, parent_comment=parment_comment)
|
||||
|
||||
if not sub_comments:
|
||||
break
|
||||
if callback:
|
||||
await callback(parment_comment.note_id, sub_comments)
|
||||
all_sub_comments.extend(sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return all_sub_comments
|
||||
|
||||
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
||||
"""
|
||||
根据贴吧名称获取帖子列表
|
||||
Args:
|
||||
tieba_name: 贴吧名称
|
||||
page_num: 分页数量
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/f?kw={tieba_name}&pn={page_num}"
|
||||
page_content = await self.get(uri, return_ori_content=True)
|
||||
return self._page_extractor.extract_tieba_note_list(page_content)
|
||||
|
||||
async def get_creator_info_by_url(self, creator_url: str) -> str:
|
||||
"""
|
||||
根据创作者ID获取创作者信息
|
||||
Args:
|
||||
creator_url: 创作者主页URL
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
|
||||
return page_content
|
||||
|
||||
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
|
||||
"""
|
||||
根据创作者获取创作者的所有帖子
|
||||
Args:
|
||||
user_name:
|
||||
page_number:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/home/get/getthread"
|
||||
params = {
|
||||
"un": user_name,
|
||||
"pn": page_number,
|
||||
"id": "utf-8",
|
||||
"_": utils.get_current_timestamp(),
|
||||
}
|
||||
return await self.get(uri, params=params)
|
||||
|
||||
async def get_all_notes_by_creator_user_name(
|
||||
self,
|
||||
user_name: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_note_count: int = 0,
|
||||
creator_page_html_content: str = None,
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
根据创作者用户名获取创作者所有帖子
|
||||
Args:
|
||||
user_name: 创作者用户名
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后的回调函数,是一个awaitable类型的函数
|
||||
max_note_count: 帖子最大获取数量,如果为0则获取所有
|
||||
creator_page_html_content: 创作者主页HTML内容
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 百度贴吧比较特殊一些,前10个帖子是直接展示在主页上的,要单独处理,通过API获取不到
|
||||
result: List[TiebaNote] = []
|
||||
if creator_page_html_content:
|
||||
thread_id_list = (self._page_extractor.extract_tieba_thread_id_list_from_creator_page(creator_page_html_content))
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}")
|
||||
note_detail_task = [self.get_note_by_id(thread_id) for thread_id in thread_id_list]
|
||||
notes = await asyncio.gather(*note_detail_task)
|
||||
if callback:
|
||||
await callback(notes)
|
||||
result.extend(notes)
|
||||
|
||||
notes_has_more = 1
|
||||
page_number = 1
|
||||
page_per_count = 20
|
||||
total_get_count = 0
|
||||
while notes_has_more == 1 and (max_note_count == 0 or total_get_count < max_note_count):
|
||||
notes_res = await self.get_notes_by_creator(user_name, page_number)
|
||||
if not notes_res or notes_res.get("no") != 0:
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}")
|
||||
break
|
||||
notes_data = notes_res.get("data")
|
||||
notes_has_more = notes_data.get("has_more")
|
||||
notes = notes_data["thread_list"]
|
||||
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}")
|
||||
|
||||
note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes]
|
||||
notes = await asyncio.gather(*note_detail_task)
|
||||
if callback:
|
||||
await callback(notes)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(notes)
|
||||
page_number += 1
|
||||
total_get_count += page_per_count
|
||||
return result
|
||||
@@ -0,0 +1,418 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (
|
||||
BrowserContext,
|
||||
BrowserType,
|
||||
Page,
|
||||
Playwright,
|
||||
async_playwright,
|
||||
)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from model.m_baidu_tieba import TiebaCreator, TiebaNote
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import tieba as tieba_store
|
||||
from tools import utils
|
||||
from tools.cdp_browser import CDPBrowserManager
|
||||
from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import BaiduTieBaClient
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import TieBaExtractor
|
||||
from .login import BaiduTieBaLogin
|
||||
|
||||
|
||||
class TieBaCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
tieba_client: BaiduTieBaClient
|
||||
browser_context: BrowserContext
|
||||
cdp_manager: Optional[CDPBrowserManager]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://tieba.baidu.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.cdp_manager = None
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
Start the crawler
|
||||
Returns:
|
||||
|
||||
"""
|
||||
ip_proxy_pool, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
utils.logger.info(
|
||||
"[BaiduTieBaCrawler.start] Begin create ip proxy pool ..."
|
||||
)
|
||||
ip_proxy_pool = await create_ip_pool(
|
||||
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
|
||||
)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
_, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}"
|
||||
)
|
||||
|
||||
# Create a client to interact with the baidutieba website.
|
||||
self.tieba_client = BaiduTieBaClient(
|
||||
ip_pool=ip_proxy_pool,
|
||||
default_ip_proxy=httpx_proxy_format,
|
||||
)
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
await self.get_specified_tieba_notes()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""
|
||||
Search for notes and retrieve their comment information.
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[BaiduTieBaCrawler.search] Begin search baidu tieba keywords"
|
||||
)
|
||||
tieba_limit_count = 10 # tieba limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||
start_page = config.START_PAGE
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
source_keyword_var.set(keyword)
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}"
|
||||
)
|
||||
page = 1
|
||||
while (
|
||||
page - start_page + 1
|
||||
) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Skip page {page}")
|
||||
page += 1
|
||||
continue
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}"
|
||||
)
|
||||
notes_list: List[TiebaNote] = (
|
||||
await self.tieba_client.get_notes_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
page_size=tieba_limit_count,
|
||||
sort=SearchSortType.TIME_DESC,
|
||||
note_type=SearchNoteType.FIXED_THREAD,
|
||||
)
|
||||
)
|
||||
if not notes_list:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] Search note list is empty"
|
||||
)
|
||||
break
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}"
|
||||
)
|
||||
await self.get_specified_notes(
|
||||
note_id_list=[note_detail.note_id for note_detail in notes_list]
|
||||
)
|
||||
page += 1
|
||||
except Exception as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}"
|
||||
)
|
||||
break
|
||||
|
||||
async def get_specified_tieba_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post by tieba name
|
||||
Returns:
|
||||
|
||||
"""
|
||||
tieba_limit_count = 50
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||
for tieba_name in config.TIEBA_NAME_LIST:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}"
|
||||
)
|
||||
page_number = 0
|
||||
while page_number <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
note_list: List[TiebaNote] = (
|
||||
await self.tieba_client.get_notes_by_tieba_name(
|
||||
tieba_name=tieba_name, page_num=page_number
|
||||
)
|
||||
)
|
||||
if not note_list:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty"
|
||||
)
|
||||
break
|
||||
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}"
|
||||
)
|
||||
await self.get_specified_notes([note.note_id for note in note_list])
|
||||
page_number += tieba_limit_count
|
||||
|
||||
async def get_specified_notes(
|
||||
self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST
|
||||
):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
Args:
|
||||
note_id_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore)
|
||||
for note_id in note_id_list
|
||||
]
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
note_details_model: List[TiebaNote] = []
|
||||
for note_detail in note_details:
|
||||
if note_detail is not None:
|
||||
note_details_model.append(note_detail)
|
||||
await tieba_store.update_tieba_note(note_detail)
|
||||
await self.batch_get_note_comments(note_details_model)
|
||||
|
||||
async def get_note_detail_async_task(
|
||||
self, note_id: str, semaphore: asyncio.Semaphore
|
||||
) -> Optional[TiebaNote]:
|
||||
"""
|
||||
Get note detail
|
||||
Args:
|
||||
note_id: baidu tieba note id
|
||||
semaphore: asyncio semaphore
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}"
|
||||
)
|
||||
note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
|
||||
if not note_detail:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}"
|
||||
)
|
||||
return None
|
||||
return note_detail
|
||||
except Exception as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error: {ex}"
|
||||
)
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}"
|
||||
)
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, note_detail_list: List[TiebaNote]):
|
||||
"""
|
||||
Batch get note comments
|
||||
Args:
|
||||
note_detail_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
return
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for note_detail in note_detail_list:
|
||||
task = asyncio.create_task(
|
||||
self.get_comments_async_task(note_detail, semaphore),
|
||||
name=note_detail.note_id,
|
||||
)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments_async_task(
|
||||
self, note_detail: TiebaNote, semaphore: asyncio.Semaphore
|
||||
):
|
||||
"""
|
||||
Get comments async task
|
||||
Args:
|
||||
note_detail:
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}"
|
||||
)
|
||||
await self.tieba_client.get_note_all_comments(
|
||||
note_detail=note_detail,
|
||||
crawl_interval=random.random(),
|
||||
callback=tieba_store.batch_update_tieba_note_comments,
|
||||
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||
)
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""
|
||||
Get creator's information and their notes and comments
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[WeiboCrawler.get_creators_and_notes] Begin get weibo creators"
|
||||
)
|
||||
for creator_url in config.TIEBA_CREATOR_URL_LIST:
|
||||
creator_page_html_content = await self.tieba_client.get_creator_info_by_url(
|
||||
creator_url=creator_url
|
||||
)
|
||||
creator_info: TiebaCreator = self._page_extractor.extract_creator_info(
|
||||
creator_page_html_content
|
||||
)
|
||||
if creator_info:
|
||||
utils.logger.info(
|
||||
f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}"
|
||||
)
|
||||
if not creator_info:
|
||||
raise Exception("Get creator info error")
|
||||
|
||||
await tieba_store.save_creator(user_info=creator_info)
|
||||
|
||||
# Get all note information of the creator
|
||||
all_notes_list = (
|
||||
await self.tieba_client.get_all_notes_by_creator_user_name(
|
||||
user_name=creator_info.user_name,
|
||||
crawl_interval=0,
|
||||
callback=tieba_store.batch_update_tieba_notes,
|
||||
max_note_count=config.CRAWLER_MAX_NOTES_COUNT,
|
||||
creator_page_html_content=creator_page_html_content,
|
||||
)
|
||||
)
|
||||
|
||||
await self.batch_get_note_comments(all_notes_list)
|
||||
|
||||
else:
|
||||
utils.logger.error(
|
||||
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
|
||||
)
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
Launch browser and create browser
|
||||
Args:
|
||||
chromium:
|
||||
playwright_proxy:
|
||||
user_agent:
|
||||
headless:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[BaiduTieBaCrawler.launch_browser] Begin create browser context ..."
|
||||
)
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
user_data_dir = os.path.join(
|
||||
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
|
||||
) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent,
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
|
||||
)
|
||||
return browser_context
|
||||
|
||||
async def launch_browser_with_cdp(
|
||||
self,
|
||||
playwright: Playwright,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True,
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
使用CDP模式启动浏览器
|
||||
"""
|
||||
try:
|
||||
self.cdp_manager = CDPBrowserManager()
|
||||
browser_context = await self.cdp_manager.launch_and_connect(
|
||||
playwright=playwright,
|
||||
playwright_proxy=playwright_proxy,
|
||||
user_agent=user_agent,
|
||||
headless=headless,
|
||||
)
|
||||
|
||||
# 显示浏览器信息
|
||||
browser_info = await self.cdp_manager.get_browser_info()
|
||||
utils.logger.info(f"[TieBaCrawler] CDP浏览器信息: {browser_info}")
|
||||
|
||||
return browser_context
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[TieBaCrawler] CDP模式启动失败,回退到标准模式: {e}")
|
||||
# 回退到标准模式
|
||||
chromium = playwright.chromium
|
||||
return await self.launch_browser(
|
||||
chromium, playwright_proxy, user_agent, headless
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
"""
|
||||
Close browser context
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# 如果使用CDP模式,需要特殊处理
|
||||
if self.cdp_manager:
|
||||
await self.cdp_manager.cleanup()
|
||||
self.cdp_manager = None
|
||||
else:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
|
||||
@@ -0,0 +1,29 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
# 按时间倒序
|
||||
TIME_DESC = "1"
|
||||
# 按时间顺序
|
||||
TIME_ASC = "0"
|
||||
# 按相关性顺序
|
||||
RELEVANCE_ORDER = "2"
|
||||
|
||||
|
||||
class SearchNoteType(Enum):
|
||||
# 只看主题贴
|
||||
MAIN_THREAD = "1"
|
||||
# 混合模式(帖子+回复)
|
||||
FIXED_THREAD = "0"
|
||||
@@ -0,0 +1,418 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
from urllib.parse import parse_qs, unquote
|
||||
|
||||
from parsel import Selector
|
||||
|
||||
from constant import baidu_tieba as const
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote
|
||||
from tools import utils
|
||||
|
||||
GENDER_MALE = "sex_male"
|
||||
GENDER_FEMALE = "sex_female"
|
||||
|
||||
|
||||
class TieBaExtractor:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def extract_search_note_list(page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据
|
||||
Args:
|
||||
page_content: 页面内容的HTML字符串
|
||||
|
||||
Returns:
|
||||
包含帖子信息的字典列表
|
||||
"""
|
||||
xpath_selector = "//div[@class='s_post']"
|
||||
post_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
result: List[TiebaNote] = []
|
||||
for post in post_list:
|
||||
tieba_note = TiebaNote(note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(),
|
||||
title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(),
|
||||
desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(),
|
||||
note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(
|
||||
default=''),
|
||||
user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(
|
||||
default='').strip(), user_link=const.TIEBA_URL + post.xpath(
|
||||
".//a[starts-with(@href, '/home/main')]/@href").get(default=''),
|
||||
tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(),
|
||||
tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(
|
||||
default=''),
|
||||
publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(
|
||||
default='').strip(), )
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表
|
||||
Args:
|
||||
page_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
page_content = page_content.replace('<!--', "")
|
||||
content_selector = Selector(text=page_content)
|
||||
xpath_selector = "//ul[@id='thread_list']/li"
|
||||
post_list = content_selector.xpath(xpath_selector)
|
||||
result: List[TiebaNote] = []
|
||||
for post_selector in post_list:
|
||||
post_field_value: Dict = self.extract_data_field_value(post_selector)
|
||||
if not post_field_value:
|
||||
continue
|
||||
note_id = str(post_field_value.get("id"))
|
||||
tieba_note = TiebaNote(note_id=note_id,
|
||||
title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
|
||||
desc=post_selector.xpath(
|
||||
".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
|
||||
default='').strip(), note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + post_selector.xpath(
|
||||
".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
|
||||
user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get(
|
||||
"author_name"),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(
|
||||
default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath(
|
||||
"//a[@class='card_title_fname']/@href").get(default=''),
|
||||
total_replay_num=post_field_value.get("reply_num", 0))
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_note_detail(self, page_content: str) -> TiebaNote:
|
||||
"""
|
||||
提取贴吧帖子详情
|
||||
Args:
|
||||
page_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
content_selector = Selector(text=page_content)
|
||||
first_floor_selector = content_selector.xpath("//div[@class='p_postlist'][1]")
|
||||
only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip()
|
||||
note_id = only_view_author_link.split("?")[0].split("/")[-1]
|
||||
# 帖子回复数、回复页数
|
||||
thread_num_infos = content_selector.xpath(
|
||||
"//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']")
|
||||
# IP地理位置、发表时间
|
||||
other_info_content = content_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
note = TiebaNote(note_id=note_id, title=content_selector.xpath("//title/text()").get(default='').strip(),
|
||||
desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + first_floor_selector.xpath(
|
||||
".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
||||
user_nickname=first_floor_selector.xpath(
|
||||
".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(),
|
||||
user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(
|
||||
default='').strip(),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(
|
||||
default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath(
|
||||
"//a[@class='card_title_fname']/@href").get(default=''), ip_location=ip_location,
|
||||
publish_time=publish_time,
|
||||
total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(),
|
||||
total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(), )
|
||||
note.title = note.title.replace(f"【{note.tieba_name}】_百度贴吧", "")
|
||||
return note
|
||||
|
||||
def extract_tieba_note_parment_comments(self, page_content: str, note_id: str) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子一级评论
|
||||
Args:
|
||||
page_content:
|
||||
note_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
xpath_selector = "//div[@class='l_post l_post_bright j_l_post clearfix ']"
|
||||
comment_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
result: List[TiebaComment] = []
|
||||
for comment_selector in comment_list:
|
||||
comment_field_value: Dict = self.extract_data_field_value(comment_selector)
|
||||
if not comment_field_value:
|
||||
continue
|
||||
tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip()
|
||||
other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
tieba_comment = TiebaComment(comment_id=str(comment_field_value.get("content").get("post_id")),
|
||||
sub_comment_count=comment_field_value.get("content").get("comment_num"),
|
||||
content=utils.extract_text_from_html(
|
||||
comment_field_value.get("content").get("content")),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + comment_selector.xpath(
|
||||
".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
||||
user_nickname=comment_selector.xpath(
|
||||
".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(),
|
||||
user_avatar=comment_selector.xpath(
|
||||
".//a[@class='p_author_face ']/img/@src").get(default='').strip(),
|
||||
tieba_id=str(comment_field_value.get("content").get("forum_id", "")),
|
||||
tieba_name=tieba_name, tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}",
|
||||
ip_location=ip_location, publish_time=publish_time, note_id=note_id, )
|
||||
result.append(tieba_comment)
|
||||
return result
|
||||
|
||||
def extract_tieba_note_sub_comments(self, page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子二级评论
|
||||
Args:
|
||||
page_content:
|
||||
parent_comment:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(page_content)
|
||||
comments = []
|
||||
comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']")
|
||||
comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']"))
|
||||
for comment_ele in comment_ele_list:
|
||||
comment_value = self.extract_data_field_value(comment_ele)
|
||||
if not comment_value:
|
||||
continue
|
||||
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
|
||||
content = utils.extract_text_from_html(
|
||||
comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
|
||||
comment = TiebaComment(
|
||||
comment_id=str(comment_value.get("spid")), content=content,
|
||||
user_link=comment_user_a_selector.xpath("./@href").get(default=""),
|
||||
user_nickname=comment_value.get("showname"),
|
||||
user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""),
|
||||
publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(),
|
||||
parent_comment_id=parent_comment.comment_id,
|
||||
note_id=parent_comment.note_id, note_url=parent_comment.note_url,
|
||||
tieba_id=parent_comment.tieba_id, tieba_name=parent_comment.tieba_name,
|
||||
tieba_link=parent_comment.tieba_link)
|
||||
comments.append(comment)
|
||||
|
||||
return comments
|
||||
|
||||
def extract_creator_info(self, html_content: str) -> TiebaCreator:
|
||||
"""
|
||||
提取贴吧创作者信息
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(text=html_content)
|
||||
user_link_selector = selector.xpath("//p[@class='space']/a")
|
||||
user_link: str = user_link_selector.xpath("./@href").get(default='')
|
||||
user_link_params: Dict = parse_qs(unquote(user_link.split("?")[-1]))
|
||||
user_name = user_link_params.get("un")[0] if user_link_params.get("un") else ""
|
||||
user_id = user_link_params.get("id")[0] if user_link_params.get("id") else ""
|
||||
userinfo_userdata_selector = selector.xpath("//div[@class='userinfo_userdata']")
|
||||
follow_fans_selector = selector.xpath("//span[@class='concern_num']")
|
||||
follows, fans = 0, 0
|
||||
if len(follow_fans_selector) == 2:
|
||||
follows, fans = self.extract_follow_and_fans(follow_fans_selector)
|
||||
user_content = userinfo_userdata_selector.get(default='')
|
||||
return TiebaCreator(user_id=user_id, user_name=user_name,
|
||||
nickname=selector.xpath(".//span[@class='userinfo_username ']/text()").get(
|
||||
default='').strip(),
|
||||
avatar=selector.xpath(".//div[@class='userinfo_left_head']//img/@src").get(
|
||||
default='').strip(),
|
||||
gender=self.extract_gender(user_content),
|
||||
ip_location=self.extract_ip(user_content),
|
||||
follows=follows,
|
||||
fans=fans,
|
||||
registration_duration=self.extract_registration_duration(user_content)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def extract_tieba_thread_id_list_from_creator_page(
|
||||
html_content: str
|
||||
) -> List[str]:
|
||||
"""
|
||||
提取贴吧创作者主页的帖子列表
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(text=html_content)
|
||||
thread_id_list = []
|
||||
xpath_selector = (
|
||||
"//ul[@class='new_list clearfix']//div[@class='thread_name']/a[1]/@href"
|
||||
)
|
||||
thread_url_list = selector.xpath(xpath_selector).getall()
|
||||
for thread_url in thread_url_list:
|
||||
thread_id = thread_url.split("?")[0].split("/")[-1]
|
||||
thread_id_list.append(thread_id)
|
||||
return thread_id_list
|
||||
|
||||
def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
|
||||
"""
|
||||
提取IP位置和发布时间
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>')
|
||||
time_match = pattern_pub_time.search(html_content)
|
||||
pub_time = time_match.group(1) if time_match else ""
|
||||
return self.extract_ip(html_content), pub_time
|
||||
|
||||
@staticmethod
|
||||
def extract_ip(html_content: str) -> str:
|
||||
"""
|
||||
提取IP
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
|
||||
ip_match = pattern_ip.search(html_content)
|
||||
ip = ip_match.group(1) if ip_match else ""
|
||||
return ip
|
||||
|
||||
@staticmethod
|
||||
def extract_gender(html_content: str) -> str:
|
||||
"""
|
||||
提取性别
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if GENDER_MALE in html_content:
|
||||
return '男'
|
||||
elif GENDER_FEMALE in html_content:
|
||||
return '女'
|
||||
return '未知'
|
||||
|
||||
@staticmethod
|
||||
def extract_follow_and_fans(selectors: List[Selector]) -> Tuple[str, str]:
|
||||
"""
|
||||
提取关注数和粉丝数
|
||||
Args:
|
||||
selectors:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern = re.compile(r'<span class="concern_num">\(<a[^>]*>(\d+)</a>\)</span>')
|
||||
follow_match = pattern.findall(selectors[0].get())
|
||||
fans_match = pattern.findall(selectors[1].get())
|
||||
follows = follow_match[0] if follow_match else 0
|
||||
fans = fans_match[0] if fans_match else 0
|
||||
return follows, fans
|
||||
|
||||
@staticmethod
|
||||
def extract_registration_duration(html_content: str) -> str:
|
||||
"""
|
||||
"<span>吧龄:1.9年</span>"
|
||||
Returns: 1.9年
|
||||
|
||||
"""
|
||||
pattern = re.compile(r'<span>吧龄:(\S+)</span>')
|
||||
match = pattern.search(html_content)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
@staticmethod
|
||||
def extract_data_field_value(selector: Selector) -> Dict:
|
||||
"""
|
||||
提取data-field的值
|
||||
Args:
|
||||
selector:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
data_field_value = selector.xpath("./@data-field").get(default='').strip()
|
||||
if not data_field_value or data_field_value == "{}":
|
||||
return {}
|
||||
try:
|
||||
# 先使用 html.unescape 处理转义字符 再json.loads 将 JSON 字符串转换为 Python 字典
|
||||
unescaped_json_str = html.unescape(data_field_value)
|
||||
data_field_dict_value = json.loads(unescaped_json_str)
|
||||
except Exception as ex:
|
||||
print(f"extract_data_field_value,错误信息:{ex}, 尝试使用其他方式解析")
|
||||
data_field_dict_value = {}
|
||||
return data_field_dict_value
|
||||
|
||||
|
||||
def test_extract_search_note_list():
|
||||
with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_search_note_list(content)
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_note_detail():
|
||||
with open("test_data/note_detail.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_note_detail(content)
|
||||
print(result.model_dump())
|
||||
|
||||
|
||||
def test_extract_tieba_note_parment_comments():
|
||||
with open("test_data/note_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_tieba_note_parment_comments(content, "123456")
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_tieba_note_sub_comments():
|
||||
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
fake_parment_comment = TiebaComment(comment_id="123456", content="content", user_link="user_link",
|
||||
user_nickname="user_nickname", user_avatar="user_avatar",
|
||||
publish_time="publish_time", parent_comment_id="parent_comment_id",
|
||||
note_id="note_id", note_url="note_url", tieba_id="tieba_id",
|
||||
tieba_name="tieba_name", )
|
||||
result = extractor.extract_tieba_note_sub_comments(content, fake_parment_comment)
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_tieba_note_list():
|
||||
with open("test_data/tieba_note_list.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_tieba_note_list(content)
|
||||
print(result)
|
||||
pass
|
||||
|
||||
|
||||
def test_extract_creator_info():
|
||||
with open("test_data/creator_info.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_creator_info(content)
|
||||
print(result.model_dump_json())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# test_extract_search_note_list()
|
||||
# test_extract_note_detail()
|
||||
# test_extract_tieba_note_parment_comments()
|
||||
# test_extract_tieba_note_list()
|
||||
test_extract_creator_info()
|
||||
@@ -0,0 +1,123 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class BaiduTieBaLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
轮训检查登录状态是否成功,成功返回True否则返回False
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
stoken = cookie_dict.get("STOKEN")
|
||||
ptoken = cookie_dict.get("PTOKEN")
|
||||
if stoken or ptoken:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def begin(self):
|
||||
"""Start login baidutieba"""
|
||||
utils.logger.info("[BaiduTieBaLogin.begin] Begin login baidutieba ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[BaiduTieBaLogin.begin]Invalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
|
||||
async def login_by_mobile(self):
|
||||
"""Login baidutieba by mobile"""
|
||||
pass
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login baidutieba website and keep webdriver login state"""
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Begin login baidutieba by qrcode ...")
|
||||
qrcode_img_selector = "xpath=//img[@class='tang-pass-qrcode-img']"
|
||||
# find login qrcode
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||
await asyncio.sleep(0.5)
|
||||
login_button_ele = self.context_page.locator("xpath=//li[@class='u_login']")
|
||||
await login_button_ele.click()
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
# fix issue #12
|
||||
# we need to use partial function to call show_qrcode function and run in executor
|
||||
# then current asyncio event loop will not be blocked
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Login baidutieba failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_cookies(self):
|
||||
"""login baidutieba website by cookies"""
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_cookies] Begin login baidutieba by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".baidu.com",
|
||||
'path': "/"
|
||||
}])
|
||||
+874
File diff suppressed because one or more lines are too long
+839
File diff suppressed because one or more lines are too long
+189
@@ -0,0 +1,189 @@
|
||||
<li class="lzl_single_post j_lzl_s_p first_no_border" data-field='{"spid":150726504693,"showname":"heinzfrentzen","user_name":"heinzfrentzen","portrait":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}'>
|
||||
<a rel="noopener" name="150726504693"></a>
|
||||
<a rel="noopener" data-field='{"un":"heinzfrentzen","id":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&fr=pb" username="heinzfrentzen">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"heinzfrentzen","id":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}' href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&ie=utf-8&fr=pb" target="_blank" username="heinzfrentzen">heinzfrentzen</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:11</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726506822,"showname":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","user_name":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","portrait":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}'>
|
||||
<a rel="noopener" name="150726506822"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","id":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&fr=pb" username="可爱的搬运工94">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","id":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}' href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&ie=utf-8&fr=pb" target="_blank" username="可爱的搬运工94">可爱的搬运工94</a>
|
||||
:<span class="lzl_content_main" data-username="">陈芋汐水花也不小 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726508024,"showname":"\u56fd\u9645\u4f53\u575b\u5de8\u661f\u9752\u6912\u8089\u4e1d","user_name":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","portrait":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}'>
|
||||
<a rel="noopener" name="150726508024"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","id":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&fr=pb" username="蚂蚁雅虎哈哈">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","id":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}' href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&ie=utf-8&fr=pb" target="_blank" username="蚂蚁雅虎哈哈">国际体坛巨星青椒肉丝</a>
|
||||
:<span class="lzl_content_main" data-username="">你怀孕了吗 老是呕吐 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726509762,"showname":"\u8317\u82b1\u5c11\u5e05","user_name":"\u8317\u82b1\u5c11\u5e05","portrait":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}'>
|
||||
<a rel="noopener" name="150726509762"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u8317\u82b1\u5c11\u5e05","id":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&fr=pb" username="茗花少帅">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1421248220","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1421248220","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u8317\u82b1\u5c11\u5e05","id":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}' href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&ie=utf-8&fr=pb" target="_blank" username="茗花少帅">茗花少帅</a>
|
||||
:<span class="lzl_content_main" data-username="">你就只看水花,不看空中姿态吗 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726510645,"showname":"\u4e1c\u534e\u6b66\u5170","user_name":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","portrait":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}'>
|
||||
<a rel="noopener" name="150726510645"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","id":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&fr=pb" username="西安交大前一百">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1644033630","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1644033630","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","id":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}' href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&ie=utf-8&fr=pb" target="_blank" username="西安交大前一百">东华武兰</a>
|
||||
:<span class="lzl_content_main" data-username="">经典只看水花 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726514057,"showname":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","user_name":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","portrait":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}'>
|
||||
<a rel="noopener" name="150726514057"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","id":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&fr=pb" username="上下班要注意">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","id":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}' href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&ie=utf-8&fr=pb" target="_blank" username="上下班要注意">上下班要注意</a>
|
||||
:<span class="lzl_content_main" data-username="">额,分数正常吧 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:13</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726520372,"showname":"\u9759\u770b\u8682\u8681\u4e0a\u6811","user_name":"\u9759\u770b\u8682\u8681\u4e0a\u6811","portrait":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}'>
|
||||
<a rel="noopener" name="150726520372"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u9759\u770b\u8682\u8681\u4e0a\u6811","id":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&fr=pb" username="静看蚂蚁上树">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u9759\u770b\u8682\u8681\u4e0a\u6811","id":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}' href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&ie=utf-8&fr=pb" target="_blank" username="静看蚂蚁上树">静看蚂蚁上树</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg" target="_blank" class="at">国际体坛巨星青椒肉丝</a>
|
||||
:吃酸黄瓜吃多了<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:14</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726524963,"showname":"\u4e0d\u61c2\u53d6\u5565\u540d\u5b57\ud83d\ude1c","user_name":"\u9ec4\u5c0f\u6e2forz","portrait":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}'>
|
||||
<a rel="noopener" name="150726524963"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u9ec4\u5c0f\u6e2forz","id":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&fr=pb" username="黄小港orz">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u9ec4\u5c0f\u6e2forz","id":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}' href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&ie=utf-8&fr=pb" target="_blank" username="黄小港orz">不懂取啥名字😜</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
请你去跟国际泳联投诉<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:15</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726535666,"showname":"\ud83d\udcab\u6cfd\u8d6b\u62c9\ud83d\udcaf","user_name":"\u5feb\u770b\u5361\u5361\u5361\u5361","portrait":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}'>
|
||||
<a rel="noopener" name="150726535666"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u5feb\u770b\u5361\u5361\u5361\u5361","id":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&fr=pb" username="快看卡卡卡卡">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1539783937","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1539783937","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u5feb\u770b\u5361\u5361\u5361\u5361","id":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}' href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&ie=utf-8&fr=pb" target="_blank" username="快看卡卡卡卡">💫泽赫拉💯</a>
|
||||
:<span class="lzl_content_main" data-username="">第五跳陈空中分腿了,空中姿态明显全红婵更好 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:17</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726536076,"showname":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\ud83d\udc36","user_name":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","portrait":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}'>
|
||||
<a rel="noopener" name="150726536076"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","id":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&fr=pb" username="嗯嗯哦哦啊啊哼">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":null,"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","id":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}' href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&ie=utf-8&fr=pb" target="_blank" username="嗯嗯哦哦啊啊哼">嗯嗯哦哦啊啊🐶</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.84497425.b5GLK5lGm90mTB2BhjrgpA" target="_blank" class="at">美味蟹黄堡💞</a>
|
||||
:你不会看起跳高度和空中姿态?
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:17</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_li_pager j_lzl_l_p lzl_li_pager_s" data-field='{"total_num":16,"total_page":2}'>
|
||||
<a rel="noopener" class="j_lzl_p btn-sub btn-small pull-right" href="##">
|
||||
<i class="icon-reply"></i>
|
||||
我也说一句
|
||||
</a>
|
||||
<p class="j_pager l_pager pager_theme_2">
|
||||
<span class="tP">1</span>
|
||||
<a href="#2">2</a>
|
||||
<a href="#2">下一页</a>
|
||||
<a href="#2">尾页</a>
|
||||
</p>
|
||||
</li>
|
||||
+96
@@ -0,0 +1,96 @@
|
||||
<div class="s_post_list">
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9117888152" data-fid="26976424" class="bluelink"
|
||||
href="/p/9117888152?pid=150718967291&cid=0#150718967291"
|
||||
target="_blank">武汉交互空间科技:富士康10亿加码中国大陆,印度为何逐渐“失宠</a></span>
|
||||
<div class="p_content">
|
||||
全球知名的电子制造服务巨头富士康的母公司鸿海精密工业股份有限公司正式对外发布了一则重大投资公告,富士康将在郑州投资建设新事业总部大楼,承载新事业总部功能。这一战略举措不仅彰显了富士康对中国市场持续深化的承诺与信心,也预示着该集团业务版图的新一轮扩张与升级。
|
||||
项目一期选址位于郑东新区,建筑面积约700公亩,总投资约10亿元人民币。主要建设总部管理中心、研发中心和工程中心、战略产业发展中心、战略产业金融平台、
|
||||
</div>
|
||||
贴吧:<a data-fid="26976424" class="p_forum" href="/f?kw=%CE%E4%BA%BA%BD%BB%BB%A5%BF%D5%BC%E4"
|
||||
target="_blank"><font class="p_violet">武汉交互空间</font></a>作者:<a
|
||||
href="/home/main?un=VR%D0%E9%C4%E2%B4%EF%C8%CB" target="_blank"><font class="p_violet">VR虚拟达人</font></a>
|
||||
<font class="p_green p_date">2024-08-05 16:45</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9114743782" data-fid="90367" class="bluelink"
|
||||
href="/p/9114743782?pid=150705176739&cid=0#150705176739"
|
||||
target="_blank">请各位急用玛尼的小心,骗子最多</a></span>
|
||||
<div class="p_content">
|
||||
这里面到处是骗子,大家小心。特别那些叫出村背货的,基本是卖园区,天下没有那么好的事。就是有这好事,我们在边境上的人,比你们最清楚,轮不到你们,边境上比你们胆子大的人大把,你一不熟悉小路,为什么叫你带货。东南亚带货的集结地,一般在南宁,防城港,昆明,西双版纳,临沧然后师机接了走小路出去,南宁,防城港坐船出去。好多都是二十几手的中介,之前卖园区一个三十万,现在不知道行情,但好多园区不收
|
||||
</div>
|
||||
贴吧:<a data-fid="90367" class="p_forum" href="/f?kw=%B1%B3%B0%FC%BF%CD" target="_blank"><font class="p_violet">背包客</font></a>作者:<a
|
||||
href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC64AUS" target="_blank"><font class="p_violet">贴吧用户_GC64AUS</font></a>
|
||||
<font class="p_green p_date">2024-08-03 07:35</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9095684158" data-fid="1388265" class="bluelink"
|
||||
href="/p/9095684158?pid=150616716870&cid=0#150616716870"
|
||||
target="_blank">*2025泰国冷链制冷运输展*东南亚外贸出口</a></span>
|
||||
<div class="p_content">**2025泰国曼谷国际冷库、空调制冷、仓储暨冷链运输展 *2025泰国冷链制冷运输展*东南亚外贸出口-观展游览考察
|
||||
展出时间:2025-7月(具体时间待定) 展出地点:泰国曼谷会展中心 展会周期:一年一届 组展单位:北京励航国际商务会展有限公司
|
||||
人员跟团观展补贴!为您节省成本,寻找适合您的市场:
|
||||
本公司为您提供观展考察机会,让您在大型展会上获得世界同行**科技的资料同时,感受异域文化气息。展会现场走展考察→→当地游览→→当地相关市
|
||||
</div>
|
||||
贴吧:<a data-fid="1388265" class="p_forum" href="/f?kw=%B9%FA%BC%CA%D5%B9%BB%E1" target="_blank"><font
|
||||
class="p_violet">国际展会</font></a>作者:<a href="/home/main?un=zhaot_188" target="_blank"><font
|
||||
class="p_violet">zhaot_188</font></a> <font class="p_green p_date">2024-07-19 15:44</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9093564752" data-fid="27984246" class="bluelink"
|
||||
href="/p/9093564752?pid=150606964195&cid=0#150606964195"
|
||||
target="_blank">京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承</a></span>
|
||||
<div class="p_content">来源标题:京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承 京湘楼(KING HERO)品牌创始人:肖鑫
|
||||
京湘楼,KING
|
||||
HERO,集酱板鸭、肥肠、鸭头、鸭脖、鸭肠、小龙虾、牛蛙、捆鸡、鸡爪、鱼嘴巴、鱼尾、鱿鱼、牛肉、猪头肉等特色食品卤制,加工、包装与生产经营。2022年3月在北京朝阳区双井开设了第一家“京湘楼·鲜卤集市”卤味熟食快餐店,2023年5月在湖南省长沙市开福区注册成立了“长沙京湘楼品牌管理有限公司”,以“京湘楼”作为品
|
||||
</div>
|
||||
贴吧:<a data-fid="27984246" class="p_forum" href="/f?kw=%BE%A9%CF%E6%C2%A5" target="_blank"><font
|
||||
class="p_violet">京湘楼</font></a>作者:<a href="/home/main?un=%CC%EC%C9%F1%B6%C9%B3%BE" target="_blank"><font
|
||||
class="p_violet">天神渡尘</font></a> <font class="p_green p_date">2024-07-17 23:43</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9088419293" data-fid="310" class="bluelink"
|
||||
href="/p/9088419293?pid=150582471307&cid=0#150582471307"
|
||||
target="_blank">广州能争取到迪士尼与环球落户吗?</a></span>
|
||||
<div class="p_content">
|
||||
不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。
|
||||
美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃
|
||||
</div>
|
||||
贴吧:<a data-fid="310" class="p_forum" href="/f?kw=%B5%D8%C0%ED" target="_blank"><font
|
||||
class="p_violet">地理</font></a>作者:<a href="/home/main?un=SeaRoutes" target="_blank"><font
|
||||
class="p_violet">SeaRoutes</font></a> <font class="p_green p_date">2024-07-13 20:17</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9088416365" data-fid="7561034" class="bluelink"
|
||||
href="/p/9088416365?pid=150582456551&cid=0#150582456551"
|
||||
target="_blank">#城市GDP#广州应该全力去争取迪士尼和环球影城</a></span>
|
||||
<div class="p_content">
|
||||
不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。
|
||||
美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃
|
||||
</div>
|
||||
贴吧:<a data-fid="7561034" class="p_forum" href="/f?kw=%B3%C7%CA%D0gdp" target="_blank"><font class="p_violet">城市gdp</font></a>作者:<a
|
||||
href="/home/main?un=SeaRoutes" target="_blank"><font class="p_violet">SeaRoutes</font></a> <font
|
||||
class="p_green p_date">2024-07-13 20:14</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9087419039" data-fid="46374" class="bluelink"
|
||||
href="/p/9087419039?pid=150577861626&cid=0#150577861626"
|
||||
target="_blank">云南省首批《云南日报》昆明新闻头条聚焦阳宗海省级物流枢纽建设</a></span>
|
||||
<div class="p_content">
|
||||
7月11日《云南日报》昆明新闻头条刊发文章《阳宗海风景名胜区立足“衔接西部陆海新通道与中老铁路”优势——加速28个物流枢纽设施建设》聚焦昆明阳宗海风景名胜区系统推进省级物流枢纽建设和功能提升深挖比较优势壮大物流产业据云南省发展和改革委员会在昆明召开的新闻发布会上公布,今年全省共有5地纳入云南省第一批省级物流枢纽和省级骨干冷链物流基地建设名单,其中,昆明市有两家获批,阳宗海物流枢纽上榜!一起来看近日,云南省
|
||||
</div>
|
||||
贴吧:<a data-fid="46374" class="p_forum" href="/f?kw=%C0%A5%C3%F7" target="_blank"><font
|
||||
class="p_violet">昆明</font></a>作者:<a href="/home/main?un=%8F%EC" target="_blank"><font
|
||||
class="p_violet">忟</font></a> <font class="p_green p_date">2024-07-12 23:04</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9085102046" data-fid="348713" class="bluelink"
|
||||
href="/p/9085102046?pid=150567555367&cid=0#150567555367"
|
||||
target="_blank">寻找弟弟,很久没跟家里联系</a></span>
|
||||
<div class="p_content">Kk四期世纪园区,寻找弟弟,外号大佐,F3 2楼,公司cj集团</div>
|
||||
贴吧:<a data-fid="348713" class="p_forum" href="/f?kw=%B6%AB%C4%CF%D1%C7" target="_blank"><font
|
||||
class="p_violet">东南亚</font></a>作者:<a href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC2CtRa"
|
||||
target="_blank"><font class="p_violet">贴吧用户_GC2CtRa</font></a>
|
||||
<font class="p_green p_date">2024-07-11 07:53</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9083888071" data-fid="30" class="bluelink"
|
||||
href="/p/9083888071?pid=150562129935&cid=0#150562129935"
|
||||
target="_blank">拉美 非洲 东南亚 南亚等发展中国家不太可能普及八小时双休吧?</a></span>
|
||||
<div class="p_content">拉美 和 东南亚的泰国 之类的连毒枭和黑色产业都管不好感觉普及八小时双休不太可能 缅甸和非洲军阀林立
|
||||
跟军阀谈八小时双休那么不开玩笑?缅北诈骗园区就能看出来。
|
||||
</div>
|
||||
贴吧:<a data-fid="30" class="p_forum" href="/f?kw=%C0%FA%CA%B7" target="_blank"><font
|
||||
class="p_violet">历史</font></a>作者:<a href="/home/main?un=yoursagain" target="_blank"><font
|
||||
class="p_violet">yoursagain</font></a> <font class="p_green p_date">2024-07-10 09:00</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9071937582" data-fid="8103241" class="bluelink"
|
||||
href="/p/9071937582?pid=150510120873&cid=0#150510120873"
|
||||
target="_blank">东南亚,园区【 工 价 低 】</a></span>
|
||||
<div class="p_content"></div>
|
||||
贴吧:<a data-fid="8103241" class="p_forum" href="/f?kw=%D4%B0%C7%F8%D5%D0%C9%CC" target="_blank"><font
|
||||
class="p_violet">园区招商</font></a>作者:<a href="/home/main?un=QQ59052966" target="_blank"><font
|
||||
class="p_violet">QQ59052966</font></a> <font class="p_green p_date">2024-06-30 12:09</font></div>
|
||||
</div>
|
||||
+3627
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user