131 lines
4.5 KiB
Python
131 lines
4.5 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.common.by import By
|
|
import time
|
|
import os
|
|
import pickle
|
|
import json
|
|
from datetime import datetime, timedelta
|
|
|
|
# ==================== 配置 ====================
|
|
COOKIE_FILE = "sina_cookies.pkl" # 保存 Cookies 的文件名
|
|
LOGIN_URL = "https://tousu.sina.com.cn/"
|
|
DOMAIN = "tousu.sina.com.cn"
|
|
MAX_COOKIE_AGE_DAYS = 7 # Cookies 最大有效期(天)
|
|
|
|
def is_cookie_valid():
|
|
"""检查 Cookies 是否存在且未过期"""
|
|
if not os.path.exists(COOKIE_FILE):
|
|
return False
|
|
|
|
with open(COOKIE_FILE, "rb") as f:
|
|
cookies = pickle.load(f)
|
|
|
|
# 获取 Cookies 中的过期时间(假设有一个 Cookie 有 'expiry' 字段)
|
|
for cookie in cookies:
|
|
if 'expiry' in cookie:
|
|
expiry_time = datetime.fromtimestamp(cookie['expiry'])
|
|
if expiry_time < datetime.now():
|
|
print("Cookies 已过期")
|
|
return False
|
|
return True
|
|
|
|
def save_cookies(driver):
|
|
"""保存当前会话的 Cookies 到文件"""
|
|
cookies = driver.get_cookies()
|
|
with open(COOKIE_FILE, "wb") as f:
|
|
pickle.dump(cookies, f)
|
|
print("Cookies 已保存")
|
|
|
|
def load_cookies(driver):
|
|
"""加载 Cookies 到当前浏览器会话"""
|
|
driver.get(LOGIN_URL) # 必须先访问一次目标网站才能设置 Cookies
|
|
with open(COOKIE_FILE, "rb") as f:
|
|
cookies = pickle.load(f)
|
|
for cookie in cookies:
|
|
# 移除不必要的字段
|
|
cookie.pop('sameSite', None)
|
|
cookie.pop('expiry', None) if 'expiry' in cookie and cookie['expiry'] < time.time() else None
|
|
driver.add_cookie(cookie)
|
|
print("Cookies 加载成功")
|
|
|
|
def check_login_status(driver):
|
|
"""检查用户是否登录"""
|
|
try:
|
|
WebDriverWait(driver, 10).until(
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, "#SI_User > div.ac-login.ac-logined"))
|
|
)
|
|
print("检测到已登录状态")
|
|
return True
|
|
except:
|
|
print("未登录或登录状态失效")
|
|
return False
|
|
|
|
def collect_complaints(driver, keyword, max_items=1000):
|
|
search_url = f"https://tousu.sina.com.cn/index/search/?keywords={keyword}&t=1"
|
|
driver.get(search_url)
|
|
|
|
collected_items = set()
|
|
no_new_count = 0
|
|
|
|
while len(collected_items) < max_items and no_new_count < 3:
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
time.sleep(2)
|
|
|
|
complaints = driver.find_elements(By.CSS_SELECTOR, "#search_tab > div.blackcat-container > div.tab-con.tousu-list > div > a > h1")
|
|
|
|
current_count = len(collected_items)
|
|
for complaint in complaints:
|
|
text = complaint.text.strip()
|
|
if text and text not in collected_items:
|
|
collected_items.add(text)
|
|
|
|
if len(collected_items) == current_count:
|
|
no_new_count += 1
|
|
else:
|
|
no_new_count = 0
|
|
|
|
return list(collected_items)
|
|
|
|
def main():
|
|
# 配置 Chrome 浏览器选项
|
|
chrome_options = Options()
|
|
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
|
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36")
|
|
|
|
# 启动浏览器
|
|
chrome_service = Service(executable_path='D:\\ProgramTools\\chromedriver-win64\\chromedriver.exe')
|
|
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
|
|
|
|
# 检查是否已有 Cookies
|
|
if is_cookie_valid():
|
|
print("使用保存的 Cookies 登录...")
|
|
load_cookies(driver)
|
|
driver.get(LOGIN_URL)
|
|
if not check_login_status(driver):
|
|
print("Cookies 无效,需要手动登录")
|
|
input("请手动登录后按回车继续...")
|
|
save_cookies(driver)
|
|
else:
|
|
print("首次登录,请手动登录")
|
|
driver.get(LOGIN_URL)
|
|
input("请手动登录后按回车继续...")
|
|
save_cookies(driver)
|
|
|
|
# 获取搜索关键词
|
|
keyword = input("请输入搜索关键词(例如:F6汽车科技): ")
|
|
|
|
# 收集投诉内容
|
|
complaints = collect_complaints(driver, keyword)
|
|
|
|
# 保存结果
|
|
with open("tousu.txt", "w", encoding="utf-8") as f:
|
|
for item in complaints:
|
|
f.write(item + "\n")
|
|
|
|
print(f"成功收集到{len(complaints)}条投诉信息,已保存到tousu.txt")
|
|
driver.quit()
|
|
|
|
if __name__ == "__main__":
|
|
main() |