a0845a8169
会员卡不限制车辆使用
165 lines
6.1 KiB
Python
165 lines
6.1 KiB
Python
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
|
|
import time
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
import logging
|
|
|
|
# 设置日志:记录跳过的页面
|
|
logging.basicConfig(
|
|
filename=r"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出\skipped_pages.log",
|
|
level=logging.WARNING,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
encoding='utf-8'
|
|
)
|
|
|
|
def extract_table_data(page):
|
|
"""从当前页面提取表格数据"""
|
|
rows = page.query_selector_all("table.dg tbody tr")
|
|
data = []
|
|
for row in rows:
|
|
if row.query_selector("th") or "合计" in row.text_content():
|
|
continue
|
|
cells = row.query_selector_all("td")
|
|
if len(cells) < 12:
|
|
continue
|
|
|
|
try:
|
|
record_id = row.query_selector("input[title]").get_attribute("title") or ""
|
|
except Exception:
|
|
record_id = ""
|
|
|
|
car_no = cells[2].text_content().strip()
|
|
name = cells[3].text_content().strip()
|
|
card_no = cells[4].text_content().strip()
|
|
card_type = cells[5].text_content().strip()
|
|
package = cells[6].text_content().strip()
|
|
total_times = cells[7].text_content().strip()
|
|
consumed = cells[8].text_content().strip()
|
|
remaining = cells[9].text_content().strip()
|
|
remaining_cost = cells[10].text_content().strip()
|
|
expire_date = cells[11].text_content().strip()
|
|
|
|
data.append({
|
|
"ID": record_id,
|
|
"车牌": car_no,
|
|
"姓名": name,
|
|
"卡号": card_no,
|
|
"卡类型": card_type,
|
|
"套餐项目": package,
|
|
"总次数": total_times,
|
|
"消费": consumed,
|
|
"剩余": remaining,
|
|
"剩余成本": remaining_cost,
|
|
"到期日期": expire_date
|
|
})
|
|
return data
|
|
|
|
|
|
# ====== Cookie 配置(保持不变)======
|
|
cookie_str = "td_cookie=628629794; td_cookie=627897944; ASP.NET_SessionId=54barjh2gsquceps2flqvlwy; ztrjnew@4db97b96-12af-45b0-b232-fd1e9b7a672e@=PassWord=wZn2IuvdWeE=&RememberPwd=RXv90LpPskw=&UserId=nzK31b3ZYVQ=&CSID=VjfeyHPOjnU=&UserName=fDfTOArNJXHmbGEeaShOsw==&SID=nkNRF6dD83c=&RoleId=1X5bqQAfxQY=&GroupId=KUxCDdt69t4="
|
|
|
|
cookies_dict = {}
|
|
for part in cookie_str.split(";"):
|
|
part = part.strip()
|
|
if "=" in part:
|
|
name, value = part.split("=", 1)
|
|
cookies_dict[name] = value
|
|
|
|
domain = "crm.zhongtukj.com"
|
|
path = "/"
|
|
new_cookies = [
|
|
{"name": name, "value": value, "domain": domain, "path": path}
|
|
for name, value in cookies_dict.items()
|
|
]
|
|
|
|
|
|
def navigate_to_page(page, target_page: int, max_retries: int = 3):
|
|
"""
|
|
安全地跳转到指定页码,带重试机制
|
|
"""
|
|
for attempt in range(1, max_retries + 1):
|
|
try:
|
|
if target_page == 1:
|
|
# 第一页已在初始加载中完成,只需等待表格
|
|
page.wait_for_selector("table.dg tbody tr", timeout=50000)
|
|
return True
|
|
else:
|
|
# 触发分页跳转
|
|
page.evaluate(f"() => __doPostBack('AspNetPager', '{target_page}')")
|
|
page.wait_for_load_state("networkidle", timeout=50000)
|
|
page.wait_for_selector("table.dg tbody tr", timeout=50000)
|
|
return True
|
|
except PlaywrightTimeoutError as e:
|
|
print(f" ⚠️ 第 {target_page} 页加载超时(第 {attempt}/{max_retries} 次尝试): {str(e)[:100]}...")
|
|
if attempt < max_retries:
|
|
time.sleep(2)
|
|
# 可选:刷新页面重试(针对严重卡死)
|
|
# page.reload()
|
|
else:
|
|
logging.warning(f"跳过页面 {target_page}: 加载超时")
|
|
return False
|
|
except Exception as e:
|
|
print(f" ❌ 第 {target_page} 页发生未知错误(第 {attempt}/{max_retries} 次): {e}")
|
|
if attempt < max_retries:
|
|
time.sleep(2)
|
|
else:
|
|
logging.warning(f"跳过页面 {target_page}: 未知错误 - {str(e)}")
|
|
return False
|
|
return False
|
|
|
|
|
|
def main():
|
|
start_page = 1
|
|
end_page = 1532
|
|
all_data = []
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=False, slow_mo=300)
|
|
context = browser.new_context()
|
|
context.add_cookies(new_cookies)
|
|
page = context.new_page()
|
|
page.set_default_timeout(50000) # 全局 10 秒超时
|
|
|
|
# 初始加载第一页
|
|
print("正在加载初始页面...")
|
|
try:
|
|
page.goto("http://crm.zhongtukj.com/Boss/Customer/CustomerPackageList.aspx", timeout=50000)
|
|
page.wait_for_load_state("networkidle")
|
|
# 选择“所有门店”
|
|
page.select_option("#Drop_Group", value="0")
|
|
page.wait_for_load_state("networkidle")
|
|
except Exception as e:
|
|
print(f"❌ 初始页面加载失败: {e}")
|
|
browser.close()
|
|
return
|
|
|
|
# 主循环:逐页处理
|
|
for current_page in tqdm(range(start_page, end_page + 1), desc="处理页面"):
|
|
success = navigate_to_page(page, current_page, max_retries=3)
|
|
if not success:
|
|
continue # 跳过该页
|
|
|
|
try:
|
|
data = extract_table_data(page)
|
|
all_data.extend(data)
|
|
# print(f" 第 {current_page} 页提取 {len(data)} 条记录")
|
|
except Exception as e:
|
|
print(f" ❌ 第 {current_page} 页数据提取失败: {e}")
|
|
logging.warning(f"第 {current_page} 页数据提取异常: {e}")
|
|
continue
|
|
|
|
browser.close()
|
|
|
|
# 保存结果
|
|
if all_data:
|
|
df = pd.DataFrame(all_data)
|
|
filename = rf"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出\套餐卡_第{start_page}至{end_page}页.xlsx"
|
|
df.to_excel(filename, index=False)
|
|
print(f"\n✅ 共提取 {len(all_data)} 条记录,已保存到 '{filename}'")
|
|
print(f"⚠️ 跳过的页面已记录到 skipped_pages.log")
|
|
else:
|
|
print("⚠️ 未提取到任何数据")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |