from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError import time import pandas as pd from tqdm import tqdm import logging # 设置日志:记录跳过的页面 logging.basicConfig( filename=r"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出\skipped_pages.log", level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s", encoding='utf-8' ) def extract_table_data(page): """从当前页面提取表格数据""" rows = page.query_selector_all("table.dg tbody tr") data = [] for row in rows: if row.query_selector("th") or "合计" in row.text_content(): continue cells = row.query_selector_all("td") if len(cells) < 12: continue try: record_id = row.query_selector("input[title]").get_attribute("title") or "" except Exception: record_id = "" car_no = cells[2].text_content().strip() name = cells[3].text_content().strip() card_no = cells[4].text_content().strip() card_type = cells[5].text_content().strip() package = cells[6].text_content().strip() total_times = cells[7].text_content().strip() consumed = cells[8].text_content().strip() remaining = cells[9].text_content().strip() remaining_cost = cells[10].text_content().strip() expire_date = cells[11].text_content().strip() data.append({ "ID": record_id, "车牌": car_no, "姓名": name, "卡号": card_no, "卡类型": card_type, "套餐项目": package, "总次数": total_times, "消费": consumed, "剩余": remaining, "剩余成本": remaining_cost, "到期日期": expire_date }) return data # ====== Cookie 配置(保持不变)====== cookie_str = "td_cookie=628629794; td_cookie=627897944; ASP.NET_SessionId=54barjh2gsquceps2flqvlwy; ztrjnew@4db97b96-12af-45b0-b232-fd1e9b7a672e@=PassWord=wZn2IuvdWeE=&RememberPwd=RXv90LpPskw=&UserId=nzK31b3ZYVQ=&CSID=VjfeyHPOjnU=&UserName=fDfTOArNJXHmbGEeaShOsw==&SID=nkNRF6dD83c=&RoleId=1X5bqQAfxQY=&GroupId=KUxCDdt69t4=" cookies_dict = {} for part in cookie_str.split(";"): part = part.strip() if "=" in part: name, value = part.split("=", 1) cookies_dict[name] = value domain = "crm.zhongtukj.com" path = "/" new_cookies = [ {"name": name, "value": value, "domain": domain, "path": path} for name, value in cookies_dict.items() ] def navigate_to_page(page, target_page: int, max_retries: int = 3): """ 安全地跳转到指定页码,带重试机制 """ for attempt in range(1, max_retries + 1): try: if target_page == 1: # 第一页已在初始加载中完成,只需等待表格 page.wait_for_selector("table.dg tbody tr", timeout=50000) return True else: # 触发分页跳转 page.evaluate(f"() => __doPostBack('AspNetPager', '{target_page}')") page.wait_for_load_state("networkidle", timeout=50000) page.wait_for_selector("table.dg tbody tr", timeout=50000) return True except PlaywrightTimeoutError as e: print(f" ⚠️ 第 {target_page} 页加载超时(第 {attempt}/{max_retries} 次尝试): {str(e)[:100]}...") if attempt < max_retries: time.sleep(2) # 可选:刷新页面重试(针对严重卡死) # page.reload() else: logging.warning(f"跳过页面 {target_page}: 加载超时") return False except Exception as e: print(f" ❌ 第 {target_page} 页发生未知错误(第 {attempt}/{max_retries} 次): {e}") if attempt < max_retries: time.sleep(2) else: logging.warning(f"跳过页面 {target_page}: 未知错误 - {str(e)}") return False return False def main(): start_page = 1 end_page = 1532 all_data = [] with sync_playwright() as p: browser = p.chromium.launch(headless=False, slow_mo=300) context = browser.new_context() context.add_cookies(new_cookies) page = context.new_page() page.set_default_timeout(50000) # 全局 10 秒超时 # 初始加载第一页 print("正在加载初始页面...") try: page.goto("http://crm.zhongtukj.com/Boss/Customer/CustomerPackageList.aspx", timeout=50000) page.wait_for_load_state("networkidle") # 选择“所有门店” page.select_option("#Drop_Group", value="0") page.wait_for_load_state("networkidle") except Exception as e: print(f"❌ 初始页面加载失败: {e}") browser.close() return # 主循环:逐页处理 for current_page in tqdm(range(start_page, end_page + 1), desc="处理页面"): success = navigate_to_page(page, current_page, max_retries=3) if not success: continue # 跳过该页 try: data = extract_table_data(page) all_data.extend(data) # print(f" 第 {current_page} 页提取 {len(data)} 条记录") except Exception as e: print(f" ❌ 第 {current_page} 页数据提取失败: {e}") logging.warning(f"第 {current_page} 页数据提取异常: {e}") continue browser.close() # 保存结果 if all_data: df = pd.DataFrame(all_data) filename = rf"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出\套餐卡_第{start_page}至{end_page}页.xlsx" df.to_excel(filename, index=False) print(f"\n✅ 共提取 {len(all_data)} 条记录,已保存到 '{filename}'") print(f"⚠️ 跳过的页面已记录到 skipped_pages.log") else: print("⚠️ 未提取到任何数据") if __name__ == "__main__": main()