众途脚本爬取

会员卡不限制车辆使用
2026-02-02 10:59:00 +08:00
parent f1831c31b4
commit a0845a8169
5 changed files with 321 additions and 126 deletions
@@ -12,7 +12,7 @@ from tqdm import tqdm

 # 配置 WebDriver

-chrom_dirverpath = "D:\ProgramTools\chromedriver-win64\chromedriver.exe"
+chrom_dirverpath = r"D:\Program Files\chromedriver-win64\chromedriver.exe"
 # chrome_options = Options()
 # chrome_options.add_argument("--headless")
 service = Service(executable_path=f'{chrom_dirverpath}')
@@ -47,11 +47,12 @@ def open_page(driver, carId):
    time.sleep(1)


-name = "18919515707"
-password = 'Gtyc123456'
-path = r"C:\Users\Administrator.DESKTOP-7IC2USJ\Downloads\会员卡信息 (5).xlsx"
-df = pd.read_excel(path, engine='openpyxl', sheet_name='会员卡详情', dtype='string')
+name = "15726209669"
+password = 'Zhou123'
+path = r"C:\Users\hp_z66\Desktop\钉钉文件\一号车库需修改不限车辆使用套餐卡（宗川涵）.xlsx"
+df = pd.read_excel(path, engine='openpyxl', sheet_name=1, dtype='string')
 print(df)
+
 if not df.empty:
    carIds = df["卡实体id"]
    first_time = datetime.now()
@@ -71,13 +72,9 @@ if not df.empty:
    WebDriverWait(driver, 3).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="subMain"]/div/div[3]/div[1]/div[1]/div/span[2]'))).click()

-
-
-
    for carId in tqdm(carIds):
        try:
            open_page(driver, carId)
-            time.sleep(1)
        except:
            print(f"{carId},无法打印")
    driver.close()
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2026-01-30T09:27:59.557746200Z",
+     "start_time": "2026-01-30T09:27:59.437881100Z"
+    }
+   },
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# 假设你的 DataFrame 名为 df，包含以下列：\n",
+    "# 'material_code': 材料编码\n",
+    "# 'in_qty': 入库数量\n",
+    "# 'in_cost': 入库成本（总金额，不是单价）\n",
+    "df = pd.read_excel(fr\"C:\\Users\\hp_z66\\OneDrive\\Desktop\\材料成本明细表核对.xlsx\",sheet_name='Sheet1')\n",
+    "# 1. 按材料编码分组，计算总入库数量和总入库成本\n",
+    "summary = df.groupby('材料编码').agg(\n",
+    "    total_in_qty=('数量', 'sum'),\n",
+    "    total_in_cost=(' 除税成本', 'sum')\n",
+    ").reset_index()\n",
+    "\n",
+    "print(summary)\n",
+    "summary.to_csv(fr\"C:\\Users\\hp_z66\\OneDrive\\Desktop\\材料成本明细表核对sheet1.csv\")"
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "          材料编码  total_in_qty  total_in_cost\n",
+      "0    CL0003899             8         184.07\n",
+      "1    CL0004029           300          92.92\n",
+      "2    CL0004193           100         176.99\n",
+      "3    CL0005552            -1       -4250.04\n",
+      "4    CL0005554             2        2268.78\n",
+      "..         ...           ...            ...\n",
+      "461  CL0007466             1         800.00\n",
+      "462  CL0007467             1         800.00\n",
+      "463  CL0007468             1         500.00\n",
+      "464  CL0007469             1         500.00\n",
+      "465  CL0007470             1         500.00\n",
+      "\n",
+      "[466 rows x 3 columns]\n"
+     ]
+    }
+   ],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2026-01-30T09:28:27.291489100Z",
+     "start_time": "2026-01-30T09:28:27.188359600Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# 假设你的 DataFrame 名为 df，包含以下列：\n",
+    "# 'material_code': 材料编码\n",
+    "# 'in_qty': 入库数量\n",
+    "# 'in_cost': 入库成本（总金额，不是单价）\n",
+    "df = pd.read_excel(fr\"C:\\Users\\hp_z66\\OneDrive\\Desktop\\材料成本明细表核对.xlsx\",sheet_name='Sheet2')\n",
+    "# 1. 按材料编码分组，计算总入库数量和总入库成本\n",
+    "summary = df.groupby('材料编码').agg(\n",
+    "    total_in_qty=('采购入库数量', 'sum'),\n",
+    "    total_in_cost=('采购入库成本（除税）', 'sum')\n",
+    ").reset_index()\n",
+    "\n",
+    "print(summary)\n",
+    "summary.to_csv(fr\"C:\\Users\\hp_z66\\OneDrive\\Desktop\\材料成本明细表核对sheet2.csv\")"
+   ],
+   "id": "fcb775d7ed25bd85",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "          材料编码  total_in_qty  total_in_cost\n",
+      "0    CL0003899             8     184.070800\n",
+      "1    CL0004029           300      92.920500\n",
+      "2    CL0004193           100     176.991200\n",
+      "3    CL0005552            -1   -4250.044248\n",
+      "4    CL0005554             2    2268.778762\n",
+      "..         ...           ...            ...\n",
+      "459  CL0007466             1     800.000000\n",
+      "460  CL0007467             1     800.000000\n",
+      "461  CL0007468             1     500.000000\n",
+      "462  CL0007469             1     500.000000\n",
+      "463  CL0007470             1     500.000000\n",
+      "\n",
+      "[464 rows x 3 columns]\n"
+     ]
+    }
+   ],
+   "execution_count": 5
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,165 @@
+from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
+import time
+import pandas as pd
+from tqdm import tqdm
+import logging
+
+# 设置日志：记录跳过的页面
+logging.basicConfig(
+    filename=r"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出\skipped_pages.log",
+    level=logging.WARNING,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    encoding='utf-8'
+)
+
+def extract_table_data(page):
+    """从当前页面提取表格数据"""
+    rows = page.query_selector_all("table.dg tbody tr")
+    data = []
+    for row in rows:
+        if row.query_selector("th") or "合计" in row.text_content():
+            continue
+        cells = row.query_selector_all("td")
+        if len(cells) < 12:
+            continue
+
+        try:
+            record_id = row.query_selector("input[title]").get_attribute("title") or ""
+        except Exception:
+            record_id = ""
+
+        car_no = cells[2].text_content().strip()
+        name = cells[3].text_content().strip()
+        card_no = cells[4].text_content().strip()
+        card_type = cells[5].text_content().strip()
+        package = cells[6].text_content().strip()
+        total_times = cells[7].text_content().strip()
+        consumed = cells[8].text_content().strip()
+        remaining = cells[9].text_content().strip()
+        remaining_cost = cells[10].text_content().strip()
+        expire_date = cells[11].text_content().strip()
+
+        data.append({
+            "ID": record_id,
+            "车牌": car_no,
+            "姓名": name,
+            "卡号": card_no,
+            "卡类型": card_type,
+            "套餐项目": package,
+            "总次数": total_times,
+            "消费": consumed,
+            "剩余": remaining,
+            "剩余成本": remaining_cost,
+            "到期日期": expire_date
+        })
+    return data
+
+
+# ====== Cookie 配置（保持不变）======
+cookie_str = "td_cookie=628629794; td_cookie=627897944; ASP.NET_SessionId=54barjh2gsquceps2flqvlwy; ztrjnew@4db97b96-12af-45b0-b232-fd1e9b7a672e@=PassWord=wZn2IuvdWeE=&RememberPwd=RXv90LpPskw=&UserId=nzK31b3ZYVQ=&CSID=VjfeyHPOjnU=&UserName=fDfTOArNJXHmbGEeaShOsw==&SID=nkNRF6dD83c=&RoleId=1X5bqQAfxQY=&GroupId=KUxCDdt69t4="
+
+cookies_dict = {}
+for part in cookie_str.split(";"):
+    part = part.strip()
+    if "=" in part:
+        name, value = part.split("=", 1)
+        cookies_dict[name] = value
+
+domain = "crm.zhongtukj.com"
+path = "/"
+new_cookies = [
+    {"name": name, "value": value, "domain": domain, "path": path}
+    for name, value in cookies_dict.items()
+]
+
+
+def navigate_to_page(page, target_page: int, max_retries: int = 3):
+    """
+    安全地跳转到指定页码，带重试机制
+    """
+    for attempt in range(1, max_retries + 1):
+        try:
+            if target_page == 1:
+                # 第一页已在初始加载中完成，只需等待表格
+                page.wait_for_selector("table.dg tbody tr", timeout=50000)
+                return True
+            else:
+                # 触发分页跳转
+                page.evaluate(f"() => __doPostBack('AspNetPager', '{target_page}')")
+                page.wait_for_load_state("networkidle", timeout=50000)
+                page.wait_for_selector("table.dg tbody tr", timeout=50000)
+                return True
+        except PlaywrightTimeoutError as e:
+            print(f"  ⚠️ 第 {target_page} 页加载超时（第 {attempt}/{max_retries} 次尝试）: {str(e)[:100]}...")
+            if attempt < max_retries:
+                time.sleep(2)
+                # 可选：刷新页面重试（针对严重卡死）
+                # page.reload()
+            else:
+                logging.warning(f"跳过页面 {target_page}: 加载超时")
+                return False
+        except Exception as e:
+            print(f"  ❌ 第 {target_page} 页发生未知错误（第 {attempt}/{max_retries} 次）: {e}")
+            if attempt < max_retries:
+                time.sleep(2)
+            else:
+                logging.warning(f"跳过页面 {target_page}: 未知错误 - {str(e)}")
+                return False
+    return False
+
+
+def main():
+    start_page = 1
+    end_page = 1532
+    all_data = []
+
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=False, slow_mo=300)
+        context = browser.new_context()
+        context.add_cookies(new_cookies)
+        page = context.new_page()
+        page.set_default_timeout(50000)  # 全局 10 秒超时
+
+        # 初始加载第一页
+        print("正在加载初始页面...")
+        try:
+            page.goto("http://crm.zhongtukj.com/Boss/Customer/CustomerPackageList.aspx", timeout=50000)
+            page.wait_for_load_state("networkidle")
+            # 选择“所有门店”
+            page.select_option("#Drop_Group", value="0")
+            page.wait_for_load_state("networkidle")
+        except Exception as e:
+            print(f"❌ 初始页面加载失败: {e}")
+            browser.close()
+            return
+
+        # 主循环：逐页处理
+        for current_page in tqdm(range(start_page, end_page + 1), desc="处理页面"):
+            success = navigate_to_page(page, current_page, max_retries=3)
+            if not success:
+                continue  # 跳过该页
+
+            try:
+                data = extract_table_data(page)
+                all_data.extend(data)
+                # print(f"  第 {current_page} 页提取 {len(data)} 条记录")
+            except Exception as e:
+                print(f"  ❌ 第 {current_page} 页数据提取失败: {e}")
+                logging.warning(f"第 {current_page} 页数据提取异常: {e}")
+                continue
+
+        browser.close()
+
+        # 保存结果
+        if all_data:
+            df = pd.DataFrame(all_data)
+            filename = rf"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出\套餐卡_第{start_page}至{end_page}页.xlsx"
+            df.to_excel(filename, index=False)
+            print(f"\n✅ 共提取 {len(all_data)} 条记录，已保存到 '{filename}'")
+            print(f"⚠️ 跳过的页面已记录到 skipped_pages.log")
+        else:
+            print("⚠️ 未提取到任何数据")
+
+
+if __name__ == "__main__":
+    main()
@@ -12,8 +12,8 @@
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
-     "end_time": "2026-01-27T02:00:26.466693100Z",
-     "start_time": "2026-01-27T02:00:16.549976600Z"
+     "end_time": "2026-01-31T11:05:57.587138200Z",
+     "start_time": "2026-01-31T11:04:17.031226100Z"
    }
   },
   "source": [
@@ -21,28 +21,38 @@
    "\n",
    "import requests\n",
    "import pandas as pd\n",
+    "\n",
    "cookies = {\n",
-    "    'td_cookie': '628168942',\n",
+    "    'td_cookie': '628629794',\n",
    "    'td_cookie': '627897944',\n",
-    "    'ASP.NET_SessionId': 'q5qzer2z51b4uzsxrhterzop',\n",
+    "    'ASP.NET_SessionId': '54barjh2gsquceps2flqvlwy',\n",
    "    'ztrjnew@4db97b96-12af-45b0-b232-fd1e9b7a672e@': 'PassWord=wZn2IuvdWeE=&RememberPwd=RXv90LpPskw=&UserId=nzK31b3ZYVQ=&CSID=VjfeyHPOjnU=&UserName=fDfTOArNJXHmbGEeaShOsw==&SID=nkNRF6dD83c=&RoleId=1X5bqQAfxQY=&GroupId=KUxCDdt69t4=',\n",
    "}\n",
    "\n",
    "headers = {\n",
    "    'Accept': 'application/json, text/javascript, */*; q=0.01',\n",
    "    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',\n",
-    "    'Connection': 'keep-alive',\n",
+    "    'Proxy-Connection': 'keep-alive',\n",
    "    'Referer': 'http://crm.zhongtukj.com/Boss/Customer/CustomerCardListMem.aspx',\n",
    "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36 Edg/144.0.0.0',\n",
    "    'X-Requested-With': 'XMLHttpRequest',\n",
-    "    # 'Cookie': 'td_cookie=628168942; td_cookie=627897944; ASP.NET_SessionId=q5qzer2z51b4uzsxrhterzop; ztrjnew@4db97b96-12af-45b0-b232-fd1e9b7a672e@=PassWord=wZn2IuvdWeE=&RememberPwd=RXv90LpPskw=&UserId=nzK31b3ZYVQ=&CSID=VjfeyHPOjnU=&UserName=fDfTOArNJXHmbGEeaShOsw==&SID=nkNRF6dD83c=&RoleId=1X5bqQAfxQY=&GroupId=KUxCDdt69t4=',\n",
+    "    # 'Cookie': 'td_cookie=628629794; td_cookie=627897944; ASP.NET_SessionId=54barjh2gsquceps2flqvlwy; ztrjnew@4db97b96-12af-45b0-b232-fd1e9b7a672e@=PassWord=wZn2IuvdWeE=&RememberPwd=RXv90LpPskw=&UserId=nzK31b3ZYVQ=&CSID=VjfeyHPOjnU=&UserName=fDfTOArNJXHmbGEeaShOsw==&SID=nkNRF6dD83c=&RoleId=1X5bqQAfxQY=&GroupId=KUxCDdt69t4=',\n",
    "}\n",
-    "all_data= []\n",
-    "for i in range(1,4):\n",
+    "\n",
+    "all_data = []\n",
+    "for i in range(1, 78):\n",
    "    params = {\n",
    "        'action': 'GetList',\n",
-    "        'groupId': '9',\n",
+    "        'groupId': '0',\n",
    "        'keyword': '',\n",
+    "        'managerid': '-1',\n",
+    "        'isweixin': '',\n",
+    "        'cardid': '',\n",
+    "        'saleid': '',\n",
+    "        'openCardTime1': '',\n",
+    "        'openCardTime2': '',\n",
+    "        'deadline1': '',\n",
+    "        'deadline2': '',\n",
    "        'page': i,\n",
    "        'rows': '20',\n",
    "        'sort': 'ID',\n",
@@ -61,17 +71,17 @@
    "    all_data.extend(rows)\n",
    "\n",
    "df = pd.DataFrame(all_data)\n",
-    "df.to_excel(\"D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\谷途有效会员卡.xlsx\",index=False)"
+    "df.to_excel(\"D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\谷途有效会员卡.xlsx\", index=False)"
   ],
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "<>:42: SyntaxWarning: invalid escape sequence '\\I'\n",
-      "<>:42: SyntaxWarning: invalid escape sequence '\\I'\n",
-      "C:\\Users\\hp_z66\\AppData\\Local\\Temp\\ipykernel_27516\\3060641384.py:42: SyntaxWarning: invalid escape sequence '\\I'\n",
-      "  df.to_excel(\"D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\谷途有效会员卡.xlsx\",index=False)\n"
+      "<>:55: SyntaxWarning: invalid escape sequence '\\I'\n",
+      "<>:55: SyntaxWarning: invalid escape sequence '\\I'\n",
+      "C:\\Users\\hp_z66\\AppData\\Local\\Temp\\ipykernel_16640\\1148863384.py:55: SyntaxWarning: invalid escape sequence '\\I'\n",
+      "  df.to_excel(\"D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\谷途有效会员卡.xlsx\", index=False)\n"
     ]
    }
   ],
@@ -1,103 +0,0 @@
-from playwright.sync_api import sync_playwright
-import time
-import pandas as pd
-
-
-def extract_table_data(page):
-    """从当前页面提取表格数据"""
-    rows = page.query_selector_all("table.dg tbody tr")
-    data = []
-    for row in rows:
-        # 跳过表头和合计行
-        if row.query_selector("th") or "合计" in row.text_content():
-            continue
-
-        cells = row.query_selector_all("td")
-        if len(cells) < 12:
-            continue  # 非数据行
-
-        record_id = row.query_selector("input[title]").get_attribute("title") or ""
-        car_no = cells[2].text_content().strip()
-        name = cells[3].text_content().strip()
-        card_no = cells[4].text_content().strip()
-        card_type = cells[5].text_content().strip()
-        package = cells[6].text_content().strip()
-        total_times = cells[7].text_content().strip()
-        consumed = cells[8].text_content().strip()
-        remaining = cells[9].text_content().strip()
-        remaining_cost = cells[10].text_content().strip()
-        expire_date = cells[11].text_content().strip()
-
-        data.append({
-            "ID": record_id,
-            "车牌": car_no,
-            "姓名": name,
-            "卡号": card_no,
-            "卡类型": card_type,
-            "套餐项目": package,
-            "总次数": total_times,
-            "消费": consumed,
-            "剩余": remaining,
-            "剩余成本": remaining_cost,
-            "到期日期": expire_date
-        })
-    return data
-
-
-def main():
-    # ====== 手动设置页码范围 ======
-    start_page = 1  # 起始页（包含）
-    end_page = 5  # 结束页（包含）
-    # ============================
-
-    with sync_playwright() as p:
-        browser = p.chromium.launch(headless=False, slow_mo=300)
-        context = browser.new_context()
-
-        # 设置你的 Cookie（请根据实际情况更新）
-        context.add_cookies([
-            {"name": "ASP.NET_SessionId", "value": "knhs0hxsbmolk20gidmlis3j", "domain": "crm.zhongtukj.com",
-             "path": "/"},
-            {"name": "ztrjnew@4db97b96-12af-45b0-b232-fd1e9b7a672e@",
-             "value": "PassWord=wZn2IuvdWeE=&RememberPwd=RXv90LpPskw=&UserId=nzK31b3ZYVQ=&CSID=VjfeyHPOjnU=&UserName=fDfTOArNJXHmbGEeaShOsw==&SID=nkNRF6dD83c=&RoleId=1X5bqQAfxQY=&GroupId=KUxCDdt69t4=",
-             "domain": "crm.zhongtukj.com", "path": "/"}
-        ])
-
-        page = context.new_page()
-        print(f"正在加载第 {start_page} 页（初始页）...")
-        page.goto("http://crm.zhongtukj.com/Boss/Customer/CustomerPackageList.aspx")
-        page.wait_for_load_state("networkidle")
-        time.sleep(2)
-
-        all_data = []
-
-        for current_page in range(start_page, end_page + 1):
-            if current_page == 1:
-                # 第 1 页已加载，直接提取
-                pass
-            else:
-                # 跳转到指定页码
-                print(f"正在跳转到第 {current_page} 页...")
-                page.evaluate(f"() => __doPostBack('AspNetPager', '{current_page}')")
-                page.wait_for_load_state("networkidle")
-                time.sleep(2)  # 稳定等待
-
-            # 提取当前页数据
-            data = extract_table_data(page)
-            all_data.extend(data)
-            print(f"  第 {current_page} 页提取 {len(data)} 条记录")
-
-        browser.close()
-
-        # 保存结果
-        if all_data:
-            df = pd.DataFrame(all_data)
-            filename = f"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出\套餐卡_第{start_page}至{end_page}页.xlsx"
-            df.to_excel(filename, index=False)
-            print(f"\n✅ 共提取 {len(all_data)} 条记录，已保存到 '{filename}'")
-        else:
-            print("⚠️ 未提取到任何数据")
-
-
-if __name__ == "__main__":
-    main()