F6--/张阳脚本/竞品系统数据导出/H1车店数据导出.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2be2e0c2",
   "metadata": {},
   "source": [
    "# 车辆信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "70a8b0da",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-03-25T03:51:53.352551400Z",
     "start_time": "2026-03-25T03:51:31.198595700Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "正在获取总页数...\n",
      "总页数：34 页\n",
      "正在爬取第 1/34 页...\n",
      "正在爬取第 2/34 页...\n",
      "正在爬取第 3/34 页...\n",
      "正在爬取第 4/34 页...\n",
      "正在爬取第 5/34 页...\n",
      "正在爬取第 6/34 页...\n",
      "正在爬取第 7/34 页...\n",
      "正在爬取第 8/34 页...\n",
      "正在爬取第 9/34 页...\n",
      "正在爬取第 10/34 页...\n",
      "正在爬取第 11/34 页...\n",
      "正在爬取第 12/34 页...\n",
      "正在爬取第 13/34 页...\n",
      "正在爬取第 14/34 页...\n",
      "正在爬取第 15/34 页...\n",
      "正在爬取第 16/34 页...\n",
      "正在爬取第 17/34 页...\n",
      "正在爬取第 18/34 页...\n",
      "正在爬取第 19/34 页...\n",
      "正在爬取第 20/34 页...\n",
      "正在爬取第 21/34 页...\n",
      "正在爬取第 22/34 页...\n",
      "正在爬取第 23/34 页...\n",
      "正在爬取第 24/34 页...\n",
      "正在爬取第 25/34 页...\n",
      "正在爬取第 26/34 页...\n",
      "正在爬取第 27/34 页...\n",
      "正在爬取第 28/34 页...\n",
      "正在爬取第 29/34 页...\n",
      "正在爬取第 30/34 页...\n",
      "正在爬取第 31/34 页...\n",
      "正在爬取第 32/34 页...\n",
      "正在爬取第 33/34 页...\n",
      "正在爬取第 34/34 页...\n",
      "\n",
      "========== 爬取完成 ==========\n",
      "总计数据：666 行\n",
      "\n",
      "✅ 文件已保存到桌面：\n",
      "📊 Excel文件：C:\\Users\\hp_z66\\Desktop\\车辆数据_已拆分_20260325_1151531.csv\n",
      "📄 文本文件：C:\\Users\\hp_z66\\Desktop\\车辆数据_已拆分_20260325_115153.txt\n",
      "\n",
      "前5行数据预览：\n",
      "1 ['1', '豫NA477R', '卢忠厚', '', '', '', '/', '', '118933km', '', '', '消费记录 编辑 迁移 删除']\n",
      "2 ['2', '豫NF3722', '刘建利', '', '', '', '/', '', '198609km', '', '', '消费记录 编辑 迁移 删除']\n",
      "3 ['3', '豫N13B58', '石', '15090629992', '', '', '/', '', '22462km', '', '', '消费记录 编辑 迁移 删除']\n",
      "4 ['4', '京PYB297', '科迪黄青春', '', '', '', '/', '', '119584km', '', '', '消费记录 编辑 迁移 删除']\n",
      "5 ['5', '豫NN982M', '大众', '', '', '', '/', '', '197504km', '', '', '消费记录 编辑 迁移 删除']\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "import os\n",
    "import csv\n",
    "from datetime import datetime\n",
    "\n",
    "# ===================== 配置区 =====================\n",
    "# 已替换为你curl中的最新cookies\n",
    "COOKIES = (\n",
    "    'showSmsActivity=1; '\n",
    "    'showEasyMoney=1; '\n",
    "    'LOGIN_URL=https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html; '\n",
    "    'adminun=18530760062; '\n",
    "    'uid=10407; '\n",
    "    'PHPSESSID=7v127mqdfnqa7rgcrlifksrt3t'\n",
    ")\n",
    "\n",
    "BASE_URL = \"https://scrm.h1cd.com/admin/members/carlist\"\n",
    "HEADERS = {\n",
    "    \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7\",\n",
    "    \"Accept-Language\": \"zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6\",\n",
    "    \"Connection\": \"keep-alive\",\n",
    "    \"Referer\": \"https://scrm.h1cd.com/admin/members/carlist.html\",\n",
    "    \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0\",\n",
    "    \"sec-ch-ua\": '\"Chromium\";v=\"146\", \"Not-A.Brand\";v=\"24\", \"Microsoft Edge\";v=\"146\"',\n",
    "    \"sec-ch-ua-mobile\": \"?0\",\n",
    "    \"sec-ch-ua-platform\": '\"Windows\"',\n",
    "    \"Sec-Fetch-Dest\": \"iframe\",\n",
    "    \"Sec-Fetch-Mode\": \"navigate\",\n",
    "    \"Sec-Fetch-Site\": \"same-origin\",\n",
    "    \"Sec-Fetch-User\": \"?1\",\n",
    "    \"Upgrade-Insecure-Requests\": \"1\"\n",
    "}\n",
    "\n",
    "# 解析Cookie\n",
    "def parse_cookies(cookie_str):\n",
    "    cookie_dict = {}\n",
    "    for item in cookie_str.split(\"; \"):\n",
    "        if \"=\" in item:\n",
    "            key, value = item.split(\"=\", 1)\n",
    "            cookie_dict[key] = value\n",
    "    return cookie_dict\n",
    "\n",
    "# ===================== 核心：数据拆分处理（已修复 br 换行） =====================\n",
    "def process_row(row):\n",
    "    if len(row) < 5:\n",
    "        return row\n",
    "    \n",
    "    new_row = row.copy()\n",
    "\n",
    "    # ========== 1. 处理 C列：客户名称 + 手机号（按 <br> 拆分，已修复） ==========\n",
    "    c_text = new_row.pop(2)\n",
    "    \n",
    "    # 处理换行：统一空格/换行/空白符，提取 名称 + 手机号\n",
    "    # 先把所有空白（包括HTML换行产生的空格）替换成统一分隔符\n",
    "    c_text = re.sub(r'\\s+', ' ', c_text).strip()\n",
    "    \n",
    "    name, phone = \"\", \"\"\n",
    "    # 匹配手机号（11位数字），自动分割\n",
    "    phone_match = re.search(r'1[3-9]\\d{9}', c_text)\n",
    "    if phone_match:\n",
    "        phone = phone_match.group()\n",
    "        name = c_text.replace(phone, '').strip()\n",
    "    else:\n",
    "        name = c_text.strip()\n",
    "\n",
    "    # ========== 2. 处理 E列：颜色/发动机/车架号（已去除前缀） ==========\n",
    "    e_text = new_row.pop(3)\n",
    "    color, engine, vin = \"\", \"\", \"\"\n",
    "\n",
    "    # 去掉所有中文前缀\n",
    "    e_text = re.sub(r'颜\\s*色\\s*：', '', e_text)\n",
    "    e_text = re.sub(r'发动机\\s*：', '', e_text)\n",
    "    e_text = re.sub(r'车架号\\s*：', '', e_text)\n",
    "    \n",
    "    # 按 / 拆分\n",
    "    if \"/\" in e_text:\n",
    "        parts = e_text.split(\"/\", 2)\n",
    "        color = parts[0].strip()\n",
    "        engine = parts[1].strip() if len(parts) > 1 else \"\"\n",
    "        vin = parts[2].strip() if len(parts) > 2 else \"\"\n",
    "    else:\n",
    "        color = e_text.strip()\n",
    "\n",
    "    # 插入拆分后字段\n",
    "    new_row.insert(2, name)\n",
    "    new_row.insert(3, phone)\n",
    "    new_row.insert(4, color)\n",
    "    new_row.insert(5, engine)\n",
    "    new_row.insert(6, vin)\n",
    "\n",
    "    return new_row\n",
    "\n",
    "# 获取单页表格数据 + 自动拆分（保留HTML内换行，解决<br>问题）\n",
    "def get_page_data(page_num):\n",
    "    if page_num == 1:\n",
    "        url = f\"{BASE_URL}.html\"\n",
    "    else:\n",
    "        url = f\"{BASE_URL}_{page_num}.html\"\n",
    "\n",
    "    try:\n",
    "        resp = requests.get(url, headers=HEADERS, cookies=parse_cookies(COOKIES), timeout=15)\n",
    "        resp.raise_for_status()\n",
    "        soup = BeautifulSoup(resp.text, \"html.parser\")\n",
    "        table = soup.find(\"table\")\n",
    "        if not table:\n",
    "            return []\n",
    "\n",
    "        rows = table.find_all(\"tr\")\n",
    "        data = []\n",
    "        for tr in rows:\n",
    "            tds = tr.find_all(\"td\")\n",
    "            cols = []\n",
    "            for td in tds:\n",
    "                # 关键：保留 <br> 产生的换行，不直接压缩\n",
    "                text = td.get_text(separator=\" \", strip=True)\n",
    "                cols.append(text)\n",
    "            \n",
    "            if cols:\n",
    "                processed_cols = process_row(cols)\n",
    "                data.append(processed_cols)\n",
    "        return data\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"第{page_num}页请求失败：{e}\")\n",
    "        return []\n",
    "\n",
    "# 获取总页数\n",
    "def get_total_pages():\n",
    "    try:\n",
    "        resp = requests.get(f\"{BASE_URL}.html\", headers=HEADERS, cookies=parse_cookies(COOKIES), timeout=10)\n",
    "        soup = BeautifulSoup(resp.text, \"html.parser\")\n",
    "        page_text = soup.get_text()\n",
    "        match = re.search(r\"共\\s*(\\d+)\\s*页\", page_text)\n",
    "        if match:\n",
    "            return int(match.group(1))\n",
    "\n",
    "        page_links = soup.find_all(\"a\", href=re.compile(r\"carlist_\\d+\\.html\"))\n",
    "        max_page = 1\n",
    "        for a in page_links:\n",
    "            num_match = re.search(r\"carlist_(\\d+)\\.html\", a[\"href\"])\n",
    "            if num_match:\n",
    "                max_page = max(max_page, int(num_match.group(1)))\n",
    "        return max_page\n",
    "    except:\n",
    "        return 1\n",
    "\n",
    "# ===================== 保存到桌面（自动加表头） =====================\n",
    "def save_to_desktop(all_data):\n",
    "    desktop_path = os.path.join(os.path.expanduser(\"~\"), \"Desktop\")\n",
    "    time_str = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
    "    \n",
    "    # 表头（自动添加）\n",
    "    header = [\n",
    "        \"序号\", \"车牌号\", \"客户名称\", \"客户手机号\",\n",
    "        \"颜色\", \"发动机号\", \"车架号\", \"里程数\", \"其他列1\", \"其他列2\"\n",
    "    ]\n",
    "    \n",
    "    csv_file = os.path.join(desktop_path, f\"车辆数据_已拆分_{time_str}1.csv\")\n",
    "    with open(csv_file, \"w\", newline=\"\", encoding=\"utf-8-sig\") as f:\n",
    "        writer = csv.writer(f)\n",
    "        writer.writerow(header)  # 写入标题\n",
    "        writer.writerows(all_data)\n",
    "    \n",
    "    txt_file = os.path.join(desktop_path, f\"车辆数据_已拆分_{time_str}.txt\")\n",
    "    with open(txt_file, \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(\" | \".join(header) + \"\\n\")\n",
    "        for row in all_data:\n",
    "            f.write(\" | \".join(row) + \"\\n\")\n",
    "    \n",
    "    print(f\"\\n✅ 文件已保存到桌面：\")\n",
    "    print(f\"📊 Excel文件：{csv_file}\")\n",
    "    print(f\"📄 文本文件：{txt_file}\")\n",
    "\n",
    "# ===================== 主程序 =====================\n",
    "if __name__ == \"__main__\":\n",
    "    print(\"正在获取总页数...\")\n",
    "    total_pages = get_total_pages()\n",
    "    # total_pages = 1\n",
    "    print(f\"总页数：{total_pages} 页\")\n",
    "\n",
    "    all_data = []\n",
    "    for page in range(1, total_pages + 1):\n",
    "        print(f\"正在爬取第 {page}/{total_pages} 页...\")\n",
    "        page_data = get_page_data(page)\n",
    "        if page_data:\n",
    "            all_data.extend(page_data)\n",
    "\n",
    "    print(f\"\\n========== 爬取完成 ==========\")\n",
    "    print(f\"总计数据：{len(all_data)} 行\")\n",
    "\n",
    "    save_to_desktop(all_data)\n",
    "\n",
    "    print(\"\\n前5行数据预览：\")\n",
    "    for i, row in enumerate(all_data[:5]):\n",
    "        print(i+1, row)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c370235",
   "metadata": {},
   "source": [
    "# 库存信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5392bfc0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-03-25T03:53:43.600296300Z",
     "start_time": "2026-03-25T03:53:18.688209100Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================================================\n",
      "开始爬取库存数据...\n",
      "当前 StoreID: 13435\n",
      "当前 HouseID: 9079\n",
      "==================================================\n",
      "✅ 成功获取最大页数：40\n",
      "正在爬取第 1/40 页...\n",
      "正在爬取第 2/40 页...\n",
      "正在爬取第 3/40 页...\n",
      "正在爬取第 4/40 页...\n",
      "正在爬取第 5/40 页...\n",
      "正在爬取第 6/40 页...\n",
      "正在爬取第 7/40 页...\n",
      "正在爬取第 8/40 页...\n",
      "正在爬取第 9/40 页...\n",
      "正在爬取第 10/40 页...\n",
      "正在爬取第 11/40 页...\n",
      "正在爬取第 12/40 页...\n",
      "正在爬取第 13/40 页...\n",
      "正在爬取第 14/40 页...\n",
      "正在爬取第 15/40 页...\n",
      "正在爬取第 16/40 页...\n",
      "正在爬取第 17/40 页...\n",
      "正在爬取第 18/40 页...\n",
      "正在爬取第 19/40 页...\n",
      "正在爬取第 20/40 页...\n",
      "正在爬取第 21/40 页...\n",
      "正在爬取第 22/40 页...\n",
      "正在爬取第 23/40 页...\n",
      "正在爬取第 24/40 页...\n",
      "正在爬取第 25/40 页...\n",
      "正在爬取第 26/40 页...\n",
      "正在爬取第 27/40 页...\n",
      "正在爬取第 28/40 页...\n",
      "正在爬取第 29/40 页...\n",
      "正在爬取第 30/40 页...\n",
      "正在爬取第 31/40 页...\n",
      "正在爬取第 32/40 页...\n",
      "正在爬取第 33/40 页...\n",
      "正在爬取第 34/40 页...\n",
      "正在爬取第 35/40 页...\n",
      "正在爬取第 36/40 页...\n",
      "正在爬取第 37/40 页...\n",
      "正在爬取第 38/40 页...\n",
      "正在爬取第 39/40 页...\n",
      "正在爬取第 40/40 页...\n",
      "\n",
      "🔍 去重完成 (基于列: 配件编码)：原始 782 条 → 去重后 782 条\n",
      "==================================================\n",
      "✅ 爬取 + 去重 完成！\n",
      "📊 最终有效条数：782\n",
      "📁 已保存到桌面：库存数据_13435_去重版1.xlsx\n",
      "==================================================\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "import os\n",
    "import re\n",
    "\n",
    "# ===================== 【配置区】 =====================\n",
    "# 【已更新】根据 curl 命令中的 -b 参数替换为最新 Cookie\n",
    "COOKIE = (\n",
    "    'showSmsActivity=1; '\n",
    "    'showEasyMoney=1; '\n",
    "    'LOGIN_URL=https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html; '\n",
    "    'adminun=18530760062; '\n",
    "    'uid=10407; '\n",
    "    'PHPSESSID=7v127mqdfnqa7rgcrlifksrt3t'\n",
    ")\n",
    "\n",
    "# 【已更新】根据 curl 命令中的 URL 参数更新筛选条件\n",
    "BASE_PARAMS = {\n",
    "    'storeId': '13435',\n",
    "    'house_id': '9079',\n",
    "    'repositoryName': '',\n",
    "    'first_type': '',\n",
    "    'product_type': '',\n",
    "    'status': '',\n",
    "    'searchType': '1',\n",
    "    'product_name': ''\n",
    "}\n",
    "\n",
    "# 分页格式保持不动\n",
    "BASE_URL = \"https://scrm.h1cd.com/admin/billings/stores-search__{}.html\"\n",
    "# ======================================================\n",
    "\n",
    "HEADERS = {\n",
    "    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n",
    "    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',\n",
    "    'Connection': 'keep-alive',\n",
    "    'Cookie': COOKIE,\n",
    "    'Referer': 'https://scrm.h1cd.com/admin/billings/stores-search.html?storeId=13435&house_id=9079&repositoryName=&first_type=&product_type=&status=&searchType=1&product_name=',\n",
    "    'Sec-Fetch-Dest': 'iframe',\n",
    "    'Sec-Fetch-Mode': 'navigate',\n",
    "    'Sec-Fetch-Site': 'same-origin',\n",
    "    'Sec-Fetch-User': '?1',\n",
    "    'Upgrade-Insecure-Requests': '1',\n",
    "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',\n",
    "    'sec-ch-ua': '\"Chromium\";v=\"146\", \"Not-A.Brand\";v=\"24\", \"Microsoft Edge\";v=\"146\"',\n",
    "    'sec-ch-ua-mobile': '?0',\n",
    "    'sec-ch-ua-platform': '\"Windows\"',\n",
    "}\n",
    "\n",
    "\n",
    "def get_page_html(page_num):\n",
    "    \"\"\"获取单页HTML\"\"\"\n",
    "    try:\n",
    "        if page_num == 1:\n",
    "            url = \"https://scrm.h1cd.com/admin/billings/stores-search.html\"\n",
    "        else:\n",
    "            url = BASE_URL.format(page_num)\n",
    "\n",
    "        res = requests.get(url, headers=HEADERS, params=BASE_PARAMS, timeout=15)\n",
    "        res.raise_for_status()\n",
    "\n",
    "        if 'login.html' in res.url:\n",
    "            print(f\"⚠️ 第{page_num}页检测到跳转登录，Cookie可能已失效。\")\n",
    "            return None\n",
    "\n",
    "        return res.text\n",
    "    except Exception as e:\n",
    "        print(f\"第{page_num}页请求失败：{e}\")\n",
    "        return None\n",
    "\n",
    "\n",
    "def parse_table(html):\n",
    "    \"\"\"解析表格数据\"\"\"\n",
    "    soup = BeautifulSoup(html, 'html.parser')\n",
    "    table = soup.find('table', class_='table-hover')\n",
    "    if not table:\n",
    "        table = soup.find('table')\n",
    "\n",
    "    if not table:\n",
    "        return []\n",
    "\n",
    "    rows = []\n",
    "    thead = table.find('thead')\n",
    "    if thead:\n",
    "        headers = [th.get_text(strip=True) for th in thead.find_all('th')]\n",
    "        if headers:\n",
    "            rows.append(headers)\n",
    "\n",
    "    tbody = table.find('tbody')\n",
    "    target_rows = tbody.find_all('tr') if tbody else table.find_all('tr')\n",
    "\n",
    "    for tr in target_rows:\n",
    "        tds = tr.find_all('td')\n",
    "        if not tds:\n",
    "            continue\n",
    "        row = [td.get_text(strip=True) for td in tds]\n",
    "        if any(row):\n",
    "            rows.append(row)\n",
    "\n",
    "    return rows\n",
    "\n",
    "\n",
    "def get_max_page():\n",
    "    \"\"\"从页面提取最大页数\"\"\"\n",
    "    html = get_page_html(1)\n",
    "    if not html:\n",
    "        return 1\n",
    "\n",
    "    soup = BeautifulSoup(html, 'html.parser')\n",
    "    page_info = soup.find('div', class_='dataTables_paginate')\n",
    "    if not page_info:\n",
    "        text = soup.get_text()\n",
    "        match = re.search(r'页\\s*1/(\\d+)', text)\n",
    "        if match:\n",
    "            return int(match.group(1))\n",
    "        return 1\n",
    "\n",
    "    text = page_info.get_text()\n",
    "    match = re.search(r'页\\s*1/(\\d+)', text)\n",
    "    if match:\n",
    "        return int(match.group(1))\n",
    "\n",
    "    match = re.search(r'1\\s*/\\s*(\\d+)', text)\n",
    "    if match:\n",
    "        return int(match.group(1))\n",
    "\n",
    "    return 1\n",
    "\n",
    "\n",
    "def main():\n",
    "    print(\"=\" * 50)\n",
    "    print(\"开始爬取库存数据...\")\n",
    "    print(f\"当前 StoreID: {BASE_PARAMS['storeId']}\")\n",
    "    print(f\"当前 HouseID: {BASE_PARAMS['house_id']}\")\n",
    "    print(\"=\" * 50)\n",
    "\n",
    "    max_page = get_max_page()\n",
    "    if max_page == 1:\n",
    "        print(\"⚠️ 仅检测到 1 页，可能是解析失败或确实只有一页。\")\n",
    "\n",
    "    print(f\"✅ 成功获取最大页数：{max_page}\")\n",
    "\n",
    "    all_data = []\n",
    "    for page in range(1, max_page + 1):\n",
    "        print(f\"正在爬取第 {page}/{max_page} 页...\")\n",
    "        html = get_page_html(page)\n",
    "        if not html:\n",
    "            print(f\"❌ 第 {page} 页获取失败，跳过。\")\n",
    "            continue\n",
    "\n",
    "        rows = parse_table(html)\n",
    "        if not rows:\n",
    "            print(f\"⚠️ 第 {page} 页未解析到表格数据。\")\n",
    "            continue\n",
    "\n",
    "        if page == 1:\n",
    "            all_data.extend(rows)\n",
    "        else:\n",
    "            if len(rows) > 0 and rows[0] == all_data[0]:\n",
    "                all_data.extend(rows[1:])\n",
    "            else:\n",
    "                all_data.extend(rows)\n",
    "\n",
    "    if not all_data:\n",
    "        print(\"\\n❌ 未获取到任何数据，请检查 Cookie 或网络。\")\n",
    "        return\n",
    "\n",
    "    desktop = os.path.join(os.path.expanduser(\"~\"), \"Desktop\")\n",
    "    df = pd.DataFrame(all_data[1:], columns=all_data[0])\n",
    "\n",
    "    # 按配件编码去重\n",
    "    target_col = None\n",
    "    for col in df.columns:\n",
    "        if '配件编码' in col or '编码' in col:\n",
    "            target_col = col\n",
    "            break\n",
    "\n",
    "    if target_col:\n",
    "        total_before = len(df)\n",
    "        df = df.drop_duplicates(subset=[target_col], keep='first')\n",
    "        total_after = len(df)\n",
    "        print(f\"\\n🔍 去重完成 (基于列: {target_col})：原始 {total_before} 条 → 去重后 {total_after} 条\")\n",
    "    else:\n",
    "        print(f\"\\n⚠️ 未找到包含【配件编码】的列，跳过去重。当前列名：{list(df.columns)}\")\n",
    "\n",
    "    filename = f\"库存数据_{BASE_PARAMS['storeId']}_去重版1.xlsx\"\n",
    "    path = os.path.join(desktop, filename)\n",
    "\n",
    "    try:\n",
    "        df.to_excel(path, index=False)\n",
    "        print(\"=\" * 50)\n",
    "        print(f\"✅ 爬取 + 去重 完成！\")\n",
    "        print(f\"📊 最终有效条数：{len(df)}\")\n",
    "        print(f\"📁 已保存到桌面：{filename}\")\n",
    "        print(\"=\" * 50)\n",
    "    except Exception as e:\n",
    "        print(f\"❌ 保存文件失败：{e}\")\n",
    "        csv_path = os.path.join(desktop, filename.replace('.xlsx', '.csv'))\n",
    "        df.to_csv(csv_path, index=False, encoding='utf-8-sig')\n",
    "        print(f\"💡 已尝试转为 CSV 保存至：{csv_path}\")\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b11e6fa",
   "metadata": {},
   "source": [
    "# 历史维修记录\n",
    "开单管理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cbd4eeb0a30b3e15",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-03-25T03:58:48.443601700Z",
     "start_time": "2026-03-25T03:56:48.226330400Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🔧 开始导出维修记录...\n",
      "📄 正在获取第 1 页以分析页数...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists.html\n",
      "📊 预估总页数: 53\n",
      "🔄 正在处理第 1/53 页...\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 2/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_2.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 3/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_3.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 4/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_4.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 5/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_5.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 6/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_6.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 7/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_7.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 8/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_8.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 9/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_9.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 10/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_10.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 11/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_11.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 12/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_12.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 13/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_13.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 14/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_14.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 15/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_15.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 16/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_16.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 17/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_17.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 18/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_18.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 19/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_19.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 20/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_20.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 21/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_21.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 22/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_22.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 23/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_23.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 24/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_24.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 25/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_25.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 26/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_26.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 27/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_27.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 28/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_28.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 29/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_29.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 30/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_30.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 31/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_31.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 32/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_32.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 33/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_33.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 34/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_34.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 35/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_35.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 36/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_36.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 37/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_37.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 38/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_38.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 39/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_39.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 40/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_40.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 41/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_41.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 42/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_42.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 43/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_43.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 44/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_44.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 45/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_45.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 46/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_46.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 47/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_47.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 48/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_48.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 49/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_49.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 50/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_50.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 51/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_51.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 52/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_52.html\n",
      "   ✅ 本页提取 20 条记录\n",
      "🔄 正在处理第 53/53 页...\n",
      "   正在请求: https://scrm.h1cd.com/admin/billings/Lists_53.html\n",
      "   ✅ 本页提取 8 条记录\n",
      "\n",
      "==============================\n",
      "✅ 导出成功！\n",
      "📁 文件路径: D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\维修记录_完美导出版.xlsx\n",
      "📈 总记录数: 1048\n",
      "==============================\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "import os\n",
    "import re\n",
    "import time\n",
    "\n",
    "# ===================== 固定配置 =====================\n",
    "# 注意：URL 中的 'Lists' 首字母大写，需与服务器严格一致\n",
    "BASE_URL = \"https://scrm.h1cd.com/admin/billings/Lists.html\"\n",
    "# 假设分页是通过 URL 参数或路径变化，这里根据你的代码逻辑保留路径变化模式\n",
    "# 如果实际是分页参数 (如 ?page=2)，请修改 get_html 函数\n",
    "BASE_URL_PATTERN = \"https://scrm.h1cd.com/admin/billings/Lists_{}.html\"\n",
    "\n",
    "OUTPUT_DIR = r\"D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\"\n",
    "OUTPUT_FILE = os.path.join(OUTPUT_DIR, \"维修记录_完美导出版.xlsx\")\n",
    "\n",
    "# 请求头 (完全同步你的 curl)\n",
    "HEADERS = {\n",
    "    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n",
    "    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',\n",
    "    'Connection': 'keep-alive',\n",
    "    # Referer 已更新\n",
    "    'Referer': 'https://scrm.h1cd.com/admin/billings/Lists.html?storeId=0&type=&receive_by=&is_out=&is_end=&timeStart=2022-02-01&timeEnd=&search=&status=0',\n",
    "    'Sec-Fetch-Dest': 'iframe',\n",
    "    'Sec-Fetch-Mode': 'navigate',\n",
    "    'Sec-Fetch-Site': 'same-origin',\n",
    "    'Sec-Fetch-User': '?1',\n",
    "    'Upgrade-Insecure-Requests': '1',\n",
    "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',\n",
    "    'sec-ch-ua': '\"Chromium\";v=\"146\", \"Not-A.Brand\";v=\"24\", \"Microsoft Edge\";v=\"146\"',\n",
    "    'sec-ch-ua-mobile': '?0',\n",
    "    'sec-ch-ua-platform': '\"Windows\"',\n",
    "}\n",
    "\n",
    "# Cookies (直接从 curl 提取，字典格式)\n",
    "COOKIES = {\n",
    "    'showSmsActivity': '1',\n",
    "    'showEasyMoney': '1',\n",
    "    'LOGIN_URL': 'https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html',\n",
    "    'adminpd': 'BNB%2Bpx2I%2B92MtZBN1vVyPt7A%2B3eKA3uAnIzdLP%2FD%2FBw%3D',\n",
    "    'adminun': '18530760062',\n",
    "    'uid': '10407',\n",
    "    'PHPSESSID': '7v127mqdfnqa7rgcrlifksrt3t'\n",
    "}\n",
    "\n",
    "# 查询参数 (已同步 curl 中的 timeStart=2022-02-01)\n",
    "PARAMS = {\n",
    "    'storeId': '0',\n",
    "    'type': '',\n",
    "    'receive_by': '',\n",
    "    'is_out': '',\n",
    "    'is_end': '',\n",
    "    'timeStart': '2022-02-01',\n",
    "    'timeEnd': '',\n",
    "    'search': '',\n",
    "    'status': '0'\n",
    "}\n",
    "\n",
    "# 定义表头 (根据你代码中的逻辑，共23列)\n",
    "HEADERS_LIST = [\n",
    "    '勾选框', '序号', '工单号', '工单内容', '车辆信息', '车主信息',\n",
    "    '工单金额', '开单时间', '接车人', '施工人员', '销售人员',\n",
    "    '预收款', '开单备注', '结算信息', '预计交车时间', '车架号',\n",
    "    '进厂公里', '下次保养公里', '下次保养时间', '是否出库',\n",
    "    '是否完工', '状态', '操作'\n",
    "]\n",
    "\n",
    "# =====================================================\n",
    "\n",
    "def get_html(page):\n",
    "    \"\"\"获取指定页面的HTML内容\"\"\"\n",
    "    try:\n",
    "        if page == 1:\n",
    "            url = BASE_URL\n",
    "        else:\n",
    "            # 适配分页 URL 格式：Lists_2.html, Lists_3.html ...\n",
    "            url = BASE_URL_PATTERN.format(page)\n",
    "\n",
    "        print(f\"   正在请求: {url}\")\n",
    "\n",
    "        r = requests.get(\n",
    "            url,\n",
    "            headers=HEADERS,\n",
    "            cookies=COOKIES,\n",
    "            params=PARAMS, # 参数会自动拼接到 URL 后\n",
    "            timeout=30,\n",
    "            verify=True # 默认验证 SSL，如果证书有问题可改为 False\n",
    "        )\n",
    "\n",
    "        # 检查是否被重定向到登录页 (通过检查 URL 或 内容)\n",
    "        if 'login' in r.url.lower() or '登录' in r.text[:2000]:\n",
    "            print(\"   ⚠️ 检测到可能已退出登录或 Cookie 过期！\")\n",
    "            return None\n",
    "\n",
    "        r.raise_for_status()\n",
    "        r.encoding = 'utf-8'\n",
    "        return r.text\n",
    "    except Exception as e:\n",
    "        print(f\"   ❌ 获取第{page}页失败: {str(e)}\")\n",
    "        return None\n",
    "\n",
    "def parse_table(html):\n",
    "    \"\"\"解析HTML表格，提取所有字段\"\"\"\n",
    "    soup = BeautifulSoup(html, 'html.parser')\n",
    "\n",
    "    # 尝试寻找表格，增加对 tbody 的兼容\n",
    "    table = soup.find('table', class_='table')\n",
    "    if not table:\n",
    "        # 尝试没有 class 的 table 或者 id\n",
    "        table = soup.find('table')\n",
    "\n",
    "    if not table:\n",
    "        return []\n",
    "\n",
    "    result = []\n",
    "\n",
    "    # 查找数据行，类名可能是 'order_item' 或在 tbody 中\n",
    "    # 先找 tbody，再找 tr\n",
    "    tbody = table.find('tbody')\n",
    "    if tbody:\n",
    "        rows = tbody.find_all('tr', class_='order_item')\n",
    "        if not rows:\n",
    "            rows = tbody.find_all('tr') # 如果没有特定类名，取所有行\n",
    "    else:\n",
    "        rows = table.find_all('tr', class_='order_item')\n",
    "        if not rows:\n",
    "            rows = table.find_all('tr')\n",
    "\n",
    "    # 过滤掉表头行 (如果包含 th 标签)\n",
    "    data_rows = []\n",
    "    for row in rows:\n",
    "        if row.find('th'):\n",
    "            continue\n",
    "        data_rows.append(row)\n",
    "\n",
    "    for row in data_rows:\n",
    "        tds = row.find_all('td')\n",
    "\n",
    "        # 动态判断列数，如果列数过少则跳过 (可能是空行)\n",
    "        if len(tds) < 5:\n",
    "            continue\n",
    "\n",
    "        row_data = []\n",
    "        for i, td in enumerate(tds):\n",
    "            text = td.get_text(strip=True, separator='\\n')\n",
    "\n",
    "            # 特殊处理逻辑\n",
    "            if i == 4:  # 车辆信息：提取车牌号\n",
    "                car_match = re.search(r'([京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼][A-Z][A-Z0-9]{5,6})', text)\n",
    "                if car_match:\n",
    "                    row_data.append(car_match.group(1))\n",
    "                else:\n",
    "                    row_data.append(text)\n",
    "            elif i == 2:  # 工单号：尝试提取纯数字\n",
    "                order_match = re.search(r'(\\d{8,})', text) # 放宽数字长度限制\n",
    "                if order_match:\n",
    "                    row_data.append(order_match.group(1))\n",
    "                else:\n",
    "                    row_data.append(text.split('\\n')[0].strip())\n",
    "            else:\n",
    "                clean_text = text.replace('\\n', ' | ').replace('\\r', '')\n",
    "                row_data.append(clean_text)\n",
    "\n",
    "        # 如果列数少于表头，补齐空字符串；如果多于表头，截断\n",
    "        if len(row_data) < len(HEADERS_LIST):\n",
    "            row_data.extend([''] * (len(HEADERS_LIST) - len(row_data)))\n",
    "        elif len(row_data) > len(HEADERS_LIST):\n",
    "            row_data = row_data[:len(HEADERS_LIST)]\n",
    "\n",
    "        result.append(row_data)\n",
    "\n",
    "    return result\n",
    "\n",
    "def get_total_pages(html):\n",
    "    \"\"\"从第一页 HTML 中分析总页数\"\"\"\n",
    "    if not html:\n",
    "        return 1\n",
    "\n",
    "    # 常见分页文本模式\n",
    "    patterns = [\n",
    "        r'共\\s*(\\d+)\\s*页',\n",
    "        r'共\\s*(\\d+)\\s*条.*?(\\d+)\\s*页', # 共XX条 XX页\n",
    "        r'页码\\s*\\d+/(\\d+)',\n",
    "        r'1/(\\d+)',\n",
    "        r'of\\s+(\\d+)\\s*pages' # 英文模式\n",
    "    ]\n",
    "\n",
    "    for pattern in patterns:\n",
    "        match = re.search(pattern, html)\n",
    "        if match:\n",
    "            # 取最后一个匹配组作为页数 (针对第二条正则)\n",
    "            page_num = match.group(match.lastindex)\n",
    "            try:\n",
    "                return int(page_num)\n",
    "            except ValueError:\n",
    "                continue\n",
    "\n",
    "    # 如果正则没匹配到，尝试找分页按钮数量估算 (保守估计)\n",
    "    soup = BeautifulSoup(html, 'html.parser')\n",
    "    pagination = soup.find('div', class_='dataTables_wrapper') or soup.find('ul', class_='pagination')\n",
    "    if pagination:\n",
    "        # 简单策略：如果找不到具体数字，先假设只有1页，避免报错，或者你可以手动设置一个最大值\n",
    "        # 这里返回 1，并在主循环中增加“如果下一页为空则停止”的逻辑更安全\n",
    "        print(\"   ⚠️ 未明确检测到总页数，将尝试逐页爬取直到无数据。\")\n",
    "        return 999 # 设置一个较大的上限，依靠内容为空来停止\n",
    "\n",
    "    return 1\n",
    "\n",
    "def main():\n",
    "    print(\"🔧 开始导出维修记录...\")\n",
    "\n",
    "    # 确保输出目录存在\n",
    "    if not os.path.exists(OUTPUT_DIR):\n",
    "        try:\n",
    "            os.makedirs(OUTPUT_DIR)\n",
    "            print(f\"✅ 创建输出目录: {OUTPUT_DIR}\")\n",
    "        except Exception as e:\n",
    "            print(f\"❌ 无法创建目录: {e}\")\n",
    "            return\n",
    "\n",
    "    # 1. 获取第一页以确定总页数\n",
    "    print(\"📄 正在获取第 1 页以分析页数...\")\n",
    "    first_html = get_html(1)\n",
    "    if not first_html:\n",
    "        print(\"❌ 无法获取第一页，请检查 Cookie 是否过期或网络设置。\")\n",
    "        return\n",
    "\n",
    "    total_pages = get_total_pages(first_html)\n",
    "    print(f\"📊 预估总页数: {total_pages}\")\n",
    "\n",
    "    all_data = []\n",
    "\n",
    "    # 2. 循环爬取\n",
    "    current_page = 1\n",
    "    while current_page <= total_pages:\n",
    "        time.sleep(0.5)\n",
    "        print(f\"🔄 正在处理第 {current_page}/{total_pages} 页...\")\n",
    "\n",
    "        if current_page == 1:\n",
    "            html = first_html\n",
    "        else:\n",
    "            html = get_html(current_page)\n",
    "\n",
    "        if not html:\n",
    "            print(f\"⚠️ 第{current_page}页获取失败或为空，停止爬取。\")\n",
    "            break\n",
    "\n",
    "        page_data = parse_table(html)\n",
    "\n",
    "        if not page_data:\n",
    "            print(f\"⚠️ 第{current_page}页解析不到数据，可能已到达最后一页。\")\n",
    "            break\n",
    "\n",
    "        all_data.extend(page_data)\n",
    "        print(f\"   ✅ 本页提取 {len(page_data)} 条记录\")\n",
    "\n",
    "        # 简单的反爬延时\n",
    "        time.sleep(1)\n",
    "\n",
    "        current_page += 1\n",
    "\n",
    "    if not all_data:\n",
    "        print(\"❌ 未获取到任何有效数据。\")\n",
    "        return\n",
    "\n",
    "    # 3. 保存数据\n",
    "    try:\n",
    "        df = pd.DataFrame(all_data, columns=HEADERS_LIST)\n",
    "\n",
    "        # 保存 Excel\n",
    "        df.to_excel(OUTPUT_FILE, index=False, engine='openpyxl')\n",
    "\n",
    "        print(\"\\n\" + \"=\"*30)\n",
    "        print(\"✅ 导出成功！\")\n",
    "        print(f\"📁 文件路径: {OUTPUT_FILE}\")\n",
    "        print(f\"📈 总记录数: {len(df)}\")\n",
    "        print(\"=\"*30)\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"❌ 保存 Excel 失败: {e}\")\n",
    "        # 降级保存 CSV\n",
    "        csv_file = OUTPUT_FILE.replace('.xlsx', '.csv')\n",
    "        try:\n",
    "            df.to_csv(csv_file, index=False, encoding='utf-8-sig')\n",
    "            print(f\"📌 已降级保存为 CSV: {csv_file}\")\n",
    "        except Exception as ce:\n",
    "            print(f\"❌ 保存 CSV 也失败: {ce}\")\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    # 依赖检查\n",
    "    required_packages = ['requests', 'pandas', 'bs4', 'openpyxl']\n",
    "    missing = []\n",
    "    for pkg in required_packages:\n",
    "        try:\n",
    "            __import__(pkg)\n",
    "        except ImportError:\n",
    "            missing.append(pkg)\n",
    "\n",
    "    if missing:\n",
    "        print(f\"❌ 缺少必要的库: {', '.join(missing)}\")\n",
    "        print(\"💡 请运行以下命令安装: pip install \" + \" \".join(missing))\n",
    "    else:\n",
    "        main()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b3decf1d",
   "metadata": {},
   "source": [
    "# 会员卡信息导出\n",
    "\n",
    "从 H1 系统导出会员卡信息（储值卡、套餐卡等），自动分页爬取并做数据规范化处理。\n",
    "\n",
    "> ⚠️ **注意**：H1系统导出的原始数据格式不规范（如姓名和手机号混在同一字段、操作列包含按钮文本等），脚本已内置清洗逻辑。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ab86773",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "import os\n",
    "import re\n",
    "import time\n",
    "from datetime import datetime\n",
    "\n",
    "# ===================== 【配置区】 =====================\n",
    "# Cookie（请根据实际情况更新，登录后从浏览器DevTools复制）\n",
    "COOKIES = {\n",
    "    'showSmsActivity': '1',\n",
    "    'showEasyMoney': '1',\n",
    "    'LOGIN_URL': 'https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html',\n",
    "    'adminpd': 'jVISiRrtcJplFhLoCuUIxK9XG5ekdfwzq%2B0y482ZKxE%3D',\n",
    "    'adminun': '15224781773',\n",
    "    'uid': '10291',\n",
    "    'PHPSESSID': 'nbn58laakng0rv5iqln82a6qpu',\n",
    "}\n",
    "\n",
    "HEADERS = {\n",
    "    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n",
    "    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',\n",
    "    'Connection': 'keep-alive',\n",
    "    'Referer': 'https://scrm.h1cd.com/admin/members/cards.html',\n",
    "    'Sec-Fetch-Dest': 'iframe',\n",
    "    'Sec-Fetch-Mode': 'navigate',\n",
    "    'Sec-Fetch-Site': 'same-origin',\n",
    "    'Sec-Fetch-User': '?1',\n",
    "    'Upgrade-Insecure-Requests': '1',\n",
    "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',\n",
    "    'sec-ch-ua': '\"Chromium\";v=\"146\", \"Not-A.Brand\";v=\"24\", \"Microsoft Edge\";v=\"146\"',\n",
    "    'sec-ch-ua-mobile': '?0',\n",
    "    'sec-ch-ua-platform': '\"Windows\"',\n",
    "}\n",
    "\n",
    "# 查询参数\n",
    "PARAMS = {\n",
    "    'type': '',\n",
    "    'expired': '',\n",
    "    'storeId': '0',\n",
    "    'search': '',\n",
    "}\n",
    "\n",
    "# 输出目录\n",
    "OUTPUT_DIR = r\"D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\"\n",
    "# =====================================================\n",
    "\n",
    "\n",
    "def get_page_html(page_num):\n",
    "    \"\"\"获取指定页面的HTML内容\"\"\"\n",
    "    try:\n",
    "        if page_num == 1:\n",
    "            url = \"https://scrm.h1cd.com/admin/members/cards.html\"\n",
    "        else:\n",
    "            url = f\"https://scrm.h1cd.com/admin/members/cards_{page_num}.html\"\n",
    "\n",
    "        r = requests.get(url, headers=HEADERS, cookies=COOKIES, params=PARAMS, timeout=30)\n",
    "\n",
    "        # 检查是否被重定向到登录页\n",
    "        if 'login' in r.url.lower() or '登录' in r.text[:2000]:\n",
    "            print(f\"   ⚠️ 第{page_num}页检测到跳转登录，Cookie可能已失效。\")\n",
    "            return None\n",
    "\n",
    "        r.raise_for_status()\n",
    "        r.encoding = 'utf-8'\n",
    "        return r.text\n",
    "    except Exception as e:\n",
    "        print(f\"   ❌ 第{page_num}页请求失败: {str(e)}\")\n",
    "        return None\n",
    "\n",
    "\n",
    "def parse_cards_table(html):\n",
    "    \"\"\"\n",
    "    解析会员卡HTML表格，提取数据。\n",
    "    \n",
    "    H1系统会员卡页面特点：\n",
    "    - 部分单元格包含多行信息（用<br>分隔），如姓名和手机号在同一格\n",
    "    - 操作列包含按钮文本需要过滤\n",
    "    \"\"\"\n",
    "    soup = BeautifulSoup(html, 'html.parser')\n",
    "    table = soup.find('table', class_='table')\n",
    "    if not table:\n",
    "        table = soup.find(\"table\")\n",
    "    if not table:\n",
    "        return [], []\n",
    "\n",
    "    # 提取表头\n",
    "    header = []\n",
    "    thead = table.find(\"thead\")\n",
    "    if thead:\n",
    "        ths = thead.find_all('th')\n",
    "        header = [th.get_text(strip=True) for th in ths]\n",
    "\n",
    "    if not header:\n",
    "        first_tr = table.find(\"tr\")\n",
    "        if first_tr:\n",
    "            ths = first_tr.find_all('th')\n",
    "            if ths:\n",
    "                header = [th.get_text(strip=True) for th in ths]\n",
    "\n",
    "    # 提取数据行\n",
    "    tbody = table.find(\"tbody\")\n",
    "    rows = tbody.find_all(\"tr\") if tbody else table.find_all(\"tr\")\n",
    "\n",
    "    data_rows = []\n",
    "    for tr in rows:\n",
    "        if tr.find(\"th\"):\n",
    "            continue\n",
    "        tds = tr.find_all('td')\n",
    "        if not tds or len(tds) < 3:\n",
    "            continue\n",
    "\n",
    "        row_data = []\n",
    "        for td in tds:\n",
    "            text = td.get_text(separator='|', strip=True)\n",
    "            text = re.sub(r'\\s+', ' ', text)\n",
    "            row_data.append(text.strip())\n",
    "\n",
    "        if any(row_data):\n",
    "            data_rows.append(row_data)\n",
    "\n",
    "    return header, data_rows\n",
    "\n",
    "\n",
    "def normalize_dataframe(df):\n",
    "    \"\"\"\n",
    "    对整个DataFrame进行规范化处理。\n",
    "    处理H1系统导出数据不规范的情况：\n",
    "    1. 去重\n",
    "    2. 拆分姓名+手机号合并字段\n",
    "    3. 清理数值列\n",
    "    4. 去除操作列和按钮文本残留\n",
    "    \"\"\"\n",
    "    # 去除完全重复的行\n",
    "    before_count = len(df)\n",
    "    df = df.drop_duplicates()\n",
    "    after_count = len(df)\n",
    "    if before_count != after_count:\n",
    "        print(f\"   🔍 去重：{before_count} 条 → {after_count} 条（去除 {before_count - after_count} 条重复）\")\n",
    "\n",
    "    # 拆分合并列（如\"会员名\"列中同时包含姓名和手机号）\n",
    "    for col in df.columns:\n",
    "        if any(kw in col for kw in [\"会员名\", \"姓名\", \"客户名称\", \"车主\"]):\n",
    "            # 检测该列是否同时包含姓名和手机号\n",
    "            sample = df[col].astype(str).head(20)\n",
    "            has_phone = sample.apply(lambda x: bool(re.search(r'1[3-9]\\d{9}', x))).any()\n",
    "            if has_phone and '手机号' not in df.columns:\n",
    "                df[\"客户名称\"] = df[col].apply(\n",
    "                    lambda x: re.sub(r\"1[3-9]\\d{9}\", \"\", str(x)).replace(\"|\", \"\").strip()\n",
    "                )\n",
    "                df[\"手机号\"] = df[col].apply(\n",
    "                    lambda x: (re.search(r\"1[3-9]\\d{9}\", str(x)).group() if re.search(r\"1[3-9]\\d{9}\", str(x)) else \"\")\n",
    "                )\n",
    "\n",
    "    # 清理数值列\n",
    "    for col in df.columns:\n",
    "        if any(kw in col for kw in [\"余额\", \"充值\", \"消费\", \"金额\"]):\n",
    "            df[col] = df[col].astype(str).apply(\n",
    "                lambda x: (re.search(r\"[\\d.]+\", str(x).replace(\",\", \"\")).group() if re.search(r\"[\\d.]+\", str(x).replace(\",\", \"\")) else x)\n",
    "            )\n",
    "\n",
    "    # 清理操作列\n",
    "    cols_to_drop = [col for col in df.columns if any(kw in col for kw in [\"操作\", \"选择\", \"勾选\"])]\n",
    "    if cols_to_drop:\n",
    "        df = df.drop(columns=cols_to_drop)\n",
    "\n",
    "    # 清理所有列中的按钮文本残留\n",
    "    btn_patterns = r\"(查看详情|编辑|删除|充值记录|消费记录|详情|迁移|查看)\"\n",
    "    for col in df.columns:\n",
    "        df[col] = df[col].astype(str).apply(\n",
    "            lambda x: re.sub(btn_patterns, \"\", str(x)).strip()\n",
    "        )\n",
    "        df[col] = df[col].replace({'nan': '', 'None': ''})\n",
    "\n",
    "    return df\n",
    "\n",
    "\n",
    "def get_max_page(html):\n",
    "    \"\"\"从页面中提取最大页数\"\"\"\n",
    "    if not html:\n",
    "        return 1\n",
    "\n",
    "    soup = BeautifulSoup(html, 'html.parser')\n",
    "    text = soup.get_text()\n",
    "\n",
    "    match = re.search(r'共\\s*(\\d+)\\s*页', text)\n",
    "    if match:\n",
    "        return int(match.group(1))\n",
    "\n",
    "    match = re.search(r'页\\s*1/(\\d+)', text)\n",
    "    if match:\n",
    "        return int(match.group(1))\n",
    "\n",
    "    page_links = soup.find_all('a', href=re.compile(r'cards_\\d+\\.html'))\n",
    "    if page_links:\n",
    "        max_page = 1\n",
    "        for a in page_links:\n",
    "            num_match = re.search(r'cards_(\\d+)\\.html', a.get('href', ''))\n",
    "            if num_match:\n",
    "                max_page = max(max_page, int(num_match.group(1)))\n",
    "        return max_page\n",
    "\n",
    "    return 1\n",
    "\n",
    "\n",
    "def main():\n",
    "    print(\"=\" * 50)\n",
    "    print(\"开始爬取 H1系统 会员卡信息...\")\n",
    "    print(f\"当前 StoreID: {PARAMS['storeId']}\")\n",
    "    print(\"=\" * 50)\n",
    "\n",
    "    # 获取第一页，确定总页数\n",
    "    print(\"正在获取总页数...\")\n",
    "    first_html = get_page_html(1)\n",
    "    if not first_html:\n",
    "        print(\"❌ 无法获取第一页数据，请检查 Cookie 或网络。\")\n",
    "        return\n",
    "\n",
    "    max_page = get_max_page(first_html)\n",
    "    print(f\"✅ 成功获取最大页数：{max_page}\")\n",
    "\n",
    "    # 爬取所有页面\n",
    "    all_data = []\n",
    "    merged_header = []\n",
    "\n",
    "    for page in range(1, max_page + 1):\n",
    "        print(f\"正在爬取第 {page}/{max_page} 页...\")\n",
    "\n",
    "        if page == 1:\n",
    "            html = first_html\n",
    "        else:\n",
    "            html = get_page_html(page)\n",
    "            if not html:\n",
    "                print(f\"❌ 第 {page} 页获取失败，跳过。\")\n",
    "                continue\n",
    "\n",
    "        header, rows = parse_cards_table(html)\n",
    "\n",
    "        if not header and not rows:\n",
    "            print(f\"⚠️ 第 {page} 页未解析到表格数据。\")\n",
    "            continue\n",
    "\n",
    "        # 合并表头（不同页的表头可能略有差异）\n",
    "        if header:\n",
    "            for h in header:\n",
    "                if h not in merged_header:\n",
    "                    merged_header.append(h)\n",
    "\n",
    "        all_data.extend(rows)\n",
    "\n",
    "        # 请求间隔\n",
    "        if page < max_page:\n",
    "            time.sleep(0.3)\n",
    "\n",
    "    if not all_data:\n",
    "        print(\"\\n❌ 未获取到任何数据，请检查 Cookie 或网络。\")\n",
    "        return\n",
    "\n",
    "    print(f\"\\n✅ 爬取完成，共获取 {len(all_data)} 条原始记录\")\n",
    "\n",
    "    # 构建DataFrame\n",
    "    if merged_header:\n",
    "        normalized_rows = []\n",
    "        width = len(merged_header)\n",
    "        for row in all_data:\n",
    "            if len(row) < width:\n",
    "                row = row + [\"\"] * (width - len(row))\n",
    "            elif len(row) > width:\n",
    "                row = row[:width]\n",
    "            normalized_rows.append(row)\n",
    "        df = pd.DataFrame(normalized_rows, columns=merged_header)\n",
    "    else:\n",
    "        df = pd.DataFrame(all_data)\n",
    "\n",
    "    print(f\"📋 原始列名：{list(df.columns)}\")\n",
    "    print(f\"📋 原始数据前3行：\")\n",
    "    print(df.head(3).to_string())\n",
    "\n",
    "    # 数据规范化处理\n",
    "    print(\"\\n开始数据规范化处理...\")\n",
    "    df = normalize_dataframe(df)\n",
    "\n",
    "    # 保存结果\n",
    "    os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
    "    time_str = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
    "    filename = f\"H1会员卡信息_{time_str}.xlsx\"\n",
    "    filepath = os.path.join(OUTPUT_DIR, filename)\n",
    "\n",
    "    try:\n",
    "        df.to_excel(filepath, index=False)\n",
    "        print(\"=\" * 50)\n",
    "        print(\"✅ 导出完成！\")\n",
    "        print(f\"📊 最终有效条数：{len(df)}\")\n",
    "        print(f\"📁 已保存到：{filepath}\")\n",
    "        print(\"=\" * 50)\n",
    "    except Exception as e:\n",
    "        print(f\"❌ 保存Excel失败: {e}\")\n",
    "        csv_path = filepath.replace(\".xlsx\", \".csv\")\n",
    "        df.to_csv(csv_path, index=False, encoding=\"utf-8-sig\")\n",
    "        print(f\"💡 已转为 CSV 保存至：{csv_path}\")\n",
    "\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c658267",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 运行导出\n",
    "main()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "F6+宜搭+其它",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}