{ "cells": [ { "cell_type": "markdown", "id": "2be2e0c2", "metadata": {}, "source": [ "# 车辆信息" ] }, { "cell_type": "code", "id": "70a8b0da", "metadata": { "ExecuteTime": { "end_time": "2026-03-25T03:51:53.352551400Z", "start_time": "2026-03-25T03:51:31.198595700Z" } }, "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import re\n", "import os\n", "import csv\n", "from datetime import datetime\n", "\n", "# ===================== 配置区 =====================\n", "# 已替换为你curl中的最新cookies\n", "COOKIES = (\n", " 'showSmsActivity=1; '\n", " 'showEasyMoney=1; '\n", " 'LOGIN_URL=https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html; '\n", " 'adminun=18530760062; '\n", " 'uid=10407; '\n", " 'PHPSESSID=7v127mqdfnqa7rgcrlifksrt3t'\n", ")\n", "\n", "BASE_URL = \"https://scrm.h1cd.com/admin/members/carlist\"\n", "HEADERS = {\n", " \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7\",\n", " \"Accept-Language\": \"zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6\",\n", " \"Connection\": \"keep-alive\",\n", " \"Referer\": \"https://scrm.h1cd.com/admin/members/carlist.html\",\n", " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0\",\n", " \"sec-ch-ua\": '\"Chromium\";v=\"146\", \"Not-A.Brand\";v=\"24\", \"Microsoft Edge\";v=\"146\"',\n", " \"sec-ch-ua-mobile\": \"?0\",\n", " \"sec-ch-ua-platform\": '\"Windows\"',\n", " \"Sec-Fetch-Dest\": \"iframe\",\n", " \"Sec-Fetch-Mode\": \"navigate\",\n", " \"Sec-Fetch-Site\": \"same-origin\",\n", " \"Sec-Fetch-User\": \"?1\",\n", " \"Upgrade-Insecure-Requests\": \"1\"\n", "}\n", "\n", "# 解析Cookie\n", "def parse_cookies(cookie_str):\n", " cookie_dict = {}\n", " for item in cookie_str.split(\"; \"):\n", " if \"=\" in item:\n", " key, value = item.split(\"=\", 1)\n", " cookie_dict[key] = value\n", " return cookie_dict\n", "\n", "# ===================== 核心:数据拆分处理(已修复 br 换行) =====================\n", "def process_row(row):\n", " if len(row) < 5:\n", " return row\n", " \n", " new_row = row.copy()\n", "\n", " # ========== 1. 处理 C列:客户名称 + 手机号(按
拆分,已修复) ==========\n", " c_text = new_row.pop(2)\n", " \n", " # 处理换行:统一空格/换行/空白符,提取 名称 + 手机号\n", " # 先把所有空白(包括HTML换行产生的空格)替换成统一分隔符\n", " c_text = re.sub(r'\\s+', ' ', c_text).strip()\n", " \n", " name, phone = \"\", \"\"\n", " # 匹配手机号(11位数字),自动分割\n", " phone_match = re.search(r'1[3-9]\\d{9}', c_text)\n", " if phone_match:\n", " phone = phone_match.group()\n", " name = c_text.replace(phone, '').strip()\n", " else:\n", " name = c_text.strip()\n", "\n", " # ========== 2. 处理 E列:颜色/发动机/车架号(已去除前缀) ==========\n", " e_text = new_row.pop(3)\n", " color, engine, vin = \"\", \"\", \"\"\n", "\n", " # 去掉所有中文前缀\n", " e_text = re.sub(r'颜\\s*色\\s*:', '', e_text)\n", " e_text = re.sub(r'发动机\\s*:', '', e_text)\n", " e_text = re.sub(r'车架号\\s*:', '', e_text)\n", " \n", " # 按 / 拆分\n", " if \"/\" in e_text:\n", " parts = e_text.split(\"/\", 2)\n", " color = parts[0].strip()\n", " engine = parts[1].strip() if len(parts) > 1 else \"\"\n", " vin = parts[2].strip() if len(parts) > 2 else \"\"\n", " else:\n", " color = e_text.strip()\n", "\n", " # 插入拆分后字段\n", " new_row.insert(2, name)\n", " new_row.insert(3, phone)\n", " new_row.insert(4, color)\n", " new_row.insert(5, engine)\n", " new_row.insert(6, vin)\n", "\n", " return new_row\n", "\n", "# 获取单页表格数据 + 自动拆分(保留HTML内换行,解决
问题)\n", "def get_page_data(page_num):\n", " if page_num == 1:\n", " url = f\"{BASE_URL}.html\"\n", " else:\n", " url = f\"{BASE_URL}_{page_num}.html\"\n", "\n", " try:\n", " resp = requests.get(url, headers=HEADERS, cookies=parse_cookies(COOKIES), timeout=15)\n", " resp.raise_for_status()\n", " soup = BeautifulSoup(resp.text, \"html.parser\")\n", " table = soup.find(\"table\")\n", " if not table:\n", " return []\n", "\n", " rows = table.find_all(\"tr\")\n", " data = []\n", " for tr in rows:\n", " tds = tr.find_all(\"td\")\n", " cols = []\n", " for td in tds:\n", " # 关键:保留
产生的换行,不直接压缩\n", " text = td.get_text(separator=\" \", strip=True)\n", " cols.append(text)\n", " \n", " if cols:\n", " processed_cols = process_row(cols)\n", " data.append(processed_cols)\n", " return data\n", "\n", " except Exception as e:\n", " print(f\"第{page_num}页请求失败:{e}\")\n", " return []\n", "\n", "# 获取总页数\n", "def get_total_pages():\n", " try:\n", " resp = requests.get(f\"{BASE_URL}.html\", headers=HEADERS, cookies=parse_cookies(COOKIES), timeout=10)\n", " soup = BeautifulSoup(resp.text, \"html.parser\")\n", " page_text = soup.get_text()\n", " match = re.search(r\"共\\s*(\\d+)\\s*页\", page_text)\n", " if match:\n", " return int(match.group(1))\n", "\n", " page_links = soup.find_all(\"a\", href=re.compile(r\"carlist_\\d+\\.html\"))\n", " max_page = 1\n", " for a in page_links:\n", " num_match = re.search(r\"carlist_(\\d+)\\.html\", a[\"href\"])\n", " if num_match:\n", " max_page = max(max_page, int(num_match.group(1)))\n", " return max_page\n", " except:\n", " return 1\n", "\n", "# ===================== 保存到桌面(自动加表头) =====================\n", "def save_to_desktop(all_data):\n", " desktop_path = os.path.join(os.path.expanduser(\"~\"), \"Desktop\")\n", " time_str = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", " \n", " # 表头(自动添加)\n", " header = [\n", " \"序号\", \"车牌号\", \"客户名称\", \"客户手机号\",\n", " \"颜色\", \"发动机号\", \"车架号\", \"里程数\", \"其他列1\", \"其他列2\"\n", " ]\n", " \n", " csv_file = os.path.join(desktop_path, f\"车辆数据_已拆分_{time_str}1.csv\")\n", " with open(csv_file, \"w\", newline=\"\", encoding=\"utf-8-sig\") as f:\n", " writer = csv.writer(f)\n", " writer.writerow(header) # 写入标题\n", " writer.writerows(all_data)\n", " \n", " txt_file = os.path.join(desktop_path, f\"车辆数据_已拆分_{time_str}.txt\")\n", " with open(txt_file, \"w\", encoding=\"utf-8\") as f:\n", " f.write(\" | \".join(header) + \"\\n\")\n", " for row in all_data:\n", " f.write(\" | \".join(row) + \"\\n\")\n", " \n", " print(f\"\\n✅ 文件已保存到桌面:\")\n", " print(f\"📊 Excel文件:{csv_file}\")\n", " print(f\"📄 文本文件:{txt_file}\")\n", "\n", "# ===================== 主程序 =====================\n", "if __name__ == \"__main__\":\n", " print(\"正在获取总页数...\")\n", " total_pages = get_total_pages()\n", " # total_pages = 1\n", " print(f\"总页数:{total_pages} 页\")\n", "\n", " all_data = []\n", " for page in range(1, total_pages + 1):\n", " print(f\"正在爬取第 {page}/{total_pages} 页...\")\n", " page_data = get_page_data(page)\n", " if page_data:\n", " all_data.extend(page_data)\n", "\n", " print(f\"\\n========== 爬取完成 ==========\")\n", " print(f\"总计数据:{len(all_data)} 行\")\n", "\n", " save_to_desktop(all_data)\n", "\n", " print(\"\\n前5行数据预览:\")\n", " for i, row in enumerate(all_data[:5]):\n", " print(i+1, row)" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "正在获取总页数...\n", "总页数:34 页\n", "正在爬取第 1/34 页...\n", "正在爬取第 2/34 页...\n", "正在爬取第 3/34 页...\n", "正在爬取第 4/34 页...\n", "正在爬取第 5/34 页...\n", "正在爬取第 6/34 页...\n", "正在爬取第 7/34 页...\n", "正在爬取第 8/34 页...\n", "正在爬取第 9/34 页...\n", "正在爬取第 10/34 页...\n", "正在爬取第 11/34 页...\n", "正在爬取第 12/34 页...\n", "正在爬取第 13/34 页...\n", "正在爬取第 14/34 页...\n", "正在爬取第 15/34 页...\n", "正在爬取第 16/34 页...\n", "正在爬取第 17/34 页...\n", "正在爬取第 18/34 页...\n", "正在爬取第 19/34 页...\n", "正在爬取第 20/34 页...\n", "正在爬取第 21/34 页...\n", "正在爬取第 22/34 页...\n", "正在爬取第 23/34 页...\n", "正在爬取第 24/34 页...\n", "正在爬取第 25/34 页...\n", "正在爬取第 26/34 页...\n", "正在爬取第 27/34 页...\n", "正在爬取第 28/34 页...\n", "正在爬取第 29/34 页...\n", "正在爬取第 30/34 页...\n", "正在爬取第 31/34 页...\n", "正在爬取第 32/34 页...\n", "正在爬取第 33/34 页...\n", "正在爬取第 34/34 页...\n", "\n", "========== 爬取完成 ==========\n", "总计数据:666 行\n", "\n", "✅ 文件已保存到桌面:\n", "📊 Excel文件:C:\\Users\\hp_z66\\Desktop\\车辆数据_已拆分_20260325_1151531.csv\n", "📄 文本文件:C:\\Users\\hp_z66\\Desktop\\车辆数据_已拆分_20260325_115153.txt\n", "\n", "前5行数据预览:\n", "1 ['1', '豫NA477R', '卢忠厚', '', '', '', '/', '', '118933km', '', '', '消费记录 编辑 迁移 删除']\n", "2 ['2', '豫NF3722', '刘建利', '', '', '', '/', '', '198609km', '', '', '消费记录 编辑 迁移 删除']\n", "3 ['3', '豫N13B58', '石', '15090629992', '', '', '/', '', '22462km', '', '', '消费记录 编辑 迁移 删除']\n", "4 ['4', '京PYB297', '科迪黄青春', '', '', '', '/', '', '119584km', '', '', '消费记录 编辑 迁移 删除']\n", "5 ['5', '豫NN982M', '大众', '', '', '', '/', '', '197504km', '', '', '消费记录 编辑 迁移 删除']\n" ] } ], "execution_count": 1 }, { "cell_type": "markdown", "id": "6c370235", "metadata": {}, "source": [ "# 库存信息" ] }, { "cell_type": "code", "id": "5392bfc0", "metadata": { "ExecuteTime": { "end_time": "2026-03-25T03:53:43.600296300Z", "start_time": "2026-03-25T03:53:18.688209100Z" } }, "source": [ "import requests\n", "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import os\n", "import re\n", "\n", "# ===================== 【配置区】 =====================\n", "# 【已更新】根据 curl 命令中的 -b 参数替换为最新 Cookie\n", "COOKIE = (\n", " 'showSmsActivity=1; '\n", " 'showEasyMoney=1; '\n", " 'LOGIN_URL=https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html; '\n", " 'adminun=18530760062; '\n", " 'uid=10407; '\n", " 'PHPSESSID=7v127mqdfnqa7rgcrlifksrt3t'\n", ")\n", "\n", "# 【已更新】根据 curl 命令中的 URL 参数更新筛选条件\n", "BASE_PARAMS = {\n", " 'storeId': '13435',\n", " 'house_id': '9079',\n", " 'repositoryName': '',\n", " 'first_type': '',\n", " 'product_type': '',\n", " 'status': '',\n", " 'searchType': '1',\n", " 'product_name': ''\n", "}\n", "\n", "# 分页格式保持不动\n", "BASE_URL = \"https://scrm.h1cd.com/admin/billings/stores-search__{}.html\"\n", "# ======================================================\n", "\n", "HEADERS = {\n", " 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n", " 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',\n", " 'Connection': 'keep-alive',\n", " 'Cookie': COOKIE,\n", " 'Referer': 'https://scrm.h1cd.com/admin/billings/stores-search.html?storeId=13435&house_id=9079&repositoryName=&first_type=&product_type=&status=&searchType=1&product_name=',\n", " 'Sec-Fetch-Dest': 'iframe',\n", " 'Sec-Fetch-Mode': 'navigate',\n", " 'Sec-Fetch-Site': 'same-origin',\n", " 'Sec-Fetch-User': '?1',\n", " 'Upgrade-Insecure-Requests': '1',\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',\n", " 'sec-ch-ua': '\"Chromium\";v=\"146\", \"Not-A.Brand\";v=\"24\", \"Microsoft Edge\";v=\"146\"',\n", " 'sec-ch-ua-mobile': '?0',\n", " 'sec-ch-ua-platform': '\"Windows\"',\n", "}\n", "\n", "\n", "def get_page_html(page_num):\n", " \"\"\"获取单页HTML\"\"\"\n", " try:\n", " if page_num == 1:\n", " url = \"https://scrm.h1cd.com/admin/billings/stores-search.html\"\n", " else:\n", " url = BASE_URL.format(page_num)\n", "\n", " res = requests.get(url, headers=HEADERS, params=BASE_PARAMS, timeout=15)\n", " res.raise_for_status()\n", "\n", " if 'login.html' in res.url:\n", " print(f\"⚠️ 第{page_num}页检测到跳转登录,Cookie可能已失效。\")\n", " return None\n", "\n", " return res.text\n", " except Exception as e:\n", " print(f\"第{page_num}页请求失败:{e}\")\n", " return None\n", "\n", "\n", "def parse_table(html):\n", " \"\"\"解析表格数据\"\"\"\n", " soup = BeautifulSoup(html, 'html.parser')\n", " table = soup.find('table', class_='table-hover')\n", " if not table:\n", " table = soup.find('table')\n", "\n", " if not table:\n", " return []\n", "\n", " rows = []\n", " thead = table.find('thead')\n", " if thead:\n", " headers = [th.get_text(strip=True) for th in thead.find_all('th')]\n", " if headers:\n", " rows.append(headers)\n", "\n", " tbody = table.find('tbody')\n", " target_rows = tbody.find_all('tr') if tbody else table.find_all('tr')\n", "\n", " for tr in target_rows:\n", " tds = tr.find_all('td')\n", " if not tds:\n", " continue\n", " row = [td.get_text(strip=True) for td in tds]\n", " if any(row):\n", " rows.append(row)\n", "\n", " return rows\n", "\n", "\n", "def get_max_page():\n", " \"\"\"从页面提取最大页数\"\"\"\n", " html = get_page_html(1)\n", " if not html:\n", " return 1\n", "\n", " soup = BeautifulSoup(html, 'html.parser')\n", " page_info = soup.find('div', class_='dataTables_paginate')\n", " if not page_info:\n", " text = soup.get_text()\n", " match = re.search(r'页\\s*1/(\\d+)', text)\n", " if match:\n", " return int(match.group(1))\n", " return 1\n", "\n", " text = page_info.get_text()\n", " match = re.search(r'页\\s*1/(\\d+)', text)\n", " if match:\n", " return int(match.group(1))\n", "\n", " match = re.search(r'1\\s*/\\s*(\\d+)', text)\n", " if match:\n", " return int(match.group(1))\n", "\n", " return 1\n", "\n", "\n", "def main():\n", " print(\"=\" * 50)\n", " print(\"开始爬取库存数据...\")\n", " print(f\"当前 StoreID: {BASE_PARAMS['storeId']}\")\n", " print(f\"当前 HouseID: {BASE_PARAMS['house_id']}\")\n", " print(\"=\" * 50)\n", "\n", " max_page = get_max_page()\n", " if max_page == 1:\n", " print(\"⚠️ 仅检测到 1 页,可能是解析失败或确实只有一页。\")\n", "\n", " print(f\"✅ 成功获取最大页数:{max_page}\")\n", "\n", " all_data = []\n", " for page in range(1, max_page + 1):\n", " print(f\"正在爬取第 {page}/{max_page} 页...\")\n", " html = get_page_html(page)\n", " if not html:\n", " print(f\"❌ 第 {page} 页获取失败,跳过。\")\n", " continue\n", "\n", " rows = parse_table(html)\n", " if not rows:\n", " print(f\"⚠️ 第 {page} 页未解析到表格数据。\")\n", " continue\n", "\n", " if page == 1:\n", " all_data.extend(rows)\n", " else:\n", " if len(rows) > 0 and rows[0] == all_data[0]:\n", " all_data.extend(rows[1:])\n", " else:\n", " all_data.extend(rows)\n", "\n", " if not all_data:\n", " print(\"\\n❌ 未获取到任何数据,请检查 Cookie 或网络。\")\n", " return\n", "\n", " desktop = os.path.join(os.path.expanduser(\"~\"), \"Desktop\")\n", " df = pd.DataFrame(all_data[1:], columns=all_data[0])\n", "\n", " # 按配件编码去重\n", " target_col = None\n", " for col in df.columns:\n", " if '配件编码' in col or '编码' in col:\n", " target_col = col\n", " break\n", "\n", " if target_col:\n", " total_before = len(df)\n", " df = df.drop_duplicates(subset=[target_col], keep='first')\n", " total_after = len(df)\n", " print(f\"\\n🔍 去重完成 (基于列: {target_col}):原始 {total_before} 条 → 去重后 {total_after} 条\")\n", " else:\n", " print(f\"\\n⚠️ 未找到包含【配件编码】的列,跳过去重。当前列名:{list(df.columns)}\")\n", "\n", " filename = f\"库存数据_{BASE_PARAMS['storeId']}_去重版1.xlsx\"\n", " path = os.path.join(desktop, filename)\n", "\n", " try:\n", " df.to_excel(path, index=False)\n", " print(\"=\" * 50)\n", " print(f\"✅ 爬取 + 去重 完成!\")\n", " print(f\"📊 最终有效条数:{len(df)}\")\n", " print(f\"📁 已保存到桌面:{filename}\")\n", " print(\"=\" * 50)\n", " except Exception as e:\n", " print(f\"❌ 保存文件失败:{e}\")\n", " csv_path = os.path.join(desktop, filename.replace('.xlsx', '.csv'))\n", " df.to_csv(csv_path, index=False, encoding='utf-8-sig')\n", " print(f\"💡 已尝试转为 CSV 保存至:{csv_path}\")\n", "\n", "\n", "if __name__ == '__main__':\n", " main()" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "==================================================\n", "开始爬取库存数据...\n", "当前 StoreID: 13435\n", "当前 HouseID: 9079\n", "==================================================\n", "✅ 成功获取最大页数:40\n", "正在爬取第 1/40 页...\n", "正在爬取第 2/40 页...\n", "正在爬取第 3/40 页...\n", "正在爬取第 4/40 页...\n", "正在爬取第 5/40 页...\n", "正在爬取第 6/40 页...\n", "正在爬取第 7/40 页...\n", "正在爬取第 8/40 页...\n", "正在爬取第 9/40 页...\n", "正在爬取第 10/40 页...\n", "正在爬取第 11/40 页...\n", "正在爬取第 12/40 页...\n", "正在爬取第 13/40 页...\n", "正在爬取第 14/40 页...\n", "正在爬取第 15/40 页...\n", "正在爬取第 16/40 页...\n", "正在爬取第 17/40 页...\n", "正在爬取第 18/40 页...\n", "正在爬取第 19/40 页...\n", "正在爬取第 20/40 页...\n", "正在爬取第 21/40 页...\n", "正在爬取第 22/40 页...\n", "正在爬取第 23/40 页...\n", "正在爬取第 24/40 页...\n", "正在爬取第 25/40 页...\n", "正在爬取第 26/40 页...\n", "正在爬取第 27/40 页...\n", "正在爬取第 28/40 页...\n", "正在爬取第 29/40 页...\n", "正在爬取第 30/40 页...\n", "正在爬取第 31/40 页...\n", "正在爬取第 32/40 页...\n", "正在爬取第 33/40 页...\n", "正在爬取第 34/40 页...\n", "正在爬取第 35/40 页...\n", "正在爬取第 36/40 页...\n", "正在爬取第 37/40 页...\n", "正在爬取第 38/40 页...\n", "正在爬取第 39/40 页...\n", "正在爬取第 40/40 页...\n", "\n", "🔍 去重完成 (基于列: 配件编码):原始 782 条 → 去重后 782 条\n", "==================================================\n", "✅ 爬取 + 去重 完成!\n", "📊 最终有效条数:782\n", "📁 已保存到桌面:库存数据_13435_去重版1.xlsx\n", "==================================================\n" ] } ], "execution_count": 2 }, { "cell_type": "markdown", "id": "4b11e6fa", "metadata": {}, "source": [ "# 历史维修记录\n", "开单管理" ] }, { "metadata": { "ExecuteTime": { "end_time": "2026-03-25T03:58:48.443601700Z", "start_time": "2026-03-25T03:56:48.226330400Z" } }, "cell_type": "code", "source": [ "import requests\n", "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "import os\n", "import re\n", "import time\n", "\n", "# ===================== 固定配置 =====================\n", "# 注意:URL 中的 'Lists' 首字母大写,需与服务器严格一致\n", "BASE_URL = \"https://scrm.h1cd.com/admin/billings/Lists.html\"\n", "# 假设分页是通过 URL 参数或路径变化,这里根据你的代码逻辑保留路径变化模式\n", "# 如果实际是分页参数 (如 ?page=2),请修改 get_html 函数\n", "BASE_URL_PATTERN = \"https://scrm.h1cd.com/admin/billings/Lists_{}.html\"\n", "\n", "OUTPUT_DIR = r\"D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\"\n", "OUTPUT_FILE = os.path.join(OUTPUT_DIR, \"维修记录_完美导出版.xlsx\")\n", "\n", "# 请求头 (完全同步你的 curl)\n", "HEADERS = {\n", " 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n", " 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',\n", " 'Connection': 'keep-alive',\n", " # Referer 已更新\n", " 'Referer': 'https://scrm.h1cd.com/admin/billings/Lists.html?storeId=0&type=&receive_by=&is_out=&is_end=&timeStart=2022-02-01&timeEnd=&search=&status=0',\n", " 'Sec-Fetch-Dest': 'iframe',\n", " 'Sec-Fetch-Mode': 'navigate',\n", " 'Sec-Fetch-Site': 'same-origin',\n", " 'Sec-Fetch-User': '?1',\n", " 'Upgrade-Insecure-Requests': '1',\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',\n", " 'sec-ch-ua': '\"Chromium\";v=\"146\", \"Not-A.Brand\";v=\"24\", \"Microsoft Edge\";v=\"146\"',\n", " 'sec-ch-ua-mobile': '?0',\n", " 'sec-ch-ua-platform': '\"Windows\"',\n", "}\n", "\n", "# Cookies (直接从 curl 提取,字典格式)\n", "COOKIES = {\n", " 'showSmsActivity': '1',\n", " 'showEasyMoney': '1',\n", " 'LOGIN_URL': 'https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html',\n", " 'adminpd': 'BNB%2Bpx2I%2B92MtZBN1vVyPt7A%2B3eKA3uAnIzdLP%2FD%2FBw%3D',\n", " 'adminun': '18530760062',\n", " 'uid': '10407',\n", " 'PHPSESSID': '7v127mqdfnqa7rgcrlifksrt3t'\n", "}\n", "\n", "# 查询参数 (已同步 curl 中的 timeStart=2022-02-01)\n", "PARAMS = {\n", " 'storeId': '0',\n", " 'type': '',\n", " 'receive_by': '',\n", " 'is_out': '',\n", " 'is_end': '',\n", " 'timeStart': '2022-02-01',\n", " 'timeEnd': '',\n", " 'search': '',\n", " 'status': '0'\n", "}\n", "\n", "# 定义表头 (根据你代码中的逻辑,共23列)\n", "HEADERS_LIST = [\n", " '勾选框', '序号', '工单号', '工单内容', '车辆信息', '车主信息',\n", " '工单金额', '开单时间', '接车人', '施工人员', '销售人员',\n", " '预收款', '开单备注', '结算信息', '预计交车时间', '车架号',\n", " '进厂公里', '下次保养公里', '下次保养时间', '是否出库',\n", " '是否完工', '状态', '操作'\n", "]\n", "\n", "# =====================================================\n", "\n", "def get_html(page):\n", " \"\"\"获取指定页面的HTML内容\"\"\"\n", " try:\n", " if page == 1:\n", " url = BASE_URL\n", " else:\n", " # 适配分页 URL 格式:Lists_2.html, Lists_3.html ...\n", " url = BASE_URL_PATTERN.format(page)\n", "\n", " print(f\" 正在请求: {url}\")\n", "\n", " r = requests.get(\n", " url,\n", " headers=HEADERS,\n", " cookies=COOKIES,\n", " params=PARAMS, # 参数会自动拼接到 URL 后\n", " timeout=30,\n", " verify=True # 默认验证 SSL,如果证书有问题可改为 False\n", " )\n", "\n", " # 检查是否被重定向到登录页 (通过检查 URL 或 内容)\n", " if 'login' in r.url.lower() or '登录' in r.text[:2000]:\n", " print(\" ⚠️ 检测到可能已退出登录或 Cookie 过期!\")\n", " return None\n", "\n", " r.raise_for_status()\n", " r.encoding = 'utf-8'\n", " return r.text\n", " except Exception as e:\n", " print(f\" ❌ 获取第{page}页失败: {str(e)}\")\n", " return None\n", "\n", "def parse_table(html):\n", " \"\"\"解析HTML表格,提取所有字段\"\"\"\n", " soup = BeautifulSoup(html, 'html.parser')\n", "\n", " # 尝试寻找表格,增加对 tbody 的兼容\n", " table = soup.find('table', class_='table')\n", " if not table:\n", " # 尝试没有 class 的 table 或者 id\n", " table = soup.find('table')\n", "\n", " if not table:\n", " return []\n", "\n", " result = []\n", "\n", " # 查找数据行,类名可能是 'order_item' 或在 tbody 中\n", " # 先找 tbody,再找 tr\n", " tbody = table.find('tbody')\n", " if tbody:\n", " rows = tbody.find_all('tr', class_='order_item')\n", " if not rows:\n", " rows = tbody.find_all('tr') # 如果没有特定类名,取所有行\n", " else:\n", " rows = table.find_all('tr', class_='order_item')\n", " if not rows:\n", " rows = table.find_all('tr')\n", "\n", " # 过滤掉表头行 (如果包含 th 标签)\n", " data_rows = []\n", " for row in rows:\n", " if row.find('th'):\n", " continue\n", " data_rows.append(row)\n", "\n", " for row in data_rows:\n", " tds = row.find_all('td')\n", "\n", " # 动态判断列数,如果列数过少则跳过 (可能是空行)\n", " if len(tds) < 5:\n", " continue\n", "\n", " row_data = []\n", " for i, td in enumerate(tds):\n", " text = td.get_text(strip=True, separator='\\n')\n", "\n", " # 特殊处理逻辑\n", " if i == 4: # 车辆信息:提取车牌号\n", " car_match = re.search(r'([京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼][A-Z][A-Z0-9]{5,6})', text)\n", " if car_match:\n", " row_data.append(car_match.group(1))\n", " else:\n", " row_data.append(text)\n", " elif i == 2: # 工单号:尝试提取纯数字\n", " order_match = re.search(r'(\\d{8,})', text) # 放宽数字长度限制\n", " if order_match:\n", " row_data.append(order_match.group(1))\n", " else:\n", " row_data.append(text.split('\\n')[0].strip())\n", " else:\n", " clean_text = text.replace('\\n', ' | ').replace('\\r', '')\n", " row_data.append(clean_text)\n", "\n", " # 如果列数少于表头,补齐空字符串;如果多于表头,截断\n", " if len(row_data) < len(HEADERS_LIST):\n", " row_data.extend([''] * (len(HEADERS_LIST) - len(row_data)))\n", " elif len(row_data) > len(HEADERS_LIST):\n", " row_data = row_data[:len(HEADERS_LIST)]\n", "\n", " result.append(row_data)\n", "\n", " return result\n", "\n", "def get_total_pages(html):\n", " \"\"\"从第一页 HTML 中分析总页数\"\"\"\n", " if not html:\n", " return 1\n", "\n", " # 常见分页文本模式\n", " patterns = [\n", " r'共\\s*(\\d+)\\s*页',\n", " r'共\\s*(\\d+)\\s*条.*?(\\d+)\\s*页', # 共XX条 XX页\n", " r'页码\\s*\\d+/(\\d+)',\n", " r'1/(\\d+)',\n", " r'of\\s+(\\d+)\\s*pages' # 英文模式\n", " ]\n", "\n", " for pattern in patterns:\n", " match = re.search(pattern, html)\n", " if match:\n", " # 取最后一个匹配组作为页数 (针对第二条正则)\n", " page_num = match.group(match.lastindex)\n", " try:\n", " return int(page_num)\n", " except ValueError:\n", " continue\n", "\n", " # 如果正则没匹配到,尝试找分页按钮数量估算 (保守估计)\n", " soup = BeautifulSoup(html, 'html.parser')\n", " pagination = soup.find('div', class_='dataTables_wrapper') or soup.find('ul', class_='pagination')\n", " if pagination:\n", " # 简单策略:如果找不到具体数字,先假设只有1页,避免报错,或者你可以手动设置一个最大值\n", " # 这里返回 1,并在主循环中增加“如果下一页为空则停止”的逻辑更安全\n", " print(\" ⚠️ 未明确检测到总页数,将尝试逐页爬取直到无数据。\")\n", " return 999 # 设置一个较大的上限,依靠内容为空来停止\n", "\n", " return 1\n", "\n", "def main():\n", " print(\"🔧 开始导出维修记录...\")\n", "\n", " # 确保输出目录存在\n", " if not os.path.exists(OUTPUT_DIR):\n", " try:\n", " os.makedirs(OUTPUT_DIR)\n", " print(f\"✅ 创建输出目录: {OUTPUT_DIR}\")\n", " except Exception as e:\n", " print(f\"❌ 无法创建目录: {e}\")\n", " return\n", "\n", " # 1. 获取第一页以确定总页数\n", " print(\"📄 正在获取第 1 页以分析页数...\")\n", " first_html = get_html(1)\n", " if not first_html:\n", " print(\"❌ 无法获取第一页,请检查 Cookie 是否过期或网络设置。\")\n", " return\n", "\n", " total_pages = get_total_pages(first_html)\n", " print(f\"📊 预估总页数: {total_pages}\")\n", "\n", " all_data = []\n", "\n", " # 2. 循环爬取\n", " current_page = 1\n", " while current_page <= total_pages:\n", " time.sleep(0.5)\n", " print(f\"🔄 正在处理第 {current_page}/{total_pages} 页...\")\n", "\n", " if current_page == 1:\n", " html = first_html\n", " else:\n", " html = get_html(current_page)\n", "\n", " if not html:\n", " print(f\"⚠️ 第{current_page}页获取失败或为空,停止爬取。\")\n", " break\n", "\n", " page_data = parse_table(html)\n", "\n", " if not page_data:\n", " print(f\"⚠️ 第{current_page}页解析不到数据,可能已到达最后一页。\")\n", " break\n", "\n", " all_data.extend(page_data)\n", " print(f\" ✅ 本页提取 {len(page_data)} 条记录\")\n", "\n", " # 简单的反爬延时\n", " time.sleep(1)\n", "\n", " current_page += 1\n", "\n", " if not all_data:\n", " print(\"❌ 未获取到任何有效数据。\")\n", " return\n", "\n", " # 3. 保存数据\n", " try:\n", " df = pd.DataFrame(all_data, columns=HEADERS_LIST)\n", "\n", " # 保存 Excel\n", " df.to_excel(OUTPUT_FILE, index=False, engine='openpyxl')\n", "\n", " print(\"\\n\" + \"=\"*30)\n", " print(\"✅ 导出成功!\")\n", " print(f\"📁 文件路径: {OUTPUT_FILE}\")\n", " print(f\"📈 总记录数: {len(df)}\")\n", " print(\"=\"*30)\n", "\n", " except Exception as e:\n", " print(f\"❌ 保存 Excel 失败: {e}\")\n", " # 降级保存 CSV\n", " csv_file = OUTPUT_FILE.replace('.xlsx', '.csv')\n", " try:\n", " df.to_csv(csv_file, index=False, encoding='utf-8-sig')\n", " print(f\"📌 已降级保存为 CSV: {csv_file}\")\n", " except Exception as ce:\n", " print(f\"❌ 保存 CSV 也失败: {ce}\")\n", "\n", "if __name__ == '__main__':\n", " # 依赖检查\n", " required_packages = ['requests', 'pandas', 'bs4', 'openpyxl']\n", " missing = []\n", " for pkg in required_packages:\n", " try:\n", " __import__(pkg)\n", " except ImportError:\n", " missing.append(pkg)\n", "\n", " if missing:\n", " print(f\"❌ 缺少必要的库: {', '.join(missing)}\")\n", " print(\"💡 请运行以下命令安装: pip install \" + \" \".join(missing))\n", " else:\n", " main()" ], "id": "cbd4eeb0a30b3e15", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🔧 开始导出维修记录...\n", "📄 正在获取第 1 页以分析页数...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists.html\n", "📊 预估总页数: 53\n", "🔄 正在处理第 1/53 页...\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 2/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_2.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 3/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_3.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 4/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_4.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 5/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_5.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 6/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_6.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 7/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_7.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 8/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_8.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 9/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_9.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 10/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_10.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 11/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_11.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 12/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_12.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 13/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_13.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 14/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_14.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 15/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_15.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 16/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_16.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 17/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_17.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 18/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_18.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 19/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_19.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 20/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_20.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 21/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_21.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 22/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_22.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 23/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_23.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 24/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_24.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 25/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_25.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 26/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_26.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 27/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_27.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 28/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_28.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 29/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_29.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 30/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_30.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 31/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_31.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 32/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_32.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 33/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_33.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 34/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_34.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 35/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_35.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 36/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_36.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 37/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_37.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 38/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_38.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 39/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_39.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 40/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_40.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 41/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_41.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 42/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_42.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 43/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_43.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 44/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_44.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 45/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_45.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 46/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_46.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 47/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_47.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 48/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_48.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 49/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_49.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 50/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_50.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 51/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_51.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 52/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_52.html\n", " ✅ 本页提取 20 条记录\n", "🔄 正在处理第 53/53 页...\n", " 正在请求: https://scrm.h1cd.com/admin/billings/Lists_53.html\n", " ✅ 本页提取 8 条记录\n", "\n", "==============================\n", "✅ 导出成功!\n", "📁 文件路径: D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\维修记录_完美导出版.xlsx\n", "📈 总记录数: 1048\n", "==============================\n" ] } ], "execution_count": 3 } ], "metadata": { "kernelspec": { "display_name": "F6+宜搭+其它", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.11" } }, "nbformat": 4, "nbformat_minor": 5 }