Files
F6--/张阳脚本/竞品系统数据导出/H1车店数据导出.ipynb
T
2026-04-09 09:37:04 +08:00

806 lines
31 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "2be2e0c2",
"metadata": {},
"source": [
"# 车辆信息"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "70a8b0da",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"正在获取总页数...\n",
"总页数:88 页\n",
"正在爬取第 1/88 页...\n",
"正在爬取第 2/88 页...\n",
"正在爬取第 3/88 页...\n",
"正在爬取第 4/88 页...\n",
"正在爬取第 5/88 页...\n",
"正在爬取第 6/88 页...\n",
"正在爬取第 7/88 页...\n",
"正在爬取第 8/88 页...\n",
"正在爬取第 9/88 页...\n",
"正在爬取第 10/88 页...\n",
"正在爬取第 11/88 页...\n",
"正在爬取第 12/88 页...\n",
"正在爬取第 13/88 页...\n",
"正在爬取第 14/88 页...\n",
"正在爬取第 15/88 页...\n",
"正在爬取第 16/88 页...\n",
"正在爬取第 17/88 页...\n",
"正在爬取第 18/88 页...\n",
"正在爬取第 19/88 页...\n",
"正在爬取第 20/88 页...\n",
"正在爬取第 21/88 页...\n",
"正在爬取第 22/88 页...\n",
"正在爬取第 23/88 页...\n",
"正在爬取第 24/88 页...\n",
"正在爬取第 25/88 页...\n",
"正在爬取第 26/88 页...\n",
"正在爬取第 27/88 页...\n",
"正在爬取第 28/88 页...\n",
"正在爬取第 29/88 页...\n",
"正在爬取第 30/88 页...\n",
"正在爬取第 31/88 页...\n",
"正在爬取第 32/88 页...\n",
"正在爬取第 33/88 页...\n",
"正在爬取第 34/88 页...\n",
"正在爬取第 35/88 页...\n",
"正在爬取第 36/88 页...\n",
"正在爬取第 37/88 页...\n",
"正在爬取第 38/88 页...\n",
"正在爬取第 39/88 页...\n",
"正在爬取第 40/88 页...\n",
"正在爬取第 41/88 页...\n",
"正在爬取第 42/88 页...\n",
"正在爬取第 43/88 页...\n",
"正在爬取第 44/88 页...\n",
"正在爬取第 45/88 页...\n",
"正在爬取第 46/88 页...\n",
"正在爬取第 47/88 页...\n",
"正在爬取第 48/88 页...\n",
"正在爬取第 49/88 页...\n",
"正在爬取第 50/88 页...\n",
"正在爬取第 51/88 页...\n",
"正在爬取第 52/88 页...\n",
"正在爬取第 53/88 页...\n",
"正在爬取第 54/88 页...\n",
"正在爬取第 55/88 页...\n",
"正在爬取第 56/88 页...\n",
"正在爬取第 57/88 页...\n",
"正在爬取第 58/88 页...\n",
"正在爬取第 59/88 页...\n",
"正在爬取第 60/88 页...\n",
"正在爬取第 61/88 页...\n",
"正在爬取第 62/88 页...\n",
"正在爬取第 63/88 页...\n",
"正在爬取第 64/88 页...\n",
"正在爬取第 65/88 页...\n",
"正在爬取第 66/88 页...\n",
"正在爬取第 67/88 页...\n",
"正在爬取第 68/88 页...\n",
"正在爬取第 69/88 页...\n",
"正在爬取第 70/88 页...\n",
"正在爬取第 71/88 页...\n",
"正在爬取第 72/88 页...\n",
"正在爬取第 73/88 页...\n",
"正在爬取第 74/88 页...\n",
"正在爬取第 75/88 页...\n",
"正在爬取第 76/88 页...\n",
"正在爬取第 77/88 页...\n",
"正在爬取第 78/88 页...\n",
"正在爬取第 79/88 页...\n",
"正在爬取第 80/88 页...\n",
"正在爬取第 81/88 页...\n",
"正在爬取第 82/88 页...\n",
"正在爬取第 83/88 页...\n",
"正在爬取第 84/88 页...\n",
"正在爬取第 85/88 页...\n",
"正在爬取第 86/88 页...\n",
"正在爬取第 87/88 页...\n",
"正在爬取第 88/88 页...\n",
"\n",
"========== 爬取完成 ==========\n",
"总计数据:1745 行\n",
"\n",
"✅ 文件已保存到桌面:\n",
"📊 Excel文件:C:\\Users\\CW\\Desktop\\车辆数据_已拆分_20260316_151600.csv\n",
"📄 文本文件:C:\\Users\\CW\\Desktop\\车辆数据_已拆分_20260316_151600.txt\n",
"\n",
"前5行数据预览:\n",
"1 ['1', '陕KF39335', '杜源', '', '', '', '/', '', '', '', '', '消费记录 编辑 迁移 删除']\n",
"2 ['2', '陕KB772U', '张子龙', '15686644333', '', '', '1F1FW1R68KFB41218', '', '', '', '', '消费记录 编辑 迁移 删除']\n",
"3 ['3', '陕KUY513', '高', '13474443858', '', '', 'LMGAT1L85P1320223', '', '24048km', '', '', '消费记录 编辑 迁移 删除']\n",
"4 ['4', '陕K55489', '刘永强', '15229629888', '', '', 'JTEJU9FJ5DK058155', '丰田普拉多(进口) > 2013款 4.0 自动 中东版', '', '', '', '消费记录 编辑 迁移 删除']\n",
"5 ['5', '陕K139B8', '郝波', '15929825811', '', '', 'LSVAA4182A2355722', '', '', '', '', '消费记录 编辑 迁移 删除']\n"
]
}
],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import re\n",
"import os\n",
"import csv\n",
"from datetime import datetime\n",
"\n",
"# ===================== 配置区 =====================\n",
"COOKIES = (\n",
" 'carOwnerSelects=11847,13014; LOGIN_URL=https%3A%2F%2Fscrm.h1cd.com%2Flogin.html; '\n",
" 'showSmsActivity=1; showEasyMoney=1; PHPSESSID=f5sjp1p18hvgke5oqvfr6q456o; '\n",
" 'adminpd=IMAkQA9qfl0V0bY6hPCcbYdz3rcy0MG2%2FbHz34%2BOWy0%3D; adminun=15529803908; uid=10042'\n",
")\n",
"\n",
"BASE_URL = \"https://scrm.h1cd.com/admin/members/carlist\"\n",
"HEADERS = {\n",
" \"Accept\": \"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7\",\n",
" \"Accept-Language\": \"zh-CN,zh;q=0.9\",\n",
" \"Connection\": \"keep-alive\",\n",
" \"Referer\": \"https://scrm.h1cd.com/admin/members/carlist.html\",\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36\",\n",
" \"sec-ch-ua\": '\"Not(A:Brand\";v=\"99\", \"Google Chrome\";v=\"133\", \"Chromium\";v=\"133\"',\n",
" \"sec-ch-ua-mobile\": \"?0\",\n",
" \"sec-ch-ua-platform\": '\"Windows\"',\n",
" \"Sec-Fetch-Dest\": \"iframe\",\n",
" \"Sec-Fetch-Mode\": \"navigate\",\n",
" \"Sec-Fetch-Site\": \"same-origin\",\n",
" \"Sec-Fetch-User\": \"?1\",\n",
" \"Upgrade-Insecure-Requests\": \"1\"\n",
"}\n",
"\n",
"# 解析Cookie\n",
"def parse_cookies(cookie_str):\n",
" cookie_dict = {}\n",
" for item in cookie_str.split(\"; \"):\n",
" if \"=\" in item:\n",
" key, value = item.split(\"=\", 1)\n",
" cookie_dict[key] = value\n",
" return cookie_dict\n",
"\n",
"# ===================== 核心:数据拆分处理(已修复 br 换行) =====================\n",
"def process_row(row):\n",
" if len(row) < 5:\n",
" return row\n",
" \n",
" new_row = row.copy()\n",
"\n",
" # ========== 1. 处理 C列:客户名称 + 手机号(按 <br> 拆分,已修复) ==========\n",
" c_text = new_row.pop(2)\n",
" \n",
" # 处理换行:统一空格/换行/空白符,提取 名称 + 手机号\n",
" # 先把所有空白(包括HTML换行产生的空格)替换成统一分隔符\n",
" c_text = re.sub(r'\\s+', ' ', c_text).strip()\n",
" \n",
" name, phone = \"\", \"\"\n",
" # 匹配手机号(11位数字),自动分割\n",
" phone_match = re.search(r'1[3-9]\\d{9}', c_text)\n",
" if phone_match:\n",
" phone = phone_match.group()\n",
" name = c_text.replace(phone, '').strip()\n",
" else:\n",
" name = c_text.strip()\n",
"\n",
" # ========== 2. 处理 E列:颜色/发动机/车架号(已去除前缀) ==========\n",
" e_text = new_row.pop(3)\n",
" color, engine, vin = \"\", \"\", \"\"\n",
"\n",
" # 去掉所有中文前缀\n",
" e_text = re.sub(r'颜\\s*色\\s*', '', e_text)\n",
" e_text = re.sub(r'发动机\\s*', '', e_text)\n",
" e_text = re.sub(r'车架号\\s*', '', e_text)\n",
" \n",
" # 按 / 拆分\n",
" if \"/\" in e_text:\n",
" parts = e_text.split(\"/\", 2)\n",
" color = parts[0].strip()\n",
" engine = parts[1].strip() if len(parts) > 1 else \"\"\n",
" vin = parts[2].strip() if len(parts) > 2 else \"\"\n",
" else:\n",
" color = e_text.strip()\n",
"\n",
" # 插入拆分后字段\n",
" new_row.insert(2, name)\n",
" new_row.insert(3, phone)\n",
" new_row.insert(4, color)\n",
" new_row.insert(5, engine)\n",
" new_row.insert(6, vin)\n",
"\n",
" return new_row\n",
"\n",
"# 获取单页表格数据 + 自动拆分(保留HTML内换行,解决<br>问题)\n",
"def get_page_data(page_num):\n",
" if page_num == 1:\n",
" url = f\"{BASE_URL}.html\"\n",
" else:\n",
" url = f\"{BASE_URL}_{page_num}.html\"\n",
"\n",
" try:\n",
" resp = requests.get(url, headers=HEADERS, cookies=parse_cookies(COOKIES), timeout=15)\n",
" resp.raise_for_status()\n",
" soup = BeautifulSoup(resp.text, \"html.parser\")\n",
" table = soup.find(\"table\")\n",
" if not table:\n",
" return []\n",
"\n",
" rows = table.find_all(\"tr\")\n",
" data = []\n",
" for tr in rows:\n",
" tds = tr.find_all(\"td\")\n",
" cols = []\n",
" for td in tds:\n",
" # 关键:保留 <br> 产生的换行,不直接压缩\n",
" text = td.get_text(separator=\" \", strip=True)\n",
" cols.append(text)\n",
" \n",
" if cols:\n",
" processed_cols = process_row(cols)\n",
" data.append(processed_cols)\n",
" return data\n",
"\n",
" except Exception as e:\n",
" print(f\"第{page_num}页请求失败:{e}\")\n",
" return []\n",
"\n",
"# 获取总页数\n",
"def get_total_pages():\n",
" try:\n",
" resp = requests.get(f\"{BASE_URL}.html\", headers=HEADERS, cookies=parse_cookies(COOKIES), timeout=10)\n",
" soup = BeautifulSoup(resp.text, \"html.parser\")\n",
" page_text = soup.get_text()\n",
" match = re.search(r\"共\\s*(\\d+)\\s*页\", page_text)\n",
" if match:\n",
" return int(match.group(1))\n",
"\n",
" page_links = soup.find_all(\"a\", href=re.compile(r\"carlist_\\d+\\.html\"))\n",
" max_page = 1\n",
" for a in page_links:\n",
" num_match = re.search(r\"carlist_(\\d+)\\.html\", a[\"href\"])\n",
" if num_match:\n",
" max_page = max(max_page, int(num_match.group(1)))\n",
" return max_page\n",
" except:\n",
" return 1\n",
"\n",
"# ===================== 保存到桌面(自动加表头) =====================\n",
"def save_to_desktop(all_data):\n",
" desktop_path = os.path.join(os.path.expanduser(\"~\"), \"Desktop\")\n",
" time_str = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
" \n",
" # 表头(自动添加)\n",
" header = [\n",
" \"序号\", \"车牌号\", \"客户名称\", \"客户手机号\",\n",
" \"颜色\", \"发动机号\", \"车架号\", \"其他列1\", \"其他列2\", \"其他列3\"\n",
" ]\n",
" \n",
" csv_file = os.path.join(desktop_path, f\"车辆数据_已拆分_{time_str}.csv\")\n",
" with open(csv_file, \"w\", newline=\"\", encoding=\"utf-8-sig\") as f:\n",
" writer = csv.writer(f)\n",
" writer.writerow(header) # 写入标题\n",
" writer.writerows(all_data)\n",
" \n",
" txt_file = os.path.join(desktop_path, f\"车辆数据_已拆分_{time_str}.txt\")\n",
" with open(txt_file, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(\" | \".join(header) + \"\\n\")\n",
" for row in all_data:\n",
" f.write(\" | \".join(row) + \"\\n\")\n",
" \n",
" print(f\"\\n✅ 文件已保存到桌面:\")\n",
" print(f\"📊 Excel文件:{csv_file}\")\n",
" print(f\"📄 文本文件:{txt_file}\")\n",
"\n",
"# ===================== 主程序 =====================\n",
"if __name__ == \"__main__\":\n",
" print(\"正在获取总页数...\")\n",
" total_pages = get_total_pages()\n",
" # total_pages = 1\n",
" print(f\"总页数:{total_pages} 页\")\n",
"\n",
" all_data = []\n",
" for page in range(1, total_pages + 1):\n",
" print(f\"正在爬取第 {page}/{total_pages} 页...\")\n",
" page_data = get_page_data(page)\n",
" if page_data:\n",
" all_data.extend(page_data)\n",
"\n",
" print(f\"\\n========== 爬取完成 ==========\")\n",
" print(f\"总计数据:{len(all_data)} 行\")\n",
"\n",
" save_to_desktop(all_data)\n",
"\n",
" print(\"\\n前5行数据预览:\")\n",
" for i, row in enumerate(all_data[:5]):\n",
" print(i+1, row)"
]
},
{
"cell_type": "markdown",
"id": "6c370235",
"metadata": {},
"source": [
"# 库存信息"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5392bfc0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==================================================\n",
"开始爬取库存数据...\n",
"==================================================\n",
"✅ 成功获取最大页数:93\n",
"正在爬取第 1/93 页...\n",
"正在爬取第 2/93 页...\n",
"正在爬取第 3/93 页...\n",
"正在爬取第 4/93 页...\n",
"正在爬取第 5/93 页...\n",
"正在爬取第 6/93 页...\n",
"正在爬取第 7/93 页...\n",
"正在爬取第 8/93 页...\n",
"正在爬取第 9/93 页...\n",
"正在爬取第 10/93 页...\n",
"正在爬取第 11/93 页...\n",
"正在爬取第 12/93 页...\n",
"正在爬取第 13/93 页...\n",
"正在爬取第 14/93 页...\n",
"正在爬取第 15/93 页...\n",
"正在爬取第 16/93 页...\n",
"正在爬取第 17/93 页...\n",
"正在爬取第 18/93 页...\n",
"正在爬取第 19/93 页...\n",
"正在爬取第 20/93 页...\n",
"正在爬取第 21/93 页...\n",
"正在爬取第 22/93 页...\n",
"正在爬取第 23/93 页...\n",
"正在爬取第 24/93 页...\n",
"正在爬取第 25/93 页...\n",
"正在爬取第 26/93 页...\n",
"正在爬取第 27/93 页...\n",
"正在爬取第 28/93 页...\n",
"正在爬取第 29/93 页...\n",
"正在爬取第 30/93 页...\n",
"正在爬取第 31/93 页...\n",
"正在爬取第 32/93 页...\n",
"正在爬取第 33/93 页...\n",
"正在爬取第 34/93 页...\n",
"正在爬取第 35/93 页...\n",
"正在爬取第 36/93 页...\n",
"正在爬取第 37/93 页...\n",
"正在爬取第 38/93 页...\n",
"正在爬取第 39/93 页...\n",
"正在爬取第 40/93 页...\n",
"正在爬取第 41/93 页...\n",
"正在爬取第 42/93 页...\n",
"正在爬取第 43/93 页...\n",
"正在爬取第 44/93 页...\n",
"正在爬取第 45/93 页...\n",
"正在爬取第 46/93 页...\n",
"正在爬取第 47/93 页...\n",
"正在爬取第 48/93 页...\n",
"正在爬取第 49/93 页...\n",
"正在爬取第 50/93 页...\n",
"正在爬取第 51/93 页...\n",
"正在爬取第 52/93 页...\n",
"正在爬取第 53/93 页...\n",
"正在爬取第 54/93 页...\n",
"正在爬取第 55/93 页...\n",
"正在爬取第 56/93 页...\n",
"正在爬取第 57/93 页...\n",
"正在爬取第 58/93 页...\n",
"正在爬取第 59/93 页...\n",
"正在爬取第 60/93 页...\n",
"正在爬取第 61/93 页...\n",
"正在爬取第 62/93 页...\n",
"正在爬取第 63/93 页...\n",
"正在爬取第 64/93 页...\n",
"正在爬取第 65/93 页...\n",
"正在爬取第 66/93 页...\n",
"正在爬取第 67/93 页...\n",
"正在爬取第 68/93 页...\n",
"正在爬取第 69/93 页...\n",
"正在爬取第 70/93 页...\n",
"正在爬取第 71/93 页...\n",
"正在爬取第 72/93 页...\n",
"正在爬取第 73/93 页...\n",
"正在爬取第 74/93 页...\n",
"正在爬取第 75/93 页...\n",
"正在爬取第 76/93 页...\n",
"正在爬取第 77/93 页...\n",
"正在爬取第 78/93 页...\n",
"正在爬取第 79/93 页...\n",
"正在爬取第 80/93 页...\n",
"正在爬取第 81/93 页...\n",
"正在爬取第 82/93 页...\n",
"正在爬取第 83/93 页...\n",
"正在爬取第 84/93 页...\n",
"正在爬取第 85/93 页...\n",
"正在爬取第 86/93 页...\n",
"正在爬取第 87/93 页...\n",
"正在爬取第 88/93 页...\n",
"正在爬取第 89/93 页...\n",
"正在爬取第 90/93 页...\n",
"正在爬取第 91/93 页...\n",
"正在爬取第 92/93 页...\n",
"正在爬取第 93/93 页...\n",
"\n",
"🔍 去重完成:原始 1846 条 → 去重后 1823 条\n",
"==================================================\n",
"✅ 爬取 + 去重 完成!\n",
"📊 最终有效条数:1823\n",
"📁 已保存到桌面:库存数据_去重版.xlsx\n",
"==================================================\n"
]
}
],
"source": [
"import requests\n",
"import pandas as pd\n",
"from bs4 import BeautifulSoup\n",
"import os\n",
"from urllib.parse import urlencode\n",
"import re\n",
"\n",
"# ===================== 【配置区】 =====================\n",
"COOKIE = ('LOGIN_URL=https%3A%2F%2Fscrm.h1cd.com%2Flogin.html; showSmsActivity=1; showEasyMoney=1; '\n",
" 'adminpd=IMAkQA9qfl0V0bY6hPCcbYdz3rcy0MG2%2FbHz34%2BOWy0%3D; adminun=15529803908; uid=10042; '\n",
" 'PHPSESSID=t1fg29l2b29j3nebq4o52tf0o7')\n",
"\n",
"BASE_PARAMS = {\n",
" 'storeId': '12521',\n",
" 'house_id': '8484',\n",
" 'repositoryName': '',\n",
" 'first_type': '',\n",
" 'product_type': '',\n",
" 'status': '',\n",
" 'searchType': '1',\n",
" 'product_name': ''\n",
"}\n",
"\n",
"# 关键修复:分页格式是 stores-search__.html\n",
"BASE_URL = \"https://scrm.h1cd.com/admin/billings/stores-search__{}.html\"\n",
"# ======================================================\n",
"\n",
"HEADERS = {\n",
" 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n",
" 'Accept-Language': 'zh-CN,zh;q=0.9',\n",
" 'Connection': 'keep-alive',\n",
" 'Cookie': COOKIE,\n",
" 'Referer': 'https://scrm.h1cd.com/admin/billings/stores-search.html',\n",
" 'Sec-Fetch-Dest': 'iframe',\n",
" 'Sec-Fetch-Mode': 'navigate',\n",
" 'Sec-Fetch-Site': 'same-origin',\n",
" 'Sec-Fetch-User': '?1',\n",
" 'Upgrade-Insecure-Requests': '1',\n",
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',\n",
" 'sec-ch-ua': '\"Not(A:Brand\";v=\"99\", \"Google Chrome\";v=\"133\", \"Chromium\";v=\"133\"',\n",
" 'sec-ch-ua-mobile': '?0',\n",
" 'sec-ch-ua-platform': '\"Windows\"',\n",
"}\n",
"\n",
"\n",
"def get_page_html(page_num):\n",
" \"\"\"获取单页HTML\"\"\"\n",
" try:\n",
" if page_num == 1:\n",
" url = \"https://scrm.h1cd.com/admin/billings/stores-search.html\"\n",
" else:\n",
" url = BASE_URL.format(page_num)\n",
" \n",
" res = requests.get(url, headers=HEADERS, params=BASE_PARAMS, timeout=10)\n",
" res.raise_for_status()\n",
" return res.text\n",
" except Exception as e:\n",
" print(f\"第{page_num}页请求失败:{e}\")\n",
" return None\n",
"\n",
"\n",
"def parse_table(html):\n",
" \"\"\"解析表格数据\"\"\"\n",
" soup = BeautifulSoup(html, 'html.parser')\n",
" table = soup.find('table', class_='table-hover')\n",
" if not table:\n",
" return []\n",
"\n",
" rows = []\n",
" # 表头\n",
" headers = [th.get_text(strip=True) for th in table.select('thead th')]\n",
" rows.append(headers)\n",
"\n",
" # 表体\n",
" for tr in table.select('tbody tr'):\n",
" tds = tr.find_all('td')\n",
" row = [td.get_text(strip=True) for td in tds]\n",
" if any(row):\n",
" rows.append(row)\n",
" return rows\n",
"\n",
"\n",
"def get_max_page():\n",
" \"\"\"【已修复】从页面提取最大页数:共 1846 条记录,页 1/93\"\"\"\n",
" html = get_page_html(1)\n",
" if not html:\n",
" return 1\n",
"\n",
" soup = BeautifulSoup(html, 'html.parser')\n",
" page_info = soup.find('div', class_='dataTables_paginate')\n",
" if not page_info:\n",
" return 1\n",
"\n",
" text = page_info.get_text()\n",
" match = re.search(r'页\\s*1/(\\d+)', text) # 匹配 页 1/93\n",
" if match:\n",
" return int(match.group(1))\n",
" return 1\n",
"\n",
"\n",
"def main():\n",
" print(\"=\" * 50)\n",
" print(\"开始爬取库存数据...\")\n",
" print(\"=\" * 50)\n",
"\n",
" max_page = get_max_page() * 2\n",
" print(f\"✅ 成功获取最大页数:{max_page}\")\n",
"\n",
" all_data = []\n",
" for page in range(1, max_page + 1):\n",
" print(f\"正在爬取第 {page}/{max_page} 页...\")\n",
" html = get_page_html(page)\n",
" if not html:\n",
" continue\n",
"\n",
" rows = parse_table(html)\n",
" if page == 1:\n",
" all_data.extend(rows)\n",
" else:\n",
" all_data.extend(rows[1:])\n",
"\n",
" # 保存到桌面\n",
" desktop = os.path.join(os.path.expanduser(\"~\"), \"Desktop\")\n",
" \n",
" # ===================== 核心修改:按配件编码去重 =====================\n",
" df = pd.DataFrame(all_data[1:], columns=all_data[0])\n",
" \n",
" # 按【配件编码】列去重,保留第一条数据\n",
" if '配件编码' in df.columns:\n",
" total_before = len(df)\n",
" df = df.drop_duplicates(subset=['配件编码'], keep='first')\n",
" total_after = len(df)\n",
" print(f\"\\n🔍 去重完成:原始 {total_before} 条 → 去重后 {total_after} 条\")\n",
" else:\n",
" print(\"\\n⚠️ 未找到【配件编码】列,跳过去重\")\n",
" # ==================================================================\n",
"\n",
" path = os.path.join(desktop, \"库存数据_去重版.xlsx\")\n",
" df.to_excel(path, index=False)\n",
"\n",
" print(\"=\" * 50)\n",
" print(f\"✅ 爬取 + 去重 完成!\")\n",
" print(f\"📊 最终有效条数:{len(df)}\")\n",
" print(f\"📁 已保存到桌面:库存数据_去重版.xlsx\")\n",
" print(\"=\" * 50)\n",
"\n",
"\n",
"if __name__ == '__main__':\n",
" main()"
]
},
{
"cell_type": "markdown",
"id": "4b11e6fa",
"metadata": {},
"source": [
"# 历史维修记录"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "667edbdc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🔧 开始导出(100%匹配页面结构)...\n",
"正在读取第 1/1 页\n",
"\n",
"✅ 导出完成!桌面文件:维修记录_完美导出版.xlsx\n"
]
}
],
"source": [
"import requests\n",
"import pandas as pd\n",
"from bs4 import BeautifulSoup\n",
"import os\n",
"import re\n",
"\n",
"# ===================== 固定配置 =====================\n",
"BASE_URL_1 = \"https://scrm.h1cd.com/admin/reports/orderCostDetail.html\"\n",
"BASE_URL_N = \"https://scrm.h1cd.com/admin/reports/orderCostDetail_{}.html\"\n",
"DESKTOP = os.path.join(os.path.expanduser(\"~\"), \"Desktop\")\n",
"OUTPUT_FILE = os.path.join(DESKTOP, \"维修记录_完美导出版.xlsx\")\n",
"\n",
"HEADERS = {\n",
" 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',\n",
" 'Accept-Language': 'zh-CN,zh;q=0.9',\n",
" 'Connection': 'keep-alive',\n",
" 'Referer': 'https://scrm.h1cd.com/admin/reports/orderCostDetail.html?storeId=0&cost_time_type=1&timeStart=2021-03-01&timeEnd=&type=&search=&_action=',\n",
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',\n",
"}\n",
"\n",
"COOKIES = {\n",
" 'LOGIN_URL': 'https%3A%2F%2Fscrm.h1cd.com%2Flogin.html',\n",
" 'showSmsActivity': '1',\n",
" 'showEasyMoney': '1',\n",
" 'adminpd': 'IMAkQA9qfl0V0bY6hPCcbYdz3rcy0MG2%2FbHz34%2BOWy0%3D',\n",
" 'adminun': '15529803908',\n",
" 'uid': '10042',\n",
" 'PHPSESSID': 't1fg29l2b29j3nebq4o52tf0o7'\n",
"}\n",
"\n",
"PARAMS = {\n",
" 'storeId': '0',\n",
" 'cost_time_type': '1',\n",
" 'timeStart': '2021-03-01',\n",
" 'timeEnd': '',\n",
" 'type': '',\n",
" 'search': '',\n",
" '_action': ''\n",
"}\n",
"\n",
"# =====================================================\n",
"\n",
"def get_html(page):\n",
" try:\n",
" if page == 1:\n",
" r = requests.get(BASE_URL_1, headers=HEADERS, cookies=COOKIES, params=PARAMS, timeout=10)\n",
" else:\n",
" r = requests.get(BASE_URL_N.format(page), headers=HEADERS, cookies=COOKIES, params=PARAMS, timeout=10)\n",
" r.raise_for_status()\n",
" return r.text\n",
" except:\n",
" return None\n",
"\n",
"def parse_real_table(html):\n",
" \"\"\"\n",
" 终极解析:严格按照你系统的合并单元格结构逐行还原\n",
" 主行:18列完整\n",
" 子行:从第4列开始,前面自动继承\n",
" \"\"\"\n",
" soup = BeautifulSoup(html, 'html.parser')\n",
" tables = soup.find_all('table', class_='table-hover')\n",
" if not tables:\n",
" return []\n",
" table = tables[0]\n",
" trs = table.find_all('tr')\n",
"\n",
" # 固定表头(和网页100%一致)\n",
" headers = [\n",
" '序号', '工单号', '车牌', '车主信息', '所属分类', '工单内容',\n",
" '金额', '折扣', '折后金额', '成本', '利润', '工单金额',\n",
" '优惠金额', '成本总计', '工单毛利', '毛利率', '结算时间', '接车人'\n",
" ]\n",
"\n",
" result = []\n",
" current_no = ''\n",
" current_order = ''\n",
" current_car = ''\n",
" current_user = ''\n",
"\n",
" for tr in trs:\n",
" tds = tr.find_all(['td', 'th'])\n",
" if not tds:\n",
" continue\n",
"\n",
" row = [td.get_text(strip=True) for td in tds]\n",
"\n",
" # ===================== 核心逻辑 =====================\n",
" # 这一行是【主行】:有 序号、工单号、车牌、车主\n",
" if len(tds) >= 15:\n",
" current_no = row[0]\n",
" current_order = row[1]\n",
" current_car = row[2]\n",
" current_user = row[3]\n",
" # 完整行\n",
" new_row = row[:18]\n",
"\n",
" # 这一行是【子行】:只有 分类~利润 共7列\n",
" else:\n",
" new_row = [\n",
" current_no, current_order, current_car, current_user,\n",
" row[0], row[1], row[2], row[3], row[4], row[5], row[6],\n",
" '', '', '', '', '', '', ''\n",
" ]\n",
" # ====================================================\n",
"\n",
" # 补齐18列\n",
" while len(new_row) < 18:\n",
" new_row.append('')\n",
"\n",
" result.append(new_row[:18])\n",
"\n",
" return [headers] + result\n",
"\n",
"def get_total_pages():\n",
" html = get_html(1)\n",
" if not html:\n",
" return 1\n",
" match = re.search(r'页\\s*1/(\\d+)', html)\n",
" if match:\n",
" return int(match.group(1))\n",
" return 1\n",
"\n",
"def main():\n",
" print(\"🔧 开始导出(100%匹配页面结构)...\")\n",
" total = get_total_pages()\n",
" all_data = []\n",
" header_added = False\n",
"\n",
" for p in range(1, total + 1):\n",
" print(f\"正在读取第 {p}/{total} 页\")\n",
" html = get_html(p)\n",
" if not html:\n",
" continue\n",
"\n",
" rows = parse_real_table(html)\n",
" if not rows:\n",
" continue\n",
"\n",
" if not header_added:\n",
" all_data.extend(rows)\n",
" header_added = True\n",
" else:\n",
" all_data.extend(rows[1:])\n",
"\n",
" if not all_data:\n",
" print(\"❌ 未获取到数据,请检查Cookie是否过期\")\n",
" return\n",
"\n",
" df = pd.DataFrame(all_data[1:], columns=all_data[0])\n",
" df.to_excel(OUTPUT_FILE, index=False)\n",
" print(f\"\\n✅ 导出完成!桌面文件:维修记录_完美导出版.xlsx\")\n",
"\n",
"if __name__ == '__main__':\n",
" main()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}