238 lines
9.1 KiB
Plaintext
238 lines
9.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "a84cbd49a7363225",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2026-03-31T01:58:35.248297Z",
|
|
"start_time": "2026-03-31T01:47:30.843891Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"正在抓取数据: 100%|██████████| 65/65 [15:41<00:00, 14.49s/it]\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"✅ 全部完成!共抓取 23540 条数据,已保存至 D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\神汽链10年历史数据1.xlsx\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import requests\n",
|
|
"from tqdm import tqdm\n",
|
|
"import pandas as pd\n",
|
|
"from datetime import datetime, timedelta\n",
|
|
"import time\n",
|
|
"import os\n",
|
|
"\n",
|
|
"# 配置部分\n",
|
|
"COOKIES = {\n",
|
|
" 'JSESSIONID': 'NDliNDQ0OTYtMzg2NC00ZTAwLWEzMjgtNWE1YmIzMzRjOTMx',\n",
|
|
"}\n",
|
|
"\n",
|
|
"HEADERS = {\n",
|
|
" 'Accept': 'application/json, text/plain, */*',\n",
|
|
" 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',\n",
|
|
" 'Connection': 'keep-alive',\n",
|
|
" 'Content-Type': 'application/json;charset=UTF-8',\n",
|
|
" 'Origin': 'https://www.sqzone.com',\n",
|
|
" 'Referer': 'https://www.sqzone.com/launa/web/workOrder/woManage',\n",
|
|
" 'Sec-Fetch-Dest': 'empty',\n",
|
|
" 'Sec-Fetch-Mode': 'cors',\n",
|
|
" 'Sec-Fetch-Site': 'same-origin',\n",
|
|
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',\n",
|
|
" 'X-Requested-With': 'XMLHttpRequest',\n",
|
|
" 'appName': 'SQLINK',\n",
|
|
" 'sec-ch-ua': '\"Chromium\";v=\"146\", \"Not-A.Brand\";v=\"24\", \"Microsoft Edge\";v=\"146\"',\n",
|
|
" 'sec-ch-ua-mobile': '?0',\n",
|
|
" 'sec-ch-ua-platform': '\"Windows\"',\n",
|
|
" # 'Cookie': 'JSESSIONID=NDliNDQ0OTYtMzg2NC00ZTAwLWEzMjgtNWE1YmIzMzRjOTMx',\n",
|
|
"}\n",
|
|
"\n",
|
|
"\n",
|
|
"URL = 'https://www.sqzone.com/launa/pc/dataCenter/queryShopTurnoverInfo'\n",
|
|
"OUTPUT_FILE = r'D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\神汽链10年历史数据1.xlsx'\n",
|
|
"\n",
|
|
"# 确保输出目录存在\n",
|
|
"os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)\n",
|
|
"\n",
|
|
"all_data = []\n",
|
|
"\n",
|
|
"def get_month_range(start_year, start_month, end_year, end_month):\n",
|
|
" dates = []\n",
|
|
" current = datetime(start_year, start_month, 1)\n",
|
|
" end = datetime(end_year, end_month, 1)\n",
|
|
" while current <= end:\n",
|
|
" next_month = (current.replace(day=28) + timedelta(days=4)).replace(day=1)\n",
|
|
" last_day = next_month - timedelta(days=1)\n",
|
|
" dates.append((\n",
|
|
" current.strftime('%Y-%m-%d'),\n",
|
|
" last_day.strftime('%Y-%m-%d')\n",
|
|
" ))\n",
|
|
" current = next_month\n",
|
|
" return dates\n",
|
|
"\n",
|
|
"def fetch_page_with_retry(start_date, end_date, page_num, max_retries=3):\n",
|
|
" \"\"\"带重试机制的单页请求函数\"\"\"\n",
|
|
" json_data = {\n",
|
|
" 'keyword': '',\n",
|
|
" 'pageSize': 50,\n",
|
|
" 'curPage': page_num,\n",
|
|
" 'shopId': '',\n",
|
|
" 'payId': [],\n",
|
|
" 'payStatus': '',\n",
|
|
" 'startTime': start_date,\n",
|
|
" 'endTime': end_date,\n",
|
|
" }\n",
|
|
"\n",
|
|
" for attempt in range(max_retries):\n",
|
|
" try:\n",
|
|
" response = requests.post(\n",
|
|
" URL,\n",
|
|
" cookies=COOKIES,\n",
|
|
" headers=HEADERS,\n",
|
|
" json=json_data,\n",
|
|
" timeout=15 # 增加超时设置,防止无限等待\n",
|
|
" )\n",
|
|
"\n",
|
|
" # 检查状态码\n",
|
|
" if response.status_code == 200:\n",
|
|
" res_json = response.json()\n",
|
|
" # 检查业务逻辑是否成功 (假设成功时有 'data' 字段)\n",
|
|
" if res_json.get('data'):\n",
|
|
" return res_json['data'].get('contents', [])\n",
|
|
" else:\n",
|
|
" # 可能是业务错误,打印并视为本次失败,尝试重试\n",
|
|
" print(f\"第{page_num}页业务返回异常: {res_json}\")\n",
|
|
" else:\n",
|
|
" print(f\"第{page_num}页HTTP错误: {response.status_code}\")\n",
|
|
"\n",
|
|
" except requests.exceptions.RequestException as e:\n",
|
|
" print(f\"第{page_num}页网络请求失败 (尝试 {attempt+1}/{max_retries}): {e}\")\n",
|
|
"\n",
|
|
" # 如果失败,等待一段时间后重试 (指数退避)\n",
|
|
" if attempt < max_retries - 1:\n",
|
|
" wait_time = (attempt + 1) * 2 # 2秒, 4秒, 6秒...\n",
|
|
" print(f\" -> 等待 {wait_time} 秒后重试...\")\n",
|
|
" time.sleep(wait_time)\n",
|
|
"\n",
|
|
" # 所有重试都失败\n",
|
|
" return None\n",
|
|
"\n",
|
|
"# 生成日期范围\n",
|
|
"month_ranges = get_month_range(2020, 11, 2026, 3)\n",
|
|
"\n",
|
|
"for start_date, end_date in tqdm(month_ranges, desc=\"正在抓取数据\"):\n",
|
|
" page = 1\n",
|
|
" while True:\n",
|
|
" data_list = fetch_page_with_retry(start_date, end_date, page, max_retries=3)\n",
|
|
"\n",
|
|
" # 如果重试多次后仍然失败 (返回 None),可以选择跳过该页或终止\n",
|
|
" if data_list is None:\n",
|
|
" print(f\"⚠️ 严重警告: {start_date}-{end_date} 第{page}页 多次重试失败,跳过该页继续下一页。\")\n",
|
|
" # 这里选择 break 跳出当前月份的循环,或者可以根据需求选择 continue 尝试下一页\n",
|
|
" # 为了数据安全,通常建议记录日志后 break 当前月份,防止数据错乱\n",
|
|
" break\n",
|
|
"\n",
|
|
" if not data_list:\n",
|
|
" # 正常结束:没有更多数据\n",
|
|
" # print(f\"{start_date}-{end_date} 数据已爬取完毕\")\n",
|
|
" break\n",
|
|
"\n",
|
|
" # 处理数据\n",
|
|
" for data in data_list:\n",
|
|
" # 安全获取 partsViews,防止某些记录没有该字段\n",
|
|
" parts = data.get('partsViews', [])\n",
|
|
" customer_info = {k: v for k, v in data.items() if k != 'partsViews'}\n",
|
|
"\n",
|
|
" if parts:\n",
|
|
" for part in parts:\n",
|
|
" record = {**customer_info, **part}\n",
|
|
" all_data.append(record)\n",
|
|
" else:\n",
|
|
" # 如果没有配件视图,是否也要保留主记录?视业务需求而定\n",
|
|
" # 这里假设必须有配件信息才保留,如果需要保留主记录请取消下面注释\n",
|
|
" # all_data.append(customer_info)\n",
|
|
" pass\n",
|
|
"\n",
|
|
" page += 1\n",
|
|
" # 正常翻页等待\n",
|
|
" time.sleep(1)\n",
|
|
"\n",
|
|
" # 可选:每完成一个月保存一次,防止程序运行几天后崩溃导致前功尽弃\n",
|
|
" if len(all_data) > 0 and len(all_data) % 5000 == 0:\n",
|
|
" temp_df = pd.DataFrame(all_data)\n",
|
|
" temp_df.to_excel(OUTPUT_FILE.replace('.xlsx', '_temp.xlsx'), index=False)\n",
|
|
" print(\"临时保存已完成\")\n",
|
|
"\n",
|
|
"# 最终保存\n",
|
|
"if all_data:\n",
|
|
" ndf = pd.DataFrame(all_data)\n",
|
|
" ndf.to_excel(OUTPUT_FILE, index=False)\n",
|
|
" print(f\"✅ 全部完成!共抓取 {len(all_data)} 条数据,已保存至 {OUTPUT_FILE}\")\n",
|
|
"else:\n",
|
|
" print(\"❌ 未抓取到任何数据。\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "d2abfb70a61d82a0",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2026-03-31T03:05:38.539504Z",
|
|
"start_time": "2026-03-31T03:05:27.394303900Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"✅ 全部完成!共抓取 15906 条数据,已保存至 D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\神汽链10年历史数据1.xlsx\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# 最终保存\n",
|
|
"if all_data:\n",
|
|
" ndf = pd.DataFrame(all_data)\n",
|
|
" ndf.to_excel(OUTPUT_FILE, index=False)\n",
|
|
" print(f\"✅ 全部完成!共抓取 {len(all_data)} 条数据,已保存至 {OUTPUT_FILE}\")\n",
|
|
"else:\n",
|
|
" print(\"❌ 未抓取到任何数据。\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "F6+宜搭+其它",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|