4.18备份
This commit is contained in:
@@ -0,0 +1,281 @@
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from html import unescape
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://scrm.h1cd.com"
|
||||
|
||||
DEFAULT_COOKIES = {
|
||||
"showSmsActivity": "1",
|
||||
"showEasyMoney": "1",
|
||||
"LOGIN_URL": "https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html",
|
||||
"adminpd": "jVISiRrtcJplFhLoCuUIxK9XG5ekdfwzq%2B0y482ZKxE%3D",
|
||||
"adminun": "15224781773",
|
||||
"uid": "10291",
|
||||
"PHPSESSID": "nbn58laakng0rv5iqln82a6qpu",
|
||||
}
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||||
"Connection": "keep-alive",
|
||||
"Sec-Fetch-Dest": "iframe",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "same-origin",
|
||||
"Sec-Fetch-User": "?1",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0",
|
||||
"sec-ch-ua": '"Chromium";v="146", "Not-A.Brand";v="24", "Microsoft Edge";v="146"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": '"Windows"',
|
||||
}
|
||||
|
||||
DEFAULT_PARAMS = {
|
||||
"type": "",
|
||||
"expired": "",
|
||||
"storeId": "0",
|
||||
"search": "",
|
||||
}
|
||||
|
||||
_TABLE_RE = re.compile(r"<table\b[^>]*>.*?</table>", re.IGNORECASE | re.DOTALL)
|
||||
_TR_RE = re.compile(r"<tr\b[^>]*>.*?</tr>", re.IGNORECASE | re.DOTALL)
|
||||
_CELL_RE = re.compile(r"<t[hd]\b[^>]*>(.*?)</t[hd]>", re.IGNORECASE | re.DOTALL)
|
||||
_TAG_RE = re.compile(r"<[^>]+>", re.DOTALL)
|
||||
_BR_RE = re.compile(r"<\s*br\s*/?\s*>", re.IGNORECASE)
|
||||
_NBSP_RE = re.compile(r"(\xa0| )+", re.IGNORECASE)
|
||||
|
||||
|
||||
def _clean_html_text(raw: str) -> str:
|
||||
raw = _BR_RE.sub("\n", raw)
|
||||
raw = _TAG_RE.sub("", raw)
|
||||
raw = unescape(raw)
|
||||
raw = _NBSP_RE.sub(" ", raw)
|
||||
raw = raw.replace("\r", "").strip()
|
||||
raw = re.sub(r"[ \t]+\n", "\n", raw)
|
||||
raw = re.sub(r"\n{3,}", "\n\n", raw)
|
||||
raw = re.sub(r"[ \t]{2,}", " ", raw)
|
||||
return raw.strip()
|
||||
|
||||
|
||||
def build_cards_url(page: int) -> str:
|
||||
if page <= 1:
|
||||
return f"{BASE_URL}/admin/members/cards.html"
|
||||
return f"{BASE_URL}/admin/members/cards_{page}.html"
|
||||
|
||||
|
||||
def _load_cookies() -> Dict[str, str]:
|
||||
env_json = os.environ.get("H1_COOKIES_JSON")
|
||||
if env_json:
|
||||
loaded = json.loads(env_json)
|
||||
if not isinstance(loaded, dict):
|
||||
raise ValueError("H1_COOKIES_JSON must be a JSON object")
|
||||
return {str(k): str(v) for k, v in loaded.items()}
|
||||
return dict(DEFAULT_COOKIES)
|
||||
|
||||
|
||||
def _fetch_html(
|
||||
session: requests.Session,
|
||||
page: int,
|
||||
params: Dict[str, str],
|
||||
base_headers: Dict[str, str],
|
||||
timeout_seconds: int = 30,
|
||||
) -> str:
|
||||
url = build_cards_url(page)
|
||||
headers = dict(base_headers)
|
||||
|
||||
if page >= 2:
|
||||
referer_params = dict(params)
|
||||
headers["Referer"] = f"{build_cards_url(page - 1)}?{urlencode(referer_params, doseq=True)}"
|
||||
else:
|
||||
referer_params = dict(params)
|
||||
headers["Referer"] = f"{build_cards_url(1)}?{urlencode(referer_params, doseq=True)}"
|
||||
|
||||
resp = session.get(url, params=params, headers=headers, timeout=timeout_seconds)
|
||||
resp.raise_for_status()
|
||||
if not resp.encoding:
|
||||
resp.encoding = "utf-8"
|
||||
return resp.text
|
||||
|
||||
|
||||
def _parse_table(table_html: str) -> Tuple[List[str], List[List[str]]]:
|
||||
header: List[str] = []
|
||||
data_rows: List[List[str]] = []
|
||||
|
||||
for tr_match in _TR_RE.finditer(table_html):
|
||||
tr_html = tr_match.group(0)
|
||||
cell_html_list = _CELL_RE.findall(tr_html)
|
||||
if not cell_html_list:
|
||||
continue
|
||||
|
||||
cells = [_clean_html_text(c) for c in cell_html_list]
|
||||
if not any(cells):
|
||||
continue
|
||||
|
||||
is_header = bool(re.search(r"<th\b", tr_html, re.IGNORECASE))
|
||||
if is_header and not header:
|
||||
header = cells
|
||||
else:
|
||||
data_rows.append(cells)
|
||||
|
||||
if not data_rows:
|
||||
return header, []
|
||||
|
||||
if not header:
|
||||
max_cols = max(len(r) for r in data_rows)
|
||||
header = [f"col_{i + 1}" for i in range(max_cols)]
|
||||
|
||||
width = len(header)
|
||||
normalized_rows: List[List[str]] = []
|
||||
for row in data_rows:
|
||||
if len(row) < width:
|
||||
row = row + [""] * (width - len(row))
|
||||
elif len(row) > width:
|
||||
row = row[:width]
|
||||
normalized_rows.append(row)
|
||||
|
||||
return header, normalized_rows
|
||||
|
||||
|
||||
def parse_cards_page(html_text: str) -> Tuple[List[str], List[Dict[str, str]]]:
|
||||
tables = _TABLE_RE.findall(html_text)
|
||||
best_header: List[str] = []
|
||||
best_rows: List[List[str]] = []
|
||||
|
||||
for table_html in tables:
|
||||
header, rows = _parse_table(table_html)
|
||||
if len(header) <= 1:
|
||||
continue
|
||||
if len(rows) > len(best_rows):
|
||||
best_header, best_rows = header, rows
|
||||
|
||||
if not best_rows:
|
||||
return best_header, []
|
||||
|
||||
records = [dict(zip(best_header, row)) for row in best_rows]
|
||||
return best_header, records
|
||||
|
||||
|
||||
def _merge_headers(existing: List[str], incoming: Sequence[str]) -> List[str]:
|
||||
seen = set(existing)
|
||||
merged = list(existing)
|
||||
for col in incoming:
|
||||
if col not in seen:
|
||||
merged.append(col)
|
||||
seen.add(col)
|
||||
return merged
|
||||
|
||||
|
||||
def export_all_cards(
|
||||
output_csv_path: str,
|
||||
params: Optional[Dict[str, str]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
max_pages: int = 200,
|
||||
sleep_seconds: float = 0.3,
|
||||
) -> Tuple[int, int]:
|
||||
cookies = _load_cookies()
|
||||
params = dict(DEFAULT_PARAMS if params is None else params)
|
||||
headers = dict(DEFAULT_HEADERS if headers is None else headers)
|
||||
|
||||
session = requests.Session()
|
||||
session.cookies.update(cookies)
|
||||
|
||||
all_records: List[Dict[str, str]] = []
|
||||
merged_header: List[str] = []
|
||||
seen_keys: set[Tuple[str, ...]] = set()
|
||||
|
||||
pages_fetched = 0
|
||||
for page in range(1, max_pages + 1):
|
||||
html_text = _fetch_html(session=session, page=page, params=params, base_headers=headers)
|
||||
page_header, page_records = parse_cards_page(html_text)
|
||||
pages_fetched += 1
|
||||
|
||||
if not page_records:
|
||||
break
|
||||
|
||||
merged_header = _merge_headers(merged_header, page_header)
|
||||
|
||||
for rec in page_records:
|
||||
key = tuple(rec.get(col, "") for col in page_header)
|
||||
if key in seen_keys:
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
all_records.append(rec)
|
||||
|
||||
if sleep_seconds > 0:
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
if not all_records:
|
||||
raise RuntimeError("未解析到任何表格数据(可能是登录失效/页面结构变化/被重定向到登录页)")
|
||||
|
||||
if not merged_header:
|
||||
merged_header = sorted({k for r in all_records for k in r.keys()})
|
||||
|
||||
os.makedirs(os.path.dirname(os.path.abspath(output_csv_path)) or ".", exist_ok=True)
|
||||
with open(output_csv_path, "w", encoding="utf-8-sig", newline="") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=merged_header, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
for rec in all_records:
|
||||
writer.writerow({k: rec.get(k, "") for k in merged_header})
|
||||
|
||||
return pages_fetched, len(all_records)
|
||||
|
||||
|
||||
def _self_test() -> None:
|
||||
html_text = """
|
||||
<html><body>
|
||||
<table>
|
||||
<tr><th>会员名</th><th>卡号</th><th>余额</th></tr>
|
||||
<tr><td>张三</td><td>NO001</td><td>100</td></tr>
|
||||
<tr><td>李四</td><td>NO002</td><td>200</td></tr>
|
||||
</table>
|
||||
</body></html>
|
||||
"""
|
||||
header, records = parse_cards_page(html_text)
|
||||
assert header == ["会员名", "卡号", "余额"]
|
||||
assert records[0]["卡号"] == "NO001"
|
||||
assert records[1]["余额"] == "200"
|
||||
|
||||
|
||||
def main(argv: Optional[Sequence[str]] = None) -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--output", default="H1会员卡.csv")
|
||||
parser.add_argument("--storeId", default=DEFAULT_PARAMS["storeId"])
|
||||
parser.add_argument("--search", default=DEFAULT_PARAMS["search"])
|
||||
parser.add_argument("--type", default=DEFAULT_PARAMS["type"])
|
||||
parser.add_argument("--expired", default=DEFAULT_PARAMS["expired"])
|
||||
parser.add_argument("--max-pages", type=int, default=200)
|
||||
parser.add_argument("--sleep", type=float, default=0.3)
|
||||
parser.add_argument("--self-test", action="store_true")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.self_test:
|
||||
_self_test()
|
||||
print("self-test ok")
|
||||
return 0
|
||||
|
||||
params = {
|
||||
"type": str(args.type),
|
||||
"expired": str(args.expired),
|
||||
"storeId": str(args.storeId),
|
||||
"search": str(args.search),
|
||||
}
|
||||
|
||||
pages_fetched, rows = export_all_cards(
|
||||
output_csv_path=args.output,
|
||||
params=params,
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
)
|
||||
print(f"导出完成: pages={pages_fetched}, rows={rows}, output={os.path.abspath(args.output)}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,424 @@
|
||||
"""
|
||||
H1车店系统 - 会员卡信息导出
|
||||
从 https://scrm.h1cd.com/admin/members/cards.html 导出会员卡信息
|
||||
注意:脚本解析HTML表格,导出的原始数据格式不规范,需要清洗处理
|
||||
"""
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
# ===================== 【配置区】 =====================
|
||||
# Cookie(请根据实际情况更新)
|
||||
COOKIES = {
|
||||
'showSmsActivity': '1',
|
||||
'showEasyMoney': '1',
|
||||
'LOGIN_URL': 'https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html',
|
||||
'adminpd': 'jVISiRrtcJplFhLoCuUIxK9XG5ekdfwzq%2B0y482ZKxE%3D',
|
||||
'adminun': '15224781773',
|
||||
'uid': '10291',
|
||||
'PHPSESSID': 'nbn58laakng0rv5iqln82a6qpu',
|
||||
}
|
||||
|
||||
HEADERS = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': 'https://scrm.h1cd.com/admin/members/cards.html',
|
||||
'Sec-Fetch-Dest': 'iframe',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',
|
||||
'sec-ch-ua': '"Chromium";v="146", "Not-A.Brand";v="24", "Microsoft Edge";v="146"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
}
|
||||
|
||||
# 查询参数
|
||||
PARAMS = {
|
||||
'type': '',
|
||||
'expired': '',
|
||||
'storeId': '0',
|
||||
'search': '',
|
||||
}
|
||||
|
||||
# 输出目录
|
||||
OUTPUT_DIR = r"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出"
|
||||
|
||||
# =====================================================
|
||||
|
||||
|
||||
def get_page_html(page_num, cookies, params):
|
||||
"""获取指定页面的HTML内容"""
|
||||
try:
|
||||
if page_num == 1:
|
||||
url = "https://scrm.h1cd.com/admin/members/cards.html"
|
||||
else:
|
||||
url = f"https://scrm.h1cd.com/admin/members/cards_{page_num}.html"
|
||||
|
||||
r = requests.get(url, headers=HEADERS, cookies=cookies, params=params, timeout=30)
|
||||
|
||||
# 检查是否被重定向到登录页
|
||||
if 'login' in r.url.lower() or '登录' in r.text[:2000]:
|
||||
print(f" ⚠️ 第{page_num}页检测到跳转登录,Cookie可能已失效。")
|
||||
return None
|
||||
|
||||
r.raise_for_status()
|
||||
r.encoding = 'utf-8'
|
||||
return r.text
|
||||
except Exception as e:
|
||||
print(f" ❌ 第{page_num}页请求失败: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
def parse_cards_table(html):
|
||||
"""
|
||||
解析会员卡HTML表格,提取数据并做规范化处理。
|
||||
|
||||
H1系统会员卡页面特点:
|
||||
- 部分单元格包含多行信息(用<br>分隔),如姓名和手机号在同一格
|
||||
- 状态信息可能包含多余文本
|
||||
- 数值字段可能包含非数字字符
|
||||
- 操作列包含按钮文本需要过滤
|
||||
"""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
table = soup.find('table', class_='table')
|
||||
if not table:
|
||||
table = soup.find('table')
|
||||
if not table:
|
||||
return [], []
|
||||
|
||||
# 提取表头
|
||||
header = []
|
||||
thead = table.find('thead')
|
||||
if thead:
|
||||
ths = thead.find_all('th')
|
||||
header = [th.get_text(strip=True) for th in ths]
|
||||
|
||||
# 如果没有 thead,尝试从第一行 tr 中获取
|
||||
if not header:
|
||||
first_tr = table.find('tr')
|
||||
if first_tr:
|
||||
ths = first_tr.find_all('th')
|
||||
if ths:
|
||||
header = [th.get_text(strip=True) for th in ths]
|
||||
|
||||
# 提取数据行
|
||||
tbody = table.find('tbody')
|
||||
rows = tbody.find_all('tr') if tbody else table.find_all('tr')
|
||||
|
||||
data_rows = []
|
||||
for tr in rows:
|
||||
# 跳过表头行
|
||||
if tr.find('th'):
|
||||
continue
|
||||
tds = tr.find_all('td')
|
||||
if not tds or len(tds) < 3:
|
||||
continue
|
||||
|
||||
row_data = []
|
||||
for td in tds:
|
||||
# 保留<br>产生的换行,使用separator分隔
|
||||
text = td.get_text(separator='|', strip=True)
|
||||
# 清理多余空格
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
row_data.append(text.strip())
|
||||
|
||||
if any(row_data):
|
||||
data_rows.append(row_data)
|
||||
|
||||
return header, data_rows
|
||||
|
||||
|
||||
def clean_card_record(row_dict, header):
|
||||
"""
|
||||
清洗单条会员卡记录,处理不规范的数据格式。
|
||||
|
||||
主要处理:
|
||||
1. 姓名+手机号合并在一个字段中 → 拆分为独立的"客户名称"和"手机号"
|
||||
2. 状态字段中的多余文本
|
||||
3. 数值字段中的非数字字符
|
||||
4. 操作列中的按钮文本
|
||||
"""
|
||||
cleaned = {}
|
||||
|
||||
# 定义可能的列名映射(H1系统表头可能包含的关键字)
|
||||
col_mappings = {
|
||||
'name_col': ['会员名', '姓名', '会员名称', '客户', '车主'],
|
||||
'phone_col': ['手机', '电话', '联系电话'],
|
||||
'card_no_col': ['卡号', '会员卡号', '卡编号'],
|
||||
'card_type_col': ['卡类型', '卡名称', '类型'],
|
||||
'balance_col': ['余额', '储值余额', '可用余额'],
|
||||
'total_recharge_col': ['充值', '累计充值', '总充值', '充值金额'],
|
||||
'total_consume_col': ['消费', '累计消费', '总消费', '消费金额'],
|
||||
'status_col': ['状态', '卡状态'],
|
||||
'create_time_col': ['开卡时间', '创建时间', '注册时间'],
|
||||
'expire_time_col': ['到期时间', '有效期', '过期时间'],
|
||||
'store_col': ['门店', '所属门店', '门店名称'],
|
||||
'level_col': ['等级', '会员等级', '会员级别'],
|
||||
}
|
||||
|
||||
# 查找列索引
|
||||
col_index = {}
|
||||
for key, keywords in col_mappings.items():
|
||||
for kw in keywords:
|
||||
for i, h in enumerate(header):
|
||||
if kw in h:
|
||||
col_index[key] = i
|
||||
break
|
||||
if key in col_index:
|
||||
break
|
||||
|
||||
# 逐列清洗
|
||||
for i, h in enumerate(header):
|
||||
value = row_dict.get(h, '') if isinstance(row_dict, dict) else (row_dict[i] if i < len(row_dict) else '')
|
||||
|
||||
# 处理操作列(通常在最后一列,包含"充值记录"、"消费记录"等按钮文本)
|
||||
if '操作' in h:
|
||||
cleaned[h] = ''
|
||||
continue
|
||||
|
||||
# 处理复选框列
|
||||
if '选择' in h or '勾选' in h:
|
||||
cleaned[h] = ''
|
||||
continue
|
||||
|
||||
# 处理姓名+手机号合并的情况
|
||||
if i == col_index.get('name_col'):
|
||||
name, phone = '', ''
|
||||
if '|' in value:
|
||||
parts = [p.strip() for p in value.split('|')]
|
||||
for part in parts:
|
||||
phone_match = re.search(r'1[3-9]\d{9}', part)
|
||||
if phone_match:
|
||||
phone = phone_match.group()
|
||||
elif part and not re.match(r'^\d{11}$', part):
|
||||
name = part if not name else name + part
|
||||
elif re.match(r'^\d{11}$', part):
|
||||
phone = part
|
||||
else:
|
||||
phone_match = re.search(r'1[3-9]\d{9}', value)
|
||||
if phone_match:
|
||||
phone = phone_match.group()
|
||||
name = value.replace(phone, '').strip()
|
||||
else:
|
||||
name = value.strip()
|
||||
|
||||
cleaned['客户名称'] = name
|
||||
cleaned['手机号'] = phone
|
||||
continue
|
||||
|
||||
# 处理手机号列(独立列)
|
||||
if i == col_index.get('phone_col'):
|
||||
phone_match = re.search(r'1[3-9]\d{9}', value)
|
||||
cleaned['手机号'] = phone_match.group() if phone_match else value
|
||||
continue
|
||||
|
||||
# 处理数值列(去掉非数字字符,保留小数点)
|
||||
if i == col_index.get('balance_col') or i == col_index.get('total_recharge_col') or i == col_index.get('total_consume_col'):
|
||||
num_match = re.search(r'[\d.]+', value.replace(',', ''))
|
||||
cleaned[h] = num_match.group() if num_match else value
|
||||
continue
|
||||
|
||||
# 清理其他字段中的多余空白和分隔符
|
||||
clean_val = value.replace('|', ' ').strip()
|
||||
clean_val = re.sub(r'\s+', ' ', clean_val)
|
||||
# 去除 "查看详情"、"编辑" 等按钮文本
|
||||
clean_val = re.sub(r'(查看详情|编辑|删除|充值记录|消费记录|详情)', '', clean_val).strip()
|
||||
cleaned[h] = clean_val
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def normalize_dataframe(df):
|
||||
"""
|
||||
对整个DataFrame进行规范化处理。
|
||||
处理各种数据不规范的情况。
|
||||
"""
|
||||
# 去除完全重复的行
|
||||
before_count = len(df)
|
||||
df = df.drop_duplicates()
|
||||
after_count = len(df)
|
||||
if before_count != after_count:
|
||||
print(f" 🔍 去重:{before_count} 条 → {after_count} 条(去除 {before_count - after_count} 条重复)")
|
||||
|
||||
# 尝试拆分合并列(如"姓名|手机号")
|
||||
for col in df.columns:
|
||||
# 检测该列是否包含手机号(超过30%的值匹配手机号模式)
|
||||
phone_ratio = df[col].astype(str).apply(lambda x: bool(re.search(r'1[3-9]\d{9}', x))).mean()
|
||||
name_ratio = df[col].astype(str).apply(lambda x: bool(re.search(r'[\u4e00-\u9fa5]{2,4}', x))).mean()
|
||||
|
||||
if phone_ratio > 0.3 and name_ratio > 0.3 and '名称' in col:
|
||||
# 该列同时包含姓名和手机号,需要拆分
|
||||
if '客户名称' not in df.columns:
|
||||
df['客户名称'] = df[col].apply(
|
||||
lambda x: re.sub(r'1[3-9]\d{9}', '', str(x)).replace('|', '').strip()
|
||||
)
|
||||
if '手机号' not in df.columns:
|
||||
df['手机号'] = df[col].apply(
|
||||
lambda x: (re.search(r'1[3-9]\d{9}', str(x)) or type('', (), {'group': lambda s: ''})()).group()
|
||||
)
|
||||
|
||||
# 清理数值列
|
||||
for col in df.columns:
|
||||
if any(kw in col for kw in ['余额', '充值', '消费', '金额']):
|
||||
df[col] = df[col].astype(str).apply(
|
||||
lambda x: re.search(r'[\d.]+', x.replace(',', '')).group() if re.search(r'[\d.]+', x.replace(',', '')) else x
|
||||
)
|
||||
|
||||
# 清理操作列
|
||||
for col in df.columns:
|
||||
if '操作' in col or '选择' in col or '勾选' in col:
|
||||
df = df.drop(columns=[col])
|
||||
|
||||
# 清理所有列中的按钮文本残留
|
||||
for col in df.columns:
|
||||
df[col] = df[col].astype(str).apply(
|
||||
lambda x: re.sub(r'(查看详情|编辑|删除|充值记录|消费记录|详情|迁移)', '', str(x)).strip()
|
||||
)
|
||||
# 替换 'nan' 为空字符串
|
||||
df[col] = df[col].replace('nan', '')
|
||||
df[col] = df[col].replace('None', '')
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def get_max_page(html):
|
||||
"""从页面中提取最大页数"""
|
||||
if not html:
|
||||
return 1
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
text = soup.get_text()
|
||||
|
||||
# 尝试匹配 "共X页" 格式
|
||||
match = re.search(r'共\s*(\d+)\s*页', text)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
# 尝试匹配 "页 1/X" 格式
|
||||
match = re.search(r'页\s*1/(\d+)', text)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
# 尝试匹配分页链接
|
||||
page_links = soup.find_all('a', href=re.compile(r'cards_\d+\.html'))
|
||||
if page_links:
|
||||
max_page = 1
|
||||
for a in page_links:
|
||||
num_match = re.search(r'cards_(\d+)\.html', a.get('href', ''))
|
||||
if num_match:
|
||||
max_page = max(max_page, int(num_match.group(1)))
|
||||
return max_page
|
||||
|
||||
return 1
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 50)
|
||||
print("开始爬取 H1系统 会员卡信息...")
|
||||
print(f"当前 StoreID: {PARAMS['storeId']}")
|
||||
print("=" * 50)
|
||||
|
||||
# 获取第一页,确定总页数
|
||||
print("正在获取总页数...")
|
||||
first_html = get_page_html(1, COOKIES, PARAMS)
|
||||
if not first_html:
|
||||
print("❌ 无法获取第一页数据,请检查 Cookie 或网络。")
|
||||
return
|
||||
|
||||
max_page = get_max_page(first_html)
|
||||
print(f"✅ 成功获取最大页数:{max_page}")
|
||||
|
||||
# 爬取所有页面
|
||||
all_data = []
|
||||
merged_header = []
|
||||
|
||||
for page in range(1, max_page + 1):
|
||||
print(f"正在爬取第 {page}/{max_page} 页...")
|
||||
|
||||
if page == 1:
|
||||
html = first_html
|
||||
else:
|
||||
html = get_page_html(page, COOKIES, PARAMS)
|
||||
if not html:
|
||||
print(f"❌ 第 {page} 页获取失败,跳过。")
|
||||
continue
|
||||
|
||||
header, rows = parse_cards_table(html)
|
||||
|
||||
if not header and not rows:
|
||||
print(f"⚠️ 第 {page} 页未解析到表格数据。")
|
||||
continue
|
||||
|
||||
# 合并表头(不同页的表头可能略有差异)
|
||||
if header:
|
||||
for h in header:
|
||||
if h not in merged_header:
|
||||
merged_header.append(h)
|
||||
|
||||
all_data.extend(rows)
|
||||
|
||||
# 请求间隔,避免过于频繁
|
||||
if page < max_page:
|
||||
time.sleep(0.3)
|
||||
|
||||
if not all_data:
|
||||
print("\n❌ 未获取到任何数据,请检查 Cookie 或网络。")
|
||||
return
|
||||
|
||||
print(f"\n✅ 爬取完成,共获取 {len(all_data)} 条原始记录")
|
||||
|
||||
# 构建DataFrame
|
||||
if merged_header:
|
||||
# 标准化行长度
|
||||
normalized_rows = []
|
||||
width = len(merged_header)
|
||||
for row in all_data:
|
||||
if len(row) < width:
|
||||
row = row + [''] * (width - len(row))
|
||||
elif len(row) > width:
|
||||
row = row[:width]
|
||||
normalized_rows.append(row)
|
||||
df = pd.DataFrame(normalized_rows, columns=merged_header)
|
||||
else:
|
||||
df = pd.DataFrame(all_data)
|
||||
|
||||
print(f"📋 原始列名:{list(df.columns)}")
|
||||
print(f"📋 原始数据前3行:")
|
||||
print(df.head(3).to_string())
|
||||
|
||||
# 数据规范化处理
|
||||
print("\n开始数据规范化处理...")
|
||||
df = normalize_dataframe(df)
|
||||
|
||||
# 保存结果
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"H1会员卡信息_{time_str}.xlsx"
|
||||
filepath = os.path.join(OUTPUT_DIR, filename)
|
||||
|
||||
try:
|
||||
df.to_excel(filepath, index=False)
|
||||
print("=" * 50)
|
||||
print(f"✅ 导出完成!")
|
||||
print(f"📊 最终有效条数:{len(df)}")
|
||||
print(f"📁 已保存到:{filepath}")
|
||||
print("=" * 50)
|
||||
except Exception as e:
|
||||
print(f"❌ 保存Excel失败: {e}")
|
||||
# 降级为CSV
|
||||
csv_path = filepath.replace('.xlsx', '.csv')
|
||||
df.to_csv(csv_path, index=False, encoding='utf-8-sig')
|
||||
print(f"💡 已转为 CSV 保存至:{csv_path}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
+640
-303
@@ -10,6 +10,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "70a8b0da",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
@@ -17,6 +18,64 @@
|
||||
"start_time": "2026-03-25T03:51:31.198595700Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"正在获取总页数...\n",
|
||||
"总页数:34 页\n",
|
||||
"正在爬取第 1/34 页...\n",
|
||||
"正在爬取第 2/34 页...\n",
|
||||
"正在爬取第 3/34 页...\n",
|
||||
"正在爬取第 4/34 页...\n",
|
||||
"正在爬取第 5/34 页...\n",
|
||||
"正在爬取第 6/34 页...\n",
|
||||
"正在爬取第 7/34 页...\n",
|
||||
"正在爬取第 8/34 页...\n",
|
||||
"正在爬取第 9/34 页...\n",
|
||||
"正在爬取第 10/34 页...\n",
|
||||
"正在爬取第 11/34 页...\n",
|
||||
"正在爬取第 12/34 页...\n",
|
||||
"正在爬取第 13/34 页...\n",
|
||||
"正在爬取第 14/34 页...\n",
|
||||
"正在爬取第 15/34 页...\n",
|
||||
"正在爬取第 16/34 页...\n",
|
||||
"正在爬取第 17/34 页...\n",
|
||||
"正在爬取第 18/34 页...\n",
|
||||
"正在爬取第 19/34 页...\n",
|
||||
"正在爬取第 20/34 页...\n",
|
||||
"正在爬取第 21/34 页...\n",
|
||||
"正在爬取第 22/34 页...\n",
|
||||
"正在爬取第 23/34 页...\n",
|
||||
"正在爬取第 24/34 页...\n",
|
||||
"正在爬取第 25/34 页...\n",
|
||||
"正在爬取第 26/34 页...\n",
|
||||
"正在爬取第 27/34 页...\n",
|
||||
"正在爬取第 28/34 页...\n",
|
||||
"正在爬取第 29/34 页...\n",
|
||||
"正在爬取第 30/34 页...\n",
|
||||
"正在爬取第 31/34 页...\n",
|
||||
"正在爬取第 32/34 页...\n",
|
||||
"正在爬取第 33/34 页...\n",
|
||||
"正在爬取第 34/34 页...\n",
|
||||
"\n",
|
||||
"========== 爬取完成 ==========\n",
|
||||
"总计数据:666 行\n",
|
||||
"\n",
|
||||
"✅ 文件已保存到桌面:\n",
|
||||
"📊 Excel文件:C:\\Users\\hp_z66\\Desktop\\车辆数据_已拆分_20260325_1151531.csv\n",
|
||||
"📄 文本文件:C:\\Users\\hp_z66\\Desktop\\车辆数据_已拆分_20260325_115153.txt\n",
|
||||
"\n",
|
||||
"前5行数据预览:\n",
|
||||
"1 ['1', '豫NA477R', '卢忠厚', '', '', '', '/', '', '118933km', '', '', '消费记录 编辑 迁移 删除']\n",
|
||||
"2 ['2', '豫NF3722', '刘建利', '', '', '', '/', '', '198609km', '', '', '消费记录 编辑 迁移 删除']\n",
|
||||
"3 ['3', '豫N13B58', '石', '15090629992', '', '', '/', '', '22462km', '', '', '消费记录 编辑 迁移 删除']\n",
|
||||
"4 ['4', '京PYB297', '科迪黄青春', '', '', '', '/', '', '119584km', '', '', '消费记录 编辑 迁移 删除']\n",
|
||||
"5 ['5', '豫NN982M', '大众', '', '', '', '/', '', '197504km', '', '', '消费记录 编辑 迁移 删除']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
@@ -215,66 +274,7 @@
|
||||
" print(\"\\n前5行数据预览:\")\n",
|
||||
" for i, row in enumerate(all_data[:5]):\n",
|
||||
" print(i+1, row)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"正在获取总页数...\n",
|
||||
"总页数:34 页\n",
|
||||
"正在爬取第 1/34 页...\n",
|
||||
"正在爬取第 2/34 页...\n",
|
||||
"正在爬取第 3/34 页...\n",
|
||||
"正在爬取第 4/34 页...\n",
|
||||
"正在爬取第 5/34 页...\n",
|
||||
"正在爬取第 6/34 页...\n",
|
||||
"正在爬取第 7/34 页...\n",
|
||||
"正在爬取第 8/34 页...\n",
|
||||
"正在爬取第 9/34 页...\n",
|
||||
"正在爬取第 10/34 页...\n",
|
||||
"正在爬取第 11/34 页...\n",
|
||||
"正在爬取第 12/34 页...\n",
|
||||
"正在爬取第 13/34 页...\n",
|
||||
"正在爬取第 14/34 页...\n",
|
||||
"正在爬取第 15/34 页...\n",
|
||||
"正在爬取第 16/34 页...\n",
|
||||
"正在爬取第 17/34 页...\n",
|
||||
"正在爬取第 18/34 页...\n",
|
||||
"正在爬取第 19/34 页...\n",
|
||||
"正在爬取第 20/34 页...\n",
|
||||
"正在爬取第 21/34 页...\n",
|
||||
"正在爬取第 22/34 页...\n",
|
||||
"正在爬取第 23/34 页...\n",
|
||||
"正在爬取第 24/34 页...\n",
|
||||
"正在爬取第 25/34 页...\n",
|
||||
"正在爬取第 26/34 页...\n",
|
||||
"正在爬取第 27/34 页...\n",
|
||||
"正在爬取第 28/34 页...\n",
|
||||
"正在爬取第 29/34 页...\n",
|
||||
"正在爬取第 30/34 页...\n",
|
||||
"正在爬取第 31/34 页...\n",
|
||||
"正在爬取第 32/34 页...\n",
|
||||
"正在爬取第 33/34 页...\n",
|
||||
"正在爬取第 34/34 页...\n",
|
||||
"\n",
|
||||
"========== 爬取完成 ==========\n",
|
||||
"总计数据:666 行\n",
|
||||
"\n",
|
||||
"✅ 文件已保存到桌面:\n",
|
||||
"📊 Excel文件:C:\\Users\\hp_z66\\Desktop\\车辆数据_已拆分_20260325_1151531.csv\n",
|
||||
"📄 文本文件:C:\\Users\\hp_z66\\Desktop\\车辆数据_已拆分_20260325_115153.txt\n",
|
||||
"\n",
|
||||
"前5行数据预览:\n",
|
||||
"1 ['1', '豫NA477R', '卢忠厚', '', '', '', '/', '', '118933km', '', '', '消费记录 编辑 迁移 删除']\n",
|
||||
"2 ['2', '豫NF3722', '刘建利', '', '', '', '/', '', '198609km', '', '', '消费记录 编辑 迁移 删除']\n",
|
||||
"3 ['3', '豫N13B58', '石', '15090629992', '', '', '/', '', '22462km', '', '', '消费记录 编辑 迁移 删除']\n",
|
||||
"4 ['4', '京PYB297', '科迪黄青春', '', '', '', '/', '', '119584km', '', '', '消费记录 编辑 迁移 删除']\n",
|
||||
"5 ['5', '豫NN982M', '大众', '', '', '', '/', '', '197504km', '', '', '消费记录 编辑 迁移 删除']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 1
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
@@ -286,6 +286,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "5392bfc0",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
@@ -293,6 +294,67 @@
|
||||
"start_time": "2026-03-25T03:53:18.688209100Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"==================================================\n",
|
||||
"开始爬取库存数据...\n",
|
||||
"当前 StoreID: 13435\n",
|
||||
"当前 HouseID: 9079\n",
|
||||
"==================================================\n",
|
||||
"✅ 成功获取最大页数:40\n",
|
||||
"正在爬取第 1/40 页...\n",
|
||||
"正在爬取第 2/40 页...\n",
|
||||
"正在爬取第 3/40 页...\n",
|
||||
"正在爬取第 4/40 页...\n",
|
||||
"正在爬取第 5/40 页...\n",
|
||||
"正在爬取第 6/40 页...\n",
|
||||
"正在爬取第 7/40 页...\n",
|
||||
"正在爬取第 8/40 页...\n",
|
||||
"正在爬取第 9/40 页...\n",
|
||||
"正在爬取第 10/40 页...\n",
|
||||
"正在爬取第 11/40 页...\n",
|
||||
"正在爬取第 12/40 页...\n",
|
||||
"正在爬取第 13/40 页...\n",
|
||||
"正在爬取第 14/40 页...\n",
|
||||
"正在爬取第 15/40 页...\n",
|
||||
"正在爬取第 16/40 页...\n",
|
||||
"正在爬取第 17/40 页...\n",
|
||||
"正在爬取第 18/40 页...\n",
|
||||
"正在爬取第 19/40 页...\n",
|
||||
"正在爬取第 20/40 页...\n",
|
||||
"正在爬取第 21/40 页...\n",
|
||||
"正在爬取第 22/40 页...\n",
|
||||
"正在爬取第 23/40 页...\n",
|
||||
"正在爬取第 24/40 页...\n",
|
||||
"正在爬取第 25/40 页...\n",
|
||||
"正在爬取第 26/40 页...\n",
|
||||
"正在爬取第 27/40 页...\n",
|
||||
"正在爬取第 28/40 页...\n",
|
||||
"正在爬取第 29/40 页...\n",
|
||||
"正在爬取第 30/40 页...\n",
|
||||
"正在爬取第 31/40 页...\n",
|
||||
"正在爬取第 32/40 页...\n",
|
||||
"正在爬取第 33/40 页...\n",
|
||||
"正在爬取第 34/40 页...\n",
|
||||
"正在爬取第 35/40 页...\n",
|
||||
"正在爬取第 36/40 页...\n",
|
||||
"正在爬取第 37/40 页...\n",
|
||||
"正在爬取第 38/40 页...\n",
|
||||
"正在爬取第 39/40 页...\n",
|
||||
"正在爬取第 40/40 页...\n",
|
||||
"\n",
|
||||
"🔍 去重完成 (基于列: 配件编码):原始 782 条 → 去重后 782 条\n",
|
||||
"==================================================\n",
|
||||
"✅ 爬取 + 去重 完成!\n",
|
||||
"📊 最终有效条数:782\n",
|
||||
"📁 已保存到桌面:库存数据_13435_去重版1.xlsx\n",
|
||||
"==================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import pandas as pd\n",
|
||||
@@ -499,69 +561,7 @@
|
||||
"\n",
|
||||
"if __name__ == '__main__':\n",
|
||||
" main()"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"==================================================\n",
|
||||
"开始爬取库存数据...\n",
|
||||
"当前 StoreID: 13435\n",
|
||||
"当前 HouseID: 9079\n",
|
||||
"==================================================\n",
|
||||
"✅ 成功获取最大页数:40\n",
|
||||
"正在爬取第 1/40 页...\n",
|
||||
"正在爬取第 2/40 页...\n",
|
||||
"正在爬取第 3/40 页...\n",
|
||||
"正在爬取第 4/40 页...\n",
|
||||
"正在爬取第 5/40 页...\n",
|
||||
"正在爬取第 6/40 页...\n",
|
||||
"正在爬取第 7/40 页...\n",
|
||||
"正在爬取第 8/40 页...\n",
|
||||
"正在爬取第 9/40 页...\n",
|
||||
"正在爬取第 10/40 页...\n",
|
||||
"正在爬取第 11/40 页...\n",
|
||||
"正在爬取第 12/40 页...\n",
|
||||
"正在爬取第 13/40 页...\n",
|
||||
"正在爬取第 14/40 页...\n",
|
||||
"正在爬取第 15/40 页...\n",
|
||||
"正在爬取第 16/40 页...\n",
|
||||
"正在爬取第 17/40 页...\n",
|
||||
"正在爬取第 18/40 页...\n",
|
||||
"正在爬取第 19/40 页...\n",
|
||||
"正在爬取第 20/40 页...\n",
|
||||
"正在爬取第 21/40 页...\n",
|
||||
"正在爬取第 22/40 页...\n",
|
||||
"正在爬取第 23/40 页...\n",
|
||||
"正在爬取第 24/40 页...\n",
|
||||
"正在爬取第 25/40 页...\n",
|
||||
"正在爬取第 26/40 页...\n",
|
||||
"正在爬取第 27/40 页...\n",
|
||||
"正在爬取第 28/40 页...\n",
|
||||
"正在爬取第 29/40 页...\n",
|
||||
"正在爬取第 30/40 页...\n",
|
||||
"正在爬取第 31/40 页...\n",
|
||||
"正在爬取第 32/40 页...\n",
|
||||
"正在爬取第 33/40 页...\n",
|
||||
"正在爬取第 34/40 页...\n",
|
||||
"正在爬取第 35/40 页...\n",
|
||||
"正在爬取第 36/40 页...\n",
|
||||
"正在爬取第 37/40 页...\n",
|
||||
"正在爬取第 38/40 页...\n",
|
||||
"正在爬取第 39/40 页...\n",
|
||||
"正在爬取第 40/40 页...\n",
|
||||
"\n",
|
||||
"🔍 去重完成 (基于列: 配件编码):原始 782 条 → 去重后 782 条\n",
|
||||
"==================================================\n",
|
||||
"✅ 爬取 + 去重 完成!\n",
|
||||
"📊 最终有效条数:782\n",
|
||||
"📁 已保存到桌面:库存数据_13435_去重版1.xlsx\n",
|
||||
"==================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 2
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
@@ -573,13 +573,191 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "cbd4eeb0a30b3e15",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2026-03-25T03:58:48.443601700Z",
|
||||
"start_time": "2026-03-25T03:56:48.226330400Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"🔧 开始导出维修记录...\n",
|
||||
"📄 正在获取第 1 页以分析页数...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists.html\n",
|
||||
"📊 预估总页数: 53\n",
|
||||
"🔄 正在处理第 1/53 页...\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 2/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_2.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 3/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_3.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 4/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_4.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 5/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_5.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 6/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_6.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 7/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_7.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 8/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_8.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 9/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_9.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 10/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_10.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 11/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_11.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 12/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_12.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 13/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_13.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 14/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_14.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 15/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_15.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 16/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_16.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 17/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_17.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 18/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_18.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 19/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_19.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 20/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_20.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 21/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_21.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 22/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_22.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 23/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_23.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 24/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_24.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 25/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_25.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 26/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_26.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 27/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_27.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 28/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_28.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 29/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_29.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 30/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_30.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 31/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_31.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 32/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_32.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 33/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_33.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 34/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_34.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 35/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_35.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 36/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_36.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 37/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_37.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 38/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_38.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 39/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_39.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 40/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_40.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 41/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_41.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 42/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_42.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 43/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_43.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 44/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_44.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 45/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_45.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 46/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_46.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 47/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_47.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 48/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_48.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 49/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_49.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 50/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_50.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 51/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_51.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 52/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_52.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 53/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_53.html\n",
|
||||
" ✅ 本页提取 8 条记录\n",
|
||||
"\n",
|
||||
"==============================\n",
|
||||
"✅ 导出成功!\n",
|
||||
"📁 文件路径: D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\维修记录_完美导出版.xlsx\n",
|
||||
"📈 总记录数: 1048\n",
|
||||
"==============================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import pandas as pd\n",
|
||||
@@ -885,185 +1063,344 @@
|
||||
" print(\"💡 请运行以下命令安装: pip install \" + \" \".join(missing))\n",
|
||||
" else:\n",
|
||||
" main()"
|
||||
],
|
||||
"id": "cbd4eeb0a30b3e15",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"🔧 开始导出维修记录...\n",
|
||||
"📄 正在获取第 1 页以分析页数...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists.html\n",
|
||||
"📊 预估总页数: 53\n",
|
||||
"🔄 正在处理第 1/53 页...\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 2/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_2.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 3/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_3.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 4/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_4.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 5/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_5.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 6/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_6.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 7/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_7.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 8/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_8.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 9/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_9.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 10/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_10.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 11/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_11.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 12/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_12.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 13/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_13.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 14/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_14.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 15/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_15.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 16/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_16.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 17/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_17.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 18/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_18.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 19/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_19.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 20/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_20.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 21/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_21.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 22/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_22.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 23/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_23.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 24/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_24.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 25/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_25.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 26/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_26.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 27/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_27.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 28/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_28.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 29/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_29.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 30/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_30.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 31/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_31.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 32/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_32.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 33/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_33.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 34/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_34.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 35/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_35.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 36/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_36.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 37/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_37.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 38/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_38.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 39/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_39.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 40/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_40.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 41/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_41.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 42/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_42.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 43/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_43.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 44/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_44.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 45/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_45.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 46/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_46.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 47/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_47.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 48/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_48.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 49/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_49.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 50/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_50.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 51/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_51.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 52/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_52.html\n",
|
||||
" ✅ 本页提取 20 条记录\n",
|
||||
"🔄 正在处理第 53/53 页...\n",
|
||||
" 正在请求: https://scrm.h1cd.com/admin/billings/Lists_53.html\n",
|
||||
" ✅ 本页提取 8 条记录\n",
|
||||
"\n",
|
||||
"==============================\n",
|
||||
"✅ 导出成功!\n",
|
||||
"📁 文件路径: D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\\维修记录_完美导出版.xlsx\n",
|
||||
"📈 总记录数: 1048\n",
|
||||
"==============================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 3
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b3decf1d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 会员卡信息导出\n",
|
||||
"\n",
|
||||
"从 H1 系统导出会员卡信息(储值卡、套餐卡等),自动分页爬取并做数据规范化处理。\n",
|
||||
"\n",
|
||||
"> ⚠️ **注意**:H1系统导出的原始数据格式不规范(如姓名和手机号混在同一字段、操作列包含按钮文本等),脚本已内置清洗逻辑。\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9ab86773",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import pandas as pd\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"import time\n",
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"# ===================== 【配置区】 =====================\n",
|
||||
"# Cookie(请根据实际情况更新,登录后从浏览器DevTools复制)\n",
|
||||
"COOKIES = {\n",
|
||||
" 'showSmsActivity': '1',\n",
|
||||
" 'showEasyMoney': '1',\n",
|
||||
" 'LOGIN_URL': 'https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html',\n",
|
||||
" 'adminpd': 'jVISiRrtcJplFhLoCuUIxK9XG5ekdfwzq%2B0y482ZKxE%3D',\n",
|
||||
" 'adminun': '15224781773',\n",
|
||||
" 'uid': '10291',\n",
|
||||
" 'PHPSESSID': 'nbn58laakng0rv5iqln82a6qpu',\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"HEADERS = {\n",
|
||||
" 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',\n",
|
||||
" 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',\n",
|
||||
" 'Connection': 'keep-alive',\n",
|
||||
" 'Referer': 'https://scrm.h1cd.com/admin/members/cards.html',\n",
|
||||
" 'Sec-Fetch-Dest': 'iframe',\n",
|
||||
" 'Sec-Fetch-Mode': 'navigate',\n",
|
||||
" 'Sec-Fetch-Site': 'same-origin',\n",
|
||||
" 'Sec-Fetch-User': '?1',\n",
|
||||
" 'Upgrade-Insecure-Requests': '1',\n",
|
||||
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',\n",
|
||||
" 'sec-ch-ua': '\"Chromium\";v=\"146\", \"Not-A.Brand\";v=\"24\", \"Microsoft Edge\";v=\"146\"',\n",
|
||||
" 'sec-ch-ua-mobile': '?0',\n",
|
||||
" 'sec-ch-ua-platform': '\"Windows\"',\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# 查询参数\n",
|
||||
"PARAMS = {\n",
|
||||
" 'type': '',\n",
|
||||
" 'expired': '',\n",
|
||||
" 'storeId': '0',\n",
|
||||
" 'search': '',\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# 输出目录\n",
|
||||
"OUTPUT_DIR = r\"D:\\Idea Project\\F6+宜搭+其它(1)\\张阳脚本\\文件输出\"\n",
|
||||
"# =====================================================\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_page_html(page_num):\n",
|
||||
" \"\"\"获取指定页面的HTML内容\"\"\"\n",
|
||||
" try:\n",
|
||||
" if page_num == 1:\n",
|
||||
" url = \"https://scrm.h1cd.com/admin/members/cards.html\"\n",
|
||||
" else:\n",
|
||||
" url = f\"https://scrm.h1cd.com/admin/members/cards_{page_num}.html\"\n",
|
||||
"\n",
|
||||
" r = requests.get(url, headers=HEADERS, cookies=COOKIES, params=PARAMS, timeout=30)\n",
|
||||
"\n",
|
||||
" # 检查是否被重定向到登录页\n",
|
||||
" if 'login' in r.url.lower() or '登录' in r.text[:2000]:\n",
|
||||
" print(f\" ⚠️ 第{page_num}页检测到跳转登录,Cookie可能已失效。\")\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
" r.raise_for_status()\n",
|
||||
" r.encoding = 'utf-8'\n",
|
||||
" return r.text\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\" ❌ 第{page_num}页请求失败: {str(e)}\")\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def parse_cards_table(html):\n",
|
||||
" \"\"\"\n",
|
||||
" 解析会员卡HTML表格,提取数据。\n",
|
||||
" \n",
|
||||
" H1系统会员卡页面特点:\n",
|
||||
" - 部分单元格包含多行信息(用<br>分隔),如姓名和手机号在同一格\n",
|
||||
" - 操作列包含按钮文本需要过滤\n",
|
||||
" \"\"\"\n",
|
||||
" soup = BeautifulSoup(html, 'html.parser')\n",
|
||||
" table = soup.find('table', class_='table')\n",
|
||||
" if not table:\n",
|
||||
" table = soup.find(\"table\")\n",
|
||||
" if not table:\n",
|
||||
" return [], []\n",
|
||||
"\n",
|
||||
" # 提取表头\n",
|
||||
" header = []\n",
|
||||
" thead = table.find(\"thead\")\n",
|
||||
" if thead:\n",
|
||||
" ths = thead.find_all('th')\n",
|
||||
" header = [th.get_text(strip=True) for th in ths]\n",
|
||||
"\n",
|
||||
" if not header:\n",
|
||||
" first_tr = table.find(\"tr\")\n",
|
||||
" if first_tr:\n",
|
||||
" ths = first_tr.find_all('th')\n",
|
||||
" if ths:\n",
|
||||
" header = [th.get_text(strip=True) for th in ths]\n",
|
||||
"\n",
|
||||
" # 提取数据行\n",
|
||||
" tbody = table.find(\"tbody\")\n",
|
||||
" rows = tbody.find_all(\"tr\") if tbody else table.find_all(\"tr\")\n",
|
||||
"\n",
|
||||
" data_rows = []\n",
|
||||
" for tr in rows:\n",
|
||||
" if tr.find(\"th\"):\n",
|
||||
" continue\n",
|
||||
" tds = tr.find_all('td')\n",
|
||||
" if not tds or len(tds) < 3:\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" row_data = []\n",
|
||||
" for td in tds:\n",
|
||||
" text = td.get_text(separator='|', strip=True)\n",
|
||||
" text = re.sub(r'\\s+', ' ', text)\n",
|
||||
" row_data.append(text.strip())\n",
|
||||
"\n",
|
||||
" if any(row_data):\n",
|
||||
" data_rows.append(row_data)\n",
|
||||
"\n",
|
||||
" return header, data_rows\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def normalize_dataframe(df):\n",
|
||||
" \"\"\"\n",
|
||||
" 对整个DataFrame进行规范化处理。\n",
|
||||
" 处理H1系统导出数据不规范的情况:\n",
|
||||
" 1. 去重\n",
|
||||
" 2. 拆分姓名+手机号合并字段\n",
|
||||
" 3. 清理数值列\n",
|
||||
" 4. 去除操作列和按钮文本残留\n",
|
||||
" \"\"\"\n",
|
||||
" # 去除完全重复的行\n",
|
||||
" before_count = len(df)\n",
|
||||
" df = df.drop_duplicates()\n",
|
||||
" after_count = len(df)\n",
|
||||
" if before_count != after_count:\n",
|
||||
" print(f\" 🔍 去重:{before_count} 条 → {after_count} 条(去除 {before_count - after_count} 条重复)\")\n",
|
||||
"\n",
|
||||
" # 拆分合并列(如\"会员名\"列中同时包含姓名和手机号)\n",
|
||||
" for col in df.columns:\n",
|
||||
" if any(kw in col for kw in [\"会员名\", \"姓名\", \"客户名称\", \"车主\"]):\n",
|
||||
" # 检测该列是否同时包含姓名和手机号\n",
|
||||
" sample = df[col].astype(str).head(20)\n",
|
||||
" has_phone = sample.apply(lambda x: bool(re.search(r'1[3-9]\\d{9}', x))).any()\n",
|
||||
" if has_phone and '手机号' not in df.columns:\n",
|
||||
" df[\"客户名称\"] = df[col].apply(\n",
|
||||
" lambda x: re.sub(r\"1[3-9]\\d{9}\", \"\", str(x)).replace(\"|\", \"\").strip()\n",
|
||||
" )\n",
|
||||
" df[\"手机号\"] = df[col].apply(\n",
|
||||
" lambda x: (re.search(r\"1[3-9]\\d{9}\", str(x)).group() if re.search(r\"1[3-9]\\d{9}\", str(x)) else \"\")\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 清理数值列\n",
|
||||
" for col in df.columns:\n",
|
||||
" if any(kw in col for kw in [\"余额\", \"充值\", \"消费\", \"金额\"]):\n",
|
||||
" df[col] = df[col].astype(str).apply(\n",
|
||||
" lambda x: (re.search(r\"[\\d.]+\", str(x).replace(\",\", \"\")).group() if re.search(r\"[\\d.]+\", str(x).replace(\",\", \"\")) else x)\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 清理操作列\n",
|
||||
" cols_to_drop = [col for col in df.columns if any(kw in col for kw in [\"操作\", \"选择\", \"勾选\"])]\n",
|
||||
" if cols_to_drop:\n",
|
||||
" df = df.drop(columns=cols_to_drop)\n",
|
||||
"\n",
|
||||
" # 清理所有列中的按钮文本残留\n",
|
||||
" btn_patterns = r\"(查看详情|编辑|删除|充值记录|消费记录|详情|迁移|查看)\"\n",
|
||||
" for col in df.columns:\n",
|
||||
" df[col] = df[col].astype(str).apply(\n",
|
||||
" lambda x: re.sub(btn_patterns, \"\", str(x)).strip()\n",
|
||||
" )\n",
|
||||
" df[col] = df[col].replace({'nan': '', 'None': ''})\n",
|
||||
"\n",
|
||||
" return df\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_max_page(html):\n",
|
||||
" \"\"\"从页面中提取最大页数\"\"\"\n",
|
||||
" if not html:\n",
|
||||
" return 1\n",
|
||||
"\n",
|
||||
" soup = BeautifulSoup(html, 'html.parser')\n",
|
||||
" text = soup.get_text()\n",
|
||||
"\n",
|
||||
" match = re.search(r'共\\s*(\\d+)\\s*页', text)\n",
|
||||
" if match:\n",
|
||||
" return int(match.group(1))\n",
|
||||
"\n",
|
||||
" match = re.search(r'页\\s*1/(\\d+)', text)\n",
|
||||
" if match:\n",
|
||||
" return int(match.group(1))\n",
|
||||
"\n",
|
||||
" page_links = soup.find_all('a', href=re.compile(r'cards_\\d+\\.html'))\n",
|
||||
" if page_links:\n",
|
||||
" max_page = 1\n",
|
||||
" for a in page_links:\n",
|
||||
" num_match = re.search(r'cards_(\\d+)\\.html', a.get('href', ''))\n",
|
||||
" if num_match:\n",
|
||||
" max_page = max(max_page, int(num_match.group(1)))\n",
|
||||
" return max_page\n",
|
||||
"\n",
|
||||
" return 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def main():\n",
|
||||
" print(\"=\" * 50)\n",
|
||||
" print(\"开始爬取 H1系统 会员卡信息...\")\n",
|
||||
" print(f\"当前 StoreID: {PARAMS['storeId']}\")\n",
|
||||
" print(\"=\" * 50)\n",
|
||||
"\n",
|
||||
" # 获取第一页,确定总页数\n",
|
||||
" print(\"正在获取总页数...\")\n",
|
||||
" first_html = get_page_html(1)\n",
|
||||
" if not first_html:\n",
|
||||
" print(\"❌ 无法获取第一页数据,请检查 Cookie 或网络。\")\n",
|
||||
" return\n",
|
||||
"\n",
|
||||
" max_page = get_max_page(first_html)\n",
|
||||
" print(f\"✅ 成功获取最大页数:{max_page}\")\n",
|
||||
"\n",
|
||||
" # 爬取所有页面\n",
|
||||
" all_data = []\n",
|
||||
" merged_header = []\n",
|
||||
"\n",
|
||||
" for page in range(1, max_page + 1):\n",
|
||||
" print(f\"正在爬取第 {page}/{max_page} 页...\")\n",
|
||||
"\n",
|
||||
" if page == 1:\n",
|
||||
" html = first_html\n",
|
||||
" else:\n",
|
||||
" html = get_page_html(page)\n",
|
||||
" if not html:\n",
|
||||
" print(f\"❌ 第 {page} 页获取失败,跳过。\")\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" header, rows = parse_cards_table(html)\n",
|
||||
"\n",
|
||||
" if not header and not rows:\n",
|
||||
" print(f\"⚠️ 第 {page} 页未解析到表格数据。\")\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" # 合并表头(不同页的表头可能略有差异)\n",
|
||||
" if header:\n",
|
||||
" for h in header:\n",
|
||||
" if h not in merged_header:\n",
|
||||
" merged_header.append(h)\n",
|
||||
"\n",
|
||||
" all_data.extend(rows)\n",
|
||||
"\n",
|
||||
" # 请求间隔\n",
|
||||
" if page < max_page:\n",
|
||||
" time.sleep(0.3)\n",
|
||||
"\n",
|
||||
" if not all_data:\n",
|
||||
" print(\"\\n❌ 未获取到任何数据,请检查 Cookie 或网络。\")\n",
|
||||
" return\n",
|
||||
"\n",
|
||||
" print(f\"\\n✅ 爬取完成,共获取 {len(all_data)} 条原始记录\")\n",
|
||||
"\n",
|
||||
" # 构建DataFrame\n",
|
||||
" if merged_header:\n",
|
||||
" normalized_rows = []\n",
|
||||
" width = len(merged_header)\n",
|
||||
" for row in all_data:\n",
|
||||
" if len(row) < width:\n",
|
||||
" row = row + [\"\"] * (width - len(row))\n",
|
||||
" elif len(row) > width:\n",
|
||||
" row = row[:width]\n",
|
||||
" normalized_rows.append(row)\n",
|
||||
" df = pd.DataFrame(normalized_rows, columns=merged_header)\n",
|
||||
" else:\n",
|
||||
" df = pd.DataFrame(all_data)\n",
|
||||
"\n",
|
||||
" print(f\"📋 原始列名:{list(df.columns)}\")\n",
|
||||
" print(f\"📋 原始数据前3行:\")\n",
|
||||
" print(df.head(3).to_string())\n",
|
||||
"\n",
|
||||
" # 数据规范化处理\n",
|
||||
" print(\"\\n开始数据规范化处理...\")\n",
|
||||
" df = normalize_dataframe(df)\n",
|
||||
"\n",
|
||||
" # 保存结果\n",
|
||||
" os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
|
||||
" time_str = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
|
||||
" filename = f\"H1会员卡信息_{time_str}.xlsx\"\n",
|
||||
" filepath = os.path.join(OUTPUT_DIR, filename)\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" df.to_excel(filepath, index=False)\n",
|
||||
" print(\"=\" * 50)\n",
|
||||
" print(\"✅ 导出完成!\")\n",
|
||||
" print(f\"📊 最终有效条数:{len(df)}\")\n",
|
||||
" print(f\"📁 已保存到:{filepath}\")\n",
|
||||
" print(\"=\" * 50)\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"❌ 保存Excel失败: {e}\")\n",
|
||||
" csv_path = filepath.replace(\".xlsx\", \".csv\")\n",
|
||||
" df.to_csv(csv_path, index=False, encoding=\"utf-8-sig\")\n",
|
||||
" print(f\"💡 已转为 CSV 保存至:{csv_path}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if __name__ == '__main__':\n",
|
||||
" main()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4c658267",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 运行导出\n",
|
||||
"main()\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
+17
-1
@@ -263,7 +263,21 @@
|
||||
- 逐个查询会员卡明细
|
||||
- 使用lxml解析HTML
|
||||
|
||||
### 34. 途虎养车系统
|
||||
### 35. H1车店系统
|
||||
- **文件**: `H1车店数据导出.ipynb`, `H1会员卡.py`
|
||||
- **功能**:
|
||||
- 车辆信息导出
|
||||
- 库存信息导出
|
||||
- 历史维修记录导出
|
||||
- 开单管理数据导出
|
||||
- 会员卡信息导出(储值卡、套餐卡等)
|
||||
- **接口**: `https://scrm.h1cd.com`
|
||||
- **特点**:
|
||||
- 使用Cookie认证,需要定期更新
|
||||
- HTML表格解析,分页URL模式为 `cards_{page}.html`
|
||||
- **数据格式不规范**:导出的原始数据中姓名和手机号可能混在同一字段、操作列包含按钮文本、数值字段含非数字字符等,脚本内置了数据规范化处理(拆分合并列、清理按钮文本、数值标准化、去重等)
|
||||
|
||||
### 36. 途虎养车系统
|
||||
- **文件**: `途虎养车脚本导出.ipynb`
|
||||
- **功能**:
|
||||
- 客户信息导出
|
||||
@@ -460,6 +474,8 @@
|
||||
├── 大唛云管理平台.ipynb # 大唛云管理平台
|
||||
├── 大大汽修token登录(1).ipynb # 大大汽修Token登录
|
||||
├── 大大汽修点击导出(1).ipynb # 大大汽修点击导出
|
||||
├── H1车店数据导出.ipynb # H1车店系统 - 车辆/库存/维修记录/会员卡导出
|
||||
├── H1会员卡.py # H1车店系统 - 会员卡信息导出(独立脚本)
|
||||
├── 好店长.ipynb # 好店长系统
|
||||
├── 客户无忧.ipynb # 客户无忧系统
|
||||
├── 客管家数据导出(1).ipynb # 客管家数据导出
|
||||
|
||||
+240
-182
File diff suppressed because one or more lines are too long
@@ -0,0 +1,286 @@
|
||||
"""
|
||||
神汽链(sqzone.com)登录+数据导出 一体化脚本
|
||||
- 自动登录获取JSESSIONID(Playwright + ddddocr验证码识别)
|
||||
- 导出近10年历史维修记录
|
||||
"""
|
||||
import sys, io
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
|
||||
|
||||
import requests
|
||||
import re
|
||||
import hashlib
|
||||
import time
|
||||
import json
|
||||
import base64
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import urllib3
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# ============ 配置 ============
|
||||
ACCOUNT = '17690802976'
|
||||
PASSWORD = '123321'
|
||||
URL = 'https://www.sqzone.com/launa/pc/dataCenter/queryShopTurnoverInfo'
|
||||
OUTPUT_FILE = r'D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出\神汽链10年历史数据.xlsx'
|
||||
DATE_RANGE_START = (2016, 4) # 近10年起始
|
||||
DATE_RANGE_END = (2026, 3) # 截止到上个月
|
||||
# ==============================
|
||||
|
||||
def md5(text):
|
||||
return hashlib.md5(text.encode('utf-8')).hexdigest()
|
||||
|
||||
def get_sqzone_cookies(account=ACCOUNT, password=PASSWORD, max_captcha_retries=5):
|
||||
"""通过Playwright浏览器自动化登录,获取JSESSIONID"""
|
||||
from playwright.sync_api import sync_playwright
|
||||
import ddddocr
|
||||
|
||||
ocr = ddddocr.DdddOcr(show_ad=False)
|
||||
|
||||
with sync_playwright() as p:
|
||||
print("[登录] 启动浏览器...")
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',
|
||||
viewport={'width': 1280, 'height': 720}
|
||||
)
|
||||
page = context.new_page()
|
||||
|
||||
print("[登录] 访问神汽链登录页...")
|
||||
page.goto('https://www.sqzone.com/launa/pc/login', wait_until='domcontentloaded', timeout=30000)
|
||||
page.wait_for_timeout(3000)
|
||||
|
||||
uuid = page.evaluate('() => typeof uuid !== "undefined" ? uuid : ""')
|
||||
if not uuid:
|
||||
print("[登录] 未获取到uuid,登录页可能加载异常")
|
||||
browser.close()
|
||||
return None
|
||||
print(f"[登录] uuid: {uuid}")
|
||||
|
||||
for attempt in range(max_captcha_retries):
|
||||
print(f"[登录] 尝试 {attempt+1}/{max_captcha_retries}...")
|
||||
|
||||
captcha_result = page.evaluate('''async () => {
|
||||
const resp = await fetch('/fauna/qrloginserver/getPasswordCaptcha?t=' + Date.now());
|
||||
return await resp.json();
|
||||
}''')
|
||||
|
||||
captcha_code = ''
|
||||
if captcha_result.get('success'):
|
||||
img_data = base64.b64decode(captcha_result['data'])
|
||||
captcha_code = ocr.classification(img_data)
|
||||
print(f"[登录] 验证码: {captcha_code}")
|
||||
else:
|
||||
print(f"[登录] 获取验证码失败")
|
||||
continue
|
||||
|
||||
login_result = page.evaluate('''async (params) => {
|
||||
const resp = await fetch('/fauna/qrloginserver/backOfficePwdLogin', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json;charset=UTF-8'},
|
||||
body: JSON.stringify(params)
|
||||
});
|
||||
return await resp.json();
|
||||
}''', {
|
||||
'mobilephone': account,
|
||||
'password': md5(password),
|
||||
'pwdCaptcha': captcha_code,
|
||||
'referer': page.url,
|
||||
'uuid': uuid,
|
||||
'loginType': 'SAAS',
|
||||
})
|
||||
|
||||
if login_result.get('success'):
|
||||
print("[登录] SSO登录成功!")
|
||||
break
|
||||
elif '验证码错误' in (login_result.get('errorMsg') or ''):
|
||||
print("[登录] 验证码错误,重试...")
|
||||
continue
|
||||
elif '密码' in (login_result.get('errorMsg') or ''):
|
||||
print(f"[登录] 密码错误: {login_result.get('errorMsg')}")
|
||||
browser.close()
|
||||
return None
|
||||
else:
|
||||
print(f"[登录] 错误: {login_result.get('errorMsg')}")
|
||||
continue
|
||||
else:
|
||||
print("[登录] 验证码多次失败")
|
||||
browser.close()
|
||||
return None
|
||||
|
||||
# 刷新页面让authorize回调完成
|
||||
print("[登录] 刷新页面完成授权...")
|
||||
page.reload(wait_until='domcontentloaded', timeout=30000)
|
||||
page.wait_for_timeout(2000)
|
||||
|
||||
# 跳转到sqzone主页
|
||||
page.goto('https://www.sqzone.com/launa/web/dc/turnover', wait_until='domcontentloaded', timeout=30000)
|
||||
page.wait_for_timeout(2000)
|
||||
|
||||
# 获取cookies
|
||||
cookies = context.cookies()
|
||||
sqzone_cookies = {}
|
||||
for c in cookies:
|
||||
if 'sqzone' in c.get('domain', ''):
|
||||
sqzone_cookies[c['name']] = c['value']
|
||||
|
||||
# 验证cookies
|
||||
if sqzone_cookies.get('JSESSIONID'):
|
||||
test_result = page.evaluate('''async () => {
|
||||
const resp = await fetch('/launa/pc/dataCenter/queryShopTurnoverInfo', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json;charset=UTF-8',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'appName': 'SQLINK',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
keyword: '', pageSize: 1, curPage: 1,
|
||||
shopId: '', payId: [], payStatus: '',
|
||||
startTime: '2026-03-01', endTime: '2026-03-31',
|
||||
})
|
||||
});
|
||||
return await resp.json();
|
||||
}''')
|
||||
|
||||
if test_result.get('success') or test_result.get('data'):
|
||||
print(f"[登录] Cookies有效! JSESSIONID={sqzone_cookies['JSESSIONID'][:20]}...")
|
||||
else:
|
||||
print(f"[登录] Cookies无效: {str(test_result)[:100]}")
|
||||
browser.close()
|
||||
return None
|
||||
else:
|
||||
print("[登录] 未获取到JSESSIONID")
|
||||
browser.close()
|
||||
return None
|
||||
|
||||
browser.close()
|
||||
return sqzone_cookies
|
||||
|
||||
|
||||
def get_month_range(start_year, start_month, end_year, end_month):
|
||||
dates = []
|
||||
current = datetime(start_year, start_month, 1)
|
||||
end = datetime(end_year, end_month, 1)
|
||||
while current <= end:
|
||||
next_month = (current.replace(day=28) + timedelta(days=4)).replace(day=1)
|
||||
last_day = next_month - timedelta(days=1)
|
||||
dates.append((current.strftime('%Y-%m-%d'), last_day.strftime('%Y-%m-%d')))
|
||||
current = next_month
|
||||
return dates
|
||||
|
||||
|
||||
def fetch_page_with_retry(cookies, headers, start_date, end_date, page_num, max_retries=3):
|
||||
json_data = {
|
||||
'keyword': '', 'pageSize': 50, 'curPage': page_num,
|
||||
'shopId': '', 'payId': [], 'payStatus': '',
|
||||
'startTime': start_date, 'endTime': end_date,
|
||||
}
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = requests.post(URL, cookies=cookies, headers=headers, json=json_data, timeout=15, verify=False)
|
||||
if response.status_code == 200:
|
||||
res_json = response.json()
|
||||
if res_json.get('data'):
|
||||
return res_json['data'].get('contents', [])
|
||||
else:
|
||||
# 检查是否需要重新登录
|
||||
if res_json.get('code') == '-1302':
|
||||
print(f" Session过期! 需要重新登录")
|
||||
return 'SESSION_EXPIRED'
|
||||
print(f" 第{page_num}页业务异常: {res_json}")
|
||||
else:
|
||||
print(f" 第{page_num}页HTTP错误: {response.status_code}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f" 第{page_num}页请求失败 ({attempt+1}/{max_retries}): {e}")
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep((attempt + 1) * 2)
|
||||
return None
|
||||
|
||||
|
||||
def export_data(cookies):
|
||||
"""导出历史数据"""
|
||||
headers = {
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
'Content-Type': 'application/json;charset=UTF-8',
|
||||
'Origin': 'https://www.sqzone.com',
|
||||
'Referer': 'https://www.sqzone.com/launa/web/dc/turnover',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'appName': 'SQLINK',
|
||||
}
|
||||
|
||||
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
|
||||
all_data = []
|
||||
month_ranges = get_month_range(*DATE_RANGE_START, *DATE_RANGE_END)
|
||||
print(f"\n[导出] 共 {len(month_ranges)} 个月份需要抓取")
|
||||
|
||||
for start_date, end_date in tqdm(month_ranges, desc="正在抓取数据"):
|
||||
page = 1
|
||||
while True:
|
||||
data_list = fetch_page_with_retry(cookies, headers, start_date, end_date, page)
|
||||
|
||||
if data_list == 'SESSION_EXPIRED':
|
||||
return 'SESSION_EXPIRED', all_data
|
||||
|
||||
if data_list is None:
|
||||
print(f"\n {start_date}-{end_date} 第{page}页多次失败,跳过")
|
||||
break
|
||||
|
||||
if not data_list:
|
||||
break
|
||||
|
||||
for data in data_list:
|
||||
parts = data.get('partsViews', [])
|
||||
customer_info = {k: v for k, v in data.items() if k != 'partsViews'}
|
||||
if parts:
|
||||
for part in parts:
|
||||
record = {**customer_info, **part}
|
||||
all_data.append(record)
|
||||
|
||||
page += 1
|
||||
time.sleep(1)
|
||||
|
||||
# 每5000条临时保存
|
||||
if len(all_data) > 0 and len(all_data) % 5000 == 0:
|
||||
temp_file = OUTPUT_FILE.replace('.xlsx', '_temp.xlsx')
|
||||
pd.DataFrame(all_data).to_excel(temp_file, index=False)
|
||||
print(f"\n 临时保存: {len(all_data)} 条")
|
||||
|
||||
return 'OK', all_data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 登录获取cookies
|
||||
cookies = get_sqzone_cookies()
|
||||
if not cookies:
|
||||
print("登录失败,退出")
|
||||
sys.exit(1)
|
||||
|
||||
# 导出数据(支持Session过期自动重新登录)
|
||||
while True:
|
||||
status, all_data = export_data(cookies)
|
||||
if status == 'SESSION_EXPIRED':
|
||||
print("\nSession过期,重新登录...")
|
||||
cookies = get_sqzone_cookies()
|
||||
if not cookies:
|
||||
print("重新登录失败,保存已有数据")
|
||||
break
|
||||
continue
|
||||
break
|
||||
|
||||
# 保存最终数据
|
||||
if all_data:
|
||||
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
|
||||
pd.DataFrame(all_data).to_excel(OUTPUT_FILE, index=False)
|
||||
print(f"\n全部完成! 共 {len(all_data)} 条数据,已保存至 {OUTPUT_FILE}")
|
||||
else:
|
||||
print("\n未抓取到数据")
|
||||
@@ -0,0 +1,89 @@
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
cookies = {
|
||||
'JSESSIONID': 'FA68674FDDA302C51E2775091B995EEA',
|
||||
'td_cookie': '3009435466',
|
||||
}
|
||||
|
||||
headers = {
|
||||
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': 'http://www.idsz.xin:7070/report_member_verifi_list?detailtype=1&key=totalCount&datafrom=2026-01-01&datato=2026-04-13&sshopId=&type=1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
}
|
||||
|
||||
def get_data(page=0, page_size=50):
|
||||
url = f'http://www.idsz.xin:7070/posapi_invoke?apiname=kpi_memberVerifiAndSurplusQuery&detailtype=1&startTime=2026-01-01&endTime=2026-04-13&key=totalCount&sshopId=×CardId=&option=&page={page}&pageSize={page_size}'
|
||||
response = requests.get(url=url, headers=headers, cookies=cookies)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
print("正在请求会员卡数据...")
|
||||
try:
|
||||
# 先获取第一页数据,获取总数
|
||||
first_page = get_data(page=0)
|
||||
total = first_page.get('total', 0)
|
||||
page_size = 50
|
||||
total_pages = (total + page_size - 1) // page_size
|
||||
|
||||
print(f"数据总条数: {total}")
|
||||
print(f"每页条数: {page_size}")
|
||||
print(f"总页数: {total_pages}")
|
||||
|
||||
TCK = []
|
||||
|
||||
# 获取第一页数据
|
||||
if 'rows' in first_page and len(first_page['rows']) > 0:
|
||||
for row in first_page['rows']:
|
||||
TCK1 = {
|
||||
'车牌号': row.get('carNo', ''),
|
||||
'卡名称': row.get('cardType', ''),
|
||||
'到期时间': row.get('endTime', ''),
|
||||
'发动机号': row.get('engineNumber', ''),
|
||||
'剩余明细': row.get('goodsName', ''),
|
||||
'剩余次数': row.get('qty', ''),
|
||||
'手机号': row.get('mobilePhone', ''),
|
||||
'客户姓名': row.get('name', ''),
|
||||
'备注': row.get('remark', ''),
|
||||
'Vin码': row.get('vin', '')
|
||||
}
|
||||
TCK.append(TCK1)
|
||||
print(f"已获取第1页数据,累计 {len(TCK)} 条")
|
||||
|
||||
# 获取剩余页数据
|
||||
for page in range(1, total_pages):
|
||||
try:
|
||||
data = get_data(page=page)
|
||||
if 'rows' in data and len(data['rows']) > 0:
|
||||
for row in data['rows']:
|
||||
TCK1 = {
|
||||
'车牌号': row.get('carNo', ''),
|
||||
'卡名称': row.get('cardType', ''),
|
||||
'到期时间': row.get('endTime', ''),
|
||||
'发动机号': row.get('engineNumber', ''),
|
||||
'剩余明细': row.get('goodsName', ''),
|
||||
'剩余次数': row.get('qty', ''),
|
||||
'手机号': row.get('mobilePhone', ''),
|
||||
'客户姓名': row.get('name', ''),
|
||||
'备注': row.get('remark', ''),
|
||||
'Vin码': row.get('vin', '')
|
||||
}
|
||||
TCK.append(TCK1)
|
||||
print(f"已获取第{page+1}页数据,累计 {len(TCK)} 条")
|
||||
except Exception as e:
|
||||
print(f"获取第{page+1}页数据失败: {e}")
|
||||
continue
|
||||
|
||||
# 导出数据
|
||||
df = pd.DataFrame(TCK)
|
||||
output_path = '会员卡.xlsx'
|
||||
df.to_excel(output_path, index=False)
|
||||
print(f"\n成功导出 {len(TCK)} 条会员卡数据到 {output_path}")
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"请求失败: {e}")
|
||||
except ValueError as e:
|
||||
print(f"JSON解析失败: {e}")
|
||||
@@ -0,0 +1,89 @@
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
cookies = {
|
||||
'JSESSIONID': 'FA68674FDDA302C51E2775091B995EEA',
|
||||
'td_cookie': '3008847516',
|
||||
}
|
||||
|
||||
headers = {
|
||||
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': 'http://www.idsz.xin:7070/report_member_verifi_list?detailtype=1&key=totalBalance&datafrom=2026-01-01&datato=2026-04-13&sshopId=&type=1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
}
|
||||
|
||||
def get_data(page=0, page_size=50):
|
||||
url = f'http://www.idsz.xin:7070/posapi_invoke?apiname=kpi_memberVerifiAndSurplusQuery&detailtype=1&startTime=2026-01-01&endTime=2026-04-13&key=totalBalance&sshopId=×CardId=&option=&page={page}&pageSize={page_size}'
|
||||
response = requests.get(url=url, headers=headers, cookies=cookies)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
print("正在请求数据...")
|
||||
try:
|
||||
# 先获取第一页数据,获取总数
|
||||
first_page = get_data(page=0)
|
||||
total = first_page.get('total', 0)
|
||||
page_size = 50
|
||||
total_pages = (total + page_size - 1) // page_size
|
||||
|
||||
print(f"数据总条数: {total}")
|
||||
print(f"每页条数: {page_size}")
|
||||
print(f"总页数: {total_pages}")
|
||||
|
||||
CZK = []
|
||||
|
||||
# 获取第一页数据
|
||||
if 'rows' in first_page and len(first_page['rows']) > 0:
|
||||
for row in first_page['rows']:
|
||||
CZK1 = {
|
||||
'车牌号': row.get('carNo', ''),
|
||||
'卡名称': row.get('cardType', ''),
|
||||
'到期时间': row.get('endTime', ''),
|
||||
'发动机号': row.get('engineNumber', ''),
|
||||
'剩余金额': row.get('leftAmount', ''),
|
||||
'剩余赠送金额': row.get('leftsendAmount', ''),
|
||||
'手机号': row.get('mobilePhone', ''),
|
||||
'客户姓名': row.get('name', ''),
|
||||
'备注': row.get('remark', ''),
|
||||
'Vin码': row.get('vin', '')
|
||||
}
|
||||
CZK.append(CZK1)
|
||||
print(f"已获取第1页数据,累计 {len(CZK)} 条")
|
||||
|
||||
# 获取剩余页数据
|
||||
for page in range(1, total_pages):
|
||||
try:
|
||||
data = get_data(page=page)
|
||||
if 'rows' in data and len(data['rows']) > 0:
|
||||
for row in data['rows']:
|
||||
CZK1 = {
|
||||
'车牌号': row.get('carNo', ''),
|
||||
'卡名称': row.get('cardType', ''),
|
||||
'到期时间': row.get('endTime', ''),
|
||||
'发动机号': row.get('engineNumber', ''),
|
||||
'剩余金额': row.get('leftAmount', ''),
|
||||
'剩余赠送金额': row.get('leftsendAmount', ''),
|
||||
'手机号': row.get('mobilePhone', ''),
|
||||
'客户姓名': row.get('name', ''),
|
||||
'备注': row.get('remark', ''),
|
||||
'Vin码': row.get('vin', '')
|
||||
}
|
||||
CZK.append(CZK1)
|
||||
print(f"已获取第{page+1}页数据,累计 {len(CZK)} 条")
|
||||
except Exception as e:
|
||||
print(f"获取第{page+1}页数据失败: {e}")
|
||||
continue
|
||||
|
||||
# 导出数据
|
||||
df = pd.DataFrame(CZK)
|
||||
output_path = '储值卡.xlsx'
|
||||
df.to_excel(output_path, index=False)
|
||||
print(f"\n成功导出 {len(CZK)} 条数据到 {output_path}")
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"请求失败: {e}")
|
||||
except ValueError as e:
|
||||
print(f"JSON解析失败: {e}")
|
||||
+1030
-28
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user