425 lines
14 KiB
Python
425 lines
14 KiB
Python
"""
|
||
H1车店系统 - 会员卡信息导出
|
||
从 https://scrm.h1cd.com/admin/members/cards.html 导出会员卡信息
|
||
注意:脚本解析HTML表格,导出的原始数据格式不规范,需要清洗处理
|
||
"""
|
||
|
||
import requests
|
||
import pandas as pd
|
||
from bs4 import BeautifulSoup
|
||
import os
|
||
import re
|
||
import time
|
||
import json
|
||
from datetime import datetime
|
||
|
||
# ===================== 【配置区】 =====================
|
||
# Cookie(请根据实际情况更新)
|
||
COOKIES = {
|
||
'showSmsActivity': '1',
|
||
'showEasyMoney': '1',
|
||
'LOGIN_URL': 'https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html',
|
||
'adminpd': 'jVISiRrtcJplFhLoCuUIxK9XG5ekdfwzq%2B0y482ZKxE%3D',
|
||
'adminun': '15224781773',
|
||
'uid': '10291',
|
||
'PHPSESSID': 'nbn58laakng0rv5iqln82a6qpu',
|
||
}
|
||
|
||
HEADERS = {
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
||
'Connection': 'keep-alive',
|
||
'Referer': 'https://scrm.h1cd.com/admin/members/cards.html',
|
||
'Sec-Fetch-Dest': 'iframe',
|
||
'Sec-Fetch-Mode': 'navigate',
|
||
'Sec-Fetch-Site': 'same-origin',
|
||
'Sec-Fetch-User': '?1',
|
||
'Upgrade-Insecure-Requests': '1',
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',
|
||
'sec-ch-ua': '"Chromium";v="146", "Not-A.Brand";v="24", "Microsoft Edge";v="146"',
|
||
'sec-ch-ua-mobile': '?0',
|
||
'sec-ch-ua-platform': '"Windows"',
|
||
}
|
||
|
||
# 查询参数
|
||
PARAMS = {
|
||
'type': '',
|
||
'expired': '',
|
||
'storeId': '0',
|
||
'search': '',
|
||
}
|
||
|
||
# 输出目录
|
||
OUTPUT_DIR = r"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出"
|
||
|
||
# =====================================================
|
||
|
||
|
||
def get_page_html(page_num, cookies, params):
|
||
"""获取指定页面的HTML内容"""
|
||
try:
|
||
if page_num == 1:
|
||
url = "https://scrm.h1cd.com/admin/members/cards.html"
|
||
else:
|
||
url = f"https://scrm.h1cd.com/admin/members/cards_{page_num}.html"
|
||
|
||
r = requests.get(url, headers=HEADERS, cookies=cookies, params=params, timeout=30)
|
||
|
||
# 检查是否被重定向到登录页
|
||
if 'login' in r.url.lower() or '登录' in r.text[:2000]:
|
||
print(f" ⚠️ 第{page_num}页检测到跳转登录,Cookie可能已失效。")
|
||
return None
|
||
|
||
r.raise_for_status()
|
||
r.encoding = 'utf-8'
|
||
return r.text
|
||
except Exception as e:
|
||
print(f" ❌ 第{page_num}页请求失败: {str(e)}")
|
||
return None
|
||
|
||
|
||
def parse_cards_table(html):
|
||
"""
|
||
解析会员卡HTML表格,提取数据并做规范化处理。
|
||
|
||
H1系统会员卡页面特点:
|
||
- 部分单元格包含多行信息(用<br>分隔),如姓名和手机号在同一格
|
||
- 状态信息可能包含多余文本
|
||
- 数值字段可能包含非数字字符
|
||
- 操作列包含按钮文本需要过滤
|
||
"""
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
table = soup.find('table', class_='table')
|
||
if not table:
|
||
table = soup.find('table')
|
||
if not table:
|
||
return [], []
|
||
|
||
# 提取表头
|
||
header = []
|
||
thead = table.find('thead')
|
||
if thead:
|
||
ths = thead.find_all('th')
|
||
header = [th.get_text(strip=True) for th in ths]
|
||
|
||
# 如果没有 thead,尝试从第一行 tr 中获取
|
||
if not header:
|
||
first_tr = table.find('tr')
|
||
if first_tr:
|
||
ths = first_tr.find_all('th')
|
||
if ths:
|
||
header = [th.get_text(strip=True) for th in ths]
|
||
|
||
# 提取数据行
|
||
tbody = table.find('tbody')
|
||
rows = tbody.find_all('tr') if tbody else table.find_all('tr')
|
||
|
||
data_rows = []
|
||
for tr in rows:
|
||
# 跳过表头行
|
||
if tr.find('th'):
|
||
continue
|
||
tds = tr.find_all('td')
|
||
if not tds or len(tds) < 3:
|
||
continue
|
||
|
||
row_data = []
|
||
for td in tds:
|
||
# 保留<br>产生的换行,使用separator分隔
|
||
text = td.get_text(separator='|', strip=True)
|
||
# 清理多余空格
|
||
text = re.sub(r'\s+', ' ', text)
|
||
row_data.append(text.strip())
|
||
|
||
if any(row_data):
|
||
data_rows.append(row_data)
|
||
|
||
return header, data_rows
|
||
|
||
|
||
def clean_card_record(row_dict, header):
|
||
"""
|
||
清洗单条会员卡记录,处理不规范的数据格式。
|
||
|
||
主要处理:
|
||
1. 姓名+手机号合并在一个字段中 → 拆分为独立的"客户名称"和"手机号"
|
||
2. 状态字段中的多余文本
|
||
3. 数值字段中的非数字字符
|
||
4. 操作列中的按钮文本
|
||
"""
|
||
cleaned = {}
|
||
|
||
# 定义可能的列名映射(H1系统表头可能包含的关键字)
|
||
col_mappings = {
|
||
'name_col': ['会员名', '姓名', '会员名称', '客户', '车主'],
|
||
'phone_col': ['手机', '电话', '联系电话'],
|
||
'card_no_col': ['卡号', '会员卡号', '卡编号'],
|
||
'card_type_col': ['卡类型', '卡名称', '类型'],
|
||
'balance_col': ['余额', '储值余额', '可用余额'],
|
||
'total_recharge_col': ['充值', '累计充值', '总充值', '充值金额'],
|
||
'total_consume_col': ['消费', '累计消费', '总消费', '消费金额'],
|
||
'status_col': ['状态', '卡状态'],
|
||
'create_time_col': ['开卡时间', '创建时间', '注册时间'],
|
||
'expire_time_col': ['到期时间', '有效期', '过期时间'],
|
||
'store_col': ['门店', '所属门店', '门店名称'],
|
||
'level_col': ['等级', '会员等级', '会员级别'],
|
||
}
|
||
|
||
# 查找列索引
|
||
col_index = {}
|
||
for key, keywords in col_mappings.items():
|
||
for kw in keywords:
|
||
for i, h in enumerate(header):
|
||
if kw in h:
|
||
col_index[key] = i
|
||
break
|
||
if key in col_index:
|
||
break
|
||
|
||
# 逐列清洗
|
||
for i, h in enumerate(header):
|
||
value = row_dict.get(h, '') if isinstance(row_dict, dict) else (row_dict[i] if i < len(row_dict) else '')
|
||
|
||
# 处理操作列(通常在最后一列,包含"充值记录"、"消费记录"等按钮文本)
|
||
if '操作' in h:
|
||
cleaned[h] = ''
|
||
continue
|
||
|
||
# 处理复选框列
|
||
if '选择' in h or '勾选' in h:
|
||
cleaned[h] = ''
|
||
continue
|
||
|
||
# 处理姓名+手机号合并的情况
|
||
if i == col_index.get('name_col'):
|
||
name, phone = '', ''
|
||
if '|' in value:
|
||
parts = [p.strip() for p in value.split('|')]
|
||
for part in parts:
|
||
phone_match = re.search(r'1[3-9]\d{9}', part)
|
||
if phone_match:
|
||
phone = phone_match.group()
|
||
elif part and not re.match(r'^\d{11}$', part):
|
||
name = part if not name else name + part
|
||
elif re.match(r'^\d{11}$', part):
|
||
phone = part
|
||
else:
|
||
phone_match = re.search(r'1[3-9]\d{9}', value)
|
||
if phone_match:
|
||
phone = phone_match.group()
|
||
name = value.replace(phone, '').strip()
|
||
else:
|
||
name = value.strip()
|
||
|
||
cleaned['客户名称'] = name
|
||
cleaned['手机号'] = phone
|
||
continue
|
||
|
||
# 处理手机号列(独立列)
|
||
if i == col_index.get('phone_col'):
|
||
phone_match = re.search(r'1[3-9]\d{9}', value)
|
||
cleaned['手机号'] = phone_match.group() if phone_match else value
|
||
continue
|
||
|
||
# 处理数值列(去掉非数字字符,保留小数点)
|
||
if i == col_index.get('balance_col') or i == col_index.get('total_recharge_col') or i == col_index.get('total_consume_col'):
|
||
num_match = re.search(r'[\d.]+', value.replace(',', ''))
|
||
cleaned[h] = num_match.group() if num_match else value
|
||
continue
|
||
|
||
# 清理其他字段中的多余空白和分隔符
|
||
clean_val = value.replace('|', ' ').strip()
|
||
clean_val = re.sub(r'\s+', ' ', clean_val)
|
||
# 去除 "查看详情"、"编辑" 等按钮文本
|
||
clean_val = re.sub(r'(查看详情|编辑|删除|充值记录|消费记录|详情)', '', clean_val).strip()
|
||
cleaned[h] = clean_val
|
||
|
||
return cleaned
|
||
|
||
|
||
def normalize_dataframe(df):
|
||
"""
|
||
对整个DataFrame进行规范化处理。
|
||
处理各种数据不规范的情况。
|
||
"""
|
||
# 去除完全重复的行
|
||
before_count = len(df)
|
||
df = df.drop_duplicates()
|
||
after_count = len(df)
|
||
if before_count != after_count:
|
||
print(f" 🔍 去重:{before_count} 条 → {after_count} 条(去除 {before_count - after_count} 条重复)")
|
||
|
||
# 尝试拆分合并列(如"姓名|手机号")
|
||
for col in df.columns:
|
||
# 检测该列是否包含手机号(超过30%的值匹配手机号模式)
|
||
phone_ratio = df[col].astype(str).apply(lambda x: bool(re.search(r'1[3-9]\d{9}', x))).mean()
|
||
name_ratio = df[col].astype(str).apply(lambda x: bool(re.search(r'[\u4e00-\u9fa5]{2,4}', x))).mean()
|
||
|
||
if phone_ratio > 0.3 and name_ratio > 0.3 and '名称' in col:
|
||
# 该列同时包含姓名和手机号,需要拆分
|
||
if '客户名称' not in df.columns:
|
||
df['客户名称'] = df[col].apply(
|
||
lambda x: re.sub(r'1[3-9]\d{9}', '', str(x)).replace('|', '').strip()
|
||
)
|
||
if '手机号' not in df.columns:
|
||
df['手机号'] = df[col].apply(
|
||
lambda x: (re.search(r'1[3-9]\d{9}', str(x)) or type('', (), {'group': lambda s: ''})()).group()
|
||
)
|
||
|
||
# 清理数值列
|
||
for col in df.columns:
|
||
if any(kw in col for kw in ['余额', '充值', '消费', '金额']):
|
||
df[col] = df[col].astype(str).apply(
|
||
lambda x: re.search(r'[\d.]+', x.replace(',', '')).group() if re.search(r'[\d.]+', x.replace(',', '')) else x
|
||
)
|
||
|
||
# 清理操作列
|
||
for col in df.columns:
|
||
if '操作' in col or '选择' in col or '勾选' in col:
|
||
df = df.drop(columns=[col])
|
||
|
||
# 清理所有列中的按钮文本残留
|
||
for col in df.columns:
|
||
df[col] = df[col].astype(str).apply(
|
||
lambda x: re.sub(r'(查看详情|编辑|删除|充值记录|消费记录|详情|迁移)', '', str(x)).strip()
|
||
)
|
||
# 替换 'nan' 为空字符串
|
||
df[col] = df[col].replace('nan', '')
|
||
df[col] = df[col].replace('None', '')
|
||
|
||
return df
|
||
|
||
|
||
def get_max_page(html):
|
||
"""从页面中提取最大页数"""
|
||
if not html:
|
||
return 1
|
||
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
text = soup.get_text()
|
||
|
||
# 尝试匹配 "共X页" 格式
|
||
match = re.search(r'共\s*(\d+)\s*页', text)
|
||
if match:
|
||
return int(match.group(1))
|
||
|
||
# 尝试匹配 "页 1/X" 格式
|
||
match = re.search(r'页\s*1/(\d+)', text)
|
||
if match:
|
||
return int(match.group(1))
|
||
|
||
# 尝试匹配分页链接
|
||
page_links = soup.find_all('a', href=re.compile(r'cards_\d+\.html'))
|
||
if page_links:
|
||
max_page = 1
|
||
for a in page_links:
|
||
num_match = re.search(r'cards_(\d+)\.html', a.get('href', ''))
|
||
if num_match:
|
||
max_page = max(max_page, int(num_match.group(1)))
|
||
return max_page
|
||
|
||
return 1
|
||
|
||
|
||
def main():
|
||
print("=" * 50)
|
||
print("开始爬取 H1系统 会员卡信息...")
|
||
print(f"当前 StoreID: {PARAMS['storeId']}")
|
||
print("=" * 50)
|
||
|
||
# 获取第一页,确定总页数
|
||
print("正在获取总页数...")
|
||
first_html = get_page_html(1, COOKIES, PARAMS)
|
||
if not first_html:
|
||
print("❌ 无法获取第一页数据,请检查 Cookie 或网络。")
|
||
return
|
||
|
||
max_page = get_max_page(first_html)
|
||
print(f"✅ 成功获取最大页数:{max_page}")
|
||
|
||
# 爬取所有页面
|
||
all_data = []
|
||
merged_header = []
|
||
|
||
for page in range(1, max_page + 1):
|
||
print(f"正在爬取第 {page}/{max_page} 页...")
|
||
|
||
if page == 1:
|
||
html = first_html
|
||
else:
|
||
html = get_page_html(page, COOKIES, PARAMS)
|
||
if not html:
|
||
print(f"❌ 第 {page} 页获取失败,跳过。")
|
||
continue
|
||
|
||
header, rows = parse_cards_table(html)
|
||
|
||
if not header and not rows:
|
||
print(f"⚠️ 第 {page} 页未解析到表格数据。")
|
||
continue
|
||
|
||
# 合并表头(不同页的表头可能略有差异)
|
||
if header:
|
||
for h in header:
|
||
if h not in merged_header:
|
||
merged_header.append(h)
|
||
|
||
all_data.extend(rows)
|
||
|
||
# 请求间隔,避免过于频繁
|
||
if page < max_page:
|
||
time.sleep(0.3)
|
||
|
||
if not all_data:
|
||
print("\n❌ 未获取到任何数据,请检查 Cookie 或网络。")
|
||
return
|
||
|
||
print(f"\n✅ 爬取完成,共获取 {len(all_data)} 条原始记录")
|
||
|
||
# 构建DataFrame
|
||
if merged_header:
|
||
# 标准化行长度
|
||
normalized_rows = []
|
||
width = len(merged_header)
|
||
for row in all_data:
|
||
if len(row) < width:
|
||
row = row + [''] * (width - len(row))
|
||
elif len(row) > width:
|
||
row = row[:width]
|
||
normalized_rows.append(row)
|
||
df = pd.DataFrame(normalized_rows, columns=merged_header)
|
||
else:
|
||
df = pd.DataFrame(all_data)
|
||
|
||
print(f"📋 原始列名:{list(df.columns)}")
|
||
print(f"📋 原始数据前3行:")
|
||
print(df.head(3).to_string())
|
||
|
||
# 数据规范化处理
|
||
print("\n开始数据规范化处理...")
|
||
df = normalize_dataframe(df)
|
||
|
||
# 保存结果
|
||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||
time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
filename = f"H1会员卡信息_{time_str}.xlsx"
|
||
filepath = os.path.join(OUTPUT_DIR, filename)
|
||
|
||
try:
|
||
df.to_excel(filepath, index=False)
|
||
print("=" * 50)
|
||
print(f"✅ 导出完成!")
|
||
print(f"📊 最终有效条数:{len(df)}")
|
||
print(f"📁 已保存到:{filepath}")
|
||
print("=" * 50)
|
||
except Exception as e:
|
||
print(f"❌ 保存Excel失败: {e}")
|
||
# 降级为CSV
|
||
csv_path = filepath.replace('.xlsx', '.csv')
|
||
df.to_csv(csv_path, index=False, encoding='utf-8-sig')
|
||
print(f"💡 已转为 CSV 保存至:{csv_path}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|