Files
F6--/张阳脚本/竞品系统数据导出/H1会员卡导出_temp.py
T
2026-04-18 09:22:23 +08:00

425 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
H1车店系统 - 会员卡信息导出
从 https://scrm.h1cd.com/admin/members/cards.html 导出会员卡信息
注意:脚本解析HTML表格,导出的原始数据格式不规范,需要清洗处理
"""
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import re
import time
import json
from datetime import datetime
# ===================== 【配置区】 =====================
# Cookie(请根据实际情况更新)
COOKIES = {
'showSmsActivity': '1',
'showEasyMoney': '1',
'LOGIN_URL': 'https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html',
'adminpd': 'jVISiRrtcJplFhLoCuUIxK9XG5ekdfwzq%2B0y482ZKxE%3D',
'adminun': '15224781773',
'uid': '10291',
'PHPSESSID': 'nbn58laakng0rv5iqln82a6qpu',
}
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Referer': 'https://scrm.h1cd.com/admin/members/cards.html',
'Sec-Fetch-Dest': 'iframe',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',
'sec-ch-ua': '"Chromium";v="146", "Not-A.Brand";v="24", "Microsoft Edge";v="146"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
# 查询参数
PARAMS = {
'type': '',
'expired': '',
'storeId': '0',
'search': '',
}
# 输出目录
OUTPUT_DIR = r"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出"
# =====================================================
def get_page_html(page_num, cookies, params):
"""获取指定页面的HTML内容"""
try:
if page_num == 1:
url = "https://scrm.h1cd.com/admin/members/cards.html"
else:
url = f"https://scrm.h1cd.com/admin/members/cards_{page_num}.html"
r = requests.get(url, headers=HEADERS, cookies=cookies, params=params, timeout=30)
# 检查是否被重定向到登录页
if 'login' in r.url.lower() or '登录' in r.text[:2000]:
print(f" ⚠️ 第{page_num}页检测到跳转登录,Cookie可能已失效。")
return None
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except Exception as e:
print(f" ❌ 第{page_num}页请求失败: {str(e)}")
return None
def parse_cards_table(html):
"""
解析会员卡HTML表格,提取数据并做规范化处理。
H1系统会员卡页面特点:
- 部分单元格包含多行信息(用<br>分隔),如姓名和手机号在同一格
- 状态信息可能包含多余文本
- 数值字段可能包含非数字字符
- 操作列包含按钮文本需要过滤
"""
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='table')
if not table:
table = soup.find('table')
if not table:
return [], []
# 提取表头
header = []
thead = table.find('thead')
if thead:
ths = thead.find_all('th')
header = [th.get_text(strip=True) for th in ths]
# 如果没有 thead,尝试从第一行 tr 中获取
if not header:
first_tr = table.find('tr')
if first_tr:
ths = first_tr.find_all('th')
if ths:
header = [th.get_text(strip=True) for th in ths]
# 提取数据行
tbody = table.find('tbody')
rows = tbody.find_all('tr') if tbody else table.find_all('tr')
data_rows = []
for tr in rows:
# 跳过表头行
if tr.find('th'):
continue
tds = tr.find_all('td')
if not tds or len(tds) < 3:
continue
row_data = []
for td in tds:
# 保留<br>产生的换行,使用separator分隔
text = td.get_text(separator='|', strip=True)
# 清理多余空格
text = re.sub(r'\s+', ' ', text)
row_data.append(text.strip())
if any(row_data):
data_rows.append(row_data)
return header, data_rows
def clean_card_record(row_dict, header):
"""
清洗单条会员卡记录,处理不规范的数据格式。
主要处理:
1. 姓名+手机号合并在一个字段中 → 拆分为独立的"客户名称""手机号"
2. 状态字段中的多余文本
3. 数值字段中的非数字字符
4. 操作列中的按钮文本
"""
cleaned = {}
# 定义可能的列名映射(H1系统表头可能包含的关键字)
col_mappings = {
'name_col': ['会员名', '姓名', '会员名称', '客户', '车主'],
'phone_col': ['手机', '电话', '联系电话'],
'card_no_col': ['卡号', '会员卡号', '卡编号'],
'card_type_col': ['卡类型', '卡名称', '类型'],
'balance_col': ['余额', '储值余额', '可用余额'],
'total_recharge_col': ['充值', '累计充值', '总充值', '充值金额'],
'total_consume_col': ['消费', '累计消费', '总消费', '消费金额'],
'status_col': ['状态', '卡状态'],
'create_time_col': ['开卡时间', '创建时间', '注册时间'],
'expire_time_col': ['到期时间', '有效期', '过期时间'],
'store_col': ['门店', '所属门店', '门店名称'],
'level_col': ['等级', '会员等级', '会员级别'],
}
# 查找列索引
col_index = {}
for key, keywords in col_mappings.items():
for kw in keywords:
for i, h in enumerate(header):
if kw in h:
col_index[key] = i
break
if key in col_index:
break
# 逐列清洗
for i, h in enumerate(header):
value = row_dict.get(h, '') if isinstance(row_dict, dict) else (row_dict[i] if i < len(row_dict) else '')
# 处理操作列(通常在最后一列,包含"充值记录"、"消费记录"等按钮文本)
if '操作' in h:
cleaned[h] = ''
continue
# 处理复选框列
if '选择' in h or '勾选' in h:
cleaned[h] = ''
continue
# 处理姓名+手机号合并的情况
if i == col_index.get('name_col'):
name, phone = '', ''
if '|' in value:
parts = [p.strip() for p in value.split('|')]
for part in parts:
phone_match = re.search(r'1[3-9]\d{9}', part)
if phone_match:
phone = phone_match.group()
elif part and not re.match(r'^\d{11}$', part):
name = part if not name else name + part
elif re.match(r'^\d{11}$', part):
phone = part
else:
phone_match = re.search(r'1[3-9]\d{9}', value)
if phone_match:
phone = phone_match.group()
name = value.replace(phone, '').strip()
else:
name = value.strip()
cleaned['客户名称'] = name
cleaned['手机号'] = phone
continue
# 处理手机号列(独立列)
if i == col_index.get('phone_col'):
phone_match = re.search(r'1[3-9]\d{9}', value)
cleaned['手机号'] = phone_match.group() if phone_match else value
continue
# 处理数值列(去掉非数字字符,保留小数点)
if i == col_index.get('balance_col') or i == col_index.get('total_recharge_col') or i == col_index.get('total_consume_col'):
num_match = re.search(r'[\d.]+', value.replace(',', ''))
cleaned[h] = num_match.group() if num_match else value
continue
# 清理其他字段中的多余空白和分隔符
clean_val = value.replace('|', ' ').strip()
clean_val = re.sub(r'\s+', ' ', clean_val)
# 去除 "查看详情"、"编辑" 等按钮文本
clean_val = re.sub(r'(查看详情|编辑|删除|充值记录|消费记录|详情)', '', clean_val).strip()
cleaned[h] = clean_val
return cleaned
def normalize_dataframe(df):
"""
对整个DataFrame进行规范化处理。
处理各种数据不规范的情况。
"""
# 去除完全重复的行
before_count = len(df)
df = df.drop_duplicates()
after_count = len(df)
if before_count != after_count:
print(f" 🔍 去重:{before_count} 条 → {after_count} 条(去除 {before_count - after_count} 条重复)")
# 尝试拆分合并列(如"姓名|手机号"
for col in df.columns:
# 检测该列是否包含手机号(超过30%的值匹配手机号模式)
phone_ratio = df[col].astype(str).apply(lambda x: bool(re.search(r'1[3-9]\d{9}', x))).mean()
name_ratio = df[col].astype(str).apply(lambda x: bool(re.search(r'[\u4e00-\u9fa5]{2,4}', x))).mean()
if phone_ratio > 0.3 and name_ratio > 0.3 and '名称' in col:
# 该列同时包含姓名和手机号,需要拆分
if '客户名称' not in df.columns:
df['客户名称'] = df[col].apply(
lambda x: re.sub(r'1[3-9]\d{9}', '', str(x)).replace('|', '').strip()
)
if '手机号' not in df.columns:
df['手机号'] = df[col].apply(
lambda x: (re.search(r'1[3-9]\d{9}', str(x)) or type('', (), {'group': lambda s: ''})()).group()
)
# 清理数值列
for col in df.columns:
if any(kw in col for kw in ['余额', '充值', '消费', '金额']):
df[col] = df[col].astype(str).apply(
lambda x: re.search(r'[\d.]+', x.replace(',', '')).group() if re.search(r'[\d.]+', x.replace(',', '')) else x
)
# 清理操作列
for col in df.columns:
if '操作' in col or '选择' in col or '勾选' in col:
df = df.drop(columns=[col])
# 清理所有列中的按钮文本残留
for col in df.columns:
df[col] = df[col].astype(str).apply(
lambda x: re.sub(r'(查看详情|编辑|删除|充值记录|消费记录|详情|迁移)', '', str(x)).strip()
)
# 替换 'nan' 为空字符串
df[col] = df[col].replace('nan', '')
df[col] = df[col].replace('None', '')
return df
def get_max_page(html):
"""从页面中提取最大页数"""
if not html:
return 1
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text()
# 尝试匹配 "共X页" 格式
match = re.search(r'\s*(\d+)\s*页', text)
if match:
return int(match.group(1))
# 尝试匹配 "页 1/X" 格式
match = re.search(r'\s*1/(\d+)', text)
if match:
return int(match.group(1))
# 尝试匹配分页链接
page_links = soup.find_all('a', href=re.compile(r'cards_\d+\.html'))
if page_links:
max_page = 1
for a in page_links:
num_match = re.search(r'cards_(\d+)\.html', a.get('href', ''))
if num_match:
max_page = max(max_page, int(num_match.group(1)))
return max_page
return 1
def main():
print("=" * 50)
print("开始爬取 H1系统 会员卡信息...")
print(f"当前 StoreID: {PARAMS['storeId']}")
print("=" * 50)
# 获取第一页,确定总页数
print("正在获取总页数...")
first_html = get_page_html(1, COOKIES, PARAMS)
if not first_html:
print("❌ 无法获取第一页数据,请检查 Cookie 或网络。")
return
max_page = get_max_page(first_html)
print(f"✅ 成功获取最大页数:{max_page}")
# 爬取所有页面
all_data = []
merged_header = []
for page in range(1, max_page + 1):
print(f"正在爬取第 {page}/{max_page} 页...")
if page == 1:
html = first_html
else:
html = get_page_html(page, COOKIES, PARAMS)
if not html:
print(f"❌ 第 {page} 页获取失败,跳过。")
continue
header, rows = parse_cards_table(html)
if not header and not rows:
print(f"⚠️ 第 {page} 页未解析到表格数据。")
continue
# 合并表头(不同页的表头可能略有差异)
if header:
for h in header:
if h not in merged_header:
merged_header.append(h)
all_data.extend(rows)
# 请求间隔,避免过于频繁
if page < max_page:
time.sleep(0.3)
if not all_data:
print("\n❌ 未获取到任何数据,请检查 Cookie 或网络。")
return
print(f"\n✅ 爬取完成,共获取 {len(all_data)} 条原始记录")
# 构建DataFrame
if merged_header:
# 标准化行长度
normalized_rows = []
width = len(merged_header)
for row in all_data:
if len(row) < width:
row = row + [''] * (width - len(row))
elif len(row) > width:
row = row[:width]
normalized_rows.append(row)
df = pd.DataFrame(normalized_rows, columns=merged_header)
else:
df = pd.DataFrame(all_data)
print(f"📋 原始列名:{list(df.columns)}")
print(f"📋 原始数据前3行:")
print(df.head(3).to_string())
# 数据规范化处理
print("\n开始数据规范化处理...")
df = normalize_dataframe(df)
# 保存结果
os.makedirs(OUTPUT_DIR, exist_ok=True)
time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"H1会员卡信息_{time_str}.xlsx"
filepath = os.path.join(OUTPUT_DIR, filename)
try:
df.to_excel(filepath, index=False)
print("=" * 50)
print(f"✅ 导出完成!")
print(f"📊 最终有效条数:{len(df)}")
print(f"📁 已保存到:{filepath}")
print("=" * 50)
except Exception as e:
print(f"❌ 保存Excel失败: {e}")
# 降级为CSV
csv_path = filepath.replace('.xlsx', '.csv')
df.to_csv(csv_path, index=False, encoding='utf-8-sig')
print(f"💡 已转为 CSV 保存至:{csv_path}")
if __name__ == '__main__':
main()