F6--/张阳脚本/竞品系统数据导出/H1会员卡导出_temp.py

"""
H1车店系统 - 会员卡信息导出
从 https://scrm.h1cd.com/admin/members/cards.html 导出会员卡信息
注意：脚本解析HTML表格，导出的原始数据格式不规范，需要清洗处理
"""

import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import re
import time
import json
from datetime import datetime

# ===================== 【配置区】 =====================
# Cookie（请根据实际情况更新）
COOKIES = {
    'showSmsActivity': '1',
    'showEasyMoney': '1',
    'LOGIN_URL': 'https%3A%2F%2Fscrm.h1cd.com%2Flogin-h1cd.html',
    'adminpd': 'jVISiRrtcJplFhLoCuUIxK9XG5ekdfwzq%2B0y482ZKxE%3D',
    'adminun': '15224781773',
    'uid': '10291',
    'PHPSESSID': 'nbn58laakng0rv5iqln82a6qpu',
}

HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Connection': 'keep-alive',
    'Referer': 'https://scrm.h1cd.com/admin/members/cards.html',
    'Sec-Fetch-Dest': 'iframe',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36 Edg/146.0.0.0',
    'sec-ch-ua': '"Chromium";v="146", "Not-A.Brand";v="24", "Microsoft Edge";v="146"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}

# 查询参数
PARAMS = {
    'type': '',
    'expired': '',
    'storeId': '0',
    'search': '',
}

# 输出目录
OUTPUT_DIR = r"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\文件输出"

# =====================================================


def get_page_html(page_num, cookies, params):
    """获取指定页面的HTML内容"""
    try:
        if page_num == 1:
            url = "https://scrm.h1cd.com/admin/members/cards.html"
        else:
            url = f"https://scrm.h1cd.com/admin/members/cards_{page_num}.html"

        r = requests.get(url, headers=HEADERS, cookies=cookies, params=params, timeout=30)

        # 检查是否被重定向到登录页
        if 'login' in r.url.lower() or '登录' in r.text[:2000]:
            print(f"   ⚠️ 第{page_num}页检测到跳转登录，Cookie可能已失效。")
            return None

        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except Exception as e:
        print(f"   ❌ 第{page_num}页请求失败: {str(e)}")
        return None


def parse_cards_table(html):
    """
    解析会员卡HTML表格，提取数据并做规范化处理。

    H1系统会员卡页面特点：
    - 部分单元格包含多行信息（用<br>分隔），如姓名和手机号在同一格
    - 状态信息可能包含多余文本
    - 数值字段可能包含非数字字符
    - 操作列包含按钮文本需要过滤
    """
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', class_='table')
    if not table:
        table = soup.find('table')
    if not table:
        return [], []

    # 提取表头
    header = []
    thead = table.find('thead')
    if thead:
        ths = thead.find_all('th')
        header = [th.get_text(strip=True) for th in ths]

    # 如果没有 thead，尝试从第一行 tr 中获取
    if not header:
        first_tr = table.find('tr')
        if first_tr:
            ths = first_tr.find_all('th')
            if ths:
                header = [th.get_text(strip=True) for th in ths]

    # 提取数据行
    tbody = table.find('tbody')
    rows = tbody.find_all('tr') if tbody else table.find_all('tr')

    data_rows = []
    for tr in rows:
        # 跳过表头行
        if tr.find('th'):
            continue
        tds = tr.find_all('td')
        if not tds or len(tds) < 3:
            continue

        row_data = []
        for td in tds:
            # 保留<br>产生的换行，使用separator分隔
            text = td.get_text(separator='|', strip=True)
            # 清理多余空格
            text = re.sub(r'\s+', ' ', text)
            row_data.append(text.strip())

        if any(row_data):
            data_rows.append(row_data)

    return header, data_rows


def clean_card_record(row_dict, header):
    """
    清洗单条会员卡记录，处理不规范的数据格式。

    主要处理：
    1. 姓名+手机号合并在一个字段中 → 拆分为独立的"客户名称"和"手机号"
    2. 状态字段中的多余文本
    3. 数值字段中的非数字字符
    4. 操作列中的按钮文本
    """
    cleaned = {}

    # 定义可能的列名映射（H1系统表头可能包含的关键字）
    col_mappings = {
        'name_col': ['会员名', '姓名', '会员名称', '客户', '车主'],
        'phone_col': ['手机', '电话', '联系电话'],
        'card_no_col': ['卡号', '会员卡号', '卡编号'],
        'card_type_col': ['卡类型', '卡名称', '类型'],
        'balance_col': ['余额', '储值余额', '可用余额'],
        'total_recharge_col': ['充值', '累计充值', '总充值', '充值金额'],
        'total_consume_col': ['消费', '累计消费', '总消费', '消费金额'],
        'status_col': ['状态', '卡状态'],
        'create_time_col': ['开卡时间', '创建时间', '注册时间'],
        'expire_time_col': ['到期时间', '有效期', '过期时间'],
        'store_col': ['门店', '所属门店', '门店名称'],
        'level_col': ['等级', '会员等级', '会员级别'],
    }

    # 查找列索引
    col_index = {}
    for key, keywords in col_mappings.items():
        for kw in keywords:
            for i, h in enumerate(header):
                if kw in h:
                    col_index[key] = i
                    break
            if key in col_index:
                break

    # 逐列清洗
    for i, h in enumerate(header):
        value = row_dict.get(h, '') if isinstance(row_dict, dict) else (row_dict[i] if i < len(row_dict) else '')

        # 处理操作列（通常在最后一列，包含"充值记录"、"消费记录"等按钮文本）
        if '操作' in h:
            cleaned[h] = ''
            continue

        # 处理复选框列
        if '选择' in h or '勾选' in h:
            cleaned[h] = ''
            continue

        # 处理姓名+手机号合并的情况
        if i == col_index.get('name_col'):
            name, phone = '', ''
            if '|' in value:
                parts = [p.strip() for p in value.split('|')]
                for part in parts:
                    phone_match = re.search(r'1[3-9]\d{9}', part)
                    if phone_match:
                        phone = phone_match.group()
                    elif part and not re.match(r'^\d{11}$', part):
                        name = part if not name else name + part
                    elif re.match(r'^\d{11}$', part):
                        phone = part
            else:
                phone_match = re.search(r'1[3-9]\d{9}', value)
                if phone_match:
                    phone = phone_match.group()
                    name = value.replace(phone, '').strip()
                else:
                    name = value.strip()

            cleaned['客户名称'] = name
            cleaned['手机号'] = phone
            continue

        # 处理手机号列（独立列）
        if i == col_index.get('phone_col'):
            phone_match = re.search(r'1[3-9]\d{9}', value)
            cleaned['手机号'] = phone_match.group() if phone_match else value
            continue

        # 处理数值列（去掉非数字字符，保留小数点）
        if i == col_index.get('balance_col') or i == col_index.get('total_recharge_col') or i == col_index.get('total_consume_col'):
            num_match = re.search(r'[\d.]+', value.replace(',', ''))
            cleaned[h] = num_match.group() if num_match else value
            continue

        # 清理其他字段中的多余空白和分隔符
        clean_val = value.replace('|', ' ').strip()
        clean_val = re.sub(r'\s+', ' ', clean_val)
        # 去除 "查看详情"、"编辑" 等按钮文本
        clean_val = re.sub(r'(查看详情|编辑|删除|充值记录|消费记录|详情)', '', clean_val).strip()
        cleaned[h] = clean_val

    return cleaned


def normalize_dataframe(df):
    """
    对整个DataFrame进行规范化处理。
    处理各种数据不规范的情况。
    """
    # 去除完全重复的行
    before_count = len(df)
    df = df.drop_duplicates()
    after_count = len(df)
    if before_count != after_count:
        print(f"   🔍 去重：{before_count} 条 → {after_count} 条（去除 {before_count - after_count} 条重复）")

    # 尝试拆分合并列（如"姓名|手机号"）
    for col in df.columns:
        # 检测该列是否包含手机号（超过30%的值匹配手机号模式）
        phone_ratio = df[col].astype(str).apply(lambda x: bool(re.search(r'1[3-9]\d{9}', x))).mean()
        name_ratio = df[col].astype(str).apply(lambda x: bool(re.search(r'[\u4e00-\u9fa5]{2,4}', x))).mean()

        if phone_ratio > 0.3 and name_ratio > 0.3 and '名称' in col:
            # 该列同时包含姓名和手机号，需要拆分
            if '客户名称' not in df.columns:
                df['客户名称'] = df[col].apply(
                    lambda x: re.sub(r'1[3-9]\d{9}', '', str(x)).replace('|', '').strip()
                )
            if '手机号' not in df.columns:
                df['手机号'] = df[col].apply(
                    lambda x: (re.search(r'1[3-9]\d{9}', str(x)) or type('', (), {'group': lambda s: ''})()).group()
                )

    # 清理数值列
    for col in df.columns:
        if any(kw in col for kw in ['余额', '充值', '消费', '金额']):
            df[col] = df[col].astype(str).apply(
                lambda x: re.search(r'[\d.]+', x.replace(',', '')).group() if re.search(r'[\d.]+', x.replace(',', '')) else x
            )

    # 清理操作列
    for col in df.columns:
        if '操作' in col or '选择' in col or '勾选' in col:
            df = df.drop(columns=[col])

    # 清理所有列中的按钮文本残留
    for col in df.columns:
        df[col] = df[col].astype(str).apply(
            lambda x: re.sub(r'(查看详情|编辑|删除|充值记录|消费记录|详情|迁移)', '', str(x)).strip()
        )
        # 替换 'nan' 为空字符串
        df[col] = df[col].replace('nan', '')
        df[col] = df[col].replace('None', '')

    return df


def get_max_page(html):
    """从页面中提取最大页数"""
    if not html:
        return 1

    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text()

    # 尝试匹配 "共X页" 格式
    match = re.search(r'共\s*(\d+)\s*页', text)
    if match:
        return int(match.group(1))

    # 尝试匹配 "页 1/X" 格式
    match = re.search(r'页\s*1/(\d+)', text)
    if match:
        return int(match.group(1))

    # 尝试匹配分页链接
    page_links = soup.find_all('a', href=re.compile(r'cards_\d+\.html'))
    if page_links:
        max_page = 1
        for a in page_links:
            num_match = re.search(r'cards_(\d+)\.html', a.get('href', ''))
            if num_match:
                max_page = max(max_page, int(num_match.group(1)))
        return max_page

    return 1


def main():
    print("=" * 50)
    print("开始爬取 H1系统 会员卡信息...")
    print(f"当前 StoreID: {PARAMS['storeId']}")
    print("=" * 50)

    # 获取第一页，确定总页数
    print("正在获取总页数...")
    first_html = get_page_html(1, COOKIES, PARAMS)
    if not first_html:
        print("❌ 无法获取第一页数据，请检查 Cookie 或网络。")
        return

    max_page = get_max_page(first_html)
    print(f"✅ 成功获取最大页数：{max_page}")

    # 爬取所有页面
    all_data = []
    merged_header = []

    for page in range(1, max_page + 1):
        print(f"正在爬取第 {page}/{max_page} 页...")

        if page == 1:
            html = first_html
        else:
            html = get_page_html(page, COOKIES, PARAMS)
            if not html:
                print(f"❌ 第 {page} 页获取失败，跳过。")
                continue

        header, rows = parse_cards_table(html)

        if not header and not rows:
            print(f"⚠️ 第 {page} 页未解析到表格数据。")
            continue

        # 合并表头（不同页的表头可能略有差异）
        if header:
            for h in header:
                if h not in merged_header:
                    merged_header.append(h)

        all_data.extend(rows)

        # 请求间隔，避免过于频繁
        if page < max_page:
            time.sleep(0.3)

    if not all_data:
        print("\n❌ 未获取到任何数据，请检查 Cookie 或网络。")
        return

    print(f"\n✅ 爬取完成，共获取 {len(all_data)} 条原始记录")

    # 构建DataFrame
    if merged_header:
        # 标准化行长度
        normalized_rows = []
        width = len(merged_header)
        for row in all_data:
            if len(row) < width:
                row = row + [''] * (width - len(row))
            elif len(row) > width:
                row = row[:width]
            normalized_rows.append(row)
        df = pd.DataFrame(normalized_rows, columns=merged_header)
    else:
        df = pd.DataFrame(all_data)

    print(f"📋 原始列名：{list(df.columns)}")
    print(f"📋 原始数据前3行：")
    print(df.head(3).to_string())

    # 数据规范化处理
    print("\n开始数据规范化处理...")
    df = normalize_dataframe(df)

    # 保存结果
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"H1会员卡信息_{time_str}.xlsx"
    filepath = os.path.join(OUTPUT_DIR, filename)

    try:
        df.to_excel(filepath, index=False)
        print("=" * 50)
        print(f"✅ 导出完成！")
        print(f"📊 最终有效条数：{len(df)}")
        print(f"📁 已保存到：{filepath}")
        print("=" * 50)
    except Exception as e:
        print(f"❌ 保存Excel失败: {e}")
        # 降级为CSV
        csv_path = filepath.replace('.xlsx', '.csv')
        df.to_csv(csv_path, index=False, encoding='utf-8-sig')
        print(f"💡 已转为 CSV 保存至：{csv_path}")


if __name__ == '__main__':
    main()