Files
saas/test/续约回访90-180天调试.ipynb
2026-04-09 09:53:47 +08:00

1218 lines
59 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 续约回访90-180天调试分析\n",
"\n",
"本notebook用于调试续约回访派发数据为空的问题,每一步都会保存CSV以便分析。\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤1: 导入必要的库和初始化\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"当前工作目录: d:\\Idea Project\\SaaS_V1.7\\test\n",
"项目根目录: d:\\Idea Project\\SaaS_V1.7\n",
"Python路径: ['d:\\\\Idea Project\\\\SaaS_V1.7', 'd:\\\\Program Files\\\\anaconda3\\\\envs\\\\SaaS\\\\python313.zip', 'd:\\\\Program Files\\\\anaconda3\\\\envs\\\\SaaS\\\\DLLs']...\n",
"输出目录: d:\\Idea Project\\SaaS_V1.7\\back_ground_module\\output\\debug_revisit_renew\n"
]
}
],
"source": [
"import os\n",
"import sys\n",
"import time\n",
"import requests\n",
"\n",
"# 添加项目根目录到Python路径\n",
"# 方法1: 如果notebook在test目录下,向上找一级\n",
"current_dir = os.getcwd()\n",
"if os.path.basename(current_dir) == 'test':\n",
" project_root = os.path.dirname(current_dir)\n",
"else:\n",
" # 方法2: 向上查找直到找到api.py文件\n",
" project_root = current_dir\n",
" while project_root != os.path.dirname(project_root):\n",
" if os.path.exists(os.path.join(project_root, 'api.py')):\n",
" break\n",
" project_root = os.path.dirname(project_root)\n",
"\n",
"if project_root not in sys.path:\n",
" sys.path.insert(0, project_root)\n",
"print(f\"当前工作目录: {current_dir}\")\n",
"print(f\"项目根目录: {project_root}\")\n",
"print(f\"Python路径: {sys.path[:3]}...\") # 只显示前3个路径\n",
"\n",
"from api import API\n",
"from back_ground_module import CommonModule\n",
"import pandas as pd\n",
"import datetime\n",
"import re\n",
"from log_config import configure_task_logger, configure_error_task_logger\n",
"\n",
"api_instance = API()\n",
"common_module = CommonModule()\n",
"logger = configure_task_logger()\n",
"error_task_logger = configure_error_task_logger()\n",
"\n",
"# 设置输出目录(相对于当前notebook目录)\n",
"output_dir = \"output/debug_revisit_renew\"\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"print(f\"输出目录: {os.path.abspath(output_dir)}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤2: 加载所有数据(单独执行,数据量较大)\n",
"\n",
"**注意:这一步数据量较大,请单独执行此单元**\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"开始加载所有数据...\")\n",
"print(f\"开始时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n",
"\n",
"# 省市区人员关系表\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676512ac3e54dc3159460c0a\"}\n",
"json_dict = api_instance.entry_data_list(payload)\n",
"if json_dict and \"data\" in json_dict:\n",
" json_list = json_dict.get(\"data\")\n",
" print(f\"省市区人员关系表: {len(json_list)} 条\")\n",
" pd.DataFrame(json_list).to_csv(f\"{output_dir}/01_省市区人员关系表.csv\", index=False, encoding='utf-8-sig')\n",
"else:\n",
" print(\"加载省市区人员关系表失败\")\n",
" json_list = []\n",
"\n",
"# 获取简道云员工id\n",
"payload = {\"api_key\": \"6694d3c4fcb69ca9a111a6c4\", \"entry_id\": \"6769204a1902c9341340a1bc\"}\n",
"staff_id = api_instance.entry_data_list(payload)\n",
"staff_id_list = staff_id.get(\"data\")\n",
"print(f\"简道云员工id: {len(staff_id_list)} 条\")\n",
"pd.DataFrame(staff_id_list).to_csv(f\"{output_dir}/02_简道云员工id.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取权限表信息\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"675b96c14e839f90fef1647c\"}\n",
"permissions_table = api_instance.entry_data_list(payload).get(\"data\")\n",
"print(f\"权限表: {len(permissions_table)} 条\")\n",
"pd.DataFrame(permissions_table).to_csv(f\"{output_dir}/03_权限表.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取NGV数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"675bb02bd2d53c2034c665e4\"}\n",
"NGV_data_list = api_instance.entry_data_list(payload).get(\"data\")\n",
"print(f\"NGV数据: {len(NGV_data_list)} 条\")\n",
"pd.DataFrame(NGV_data_list).to_csv(f\"{output_dir}/04_NGV数据.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取服务提醒-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676bb7bda3029720f1083e99\"}\n",
"service_remind = api_instance.entry_data_list(payload).get(\"data\")\n",
"print(f\"服务提醒: {len(service_remind)} 条\")\n",
"pd.DataFrame(service_remind).to_csv(f\"{output_dir}/05_服务提醒.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取智能检测-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676bb99649ab3ac975af6e39\"}\n",
"Smart_detection = api_instance.entry_data_list(payload).get(\"data\")\n",
"print(f\"智能检测: {len(Smart_detection)} 条\")\n",
"pd.DataFrame(Smart_detection).to_csv(f\"{output_dir}/06_智能检测.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取功能使用情况表\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"6763bbf657bd8fb76fcb41b2\"}\n",
"get_feature_usage = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"print(f\"功能使用情况: {len(get_feature_usage)} 条\")\n",
"pd.DataFrame(get_feature_usage).to_csv(f\"{output_dir}/07_功能使用情况.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取保单识别表\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"6773a60d30ed87ff9f68d3c5\"}\n",
"policy_recognition = api_instance.entry_data_list(payload).get(\"data\")\n",
"print(f\"保单识别: {len(policy_recognition)} 条\")\n",
"widget_list = [item['_widget_1735632397600'] for item in policy_recognition]\n",
"pd.DataFrame(policy_recognition).to_csv(f\"{output_dir}/08_保单识别.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取私域小程序-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e0f0fae622896749ba5087\"}\n",
"private_domain = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"print(f\"私域小程序: {len(private_domain)} 条\")\n",
"pd.DataFrame(private_domain).to_csv(f\"{output_dir}/09_私域小程序.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取公域小程序-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e0c702c8f603b997980999\"}\n",
"public_domain = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"public_domain_list = [item['_widget_1742784257506'] for item in public_domain]\n",
"print(f\"公域小程序: {len(public_domain)} 条\")\n",
"pd.DataFrame(public_domain).to_csv(f\"{output_dir}/10_公域小程序.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取异业合作-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e24fdd8dfcfa918e17c30b\"}\n",
"different_industries = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"different_industries_list = [item['_widget_1742884829007'] for item in different_industries]\n",
"print(f\"异业合作: {len(different_industries)} 条\")\n",
"pd.DataFrame(different_industries).to_csv(f\"{output_dir}/11_异业合作.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取短信-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e5107198ba1b20d5df3974\"}\n",
"groupnotification = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"print(f\"短信: {len(groupnotification)} 条\")\n",
"pd.DataFrame(groupnotification).to_csv(f\"{output_dir}/12_短信.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 获取多公司过滤表\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"689bf5f8ba88a28cb0679ec9\"}\n",
"get_filter_company_list = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"print(f\"多公司过滤表: {len(get_filter_company_list)} 条\")\n",
"pd.DataFrame(get_filter_company_list).to_csv(f\"{output_dir}/13_多公司过滤表.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"print(f\"\\n数据加载完成!\")\n",
"print(f\"结束时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤3: 获取节假日列表和计算date_one\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"节假日列表: 52 个日期\n",
"当前日期: 2026-01-17\n",
"遍历日期: 2026-01-16\n",
"遍历次数: 1\n",
"date_one = 1\n"
]
}
],
"source": [
"def calculate_date_one(date_list, start_offset=0):\n",
" \"\"\"\n",
" 计算从当前日期(或指定偏移量的日期)开始,往前遍历遇到date_list中日期的次数。\n",
" \"\"\"\n",
" now_time = datetime.datetime.now() + datetime.timedelta(days=start_offset)\n",
" date_one = 1\n",
" print(\"当前日期:\", now_time.strftime(\"%Y-%m-%d\"))\n",
" \n",
" if now_time.strftime(\"%Y-%m-%d\") in date_list:\n",
" date_one = 0\n",
" print(\"开始次数:\", date_one)\n",
" else:\n",
" for i in range(1, 10):\n",
" new_date = now_time + datetime.timedelta(days=-i)\n",
" new_date_str = new_date.strftime(\"%Y-%m-%d\")\n",
" print(\"遍历日期:\", new_date_str)\n",
" if new_date_str in date_list:\n",
" date_one += 1\n",
" print(\"节假日期:\", new_date_str)\n",
" else:\n",
" break\n",
" \n",
" print(\"遍历次数:\", date_one)\n",
" return date_one\n",
"\n",
"# 获取节假日列表\n",
"date_list = common_module.get_holiday_list()\n",
"print(f\"节假日列表: {len(date_list)} 个日期\")\n",
"pd.Series(date_list).to_csv(f\"{output_dir}/14_节假日列表.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 计算date_one\n",
"date_one = calculate_date_one(date_list, start_offset=0)\n",
"print(f\"date_one = {date_one}\")\n",
"\n",
"# 保存date_one\n",
"pd.DataFrame([{\"date_one\": date_one}]).to_csv(f\"{output_dir}/15_date_one.csv\", index=False, encoding='utf-8-sig')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤4: 获取NGV明细数据\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"开始获取NGV明细数据...\n",
"开始时间: 2026-01-17 13:34:48\n",
"NGV明细数据: 45686 条\n",
"NGV明细数据列数: 141\n",
"已保存到: output/debug_revisit_renew/16_原始NGV数据.csv\n",
"结束时间: 2026-01-17 13:35:00\n"
]
}
],
"source": [
"print(\"开始获取NGV明细数据...\")\n",
"print(f\"开始时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n",
"\n",
"data_NGV = common_module.get_ngv_details(days_back=1)\n",
"print(f\"NGV明细数据: {len(data_NGV)} 条\")\n",
"print(f\"NGV明细数据列数: {len(data_NGV.columns)}\")\n",
"\n",
"# 保存原始NGV数据\n",
"data_NGV.to_csv(f\"{output_dir}/16_原始NGV数据.csv\", index=False, encoding='utf-8-sig')\n",
"print(f\"已保存到: {output_dir}/16_原始NGV数据.csv\")\n",
"\n",
"print(f\"结束时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤5: 构建省市区索引\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"省市区索引构建完成: 3667 条\n"
]
}
],
"source": [
"def build_index(json_list):\n",
" index = {}\n",
" for json_item in json_list:\n",
" try:\n",
" key = (json_item['_widget_1734677164861'], json_item['_widget_1734677164862'],\n",
" json_item['_widget_1734677164863']) # 省市区\n",
" if '_widget_1734677164871' not in json_item: # 日常回访客服\n",
" raise KeyError(\"缺少 '日常回访客服' 键\")\n",
" index[key] = json_item\n",
" except KeyError as e:\n",
" print(f\"警告:{e},跳过该条记录: {json_item}\")\n",
" continue\n",
" return index\n",
"\n",
"# 从CSV加载省市区人员关系表(步骤2的输出)\n",
"csv_path = f\"{output_dir}/01_省市区人员关系表.csv\"\n",
"if os.path.exists(csv_path):\n",
" print(f\"从CSV文件读取: {csv_path}\")\n",
" json_list_df = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
" json_list = json_list_df.to_dict('records')\n",
" print(f\"读取到 {len(json_list)} 条记录\")\n",
"else:\n",
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤2\")\n",
" json_list = []\n",
"\n",
"# 构建索引\n",
"index = build_index(json_list)\n",
"print(f\"省市区索引构建完成: {len(index)} 条\")\n",
"\n",
"# 保存索引信息\n",
"index_df = pd.DataFrame([{\"省\": k[0], \"市\": k[1], \"区\": k[2], \"数据\": str(v)} for k, v in index.items()])\n",
"index_df.to_csv(f\"{output_dir}/17_省市区索引.csv\", index=False, encoding='utf-8-sig')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤6: 获取多公司过滤列表\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"获取多公司过滤公司id\n",
"过滤公司条数: 19\n"
]
}
],
"source": [
"print(\"获取多公司过滤公司id\")\n",
"# 从CSV加载多公司过滤表(步骤2的输出)\n",
"csv_path = f\"{output_dir}/13_多公司过滤表.csv\"\n",
"if os.path.exists(csv_path):\n",
" print(f\"从CSV文件读取: {csv_path}\")\n",
" get_filter_company_list_df = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
" get_filter_company_list = get_filter_company_list_df.to_dict('records')\n",
" print(f\"读取到 {len(get_filter_company_list)} 条记录\")\n",
"else:\n",
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤2\")\n",
" get_filter_company_list = []\n",
"\n",
"all_filter_company_list = []\n",
"for company in get_filter_company_list:\n",
" company_list = company.get(\"_widget_1755052002491\")\n",
" if company_list:\n",
" # 处理可能是字符串的情况\n",
" if isinstance(company_list, str):\n",
" import ast\n",
" try:\n",
" company_list = ast.literal_eval(company_list)\n",
" except:\n",
" continue\n",
" for company_item in company_list:\n",
" if company_item.get(\"_widget_1755052002496\") == \"否\":\n",
" all_filter_company_list.append(company_item.get(\"_widget_1755052002495\"))\n",
"\n",
"print(f\"过滤公司条数: {len(all_filter_company_list)}\")\n",
"pd.Series(all_filter_company_list).to_csv(f\"{output_dir}/18_过滤公司列表.csv\", index=False, encoding='utf-8-sig')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤7: 数据处理和过滤(第一部分:基础处理)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"过滤前数据量: 45686\n",
"过滤后数据量: 45667\n"
]
}
],
"source": [
"# 从CSV加载NGV数据(步骤4的输出)\n",
"csv_path = f\"{output_dir}/16_原始NGV数据.csv\"\n",
"if os.path.exists(csv_path):\n",
" print(f\"从CSV文件读取: {csv_path}\")\n",
" data_NGV = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
" print(f\"读取到 {len(data_NGV)} 条记录\")\n",
"else:\n",
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤4\")\n",
" data_NGV = pd.DataFrame()\n",
"\n",
"# 将A列和B列的日期字符串转换为日期格式\n",
"data_NGV_processed = data_NGV.copy()\n",
"data_NGV_processed['A'] = pd.to_datetime(data_NGV_processed['expiry_time'])\n",
"data_NGV_processed['B'] = pd.to_datetime(data_NGV_processed['renew_date'])\n",
"\n",
"def replace_values(series):\n",
" return series.apply(lambda x: '' if pd.isna(x) or x in ['NA', 'None', ''] else x)\n",
"\n",
"# 处理字符串数据\n",
"data_NGV_processed = data_NGV_processed.apply(replace_values)\n",
"\n",
"# 过滤多公司\n",
"print(f\"过滤前数据量: {len(data_NGV_processed)}\")\n",
"data_NGV_processed = data_NGV_processed[~data_NGV_processed['id_own_group'].isin(all_filter_company_list)]\n",
"print(f\"过滤后数据量: {len(data_NGV_processed)}\")\n",
"\n",
"# 保存过滤后的数据\n",
"data_NGV_processed.to_csv(f\"{output_dir}/19_过滤多公司后数据.csv\", index=False, encoding='utf-8-sig')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤8: 数据处理和过滤(第二部分:优先级排序和最佳值计算)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"合并最佳值后数据量: 45667\n"
]
}
],
"source": [
"# 从CSV加载前一步的数据(步骤7的输出)\n",
"csv_path = f\"{output_dir}/19_过滤多公司后数据.csv\"\n",
"if os.path.exists(csv_path):\n",
" print(f\"从CSV文件读取: {csv_path}\")\n",
" data_NGV_processed = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
" # 重新转换日期列\n",
" data_NGV_processed['A'] = pd.to_datetime(data_NGV_processed['expiry_time'])\n",
" data_NGV_processed['B'] = pd.to_datetime(data_NGV_processed['renew_date'])\n",
" print(f\"读取到 {len(data_NGV_processed)} 条记录\")\n",
"else:\n",
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤7\")\n",
" data_NGV_processed = pd.DataFrame()\n",
"\n",
"# 定义优先级顺序\n",
"edition_order = ['皇冠版', '至尊版', '尊享版', '旗舰版', '标准版', '进阶版', '基础版', '入门版']\n",
"customer_type_order = [\"F\", \"E\", \"D\", \"C\", \"B\", \"A\"]\n",
"group_grade_order = ['全国KAFMVP', '区域KAMVP', '重要客户(SVIP', '普通客户(VIP']\n",
"\n",
"# 创建映射字典\n",
"edition_map = {edition: idx for idx, edition in enumerate(edition_order)}\n",
"customer_type_map = {ctype: idx for idx, ctype in enumerate(customer_type_order)}\n",
"group_grade_map = {grade: idx for idx, grade in enumerate(group_grade_order)}\n",
"\n",
"# 添加用于排序的新列\n",
"data_NGV_processed['edition_rank'] = data_NGV_processed['saas_edition_fmt'].map(edition_map).fillna(0).astype(int)\n",
"data_NGV_processed['customer_type_rank'] = data_NGV_processed['saas_customer_type'].map(customer_type_map).fillna(0).astype(int)\n",
"data_NGV_processed['group_grade_rank'] = data_NGV_processed['group_grade'].map(group_grade_map).fillna(0).astype(int)\n",
"\n",
"# 找到每组中 edition_rank 最小值对应的行\n",
"best_edition_idx = data_NGV_processed.groupby('id_own_group')['edition_rank'].idxmin()\n",
"best_edition_rows = data_NGV_processed.loc[best_edition_idx]\n",
"best_edition_rows['max_saas_edition'] = best_edition_rows['saas_edition_fmt']\n",
"\n",
"# 找到每组中 customer_type_rank 最小值对应的行\n",
"best_customer_type_idx = data_NGV_processed.groupby('id_own_group')['customer_type_rank'].idxmin()\n",
"best_customer_type_rows = data_NGV_processed.loc[best_customer_type_idx]\n",
"best_customer_type_rows['max_saas_customer_type'] = best_customer_type_rows['customer_type_rank'].apply(\n",
" lambda x: customer_type_order[x])\n",
"\n",
"# 找到每组中 group_grade_rank 最小值对应的行\n",
"best_group_grade_idx = data_NGV_processed.groupby('id_own_group')['group_grade_rank'].idxmin()\n",
"best_group_grade_rows = data_NGV_processed.loc[best_group_grade_idx]\n",
"best_group_grade_rows['max_group_grade'] = best_group_grade_rows['group_grade']\n",
"\n",
"# 合并最佳值回到原数据集\n",
"best_values = (\n",
" best_edition_rows[['id_own_group', 'max_saas_edition']]\n",
" .merge(best_customer_type_rows[['id_own_group', 'max_saas_customer_type']], on='id_own_group', how='outer')\n",
" .merge(best_group_grade_rows[['id_own_group', 'max_group_grade']], on='id_own_group', how='outer')\n",
")\n",
"\n",
"# 将最佳值合并回原数据集\n",
"data_NGV_processed = data_NGV_processed.merge(best_values, on='id_own_group', how='left')\n",
"\n",
"print(f\"合并最佳值后数据量: {len(data_NGV_processed)}\")\n",
"data_NGV_processed.to_csv(f\"{output_dir}/20_合并最佳值后数据.csv\", index=False, encoding='utf-8-sig')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤8.5: 字段数据类型检查(调试用)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"============================================================\n",
"字段数据类型和值检查\n",
"============================================================\n",
"\n",
"当前 data_NGV_processed 数据量: 45667 条\n",
"\n",
"【is_main_org 字段检查】\n",
"数据类型: object\n",
"唯一值: ['0', '1']\n",
"值分布:\n",
"is_main_org\n",
"1 37637\n",
"0 8030\n",
"Name: count, dtype: int64\n",
"示例值(前5个): ['1', '0', '0', '0', '1']\n",
"\n",
"【org_status 字段检查】\n",
"数据类型: object\n",
"唯一值: ['留存', '过期']\n",
"值分布:\n",
"org_status\n",
"留存 27997\n",
"过期 17670\n",
"Name: count, dtype: int64\n",
"\n",
"【org_type 字段检查】\n",
"数据类型: object\n",
"唯一值: ['一般', '天猫']\n",
"值分布:\n",
"org_type\n",
"一般 42998\n",
"天猫 2669\n",
"Name: count, dtype: int64\n",
"\n",
"【area_manager 字段检查】\n",
"数据类型: object\n",
"唯一值数量: 16\n",
"值分布(前10:\n",
"area_manager\n",
"肖军 10795\n",
"景东强 8353\n",
"陈庆伟 8301\n",
"张凯 8232\n",
"关磊 6994\n",
"孙玉蕾 2007\n",
"殷昊 745\n",
"王涛 161\n",
"刘伟 52\n",
" 8\n",
"Name: count, dtype: int64\n",
"是否包含'殷昊': True\n",
"是否包含'孙玉蕾': True\n",
"\n",
"【条件匹配测试】\n",
"警告: is_main_org 是字符串类型,需要转换为数值或使用字符串比较\n",
"使用数值比较 (== 1): 15066 条\n",
"使用字符串比较 (== '1'): 15066 条\n",
"\n",
"============================================================\n"
]
}
],
"source": [
"# 从CSV加载前一步的数据(步骤8的输出)\n",
"csv_path = f\"{output_dir}/20_合并最佳值后数据.csv\"\n",
"if os.path.exists(csv_path):\n",
" print(f\"从CSV文件读取: {csv_path}\")\n",
" data_NGV_processed = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
" print(f\"读取到 {len(data_NGV_processed)} 条记录\")\n",
"else:\n",
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤8\")\n",
" data_NGV_processed = pd.DataFrame()\n",
"\n",
"# 检查关键字段的数据类型和实际值\n",
"print(\"=\" * 60)\n",
"print(\"字段数据类型和值检查\")\n",
"print(\"=\" * 60)\n",
"print(f\"\\n当前 data_NGV_processed 数据量: {len(data_NGV_processed)} 条\")\n",
"\n",
"# 检查 is_main_org\n",
"print(f\"\\n【is_main_org 字段检查】\")\n",
"print(f\"数据类型: {data_NGV_processed['is_main_org'].dtype}\")\n",
"print(f\"唯一值: {sorted(data_NGV_processed['is_main_org'].unique())}\")\n",
"print(f\"值分布:\\n{data_NGV_processed['is_main_org'].value_counts()}\")\n",
"print(f\"示例值(前5个): {data_NGV_processed['is_main_org'].head().tolist()}\")\n",
"\n",
"# 检查 org_status\n",
"print(f\"\\n【org_status 字段检查】\")\n",
"print(f\"数据类型: {data_NGV_processed['org_status'].dtype}\")\n",
"print(f\"唯一值: {sorted(data_NGV_processed['org_status'].unique())}\")\n",
"print(f\"值分布:\\n{data_NGV_processed['org_status'].value_counts()}\")\n",
"\n",
"# 检查 org_type\n",
"print(f\"\\n【org_type 字段检查】\")\n",
"print(f\"数据类型: {data_NGV_processed['org_type'].dtype}\")\n",
"print(f\"唯一值: {sorted(data_NGV_processed['org_type'].unique())}\")\n",
"print(f\"值分布:\\n{data_NGV_processed['org_type'].value_counts()}\")\n",
"\n",
"# 检查 area_manager\n",
"print(f\"\\n【area_manager 字段检查】\")\n",
"print(f\"数据类型: {data_NGV_processed['area_manager'].dtype}\")\n",
"print(f\"唯一值数量: {data_NGV_processed['area_manager'].nunique()}\")\n",
"print(f\"值分布(前10:\\n{data_NGV_processed['area_manager'].value_counts().head(10)}\")\n",
"print(f\"是否包含'殷昊': {'殷昊' in data_NGV_processed['area_manager'].values}\")\n",
"print(f\"是否包含'孙玉蕾': {'孙玉蕾' in data_NGV_processed['area_manager'].values}\")\n",
"\n",
"# 测试条件匹配\n",
"print(f\"\\n【条件匹配测试】\")\n",
"# 测试主店过期条件\n",
"if data_NGV_processed['is_main_org'].dtype == 'object':\n",
" print(\"警告: is_main_org 是字符串类型,需要转换为数值或使用字符串比较\")\n",
" # 尝试转换为数值\n",
" is_main_org_numeric = pd.to_numeric(data_NGV_processed['is_main_org'], errors='coerce')\n",
" condition1_test = (is_main_org_numeric == 1) & (data_NGV_processed['org_status'] == '过期')\n",
" condition1_test_str = (data_NGV_processed['is_main_org'] == '1') & (data_NGV_processed['org_status'] == '过期')\n",
" print(f\"使用数值比较 (== 1): {condition1_test.sum()} 条\")\n",
" print(f\"使用字符串比较 (== '1'): {condition1_test_str.sum()} 条\")\n",
"else:\n",
" condition1_test = (data_NGV_processed['is_main_org'] == 1) & (data_NGV_processed['org_status'] == '过期')\n",
" print(f\"主店过期条件匹配: {condition1_test.sum()} 条\")\n",
"\n",
"# 保存检查结果\n",
"check_result = {\n",
" 'is_main_org_dtype': str(data_NGV_processed['is_main_org'].dtype),\n",
" 'is_main_org_unique_values': list(data_NGV_processed['is_main_org'].unique()),\n",
" 'org_status_unique_values': list(data_NGV_processed['org_status'].unique()),\n",
" 'org_type_unique_values': list(data_NGV_processed['org_type'].unique()),\n",
" 'area_manager_unique_count': data_NGV_processed['area_manager'].nunique(),\n",
" 'has_殷昊': '殷昊' in data_NGV_processed['area_manager'].values,\n",
" 'has_孙玉蕾': '孙玉蕾' in data_NGV_processed['area_manager'].values,\n",
"}\n",
"pd.DataFrame([check_result]).to_csv(f\"{output_dir}/20.5_字段检查结果.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"print(\"\\n\" + \"=\" * 60)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤9: 数据处理和过滤(第三部分:主店过期处理)\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"主店过期数据量: 0\n",
"警告: 主店过期数据为空,请检查 is_main_org 和 org_status 字段\n",
"满足条件的分店数据量: 0\n",
"警告: 主店过期数据为空,无法筛选分店数据\n",
"满足条件的主店数据量: 20159\n",
"警告: 没有分店数据可合并\n",
"合并后总数据量: 20159\n"
]
}
],
"source": [
"# 主店过期,分店设置为主店\n",
"# 修复:处理 is_main_org 可能是字符串类型的情况\n",
"if data_NGV_processed['is_main_org'].dtype == 'object':\n",
" # 如果是字符串类型,转换为数值或使用字符串比较\n",
" is_main_org_numeric = pd.to_numeric(data_NGV_processed['is_main_org'], errors='coerce')\n",
" condition = (is_main_org_numeric == 1) & (data_NGV_processed['org_status'] == '过期')\n",
"else:\n",
" condition = (data_NGV_processed['is_main_org'] == 1) & (data_NGV_processed['org_status'] == '过期')\n",
"\n",
"ngvv2 = data_NGV_processed[condition]\n",
"print(f\"主店过期数据量: {len(ngvv2)}\")\n",
"if len(ngvv2) > 0:\n",
" ngvv2.to_csv(f\"{output_dir}/21_主店过期数据.csv\", index=False, encoding='utf-8-sig')\n",
"else:\n",
" print(\"警告: 主店过期数据为空,请检查 is_main_org 和 org_status 字段\")\n",
"\n",
"# 检查id_own_group是否存在于ngvv2中\n",
"data_NGV_V2 = data_NGV_processed.copy()\n",
"\n",
"# 修复:处理 is_main_org 可能是字符串类型的情况\n",
"if data_NGV_V2['is_main_org'].dtype == 'object':\n",
" is_main_org_numeric_v2 = pd.to_numeric(data_NGV_V2['is_main_org'], errors='coerce')\n",
" data_NGV_V2['条件'] = ((data_NGV_V2['org_type'] == \"一般\") & (data_NGV_V2['org_status'] == '留存') &\n",
" (data_NGV_V2['area_manager'] != '殷昊') & (data_NGV_V2['area_manager'] != '孙玉蕾') &\n",
" (is_main_org_numeric_v2 != 1))\n",
"else:\n",
" data_NGV_V2['条件'] = ((data_NGV_V2['org_type'] == \"一般\") & (data_NGV_V2['org_status'] == '留存') &\n",
" (data_NGV_V2['area_manager'] != '殷昊') & (data_NGV_V2['area_manager'] != '孙玉蕾') &\n",
" (data_NGV_V2['is_main_org'] != 1))\n",
"\n",
"data_NGV_V2 = data_NGV_V2.loc[data_NGV_V2[\"条件\"]]\n",
"print(f\"满足条件的分店数据量: {len(data_NGV_V2)}\")\n",
"\n",
"# 过滤存在的记录\n",
"if len(ngvv2) > 0:\n",
" data_NGV_V2['exists_in_ngvv2'] = data_NGV_V2['id_own_group'].isin(ngvv2['id_own_group'])\n",
" filtered_data = data_NGV_V2[data_NGV_V2['exists_in_ngvv2']]\n",
" print(f\"存在于主店过期列表的分店数据量: {len(filtered_data)}\")\n",
" \n",
" if len(filtered_data) > 0:\n",
" filtered_data.to_csv(f\"{output_dir}/22_分店数据.csv\", index=False, encoding='utf-8-sig')\n",
" \n",
" # 按版本排序并去重\n",
" fixed_order = ['皇冠版', '至尊版', '尊享版', '旗舰版', '标准版', '进阶版', '基础版', '入门版']\n",
" fixed_order_map = {edition: index for index, edition in enumerate(fixed_order)}\n",
" filtered_data['sort_key'] = filtered_data['saas_edition_fmt'].map(fixed_order_map)\n",
" filtered_data = filtered_data.sort_values(by='sort_key').drop('sort_key', axis=1)\n",
" result = filtered_data.drop_duplicates(subset='id_own_group', keep='first')\n",
" print(f\"去重后的分店数据量: {len(result)}\")\n",
" if len(result) > 0:\n",
" result.to_csv(f\"{output_dir}/23_去重后分店数据.csv\", index=False, encoding='utf-8-sig')\n",
" else:\n",
" print(\"警告: 没有分店数据存在于主店过期列表中\")\n",
" result = pd.DataFrame()\n",
"else:\n",
" print(\"警告: 主店过期数据为空,无法筛选分店数据\")\n",
" result = pd.DataFrame()\n",
"\n",
"# 合并主店和分店数据\n",
"# 修复:处理 is_main_org 可能是字符串类型的情况\n",
"if data_NGV_processed['is_main_org'].dtype == 'object':\n",
" is_main_org_numeric_main = pd.to_numeric(data_NGV_processed['is_main_org'], errors='coerce')\n",
" data_NGV_processed['条件'] = ((data_NGV_processed['org_type'] == \"一般\") & (data_NGV_processed['org_status'] == '留存') &\n",
" (data_NGV_processed['area_manager'] != '殷昊') &\n",
" (data_NGV_processed['area_manager'] != '孙玉蕾') &\n",
" (is_main_org_numeric_main == 1))\n",
"else:\n",
" data_NGV_processed['条件'] = ((data_NGV_processed['org_type'] == \"一般\") & (data_NGV_processed['org_status'] == '留存') &\n",
" (data_NGV_processed['area_manager'] != '殷昊') &\n",
" (data_NGV_processed['area_manager'] != '孙玉蕾') &\n",
" (data_NGV_processed['is_main_org'] == 1))\n",
"\n",
"data_NGV_processed = data_NGV_processed.loc[data_NGV_processed[\"条件\"]]\n",
"print(f\"满足条件的主店数据量: {len(data_NGV_processed)}\")\n",
"\n",
"# 合并数据\n",
"if len(result) > 0:\n",
" data_NGV_processed = pd.concat([data_NGV_processed, result], axis=0)\n",
"else:\n",
" print(\"警告: 没有分店数据可合并\")\n",
"print(f\"合并后总数据量: {len(data_NGV_processed)}\")\n",
"\n",
"if len(data_NGV_processed) > 0:\n",
" data_NGV_processed.to_csv(f\"{output_dir}/24_合并主店分店后数据.csv\", index=False, encoding='utf-8-sig')\n",
"else:\n",
" print(\"警告: 合并后数据为空,请检查前面的过滤条件\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤10: 数据处理和过滤(第四部分:续约日期处理)\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"计算条件前数据量: 20159\n",
"条件>0的数据量: 9740\n",
"过滤后数据量: 9740\n",
"数据量: 9740\n",
"年数分布: {3: 7972, 2: 1388, 5: 259, 4: 98, 6: 14, 7: 4, 10: 4, 8: 1}\n"
]
}
],
"source": [
"data_details = data_NGV_processed.copy()\n",
"data_details = data_details.reset_index(drop=True)\n",
"\n",
"# 判断A列的日期是否大于B列的日期730天\n",
"data_details['条件'] = data_details.apply(\n",
" lambda row: (\n",
" (pd.to_datetime(row['A']) - pd.to_datetime(row['B'])).days\n",
" if pd.to_datetime(row['A']) - pd.to_datetime(row['B']) >= pd.Timedelta(days=730)\n",
" else 0\n",
" ),\n",
" axis=1\n",
")\n",
"print(f\"计算条件前数据量: {len(data_details)}\")\n",
"print(f\"条件>0的数据量: {len(data_details[data_details['条件'] > 0])}\")\n",
"data_details = data_details.loc[data_details[\"条件\"] > 0]\n",
"print(f\"过滤后数据量: {len(data_details)}\")\n",
"data_details.to_csv(f\"{output_dir}/25_续约日期过滤后数据.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 计算年数\n",
"def divide_by_365(x):\n",
" if isinstance(x, (int, float)):\n",
" return int(x / 365)\n",
" else:\n",
" return x\n",
"\n",
"data_details['年'] = data_details['条件'].apply(divide_by_365)\n",
"data_details = data_details.reset_index(drop=True)\n",
"print(f\"数据量: {len(data_details)}\")\n",
"print(f\"年数分布: {data_details['年'].value_counts().to_dict()}\")\n",
"data_details.to_csv(f\"{output_dir}/26_计算年数后数据.csv\", index=False, encoding='utf-8-sig')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤11: 数据处理和过滤(第五部分:生成历史续约日期)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 从CSV加载前一步的数据(步骤10的输出)\n",
"csv_path = f\"{output_dir}/26_计算年数后数据.csv\"\n",
"if os.path.exists(csv_path):\n",
" print(f\"从CSV文件读取: {csv_path}\")\n",
" data_details = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
" print(f\"读取到 {len(data_details)} 条记录\")\n",
"else:\n",
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤10\")\n",
" data_details = pd.DataFrame()\n",
"\n",
"# 创建新的DataFrame用于存储历史续约日期\n",
"new_df = pd.DataFrame()\n",
"# 使用 from datetime import datetime 避免冲突\n",
"from datetime import datetime as dt\n",
"\n",
"for index, row in data_details.iterrows():\n",
" if row[\"renew_date\"] != \"2024-02-29\":\n",
" # 修复:确保 '年' 是整数类型,处理可能的NaN或float值\n",
" try:\n",
" year_value = int(row['年']) if pd.notna(row['年']) else 0\n",
" except (ValueError, TypeError):\n",
" year_value = 0\n",
" \n",
" # 只有当年数大于1时才生成历史续约日期\n",
" if year_value > 1:\n",
" for i_new in range(1, year_value):\n",
" row_new = row.copy()\n",
" c = row_new[\"renew_date\"]\n",
" # 使用 dt.strptime (dt 是 datetime.datetime 的别名)\n",
" date_obj = dt.strptime(str(c), \"%Y-%m-%d\")\n",
" new_year = date_obj.year + i_new\n",
" new_date_obj = date_obj.replace(year=new_year)\n",
" new_c = new_date_obj.strftime(\"%Y-%m-%d\")\n",
" row_new[\"renew_date\"] = new_c\n",
" new_df = pd.concat([new_df, pd.DataFrame([row_new])], ignore_index=True)\n",
"\n",
"print(f\"生成的历史续约日期数据量: {len(new_df)}\")\n",
"if len(new_df) > 0:\n",
" new_df.to_csv(f\"{output_dir}/27_生成的历史续约日期数据.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 合并两个DataFrame\n",
"merged_df = pd.concat([data_NGV_processed, new_df], axis=0, ignore_index=True)\n",
"data_details = merged_df.copy()\n",
"print(f\"合并后总数据量: {len(data_details)}\")\n",
"data_details.to_csv(f\"{output_dir}/28_合并历史续约日期后数据.csv\", index=False, encoding='utf-8-sig')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤12: 数据处理和过滤(第六部分:最终过滤)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"renew_date不为空的数据量: 38932\n",
"过滤前数据量: 38932\n",
"创建年份等于续约年份的数据量: 8246\n",
"最终过滤后数据量: 30686\n",
"\n",
"最终数据已保存到: output/debug_revisit_renew/30_最终过滤后数据.csv\n"
]
}
],
"source": [
"# 从CSV加载前一步的数据(步骤11的输出)\n",
"csv_path = f\"{output_dir}/28_合并历史续约日期后数据.csv\"\n",
"if os.path.exists(csv_path):\n",
" print(f\"从CSV文件读取: {csv_path}\")\n",
" data_details = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
" print(f\"读取到 {len(data_details)} 条记录\")\n",
"else:\n",
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤11\")\n",
" data_details = pd.DataFrame()\n",
"\n",
"# 过滤renew_date不为空的数据\n",
"data_details_not_null = data_details[data_details['renew_date'].notnull()]\n",
"data_details_not_null = data_details_not_null.reset_index(drop=True)\n",
"print(f\"renew_date不为空的数据量: {len(data_details_not_null)}\")\n",
"data_details_not_null.to_csv(f\"{output_dir}/29_renew_date不为空数据.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"data_details = data_details_not_null.copy()\n",
"\n",
"# 过滤掉创建年份等于续约年份的数据\n",
"data_details['saas_create_time'] = data_details['saas_create_time'].str[:4]\n",
"data_details['renew_date_new'] = data_details['renew_date'].str[:4]\n",
"print(f\"过滤前数据量: {len(data_details)}\")\n",
"print(f\"创建年份等于续约年份的数据量: {len(data_details[data_details['saas_create_time'] == data_details['renew_date_new']])}\")\n",
"data_details = data_details[data_details['saas_create_time'] != data_details['renew_date_new']]\n",
"data_details = data_details.reset_index(drop=True)\n",
"print(f\"最终过滤后数据量: {len(data_details)}\")\n",
"\n",
"data_details.to_csv(f\"{output_dir}/30_最终过滤后数据.csv\", index=False, encoding='utf-8-sig')\n",
"print(f\"\\n最终数据已保存到: {output_dir}/30_最终过滤后数据.csv\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤13: 日期计算和循环处理(90/120/180天数据筛选)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "type object 'datetime.datetime' has no attribute 'datetime'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[27]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 2\u001b[39m date_120 = \u001b[32m113\u001b[39m\n\u001b[32m 3\u001b[39m date_180 = \u001b[32m173\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m start_time = \u001b[43mdatetime\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdatetime\u001b[49m.now()\n\u001b[32m 6\u001b[39m now_time = start_time.replace()\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m now_time.strftime(\u001b[33m\"\u001b[39m\u001b[33m%\u001b[39m\u001b[33mY-\u001b[39m\u001b[33m%\u001b[39m\u001b[33mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01min\u001b[39;00m date_list:\n",
"\u001b[31mAttributeError\u001b[39m: type object 'datetime.datetime' has no attribute 'datetime'"
]
}
],
"source": [
"# 从CSV加载最终过滤后的数据(步骤12的输出)\n",
"csv_path = f\"{output_dir}/30_最终过滤后数据.csv\"\n",
"if os.path.exists(csv_path):\n",
" print(f\"从CSV文件读取: {csv_path}\")\n",
" data_details = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
" print(f\"读取到 {len(data_details)} 条记录\")\n",
"else:\n",
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤12\")\n",
" data_details = pd.DataFrame()\n",
"\n",
"# 从CSV加载节假日列表和date_one(步骤3的输出)\n",
"date_list_csv = f\"{output_dir}/14_节假日列表.csv\"\n",
"if os.path.exists(date_list_csv):\n",
" date_list = pd.read_csv(date_list_csv, encoding='utf-8-sig').iloc[:, 0].tolist()\n",
"else:\n",
" print(f\"警告: 文件不存在 {date_list_csv},请先执行步骤3\")\n",
" date_list = []\n",
"\n",
"date_one_csv = f\"{output_dir}/15_date_one.csv\"\n",
"if os.path.exists(date_one_csv):\n",
" date_one = pd.read_csv(date_one_csv, encoding='utf-8-sig').iloc[0, 0]\n",
"else:\n",
" print(f\"警告: 文件不存在 {date_one_csv},请先执行步骤3\")\n",
" date_one = 1\n",
"\n",
"date_90 = 83\n",
"date_120 = 113\n",
"date_180 = 173\n",
"\n",
"start_time = datetime.datetime.now()\n",
"now_time = start_time.replace()\n",
"\n",
"if now_time.strftime(\"%Y-%m-%d\") in date_list:\n",
" date_one = 0\n",
" print(\"开始次数:\", date_one)\n",
" print(\"当前日期:\", now_time)\n",
"\n",
"print(f\"遍历次数:{date_one}\")\n",
"\n",
"# 存储所有派发数据\n",
"all_distribution_data = []\n",
"\n",
"for i in range(0, date_one):\n",
" print(f\"\\n========== 这是第{i}次遍历 ==========\")\n",
" now_time = datetime.datetime.now() + datetime.timedelta(days=-(i + 1))\n",
" \n",
" today = now_time + datetime.timedelta(days=-date_90)\n",
" formatted_today_90 = today.strftime(\"%Y-%m-%d\")\n",
" today = now_time + datetime.timedelta(days=-date_120)\n",
" formatted_today_120 = today.strftime(\"%Y-%m-%d\")\n",
" today = now_time + datetime.timedelta(days=-date_180)\n",
" formatted_today_180 = today.strftime(\"%Y-%m-%d\")\n",
" \n",
" print(f\"90天为{formatted_today_90}120天为{formatted_today_120}180天为{formatted_today_180}\")\n",
" \n",
" # 获取90天数据\n",
" data_details_90 = data_details.copy()\n",
" data_details_90['条件'] = ((data_details_90['renew_date'] == formatted_today_90) & \n",
" (data_details_90['group_grade'] != \"普通客户(VIP\"))\n",
" data_details_90 = data_details_90.loc[data_details_90[\"条件\"]]\n",
" print(f\"90天数据量: {len(data_details_90)}\")\n",
" \n",
" # 获取120天数据\n",
" data_details_120 = data_details.copy()\n",
" data_details_120['条件'] = ((data_details_120['renew_date'] == formatted_today_120) &\n",
" ((data_details_120['saas_edition_fmt'] == '基础版') |\n",
" (data_details_120['saas_edition_fmt'] == '入门版')))\n",
" data_details_120 = data_details_120.loc[data_details_120[\"条件\"]]\n",
" print(f\"120天数据量: {len(data_details_120)}\")\n",
" \n",
" # 获取180天数据\n",
" data_details_180 = data_details.copy()\n",
" data_details_180['条件'] = (data_details_180['renew_date'] == formatted_today_180)\n",
" data_details_180 = data_details_180.loc[data_details_180[\"条件\"]]\n",
" print(f\"180天数据量: {len(data_details_180)}\")\n",
" \n",
" # 添加跟进阶段和主要目的\n",
" data_details_90[\"跟进阶段\"] = \"续约后90天回访\"\n",
" data_details_90[\"主要目的\"] = \"关怀使用情况,促进更多功能使用,提升系统使用深度。\"\n",
" data_details_120[\"跟进阶段\"] = \"续约后120天回访\"\n",
" data_details_120[\"主要目的\"] = \"暂无\"\n",
" data_details_180[\"跟进阶段\"] = \"续约后180天回访\"\n",
" data_details_180[\"主要目的\"] = \"关怀使用情况,促进增购商机转化,识别潜在风险,及时提报。\"\n",
" \n",
" # 合并三个DataFrame(去除续约120天回访)\n",
" data_result = pd.concat([data_details_90, data_details_180], ignore_index=True)\n",
" print(f\"合并后派发数据长度:{len(data_result)}\")\n",
" \n",
" if len(data_result) > 0:\n",
" # 保存每次循环的派发数据\n",
" data_result.to_csv(f\"{output_dir}/31_第{i}次遍历_派发数据.csv\", index=False, encoding='utf-8-sig')\n",
" all_distribution_data.append(data_result)\n",
" else:\n",
" print(f\"警告:第{i}次遍历没有派发数据!\")\n",
" # 保存空数据的原因分析\n",
" analysis = {\n",
" '遍历次数': i,\n",
" '日期': now_time.strftime('%Y-%m-%d'),\n",
" '90天日期': formatted_today_90,\n",
" '180天日期': formatted_today_180,\n",
" '90天匹配数量': len(data_details_90),\n",
" '180天匹配数量': len(data_details_180),\n",
" '总数据量': len(data_details),\n",
" 'renew_date唯一值数量': data_details['renew_date'].nunique() if len(data_details) > 0 else 0\n",
" }\n",
" pd.DataFrame([analysis]).to_csv(f\"{output_dir}/32_第{i}次遍历_空数据原因分析.csv\", index=False, encoding='utf-8-sig')\n",
"\n",
"# 合并所有派发数据\n",
"if all_distribution_data:\n",
" final_distribution_data = pd.concat(all_distribution_data, ignore_index=True)\n",
" print(f\"\\n总派发数据量: {len(final_distribution_data)}\")\n",
" final_distribution_data.to_csv(f\"{output_dir}/33_最终派发数据.csv\", index=False, encoding='utf-8-sig')\n",
" print(f\"最终派发数据已保存到: {output_dir}/33_最终派发数据.csv\")\n",
"else:\n",
" print(\"\\n警告:所有遍历都没有派发数据!\")\n",
" # 分析为什么没有派发数据\n",
" analysis = {\n",
" '总数据量': len(data_details),\n",
" 'renew_date唯一值': data_details['renew_date'].nunique() if len(data_details) > 0 else 0,\n",
" 'renew_date范围': f\"{data_details['renew_date'].min()} 到 {data_details['renew_date'].max()}\" if len(data_details) > 0 else '无数据',\n",
" 'date_one': date_one,\n",
" '当前日期': datetime.datetime.now().strftime('%Y-%m-%d')\n",
" }\n",
" pd.DataFrame([analysis]).to_csv(f\"{output_dir}/34_派发数据为空原因分析.csv\", index=False, encoding='utf-8-sig')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤14: 派发数据为空的原因分析\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 分析派发数据为空的原因\n",
"print(\"========== 派发数据为空原因分析 ==========\")\n",
"\n",
"if len(data_details) == 0:\n",
" print(\"问题1: 最终过滤后数据为空\")\n",
" print(\"请检查步骤12的过滤条件\")\n",
"else:\n",
" print(f\"最终数据量: {len(data_details)}\")\n",
" \n",
" # 检查renew_date的分布\n",
" print(f\"\\nrenew_date唯一值数量: {data_details['renew_date'].nunique()}\")\n",
" print(f\"renew_date范围: {data_details['renew_date'].min()} 到 {data_details['renew_date'].max()}\")\n",
" \n",
" # 计算目标日期范围\n",
" now_time = datetime.datetime.now()\n",
" target_date_90 = (now_time - datetime.timedelta(days=83)).strftime(\"%Y-%m-%d\")\n",
" target_date_180 = (now_time - datetime.timedelta(days=173)).strftime(\"%Y-%m-%d\")\n",
" \n",
" print(f\"\\n目标90天日期: {target_date_90}\")\n",
" print(f\"目标180天日期: {target_date_180}\")\n",
" \n",
" # 检查是否有匹配的数据\n",
" match_90 = data_details[data_details['renew_date'] == target_date_90]\n",
" match_180 = data_details[data_details['renew_date'] == target_date_180]\n",
" \n",
" print(f\"\\n匹配90天日期的数据量: {len(match_90)}\")\n",
" print(f\"匹配180天日期的数据量: {len(match_180)}\")\n",
" \n",
" if len(match_90) > 0:\n",
" # 检查90天数据的group_grade过滤\n",
" match_90_filtered = match_90[match_90['group_grade'] != \"普通客户(VIP\"]\n",
" print(f\"90天数据过滤后(排除普通客户): {len(match_90_filtered)}\")\n",
" if len(match_90_filtered) == 0:\n",
" print(\"问题: 90天数据全部被group_grade过滤掉\")\n",
" print(f\"90天数据的group_grade分布: {match_90['group_grade'].value_counts().to_dict()}\")\n",
" \n",
" # 检查renew_date的日期分布\n",
" print(f\"\\nrenew_date日期分布(前20个):\")\n",
" print(data_details['renew_date'].value_counts().head(20))\n",
" \n",
" # 保存分析结果\n",
" analysis_result = {\n",
" '最终数据量': len(data_details),\n",
" 'renew_date唯一值数量': data_details['renew_date'].nunique(),\n",
" 'renew_date最小值': data_details['renew_date'].min(),\n",
" 'renew_date最大值': data_details['renew_date'].max(),\n",
" '目标90天日期': target_date_90,\n",
" '目标180天日期': target_date_180,\n",
" '匹配90天日期数量': len(match_90),\n",
" '匹配180天日期数量': len(match_180),\n",
" 'date_one': date_one,\n",
" '当前日期': datetime.datetime.now().strftime('%Y-%m-%d')\n",
" }\n",
" pd.DataFrame([analysis_result]).to_csv(f\"{output_dir}/35_派发数据为空原因分析.csv\", index=False, encoding='utf-8-sig')\n",
" \n",
" # 保存renew_date的详细分布\n",
" renew_date_dist = data_details['renew_date'].value_counts().reset_index()\n",
" renew_date_dist.columns = ['renew_date', 'count']\n",
" renew_date_dist.to_csv(f\"{output_dir}/36_renew_date分布.csv\", index=False, encoding='utf-8-sig')\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "SaaS",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}