1218 lines
59 KiB
Plaintext
1218 lines
59 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 续约回访90-180天调试分析\n",
|
||
"\n",
|
||
"本notebook用于调试续约回访派发数据为空的问题,每一步都会保存CSV以便分析。\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤1: 导入必要的库和初始化\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"当前工作目录: d:\\Idea Project\\SaaS_V1.7\\test\n",
|
||
"项目根目录: d:\\Idea Project\\SaaS_V1.7\n",
|
||
"Python路径: ['d:\\\\Idea Project\\\\SaaS_V1.7', 'd:\\\\Program Files\\\\anaconda3\\\\envs\\\\SaaS\\\\python313.zip', 'd:\\\\Program Files\\\\anaconda3\\\\envs\\\\SaaS\\\\DLLs']...\n",
|
||
"输出目录: d:\\Idea Project\\SaaS_V1.7\\back_ground_module\\output\\debug_revisit_renew\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import os\n",
|
||
"import sys\n",
|
||
"import time\n",
|
||
"import requests\n",
|
||
"\n",
|
||
"# 添加项目根目录到Python路径\n",
|
||
"# 方法1: 如果notebook在test目录下,向上找一级\n",
|
||
"current_dir = os.getcwd()\n",
|
||
"if os.path.basename(current_dir) == 'test':\n",
|
||
" project_root = os.path.dirname(current_dir)\n",
|
||
"else:\n",
|
||
" # 方法2: 向上查找直到找到api.py文件\n",
|
||
" project_root = current_dir\n",
|
||
" while project_root != os.path.dirname(project_root):\n",
|
||
" if os.path.exists(os.path.join(project_root, 'api.py')):\n",
|
||
" break\n",
|
||
" project_root = os.path.dirname(project_root)\n",
|
||
"\n",
|
||
"if project_root not in sys.path:\n",
|
||
" sys.path.insert(0, project_root)\n",
|
||
"print(f\"当前工作目录: {current_dir}\")\n",
|
||
"print(f\"项目根目录: {project_root}\")\n",
|
||
"print(f\"Python路径: {sys.path[:3]}...\") # 只显示前3个路径\n",
|
||
"\n",
|
||
"from api import API\n",
|
||
"from back_ground_module import CommonModule\n",
|
||
"import pandas as pd\n",
|
||
"import datetime\n",
|
||
"import re\n",
|
||
"from log_config import configure_task_logger, configure_error_task_logger\n",
|
||
"\n",
|
||
"api_instance = API()\n",
|
||
"common_module = CommonModule()\n",
|
||
"logger = configure_task_logger()\n",
|
||
"error_task_logger = configure_error_task_logger()\n",
|
||
"\n",
|
||
"# 设置输出目录(相对于当前notebook目录)\n",
|
||
"output_dir = \"output/debug_revisit_renew\"\n",
|
||
"os.makedirs(output_dir, exist_ok=True)\n",
|
||
"print(f\"输出目录: {os.path.abspath(output_dir)}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤2: 加载所有数据(单独执行,数据量较大)\n",
|
||
"\n",
|
||
"**注意:这一步数据量较大,请单独执行此单元**\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"print(\"开始加载所有数据...\")\n",
|
||
"print(f\"开始时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n",
|
||
"\n",
|
||
"# 省市区人员关系表\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676512ac3e54dc3159460c0a\"}\n",
|
||
"json_dict = api_instance.entry_data_list(payload)\n",
|
||
"if json_dict and \"data\" in json_dict:\n",
|
||
" json_list = json_dict.get(\"data\")\n",
|
||
" print(f\"省市区人员关系表: {len(json_list)} 条\")\n",
|
||
" pd.DataFrame(json_list).to_csv(f\"{output_dir}/01_省市区人员关系表.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"else:\n",
|
||
" print(\"加载省市区人员关系表失败\")\n",
|
||
" json_list = []\n",
|
||
"\n",
|
||
"# 获取简道云员工id\n",
|
||
"payload = {\"api_key\": \"6694d3c4fcb69ca9a111a6c4\", \"entry_id\": \"6769204a1902c9341340a1bc\"}\n",
|
||
"staff_id = api_instance.entry_data_list(payload)\n",
|
||
"staff_id_list = staff_id.get(\"data\")\n",
|
||
"print(f\"简道云员工id: {len(staff_id_list)} 条\")\n",
|
||
"pd.DataFrame(staff_id_list).to_csv(f\"{output_dir}/02_简道云员工id.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取权限表信息\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"675b96c14e839f90fef1647c\"}\n",
|
||
"permissions_table = api_instance.entry_data_list(payload).get(\"data\")\n",
|
||
"print(f\"权限表: {len(permissions_table)} 条\")\n",
|
||
"pd.DataFrame(permissions_table).to_csv(f\"{output_dir}/03_权限表.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取NGV数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"675bb02bd2d53c2034c665e4\"}\n",
|
||
"NGV_data_list = api_instance.entry_data_list(payload).get(\"data\")\n",
|
||
"print(f\"NGV数据: {len(NGV_data_list)} 条\")\n",
|
||
"pd.DataFrame(NGV_data_list).to_csv(f\"{output_dir}/04_NGV数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取服务提醒-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676bb7bda3029720f1083e99\"}\n",
|
||
"service_remind = api_instance.entry_data_list(payload).get(\"data\")\n",
|
||
"print(f\"服务提醒: {len(service_remind)} 条\")\n",
|
||
"pd.DataFrame(service_remind).to_csv(f\"{output_dir}/05_服务提醒.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取智能检测-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676bb99649ab3ac975af6e39\"}\n",
|
||
"Smart_detection = api_instance.entry_data_list(payload).get(\"data\")\n",
|
||
"print(f\"智能检测: {len(Smart_detection)} 条\")\n",
|
||
"pd.DataFrame(Smart_detection).to_csv(f\"{output_dir}/06_智能检测.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取功能使用情况表\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"6763bbf657bd8fb76fcb41b2\"}\n",
|
||
"get_feature_usage = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"print(f\"功能使用情况: {len(get_feature_usage)} 条\")\n",
|
||
"pd.DataFrame(get_feature_usage).to_csv(f\"{output_dir}/07_功能使用情况.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取保单识别表\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"6773a60d30ed87ff9f68d3c5\"}\n",
|
||
"policy_recognition = api_instance.entry_data_list(payload).get(\"data\")\n",
|
||
"print(f\"保单识别: {len(policy_recognition)} 条\")\n",
|
||
"widget_list = [item['_widget_1735632397600'] for item in policy_recognition]\n",
|
||
"pd.DataFrame(policy_recognition).to_csv(f\"{output_dir}/08_保单识别.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取私域小程序-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e0f0fae622896749ba5087\"}\n",
|
||
"private_domain = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"print(f\"私域小程序: {len(private_domain)} 条\")\n",
|
||
"pd.DataFrame(private_domain).to_csv(f\"{output_dir}/09_私域小程序.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取公域小程序-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e0c702c8f603b997980999\"}\n",
|
||
"public_domain = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"public_domain_list = [item['_widget_1742784257506'] for item in public_domain]\n",
|
||
"print(f\"公域小程序: {len(public_domain)} 条\")\n",
|
||
"pd.DataFrame(public_domain).to_csv(f\"{output_dir}/10_公域小程序.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取异业合作-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e24fdd8dfcfa918e17c30b\"}\n",
|
||
"different_industries = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"different_industries_list = [item['_widget_1742884829007'] for item in different_industries]\n",
|
||
"print(f\"异业合作: {len(different_industries)} 条\")\n",
|
||
"pd.DataFrame(different_industries).to_csv(f\"{output_dir}/11_异业合作.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取短信-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e5107198ba1b20d5df3974\"}\n",
|
||
"groupnotification = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"print(f\"短信: {len(groupnotification)} 条\")\n",
|
||
"pd.DataFrame(groupnotification).to_csv(f\"{output_dir}/12_短信.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 获取多公司过滤表\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"689bf5f8ba88a28cb0679ec9\"}\n",
|
||
"get_filter_company_list = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"print(f\"多公司过滤表: {len(get_filter_company_list)} 条\")\n",
|
||
"pd.DataFrame(get_filter_company_list).to_csv(f\"{output_dir}/13_多公司过滤表.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"print(f\"\\n数据加载完成!\")\n",
|
||
"print(f\"结束时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤3: 获取节假日列表和计算date_one\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"节假日列表: 52 个日期\n",
|
||
"当前日期: 2026-01-17\n",
|
||
"遍历日期: 2026-01-16\n",
|
||
"遍历次数: 1\n",
|
||
"date_one = 1\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def calculate_date_one(date_list, start_offset=0):\n",
|
||
" \"\"\"\n",
|
||
" 计算从当前日期(或指定偏移量的日期)开始,往前遍历遇到date_list中日期的次数。\n",
|
||
" \"\"\"\n",
|
||
" now_time = datetime.datetime.now() + datetime.timedelta(days=start_offset)\n",
|
||
" date_one = 1\n",
|
||
" print(\"当前日期:\", now_time.strftime(\"%Y-%m-%d\"))\n",
|
||
" \n",
|
||
" if now_time.strftime(\"%Y-%m-%d\") in date_list:\n",
|
||
" date_one = 0\n",
|
||
" print(\"开始次数:\", date_one)\n",
|
||
" else:\n",
|
||
" for i in range(1, 10):\n",
|
||
" new_date = now_time + datetime.timedelta(days=-i)\n",
|
||
" new_date_str = new_date.strftime(\"%Y-%m-%d\")\n",
|
||
" print(\"遍历日期:\", new_date_str)\n",
|
||
" if new_date_str in date_list:\n",
|
||
" date_one += 1\n",
|
||
" print(\"节假日期:\", new_date_str)\n",
|
||
" else:\n",
|
||
" break\n",
|
||
" \n",
|
||
" print(\"遍历次数:\", date_one)\n",
|
||
" return date_one\n",
|
||
"\n",
|
||
"# 获取节假日列表\n",
|
||
"date_list = common_module.get_holiday_list()\n",
|
||
"print(f\"节假日列表: {len(date_list)} 个日期\")\n",
|
||
"pd.Series(date_list).to_csv(f\"{output_dir}/14_节假日列表.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 计算date_one\n",
|
||
"date_one = calculate_date_one(date_list, start_offset=0)\n",
|
||
"print(f\"date_one = {date_one}\")\n",
|
||
"\n",
|
||
"# 保存date_one\n",
|
||
"pd.DataFrame([{\"date_one\": date_one}]).to_csv(f\"{output_dir}/15_date_one.csv\", index=False, encoding='utf-8-sig')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤4: 获取NGV明细数据\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"开始获取NGV明细数据...\n",
|
||
"开始时间: 2026-01-17 13:34:48\n",
|
||
"NGV明细数据: 45686 条\n",
|
||
"NGV明细数据列数: 141\n",
|
||
"已保存到: output/debug_revisit_renew/16_原始NGV数据.csv\n",
|
||
"结束时间: 2026-01-17 13:35:00\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"开始获取NGV明细数据...\")\n",
|
||
"print(f\"开始时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n",
|
||
"\n",
|
||
"data_NGV = common_module.get_ngv_details(days_back=1)\n",
|
||
"print(f\"NGV明细数据: {len(data_NGV)} 条\")\n",
|
||
"print(f\"NGV明细数据列数: {len(data_NGV.columns)}\")\n",
|
||
"\n",
|
||
"# 保存原始NGV数据\n",
|
||
"data_NGV.to_csv(f\"{output_dir}/16_原始NGV数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"print(f\"已保存到: {output_dir}/16_原始NGV数据.csv\")\n",
|
||
"\n",
|
||
"print(f\"结束时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤5: 构建省市区索引\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"省市区索引构建完成: 3667 条\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"def build_index(json_list):\n",
|
||
" index = {}\n",
|
||
" for json_item in json_list:\n",
|
||
" try:\n",
|
||
" key = (json_item['_widget_1734677164861'], json_item['_widget_1734677164862'],\n",
|
||
" json_item['_widget_1734677164863']) # 省市区\n",
|
||
" if '_widget_1734677164871' not in json_item: # 日常回访客服\n",
|
||
" raise KeyError(\"缺少 '日常回访客服' 键\")\n",
|
||
" index[key] = json_item\n",
|
||
" except KeyError as e:\n",
|
||
" print(f\"警告:{e},跳过该条记录: {json_item}\")\n",
|
||
" continue\n",
|
||
" return index\n",
|
||
"\n",
|
||
"# 从CSV加载省市区人员关系表(步骤2的输出)\n",
|
||
"csv_path = f\"{output_dir}/01_省市区人员关系表.csv\"\n",
|
||
"if os.path.exists(csv_path):\n",
|
||
" print(f\"从CSV文件读取: {csv_path}\")\n",
|
||
" json_list_df = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
|
||
" json_list = json_list_df.to_dict('records')\n",
|
||
" print(f\"读取到 {len(json_list)} 条记录\")\n",
|
||
"else:\n",
|
||
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤2\")\n",
|
||
" json_list = []\n",
|
||
"\n",
|
||
"# 构建索引\n",
|
||
"index = build_index(json_list)\n",
|
||
"print(f\"省市区索引构建完成: {len(index)} 条\")\n",
|
||
"\n",
|
||
"# 保存索引信息\n",
|
||
"index_df = pd.DataFrame([{\"省\": k[0], \"市\": k[1], \"区\": k[2], \"数据\": str(v)} for k, v in index.items()])\n",
|
||
"index_df.to_csv(f\"{output_dir}/17_省市区索引.csv\", index=False, encoding='utf-8-sig')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤6: 获取多公司过滤列表\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"获取多公司过滤公司id\n",
|
||
"过滤公司条数: 19\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"获取多公司过滤公司id\")\n",
|
||
"# 从CSV加载多公司过滤表(步骤2的输出)\n",
|
||
"csv_path = f\"{output_dir}/13_多公司过滤表.csv\"\n",
|
||
"if os.path.exists(csv_path):\n",
|
||
" print(f\"从CSV文件读取: {csv_path}\")\n",
|
||
" get_filter_company_list_df = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
|
||
" get_filter_company_list = get_filter_company_list_df.to_dict('records')\n",
|
||
" print(f\"读取到 {len(get_filter_company_list)} 条记录\")\n",
|
||
"else:\n",
|
||
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤2\")\n",
|
||
" get_filter_company_list = []\n",
|
||
"\n",
|
||
"all_filter_company_list = []\n",
|
||
"for company in get_filter_company_list:\n",
|
||
" company_list = company.get(\"_widget_1755052002491\")\n",
|
||
" if company_list:\n",
|
||
" # 处理可能是字符串的情况\n",
|
||
" if isinstance(company_list, str):\n",
|
||
" import ast\n",
|
||
" try:\n",
|
||
" company_list = ast.literal_eval(company_list)\n",
|
||
" except:\n",
|
||
" continue\n",
|
||
" for company_item in company_list:\n",
|
||
" if company_item.get(\"_widget_1755052002496\") == \"否\":\n",
|
||
" all_filter_company_list.append(company_item.get(\"_widget_1755052002495\"))\n",
|
||
"\n",
|
||
"print(f\"过滤公司条数: {len(all_filter_company_list)}\")\n",
|
||
"pd.Series(all_filter_company_list).to_csv(f\"{output_dir}/18_过滤公司列表.csv\", index=False, encoding='utf-8-sig')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤7: 数据处理和过滤(第一部分:基础处理)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"过滤前数据量: 45686\n",
|
||
"过滤后数据量: 45667\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 从CSV加载NGV数据(步骤4的输出)\n",
|
||
"csv_path = f\"{output_dir}/16_原始NGV数据.csv\"\n",
|
||
"if os.path.exists(csv_path):\n",
|
||
" print(f\"从CSV文件读取: {csv_path}\")\n",
|
||
" data_NGV = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
|
||
" print(f\"读取到 {len(data_NGV)} 条记录\")\n",
|
||
"else:\n",
|
||
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤4\")\n",
|
||
" data_NGV = pd.DataFrame()\n",
|
||
"\n",
|
||
"# 将A列和B列的日期字符串转换为日期格式\n",
|
||
"data_NGV_processed = data_NGV.copy()\n",
|
||
"data_NGV_processed['A'] = pd.to_datetime(data_NGV_processed['expiry_time'])\n",
|
||
"data_NGV_processed['B'] = pd.to_datetime(data_NGV_processed['renew_date'])\n",
|
||
"\n",
|
||
"def replace_values(series):\n",
|
||
" return series.apply(lambda x: '' if pd.isna(x) or x in ['NA', 'None', ''] else x)\n",
|
||
"\n",
|
||
"# 处理字符串数据\n",
|
||
"data_NGV_processed = data_NGV_processed.apply(replace_values)\n",
|
||
"\n",
|
||
"# 过滤多公司\n",
|
||
"print(f\"过滤前数据量: {len(data_NGV_processed)}\")\n",
|
||
"data_NGV_processed = data_NGV_processed[~data_NGV_processed['id_own_group'].isin(all_filter_company_list)]\n",
|
||
"print(f\"过滤后数据量: {len(data_NGV_processed)}\")\n",
|
||
"\n",
|
||
"# 保存过滤后的数据\n",
|
||
"data_NGV_processed.to_csv(f\"{output_dir}/19_过滤多公司后数据.csv\", index=False, encoding='utf-8-sig')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤8: 数据处理和过滤(第二部分:优先级排序和最佳值计算)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"合并最佳值后数据量: 45667\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 从CSV加载前一步的数据(步骤7的输出)\n",
|
||
"csv_path = f\"{output_dir}/19_过滤多公司后数据.csv\"\n",
|
||
"if os.path.exists(csv_path):\n",
|
||
" print(f\"从CSV文件读取: {csv_path}\")\n",
|
||
" data_NGV_processed = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
|
||
" # 重新转换日期列\n",
|
||
" data_NGV_processed['A'] = pd.to_datetime(data_NGV_processed['expiry_time'])\n",
|
||
" data_NGV_processed['B'] = pd.to_datetime(data_NGV_processed['renew_date'])\n",
|
||
" print(f\"读取到 {len(data_NGV_processed)} 条记录\")\n",
|
||
"else:\n",
|
||
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤7\")\n",
|
||
" data_NGV_processed = pd.DataFrame()\n",
|
||
"\n",
|
||
"# 定义优先级顺序\n",
|
||
"edition_order = ['皇冠版', '至尊版', '尊享版', '旗舰版', '标准版', '进阶版', '基础版', '入门版']\n",
|
||
"customer_type_order = [\"F\", \"E\", \"D\", \"C\", \"B\", \"A\"]\n",
|
||
"group_grade_order = ['全国KA(FMVP)', '区域KA(MVP)', '重要客户(SVIP)', '普通客户(VIP)']\n",
|
||
"\n",
|
||
"# 创建映射字典\n",
|
||
"edition_map = {edition: idx for idx, edition in enumerate(edition_order)}\n",
|
||
"customer_type_map = {ctype: idx for idx, ctype in enumerate(customer_type_order)}\n",
|
||
"group_grade_map = {grade: idx for idx, grade in enumerate(group_grade_order)}\n",
|
||
"\n",
|
||
"# 添加用于排序的新列\n",
|
||
"data_NGV_processed['edition_rank'] = data_NGV_processed['saas_edition_fmt'].map(edition_map).fillna(0).astype(int)\n",
|
||
"data_NGV_processed['customer_type_rank'] = data_NGV_processed['saas_customer_type'].map(customer_type_map).fillna(0).astype(int)\n",
|
||
"data_NGV_processed['group_grade_rank'] = data_NGV_processed['group_grade'].map(group_grade_map).fillna(0).astype(int)\n",
|
||
"\n",
|
||
"# 找到每组中 edition_rank 最小值对应的行\n",
|
||
"best_edition_idx = data_NGV_processed.groupby('id_own_group')['edition_rank'].idxmin()\n",
|
||
"best_edition_rows = data_NGV_processed.loc[best_edition_idx]\n",
|
||
"best_edition_rows['max_saas_edition'] = best_edition_rows['saas_edition_fmt']\n",
|
||
"\n",
|
||
"# 找到每组中 customer_type_rank 最小值对应的行\n",
|
||
"best_customer_type_idx = data_NGV_processed.groupby('id_own_group')['customer_type_rank'].idxmin()\n",
|
||
"best_customer_type_rows = data_NGV_processed.loc[best_customer_type_idx]\n",
|
||
"best_customer_type_rows['max_saas_customer_type'] = best_customer_type_rows['customer_type_rank'].apply(\n",
|
||
" lambda x: customer_type_order[x])\n",
|
||
"\n",
|
||
"# 找到每组中 group_grade_rank 最小值对应的行\n",
|
||
"best_group_grade_idx = data_NGV_processed.groupby('id_own_group')['group_grade_rank'].idxmin()\n",
|
||
"best_group_grade_rows = data_NGV_processed.loc[best_group_grade_idx]\n",
|
||
"best_group_grade_rows['max_group_grade'] = best_group_grade_rows['group_grade']\n",
|
||
"\n",
|
||
"# 合并最佳值回到原数据集\n",
|
||
"best_values = (\n",
|
||
" best_edition_rows[['id_own_group', 'max_saas_edition']]\n",
|
||
" .merge(best_customer_type_rows[['id_own_group', 'max_saas_customer_type']], on='id_own_group', how='outer')\n",
|
||
" .merge(best_group_grade_rows[['id_own_group', 'max_group_grade']], on='id_own_group', how='outer')\n",
|
||
")\n",
|
||
"\n",
|
||
"# 将最佳值合并回原数据集\n",
|
||
"data_NGV_processed = data_NGV_processed.merge(best_values, on='id_own_group', how='left')\n",
|
||
"\n",
|
||
"print(f\"合并最佳值后数据量: {len(data_NGV_processed)}\")\n",
|
||
"data_NGV_processed.to_csv(f\"{output_dir}/20_合并最佳值后数据.csv\", index=False, encoding='utf-8-sig')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤8.5: 字段数据类型检查(调试用)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"============================================================\n",
|
||
"字段数据类型和值检查\n",
|
||
"============================================================\n",
|
||
"\n",
|
||
"当前 data_NGV_processed 数据量: 45667 条\n",
|
||
"\n",
|
||
"【is_main_org 字段检查】\n",
|
||
"数据类型: object\n",
|
||
"唯一值: ['0', '1']\n",
|
||
"值分布:\n",
|
||
"is_main_org\n",
|
||
"1 37637\n",
|
||
"0 8030\n",
|
||
"Name: count, dtype: int64\n",
|
||
"示例值(前5个): ['1', '0', '0', '0', '1']\n",
|
||
"\n",
|
||
"【org_status 字段检查】\n",
|
||
"数据类型: object\n",
|
||
"唯一值: ['留存', '过期']\n",
|
||
"值分布:\n",
|
||
"org_status\n",
|
||
"留存 27997\n",
|
||
"过期 17670\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"【org_type 字段检查】\n",
|
||
"数据类型: object\n",
|
||
"唯一值: ['一般', '天猫']\n",
|
||
"值分布:\n",
|
||
"org_type\n",
|
||
"一般 42998\n",
|
||
"天猫 2669\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"【area_manager 字段检查】\n",
|
||
"数据类型: object\n",
|
||
"唯一值数量: 16\n",
|
||
"值分布(前10):\n",
|
||
"area_manager\n",
|
||
"肖军 10795\n",
|
||
"景东强 8353\n",
|
||
"陈庆伟 8301\n",
|
||
"张凯 8232\n",
|
||
"关磊 6994\n",
|
||
"孙玉蕾 2007\n",
|
||
"殷昊 745\n",
|
||
"王涛 161\n",
|
||
"刘伟 52\n",
|
||
" 8\n",
|
||
"Name: count, dtype: int64\n",
|
||
"是否包含'殷昊': True\n",
|
||
"是否包含'孙玉蕾': True\n",
|
||
"\n",
|
||
"【条件匹配测试】\n",
|
||
"警告: is_main_org 是字符串类型,需要转换为数值或使用字符串比较\n",
|
||
"使用数值比较 (== 1): 15066 条\n",
|
||
"使用字符串比较 (== '1'): 15066 条\n",
|
||
"\n",
|
||
"============================================================\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 从CSV加载前一步的数据(步骤8的输出)\n",
|
||
"csv_path = f\"{output_dir}/20_合并最佳值后数据.csv\"\n",
|
||
"if os.path.exists(csv_path):\n",
|
||
" print(f\"从CSV文件读取: {csv_path}\")\n",
|
||
" data_NGV_processed = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
|
||
" print(f\"读取到 {len(data_NGV_processed)} 条记录\")\n",
|
||
"else:\n",
|
||
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤8\")\n",
|
||
" data_NGV_processed = pd.DataFrame()\n",
|
||
"\n",
|
||
"# 检查关键字段的数据类型和实际值\n",
|
||
"print(\"=\" * 60)\n",
|
||
"print(\"字段数据类型和值检查\")\n",
|
||
"print(\"=\" * 60)\n",
|
||
"print(f\"\\n当前 data_NGV_processed 数据量: {len(data_NGV_processed)} 条\")\n",
|
||
"\n",
|
||
"# 检查 is_main_org\n",
|
||
"print(f\"\\n【is_main_org 字段检查】\")\n",
|
||
"print(f\"数据类型: {data_NGV_processed['is_main_org'].dtype}\")\n",
|
||
"print(f\"唯一值: {sorted(data_NGV_processed['is_main_org'].unique())}\")\n",
|
||
"print(f\"值分布:\\n{data_NGV_processed['is_main_org'].value_counts()}\")\n",
|
||
"print(f\"示例值(前5个): {data_NGV_processed['is_main_org'].head().tolist()}\")\n",
|
||
"\n",
|
||
"# 检查 org_status\n",
|
||
"print(f\"\\n【org_status 字段检查】\")\n",
|
||
"print(f\"数据类型: {data_NGV_processed['org_status'].dtype}\")\n",
|
||
"print(f\"唯一值: {sorted(data_NGV_processed['org_status'].unique())}\")\n",
|
||
"print(f\"值分布:\\n{data_NGV_processed['org_status'].value_counts()}\")\n",
|
||
"\n",
|
||
"# 检查 org_type\n",
|
||
"print(f\"\\n【org_type 字段检查】\")\n",
|
||
"print(f\"数据类型: {data_NGV_processed['org_type'].dtype}\")\n",
|
||
"print(f\"唯一值: {sorted(data_NGV_processed['org_type'].unique())}\")\n",
|
||
"print(f\"值分布:\\n{data_NGV_processed['org_type'].value_counts()}\")\n",
|
||
"\n",
|
||
"# 检查 area_manager\n",
|
||
"print(f\"\\n【area_manager 字段检查】\")\n",
|
||
"print(f\"数据类型: {data_NGV_processed['area_manager'].dtype}\")\n",
|
||
"print(f\"唯一值数量: {data_NGV_processed['area_manager'].nunique()}\")\n",
|
||
"print(f\"值分布(前10):\\n{data_NGV_processed['area_manager'].value_counts().head(10)}\")\n",
|
||
"print(f\"是否包含'殷昊': {'殷昊' in data_NGV_processed['area_manager'].values}\")\n",
|
||
"print(f\"是否包含'孙玉蕾': {'孙玉蕾' in data_NGV_processed['area_manager'].values}\")\n",
|
||
"\n",
|
||
"# 测试条件匹配\n",
|
||
"print(f\"\\n【条件匹配测试】\")\n",
|
||
"# 测试主店过期条件\n",
|
||
"if data_NGV_processed['is_main_org'].dtype == 'object':\n",
|
||
" print(\"警告: is_main_org 是字符串类型,需要转换为数值或使用字符串比较\")\n",
|
||
" # 尝试转换为数值\n",
|
||
" is_main_org_numeric = pd.to_numeric(data_NGV_processed['is_main_org'], errors='coerce')\n",
|
||
" condition1_test = (is_main_org_numeric == 1) & (data_NGV_processed['org_status'] == '过期')\n",
|
||
" condition1_test_str = (data_NGV_processed['is_main_org'] == '1') & (data_NGV_processed['org_status'] == '过期')\n",
|
||
" print(f\"使用数值比较 (== 1): {condition1_test.sum()} 条\")\n",
|
||
" print(f\"使用字符串比较 (== '1'): {condition1_test_str.sum()} 条\")\n",
|
||
"else:\n",
|
||
" condition1_test = (data_NGV_processed['is_main_org'] == 1) & (data_NGV_processed['org_status'] == '过期')\n",
|
||
" print(f\"主店过期条件匹配: {condition1_test.sum()} 条\")\n",
|
||
"\n",
|
||
"# 保存检查结果\n",
|
||
"check_result = {\n",
|
||
" 'is_main_org_dtype': str(data_NGV_processed['is_main_org'].dtype),\n",
|
||
" 'is_main_org_unique_values': list(data_NGV_processed['is_main_org'].unique()),\n",
|
||
" 'org_status_unique_values': list(data_NGV_processed['org_status'].unique()),\n",
|
||
" 'org_type_unique_values': list(data_NGV_processed['org_type'].unique()),\n",
|
||
" 'area_manager_unique_count': data_NGV_processed['area_manager'].nunique(),\n",
|
||
" 'has_殷昊': '殷昊' in data_NGV_processed['area_manager'].values,\n",
|
||
" 'has_孙玉蕾': '孙玉蕾' in data_NGV_processed['area_manager'].values,\n",
|
||
"}\n",
|
||
"pd.DataFrame([check_result]).to_csv(f\"{output_dir}/20.5_字段检查结果.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"print(\"\\n\" + \"=\" * 60)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤9: 数据处理和过滤(第三部分:主店过期处理)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"主店过期数据量: 0\n",
|
||
"警告: 主店过期数据为空,请检查 is_main_org 和 org_status 字段\n",
|
||
"满足条件的分店数据量: 0\n",
|
||
"警告: 主店过期数据为空,无法筛选分店数据\n",
|
||
"满足条件的主店数据量: 20159\n",
|
||
"警告: 没有分店数据可合并\n",
|
||
"合并后总数据量: 20159\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 主店过期,分店设置为主店\n",
|
||
"# 修复:处理 is_main_org 可能是字符串类型的情况\n",
|
||
"if data_NGV_processed['is_main_org'].dtype == 'object':\n",
|
||
" # 如果是字符串类型,转换为数值或使用字符串比较\n",
|
||
" is_main_org_numeric = pd.to_numeric(data_NGV_processed['is_main_org'], errors='coerce')\n",
|
||
" condition = (is_main_org_numeric == 1) & (data_NGV_processed['org_status'] == '过期')\n",
|
||
"else:\n",
|
||
" condition = (data_NGV_processed['is_main_org'] == 1) & (data_NGV_processed['org_status'] == '过期')\n",
|
||
"\n",
|
||
"ngvv2 = data_NGV_processed[condition]\n",
|
||
"print(f\"主店过期数据量: {len(ngvv2)}\")\n",
|
||
"if len(ngvv2) > 0:\n",
|
||
" ngvv2.to_csv(f\"{output_dir}/21_主店过期数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"else:\n",
|
||
" print(\"警告: 主店过期数据为空,请检查 is_main_org 和 org_status 字段\")\n",
|
||
"\n",
|
||
"# 检查id_own_group是否存在于ngvv2中\n",
|
||
"data_NGV_V2 = data_NGV_processed.copy()\n",
|
||
"\n",
|
||
"# 修复:处理 is_main_org 可能是字符串类型的情况\n",
|
||
"if data_NGV_V2['is_main_org'].dtype == 'object':\n",
|
||
" is_main_org_numeric_v2 = pd.to_numeric(data_NGV_V2['is_main_org'], errors='coerce')\n",
|
||
" data_NGV_V2['条件'] = ((data_NGV_V2['org_type'] == \"一般\") & (data_NGV_V2['org_status'] == '留存') &\n",
|
||
" (data_NGV_V2['area_manager'] != '殷昊') & (data_NGV_V2['area_manager'] != '孙玉蕾') &\n",
|
||
" (is_main_org_numeric_v2 != 1))\n",
|
||
"else:\n",
|
||
" data_NGV_V2['条件'] = ((data_NGV_V2['org_type'] == \"一般\") & (data_NGV_V2['org_status'] == '留存') &\n",
|
||
" (data_NGV_V2['area_manager'] != '殷昊') & (data_NGV_V2['area_manager'] != '孙玉蕾') &\n",
|
||
" (data_NGV_V2['is_main_org'] != 1))\n",
|
||
"\n",
|
||
"data_NGV_V2 = data_NGV_V2.loc[data_NGV_V2[\"条件\"]]\n",
|
||
"print(f\"满足条件的分店数据量: {len(data_NGV_V2)}\")\n",
|
||
"\n",
|
||
"# 过滤存在的记录\n",
|
||
"if len(ngvv2) > 0:\n",
|
||
" data_NGV_V2['exists_in_ngvv2'] = data_NGV_V2['id_own_group'].isin(ngvv2['id_own_group'])\n",
|
||
" filtered_data = data_NGV_V2[data_NGV_V2['exists_in_ngvv2']]\n",
|
||
" print(f\"存在于主店过期列表的分店数据量: {len(filtered_data)}\")\n",
|
||
" \n",
|
||
" if len(filtered_data) > 0:\n",
|
||
" filtered_data.to_csv(f\"{output_dir}/22_分店数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
" \n",
|
||
" # 按版本排序并去重\n",
|
||
" fixed_order = ['皇冠版', '至尊版', '尊享版', '旗舰版', '标准版', '进阶版', '基础版', '入门版']\n",
|
||
" fixed_order_map = {edition: index for index, edition in enumerate(fixed_order)}\n",
|
||
" filtered_data['sort_key'] = filtered_data['saas_edition_fmt'].map(fixed_order_map)\n",
|
||
" filtered_data = filtered_data.sort_values(by='sort_key').drop('sort_key', axis=1)\n",
|
||
" result = filtered_data.drop_duplicates(subset='id_own_group', keep='first')\n",
|
||
" print(f\"去重后的分店数据量: {len(result)}\")\n",
|
||
" if len(result) > 0:\n",
|
||
" result.to_csv(f\"{output_dir}/23_去重后分店数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
" else:\n",
|
||
" print(\"警告: 没有分店数据存在于主店过期列表中\")\n",
|
||
" result = pd.DataFrame()\n",
|
||
"else:\n",
|
||
" print(\"警告: 主店过期数据为空,无法筛选分店数据\")\n",
|
||
" result = pd.DataFrame()\n",
|
||
"\n",
|
||
"# 合并主店和分店数据\n",
|
||
"# 修复:处理 is_main_org 可能是字符串类型的情况\n",
|
||
"if data_NGV_processed['is_main_org'].dtype == 'object':\n",
|
||
" is_main_org_numeric_main = pd.to_numeric(data_NGV_processed['is_main_org'], errors='coerce')\n",
|
||
" data_NGV_processed['条件'] = ((data_NGV_processed['org_type'] == \"一般\") & (data_NGV_processed['org_status'] == '留存') &\n",
|
||
" (data_NGV_processed['area_manager'] != '殷昊') &\n",
|
||
" (data_NGV_processed['area_manager'] != '孙玉蕾') &\n",
|
||
" (is_main_org_numeric_main == 1))\n",
|
||
"else:\n",
|
||
" data_NGV_processed['条件'] = ((data_NGV_processed['org_type'] == \"一般\") & (data_NGV_processed['org_status'] == '留存') &\n",
|
||
" (data_NGV_processed['area_manager'] != '殷昊') &\n",
|
||
" (data_NGV_processed['area_manager'] != '孙玉蕾') &\n",
|
||
" (data_NGV_processed['is_main_org'] == 1))\n",
|
||
"\n",
|
||
"data_NGV_processed = data_NGV_processed.loc[data_NGV_processed[\"条件\"]]\n",
|
||
"print(f\"满足条件的主店数据量: {len(data_NGV_processed)}\")\n",
|
||
"\n",
|
||
"# 合并数据\n",
|
||
"if len(result) > 0:\n",
|
||
" data_NGV_processed = pd.concat([data_NGV_processed, result], axis=0)\n",
|
||
"else:\n",
|
||
" print(\"警告: 没有分店数据可合并\")\n",
|
||
"print(f\"合并后总数据量: {len(data_NGV_processed)}\")\n",
|
||
"\n",
|
||
"if len(data_NGV_processed) > 0:\n",
|
||
" data_NGV_processed.to_csv(f\"{output_dir}/24_合并主店分店后数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"else:\n",
|
||
" print(\"警告: 合并后数据为空,请检查前面的过滤条件\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤10: 数据处理和过滤(第四部分:续约日期处理)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"计算条件前数据量: 20159\n",
|
||
"条件>0的数据量: 9740\n",
|
||
"过滤后数据量: 9740\n",
|
||
"数据量: 9740\n",
|
||
"年数分布: {3: 7972, 2: 1388, 5: 259, 4: 98, 6: 14, 7: 4, 10: 4, 8: 1}\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"data_details = data_NGV_processed.copy()\n",
|
||
"data_details = data_details.reset_index(drop=True)\n",
|
||
"\n",
|
||
"# 判断A列的日期是否大于B列的日期730天\n",
|
||
"data_details['条件'] = data_details.apply(\n",
|
||
" lambda row: (\n",
|
||
" (pd.to_datetime(row['A']) - pd.to_datetime(row['B'])).days\n",
|
||
" if pd.to_datetime(row['A']) - pd.to_datetime(row['B']) >= pd.Timedelta(days=730)\n",
|
||
" else 0\n",
|
||
" ),\n",
|
||
" axis=1\n",
|
||
")\n",
|
||
"print(f\"计算条件前数据量: {len(data_details)}\")\n",
|
||
"print(f\"条件>0的数据量: {len(data_details[data_details['条件'] > 0])}\")\n",
|
||
"data_details = data_details.loc[data_details[\"条件\"] > 0]\n",
|
||
"print(f\"过滤后数据量: {len(data_details)}\")\n",
|
||
"data_details.to_csv(f\"{output_dir}/25_续约日期过滤后数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 计算年数\n",
|
||
"def divide_by_365(x):\n",
|
||
" if isinstance(x, (int, float)):\n",
|
||
" return int(x / 365)\n",
|
||
" else:\n",
|
||
" return x\n",
|
||
"\n",
|
||
"data_details['年'] = data_details['条件'].apply(divide_by_365)\n",
|
||
"data_details = data_details.reset_index(drop=True)\n",
|
||
"print(f\"数据量: {len(data_details)}\")\n",
|
||
"print(f\"年数分布: {data_details['年'].value_counts().to_dict()}\")\n",
|
||
"data_details.to_csv(f\"{output_dir}/26_计算年数后数据.csv\", index=False, encoding='utf-8-sig')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤11: 数据处理和过滤(第五部分:生成历史续约日期)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 从CSV加载前一步的数据(步骤10的输出)\n",
|
||
"csv_path = f\"{output_dir}/26_计算年数后数据.csv\"\n",
|
||
"if os.path.exists(csv_path):\n",
|
||
" print(f\"从CSV文件读取: {csv_path}\")\n",
|
||
" data_details = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
|
||
" print(f\"读取到 {len(data_details)} 条记录\")\n",
|
||
"else:\n",
|
||
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤10\")\n",
|
||
" data_details = pd.DataFrame()\n",
|
||
"\n",
|
||
"# 创建新的DataFrame用于存储历史续约日期\n",
|
||
"new_df = pd.DataFrame()\n",
|
||
"# 使用 from datetime import datetime 避免冲突\n",
|
||
"from datetime import datetime as dt\n",
|
||
"\n",
|
||
"for index, row in data_details.iterrows():\n",
|
||
" if row[\"renew_date\"] != \"2024-02-29\":\n",
|
||
" # 修复:确保 '年' 是整数类型,处理可能的NaN或float值\n",
|
||
" try:\n",
|
||
" year_value = int(row['年']) if pd.notna(row['年']) else 0\n",
|
||
" except (ValueError, TypeError):\n",
|
||
" year_value = 0\n",
|
||
" \n",
|
||
" # 只有当年数大于1时才生成历史续约日期\n",
|
||
" if year_value > 1:\n",
|
||
" for i_new in range(1, year_value):\n",
|
||
" row_new = row.copy()\n",
|
||
" c = row_new[\"renew_date\"]\n",
|
||
" # 使用 dt.strptime (dt 是 datetime.datetime 的别名)\n",
|
||
" date_obj = dt.strptime(str(c), \"%Y-%m-%d\")\n",
|
||
" new_year = date_obj.year + i_new\n",
|
||
" new_date_obj = date_obj.replace(year=new_year)\n",
|
||
" new_c = new_date_obj.strftime(\"%Y-%m-%d\")\n",
|
||
" row_new[\"renew_date\"] = new_c\n",
|
||
" new_df = pd.concat([new_df, pd.DataFrame([row_new])], ignore_index=True)\n",
|
||
"\n",
|
||
"print(f\"生成的历史续约日期数据量: {len(new_df)}\")\n",
|
||
"if len(new_df) > 0:\n",
|
||
" new_df.to_csv(f\"{output_dir}/27_生成的历史续约日期数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 合并两个DataFrame\n",
|
||
"merged_df = pd.concat([data_NGV_processed, new_df], axis=0, ignore_index=True)\n",
|
||
"data_details = merged_df.copy()\n",
|
||
"print(f\"合并后总数据量: {len(data_details)}\")\n",
|
||
"data_details.to_csv(f\"{output_dir}/28_合并历史续约日期后数据.csv\", index=False, encoding='utf-8-sig')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤12: 数据处理和过滤(第六部分:最终过滤)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"renew_date不为空的数据量: 38932\n",
|
||
"过滤前数据量: 38932\n",
|
||
"创建年份等于续约年份的数据量: 8246\n",
|
||
"最终过滤后数据量: 30686\n",
|
||
"\n",
|
||
"最终数据已保存到: output/debug_revisit_renew/30_最终过滤后数据.csv\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 从CSV加载前一步的数据(步骤11的输出)\n",
|
||
"csv_path = f\"{output_dir}/28_合并历史续约日期后数据.csv\"\n",
|
||
"if os.path.exists(csv_path):\n",
|
||
" print(f\"从CSV文件读取: {csv_path}\")\n",
|
||
" data_details = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
|
||
" print(f\"读取到 {len(data_details)} 条记录\")\n",
|
||
"else:\n",
|
||
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤11\")\n",
|
||
" data_details = pd.DataFrame()\n",
|
||
"\n",
|
||
"# 过滤renew_date不为空的数据\n",
|
||
"data_details_not_null = data_details[data_details['renew_date'].notnull()]\n",
|
||
"data_details_not_null = data_details_not_null.reset_index(drop=True)\n",
|
||
"print(f\"renew_date不为空的数据量: {len(data_details_not_null)}\")\n",
|
||
"data_details_not_null.to_csv(f\"{output_dir}/29_renew_date不为空数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"data_details = data_details_not_null.copy()\n",
|
||
"\n",
|
||
"# 过滤掉创建年份等于续约年份的数据\n",
|
||
"data_details['saas_create_time'] = data_details['saas_create_time'].str[:4]\n",
|
||
"data_details['renew_date_new'] = data_details['renew_date'].str[:4]\n",
|
||
"print(f\"过滤前数据量: {len(data_details)}\")\n",
|
||
"print(f\"创建年份等于续约年份的数据量: {len(data_details[data_details['saas_create_time'] == data_details['renew_date_new']])}\")\n",
|
||
"data_details = data_details[data_details['saas_create_time'] != data_details['renew_date_new']]\n",
|
||
"data_details = data_details.reset_index(drop=True)\n",
|
||
"print(f\"最终过滤后数据量: {len(data_details)}\")\n",
|
||
"\n",
|
||
"data_details.to_csv(f\"{output_dir}/30_最终过滤后数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"print(f\"\\n最终数据已保存到: {output_dir}/30_最终过滤后数据.csv\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤13: 日期计算和循环处理(90/120/180天数据筛选)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "AttributeError",
|
||
"evalue": "type object 'datetime.datetime' has no attribute 'datetime'",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||
"\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[27]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 2\u001b[39m date_120 = \u001b[32m113\u001b[39m\n\u001b[32m 3\u001b[39m date_180 = \u001b[32m173\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m start_time = \u001b[43mdatetime\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdatetime\u001b[49m.now()\n\u001b[32m 6\u001b[39m now_time = start_time.replace()\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m now_time.strftime(\u001b[33m\"\u001b[39m\u001b[33m%\u001b[39m\u001b[33mY-\u001b[39m\u001b[33m%\u001b[39m\u001b[33mm-\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01min\u001b[39;00m date_list:\n",
|
||
"\u001b[31mAttributeError\u001b[39m: type object 'datetime.datetime' has no attribute 'datetime'"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 从CSV加载最终过滤后的数据(步骤12的输出)\n",
|
||
"csv_path = f\"{output_dir}/30_最终过滤后数据.csv\"\n",
|
||
"if os.path.exists(csv_path):\n",
|
||
" print(f\"从CSV文件读取: {csv_path}\")\n",
|
||
" data_details = pd.read_csv(csv_path, encoding='utf-8-sig')\n",
|
||
" print(f\"读取到 {len(data_details)} 条记录\")\n",
|
||
"else:\n",
|
||
" print(f\"警告: 文件不存在 {csv_path},请先执行步骤12\")\n",
|
||
" data_details = pd.DataFrame()\n",
|
||
"\n",
|
||
"# 从CSV加载节假日列表和date_one(步骤3的输出)\n",
|
||
"date_list_csv = f\"{output_dir}/14_节假日列表.csv\"\n",
|
||
"if os.path.exists(date_list_csv):\n",
|
||
" date_list = pd.read_csv(date_list_csv, encoding='utf-8-sig').iloc[:, 0].tolist()\n",
|
||
"else:\n",
|
||
" print(f\"警告: 文件不存在 {date_list_csv},请先执行步骤3\")\n",
|
||
" date_list = []\n",
|
||
"\n",
|
||
"date_one_csv = f\"{output_dir}/15_date_one.csv\"\n",
|
||
"if os.path.exists(date_one_csv):\n",
|
||
" date_one = pd.read_csv(date_one_csv, encoding='utf-8-sig').iloc[0, 0]\n",
|
||
"else:\n",
|
||
" print(f\"警告: 文件不存在 {date_one_csv},请先执行步骤3\")\n",
|
||
" date_one = 1\n",
|
||
"\n",
|
||
"date_90 = 83\n",
|
||
"date_120 = 113\n",
|
||
"date_180 = 173\n",
|
||
"\n",
|
||
"start_time = datetime.datetime.now()\n",
|
||
"now_time = start_time.replace()\n",
|
||
"\n",
|
||
"if now_time.strftime(\"%Y-%m-%d\") in date_list:\n",
|
||
" date_one = 0\n",
|
||
" print(\"开始次数:\", date_one)\n",
|
||
" print(\"当前日期:\", now_time)\n",
|
||
"\n",
|
||
"print(f\"遍历次数:{date_one}\")\n",
|
||
"\n",
|
||
"# 存储所有派发数据\n",
|
||
"all_distribution_data = []\n",
|
||
"\n",
|
||
"for i in range(0, date_one):\n",
|
||
" print(f\"\\n========== 这是第{i}次遍历 ==========\")\n",
|
||
" now_time = datetime.datetime.now() + datetime.timedelta(days=-(i + 1))\n",
|
||
" \n",
|
||
" today = now_time + datetime.timedelta(days=-date_90)\n",
|
||
" formatted_today_90 = today.strftime(\"%Y-%m-%d\")\n",
|
||
" today = now_time + datetime.timedelta(days=-date_120)\n",
|
||
" formatted_today_120 = today.strftime(\"%Y-%m-%d\")\n",
|
||
" today = now_time + datetime.timedelta(days=-date_180)\n",
|
||
" formatted_today_180 = today.strftime(\"%Y-%m-%d\")\n",
|
||
" \n",
|
||
" print(f\"90天为{formatted_today_90},120天为{formatted_today_120},180天为{formatted_today_180}\")\n",
|
||
" \n",
|
||
" # 获取90天数据\n",
|
||
" data_details_90 = data_details.copy()\n",
|
||
" data_details_90['条件'] = ((data_details_90['renew_date'] == formatted_today_90) & \n",
|
||
" (data_details_90['group_grade'] != \"普通客户(VIP)\"))\n",
|
||
" data_details_90 = data_details_90.loc[data_details_90[\"条件\"]]\n",
|
||
" print(f\"90天数据量: {len(data_details_90)}\")\n",
|
||
" \n",
|
||
" # 获取120天数据\n",
|
||
" data_details_120 = data_details.copy()\n",
|
||
" data_details_120['条件'] = ((data_details_120['renew_date'] == formatted_today_120) &\n",
|
||
" ((data_details_120['saas_edition_fmt'] == '基础版') |\n",
|
||
" (data_details_120['saas_edition_fmt'] == '入门版')))\n",
|
||
" data_details_120 = data_details_120.loc[data_details_120[\"条件\"]]\n",
|
||
" print(f\"120天数据量: {len(data_details_120)}\")\n",
|
||
" \n",
|
||
" # 获取180天数据\n",
|
||
" data_details_180 = data_details.copy()\n",
|
||
" data_details_180['条件'] = (data_details_180['renew_date'] == formatted_today_180)\n",
|
||
" data_details_180 = data_details_180.loc[data_details_180[\"条件\"]]\n",
|
||
" print(f\"180天数据量: {len(data_details_180)}\")\n",
|
||
" \n",
|
||
" # 添加跟进阶段和主要目的\n",
|
||
" data_details_90[\"跟进阶段\"] = \"续约后90天回访\"\n",
|
||
" data_details_90[\"主要目的\"] = \"关怀使用情况,促进更多功能使用,提升系统使用深度。\"\n",
|
||
" data_details_120[\"跟进阶段\"] = \"续约后120天回访\"\n",
|
||
" data_details_120[\"主要目的\"] = \"暂无\"\n",
|
||
" data_details_180[\"跟进阶段\"] = \"续约后180天回访\"\n",
|
||
" data_details_180[\"主要目的\"] = \"关怀使用情况,促进增购商机转化,识别潜在风险,及时提报。\"\n",
|
||
" \n",
|
||
" # 合并三个DataFrame(去除续约120天回访)\n",
|
||
" data_result = pd.concat([data_details_90, data_details_180], ignore_index=True)\n",
|
||
" print(f\"合并后派发数据长度:{len(data_result)}\")\n",
|
||
" \n",
|
||
" if len(data_result) > 0:\n",
|
||
" # 保存每次循环的派发数据\n",
|
||
" data_result.to_csv(f\"{output_dir}/31_第{i}次遍历_派发数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
" all_distribution_data.append(data_result)\n",
|
||
" else:\n",
|
||
" print(f\"警告:第{i}次遍历没有派发数据!\")\n",
|
||
" # 保存空数据的原因分析\n",
|
||
" analysis = {\n",
|
||
" '遍历次数': i,\n",
|
||
" '日期': now_time.strftime('%Y-%m-%d'),\n",
|
||
" '90天日期': formatted_today_90,\n",
|
||
" '180天日期': formatted_today_180,\n",
|
||
" '90天匹配数量': len(data_details_90),\n",
|
||
" '180天匹配数量': len(data_details_180),\n",
|
||
" '总数据量': len(data_details),\n",
|
||
" 'renew_date唯一值数量': data_details['renew_date'].nunique() if len(data_details) > 0 else 0\n",
|
||
" }\n",
|
||
" pd.DataFrame([analysis]).to_csv(f\"{output_dir}/32_第{i}次遍历_空数据原因分析.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"\n",
|
||
"# 合并所有派发数据\n",
|
||
"if all_distribution_data:\n",
|
||
" final_distribution_data = pd.concat(all_distribution_data, ignore_index=True)\n",
|
||
" print(f\"\\n总派发数据量: {len(final_distribution_data)}\")\n",
|
||
" final_distribution_data.to_csv(f\"{output_dir}/33_最终派发数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
" print(f\"最终派发数据已保存到: {output_dir}/33_最终派发数据.csv\")\n",
|
||
"else:\n",
|
||
" print(\"\\n警告:所有遍历都没有派发数据!\")\n",
|
||
" # 分析为什么没有派发数据\n",
|
||
" analysis = {\n",
|
||
" '总数据量': len(data_details),\n",
|
||
" 'renew_date唯一值': data_details['renew_date'].nunique() if len(data_details) > 0 else 0,\n",
|
||
" 'renew_date范围': f\"{data_details['renew_date'].min()} 到 {data_details['renew_date'].max()}\" if len(data_details) > 0 else '无数据',\n",
|
||
" 'date_one': date_one,\n",
|
||
" '当前日期': datetime.datetime.now().strftime('%Y-%m-%d')\n",
|
||
" }\n",
|
||
" pd.DataFrame([analysis]).to_csv(f\"{output_dir}/34_派发数据为空原因分析.csv\", index=False, encoding='utf-8-sig')\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤14: 派发数据为空的原因分析\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 分析派发数据为空的原因\n",
|
||
"print(\"========== 派发数据为空原因分析 ==========\")\n",
|
||
"\n",
|
||
"if len(data_details) == 0:\n",
|
||
" print(\"问题1: 最终过滤后数据为空\")\n",
|
||
" print(\"请检查步骤12的过滤条件\")\n",
|
||
"else:\n",
|
||
" print(f\"最终数据量: {len(data_details)}\")\n",
|
||
" \n",
|
||
" # 检查renew_date的分布\n",
|
||
" print(f\"\\nrenew_date唯一值数量: {data_details['renew_date'].nunique()}\")\n",
|
||
" print(f\"renew_date范围: {data_details['renew_date'].min()} 到 {data_details['renew_date'].max()}\")\n",
|
||
" \n",
|
||
" # 计算目标日期范围\n",
|
||
" now_time = datetime.datetime.now()\n",
|
||
" target_date_90 = (now_time - datetime.timedelta(days=83)).strftime(\"%Y-%m-%d\")\n",
|
||
" target_date_180 = (now_time - datetime.timedelta(days=173)).strftime(\"%Y-%m-%d\")\n",
|
||
" \n",
|
||
" print(f\"\\n目标90天日期: {target_date_90}\")\n",
|
||
" print(f\"目标180天日期: {target_date_180}\")\n",
|
||
" \n",
|
||
" # 检查是否有匹配的数据\n",
|
||
" match_90 = data_details[data_details['renew_date'] == target_date_90]\n",
|
||
" match_180 = data_details[data_details['renew_date'] == target_date_180]\n",
|
||
" \n",
|
||
" print(f\"\\n匹配90天日期的数据量: {len(match_90)}\")\n",
|
||
" print(f\"匹配180天日期的数据量: {len(match_180)}\")\n",
|
||
" \n",
|
||
" if len(match_90) > 0:\n",
|
||
" # 检查90天数据的group_grade过滤\n",
|
||
" match_90_filtered = match_90[match_90['group_grade'] != \"普通客户(VIP)\"]\n",
|
||
" print(f\"90天数据过滤后(排除普通客户): {len(match_90_filtered)}\")\n",
|
||
" if len(match_90_filtered) == 0:\n",
|
||
" print(\"问题: 90天数据全部被group_grade过滤掉\")\n",
|
||
" print(f\"90天数据的group_grade分布: {match_90['group_grade'].value_counts().to_dict()}\")\n",
|
||
" \n",
|
||
" # 检查renew_date的日期分布\n",
|
||
" print(f\"\\nrenew_date日期分布(前20个):\")\n",
|
||
" print(data_details['renew_date'].value_counts().head(20))\n",
|
||
" \n",
|
||
" # 保存分析结果\n",
|
||
" analysis_result = {\n",
|
||
" '最终数据量': len(data_details),\n",
|
||
" 'renew_date唯一值数量': data_details['renew_date'].nunique(),\n",
|
||
" 'renew_date最小值': data_details['renew_date'].min(),\n",
|
||
" 'renew_date最大值': data_details['renew_date'].max(),\n",
|
||
" '目标90天日期': target_date_90,\n",
|
||
" '目标180天日期': target_date_180,\n",
|
||
" '匹配90天日期数量': len(match_90),\n",
|
||
" '匹配180天日期数量': len(match_180),\n",
|
||
" 'date_one': date_one,\n",
|
||
" '当前日期': datetime.datetime.now().strftime('%Y-%m-%d')\n",
|
||
" }\n",
|
||
" pd.DataFrame([analysis_result]).to_csv(f\"{output_dir}/35_派发数据为空原因分析.csv\", index=False, encoding='utf-8-sig')\n",
|
||
" \n",
|
||
" # 保存renew_date的详细分布\n",
|
||
" renew_date_dist = data_details['renew_date'].value_counts().reset_index()\n",
|
||
" renew_date_dist.columns = ['renew_date', 'count']\n",
|
||
" renew_date_dist.to_csv(f\"{output_dir}/36_renew_date分布.csv\", index=False, encoding='utf-8-sig')\n"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "SaaS",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.13.11"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|