900 lines
36 KiB
Plaintext
900 lines
36 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 数据库验证脚本 - 数据处理部分\n",
|
||
"\n",
|
||
"本notebook用于调试和验证数据库验证脚本的数据处理逻辑(260-425行)\n",
|
||
"\n",
|
||
"## 使用说明\n",
|
||
"1. 先执行数据加载部分(第2个单元格),这部分比较耗时\n",
|
||
"2. 数据加载完成后,再执行后续的数据处理单元格\n",
|
||
"3. 每个单元格都可以单独执行和调试\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-16T06:53:03.604128900Z",
|
||
"start_time": "2026-01-16T06:53:01.840121200Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"库导入完成\n",
|
||
"项目根目录: D:\\Idea Project\\SaaS_V1.7\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 导入必要的库\n",
|
||
"import os\n",
|
||
"import sys\n",
|
||
"import pandas as pd\n",
|
||
"import datetime\n",
|
||
"from datetime import datetime, timedelta\n",
|
||
"import re\n",
|
||
"\n",
|
||
"# 添加项目根目录到路径(notebook文件在test目录下,需要添加父目录)\n",
|
||
"current_dir = os.getcwd()\n",
|
||
"# 如果当前目录是test,则添加父目录;否则添加当前目录\n",
|
||
"if os.path.basename(current_dir) == 'test':\n",
|
||
" project_root = os.path.dirname(current_dir)\n",
|
||
"else:\n",
|
||
" project_root = current_dir\n",
|
||
"sys.path.insert(0, project_root)\n",
|
||
"\n",
|
||
"from api import API\n",
|
||
"from back_ground_module import CommonModule\n",
|
||
"from log_config import configure_task_logger, configure_error_task_logger\n",
|
||
"\n",
|
||
"# 初始化API和CommonModule\n",
|
||
"api_instance = API()\n",
|
||
"common_module = CommonModule()\n",
|
||
"\n",
|
||
"# 获取日志记录器\n",
|
||
"logger = configure_task_logger()\n",
|
||
"error_task_logger = configure_error_task_logger()\n",
|
||
"\n",
|
||
"print(\"库导入完成\")\n",
|
||
"print(f\"项目根目录: {project_root}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤1: 数据加载(耗时操作,可单独执行)\n",
|
||
"\n",
|
||
"这部分会加载所有必要的数据,包括:\n",
|
||
"- 省市区人员关系表\n",
|
||
"- 员工ID列表\n",
|
||
"- 权限表\n",
|
||
"- NGV数据列表\n",
|
||
"- 服务提醒数据\n",
|
||
"- 智能检测数据\n",
|
||
"- 功能使用情况表\n",
|
||
"- 保单识别表\n",
|
||
"- 私域/公域小程序数据\n",
|
||
"- 异业合作数据\n",
|
||
"- 短信数据\n",
|
||
"- 多公司过滤表\n",
|
||
"- NGV明细数据(从数据库获取)\n",
|
||
"- 节假日列表\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# ========== 数据加载部分 ==========\n",
|
||
"# 这部分比较耗时,可以单独执行\n",
|
||
"\n",
|
||
"print(\"开始加载数据...\")\n",
|
||
"\n",
|
||
"# 省市区人员关系表\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676512ac3e54dc3159460c0a\"}\n",
|
||
"json_dict = api_instance.entry_data_list(payload)\n",
|
||
"if json_dict and \"data\" in json_dict:\n",
|
||
" json_list = json_dict.get(\"data\")\n",
|
||
"else:\n",
|
||
" print(\"加载省市区人员关系表失败\")\n",
|
||
" json_list = []\n",
|
||
"print(f\"省市区人员关系表: {len(json_list)} 条\")\n",
|
||
"\n",
|
||
"# 获取简道云员工id\n",
|
||
"payload = {\"api_key\": \"6694d3c4fcb69ca9a111a6c4\", \"entry_id\": \"6769204a1902c9341340a1bc\"}\n",
|
||
"staff_id = api_instance.entry_data_list(payload)\n",
|
||
"staff_id_list = staff_id.get(\"data\")\n",
|
||
"print(f\"员工ID列表: {len(staff_id_list)} 条\")\n",
|
||
"\n",
|
||
"# 获取权限表信息\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"675b96c14e839f90fef1647c\"}\n",
|
||
"permissions_table = api_instance.entry_data_list(payload).get(\"data\")\n",
|
||
"print(f\"权限表: {len(permissions_table)} 条\")\n",
|
||
"\n",
|
||
"# 获取NGV数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"675bb02bd2d53c2034c665e4\"}\n",
|
||
"NGV_data_list = api_instance.entry_data_list(payload).get(\"data\")\n",
|
||
"print(f\"NGV数据列表: {len(NGV_data_list)} 条\")\n",
|
||
"\n",
|
||
"# 获取服务提醒-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676bb7bda3029720f1083e99\"}\n",
|
||
"service_remind = api_instance.entry_data_list(payload).get(\"data\")\n",
|
||
"print(f\"服务提醒数据: {len(service_remind)} 条\")\n",
|
||
"\n",
|
||
"# 获取智能检测-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676bb99649ab3ac975af6e39\"}\n",
|
||
"Smart_detection = api_instance.entry_data_list(payload).get(\"data\")\n",
|
||
"print(f\"智能检测数据: {len(Smart_detection)} 条\")\n",
|
||
"\n",
|
||
"# 获取功能使用情况表\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"6763bbf657bd8fb76fcb41b2\"}\n",
|
||
"get_feature_usage = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"print(f\"功能使用情况表: {len(get_feature_usage)} 条\")\n",
|
||
"\n",
|
||
"# 获取保单识别表\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"6773a60d30ed87ff9f68d3c5\"}\n",
|
||
"policy_recognition = api_instance.entry_data_list(payload).get(\"data\")\n",
|
||
"widget_list = [item['_widget_1735632397600'] for item in policy_recognition]\n",
|
||
"print(f\"保单识别表: {len(policy_recognition)} 条\")\n",
|
||
"\n",
|
||
"# 获取私域小程序-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e0f0fae622896749ba5087\"}\n",
|
||
"private_domain = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"print(f\"私域小程序数据: {len(private_domain)} 条\")\n",
|
||
"\n",
|
||
"# 获取公域小程序-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e0c702c8f603b997980999\"}\n",
|
||
"public_domain = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"public_domain_list = [item['_widget_1742784257506'] for item in public_domain]\n",
|
||
"print(f\"公域小程序数据: {len(public_domain)} 条\")\n",
|
||
"\n",
|
||
"# 获取异业合作-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e24fdd8dfcfa918e17c30b\"}\n",
|
||
"different_industries = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"different_industries_list = [item['_widget_1742884829007'] for item in different_industries]\n",
|
||
"print(f\"异业合作数据: {len(different_industries)} 条\")\n",
|
||
"\n",
|
||
"# 获取短信-数据支持表单数据\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e5107198ba1b20d5df3974\"}\n",
|
||
"groupnotification = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"print(f\"短信数据: {len(groupnotification)} 条\")\n",
|
||
"\n",
|
||
"# 获取多公司过滤表\n",
|
||
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"689bf5f8ba88a28cb0679ec9\"}\n",
|
||
"get_filter_company_list = api_instance.entry_data_list(payload).get(\"data\", [])\n",
|
||
"print(f\"多公司过滤表: {len(get_filter_company_list)} 条\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"\n",
|
||
"# 获取节假日列表\n",
|
||
"date_list = common_module.get_holiday_list()\n",
|
||
"print(f\"节假日列表: {len(date_list)} 个日期\")\n",
|
||
"\n",
|
||
"# 获取NGV明细数据(从数据库获取,比较耗时)\n",
|
||
"print(\"\\n开始从数据库获取NGV明细数据(这可能需要一些时间)...\")\n",
|
||
"data_NGV = common_module.get_ngv_details(days_back=1)\n",
|
||
"print(f\"NGV明细数据: {len(data_NGV)} 条\")\n",
|
||
"print(f\"NGV明细数据列数: {len(data_NGV.columns)}\")\n",
|
||
"\n",
|
||
"# 构建省市区索引\n",
|
||
"def build_index(json_list):\n",
|
||
" index = {}\n",
|
||
" for json_item in json_list:\n",
|
||
" try:\n",
|
||
" key = (json_item['_widget_1734677164861'], json_item['_widget_1734677164862'],\n",
|
||
" json_item['_widget_1734677164863']) # 省市区\n",
|
||
" if '_widget_1734677164871' not in json_item: # 日常回访客服\n",
|
||
" raise KeyError(\"缺少 '日常回访客服' 键\")\n",
|
||
" index[key] = json_item\n",
|
||
" except KeyError as e:\n",
|
||
" print(f\"警告:{e},跳过该条记录\")\n",
|
||
" continue\n",
|
||
" return index\n",
|
||
"\n",
|
||
"index = build_index(json_list)\n",
|
||
"print(f\"省市区索引构建完成: {len(index)} 条\")\n",
|
||
"\n",
|
||
"print(\"\\n========== 数据加载完成 ==========\")\n",
|
||
"print(f\"数据加载时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 步骤2: 数据处理逻辑(260-425行)\n",
|
||
"\n",
|
||
"这部分包含主要的数据处理逻辑,包括:\n",
|
||
"- 获取多公司过滤公司id\n",
|
||
"- 数据清洗和转换\n",
|
||
"- 优先级排序\n",
|
||
"- 数据过滤和合并\n",
|
||
"- 日期计算和扩展\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# ========== 获取多公司过滤公司id ==========\n",
|
||
"logger.info(\"获取多公司过滤公司id\")\n",
|
||
"all_filter_company_list = [] # 获取多公司过滤公司id\n",
|
||
"for company in get_filter_company_list:\n",
|
||
" company_list = company.get(\"_widget_1755052002491\")\n",
|
||
" if company_list:\n",
|
||
" for company_item in company_list:\n",
|
||
" if company_item.get(\"_widget_1755052002496\") == \"否\":\n",
|
||
" all_filter_company_list.append(company_item.get(\"_widget_1755052002495\"))\n",
|
||
"logger.info(f\"过滤公司条数:{len(all_filter_company_list)}\")\n",
|
||
"print(f\"过滤公司条数: {len(all_filter_company_list)}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-16T07:27:13.175060200Z",
|
||
"start_time": "2026-01-16T07:27:09.857076900Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"数据预处理完成,当前数据量: 45672 条\n",
|
||
"数据列数: 143\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# ========== 数据预处理:日期转换和数据清洗 ==========\n",
|
||
"# 将A列和B列的日期字符串转换为日期格式\n",
|
||
"data_NGV = data_NGV.copy()\n",
|
||
"data_NGV['A'] = pd.to_datetime(data_NGV['expiry_time'])\n",
|
||
"data_NGV['B'] = pd.to_datetime(data_NGV['renew_date'])\n",
|
||
"\n",
|
||
"def replace_values(series):\n",
|
||
" # 使用条件判断来进行替换\n",
|
||
" return series.apply(lambda x: '' if pd.isna(x) or x in ['NA', 'None', ''] else x)\n",
|
||
"\n",
|
||
"# 处理字符串数据并显式指定数据类型\n",
|
||
"data_NGV = data_NGV.apply(replace_values)\n",
|
||
"\n",
|
||
"print(f\"数据预处理完成,当前数据量: {len(data_NGV)} 条\")\n",
|
||
"print(f\"数据列数: {len(data_NGV.columns)}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-16T07:28:01.298494800Z",
|
||
"start_time": "2026-01-16T07:28:01.106113700Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"过滤多公司后数据量: 45653 条\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# ========== 过滤多公司 ==========\n",
|
||
"# 针对公司主店过期,取公司最高等级版本派发\n",
|
||
"# 过滤多公司\n",
|
||
"data_NGV = data_NGV[~data_NGV['id_own_group'].isin(all_filter_company_list)]\n",
|
||
"print(f\"过滤多公司后数据量: {len(data_NGV)} 条\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-16T07:28:04.235079300Z",
|
||
"start_time": "2026-01-16T07:28:04.185820400Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"优先级映射完成\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# ========== 定义优先级顺序和创建映射字典 ==========\n",
|
||
"# 定义优先级顺序\n",
|
||
"edition_order = ['皇冠版', '至尊版', '尊享版', '旗舰版', '标准版', '进阶版', '基础版', '入门版']\n",
|
||
"customer_type_order = [\"F\", \"E\", \"D\", \"C\", \"B\", \"A\"] # 索引越小优先级越高\n",
|
||
"group_grade_order = ['全国KA(FMVP)', '区域KA(MVP)', '重要客户(SVIP)', '普通客户(VIP)']\n",
|
||
"\n",
|
||
"# 创建映射字典,并为不在列表中的值设置默认值\n",
|
||
"edition_map = {edition: idx for idx, edition in enumerate(edition_order)}\n",
|
||
"customer_type_map = {ctype: idx for idx, ctype in enumerate(customer_type_order)}\n",
|
||
"group_grade_map = {grade: idx for idx, grade in enumerate(group_grade_order)}\n",
|
||
"\n",
|
||
"# 添加用于排序的新列,并处理不在映射字典中的值\n",
|
||
"data_NGV['edition_rank'] = data_NGV['saas_edition_fmt'].map(edition_map).fillna(0).astype(int) # 缺失值用最高优先级填充\n",
|
||
"data_NGV['customer_type_rank'] = data_NGV['saas_customer_type'].map(customer_type_map).fillna(0).astype(int)\n",
|
||
"data_NGV['group_grade_rank'] = data_NGV['group_grade'].map(group_grade_map).fillna(0).astype(int)\n",
|
||
"\n",
|
||
"print(\"优先级映射完成\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-16T07:28:08.790102700Z",
|
||
"start_time": "2026-01-16T07:28:08.399298Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"最佳值查找完成\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# ========== 找到每组中的最佳值 ==========\n",
|
||
"# 找到每组中 edition_rank 最小值对应的行\n",
|
||
"best_edition_idx = data_NGV.groupby('id_own_group')['edition_rank'].idxmin()\n",
|
||
"best_edition_rows = data_NGV.loc[best_edition_idx]\n",
|
||
"best_edition_rows['max_saas_edition'] = best_edition_rows['saas_edition_fmt']\n",
|
||
"\n",
|
||
"# 找到每组中 customer_type_rank 最小值对应的行\n",
|
||
"best_customer_type_idx = data_NGV.groupby('id_own_group')['customer_type_rank'].idxmin()\n",
|
||
"best_customer_type_rows = data_NGV.loc[best_customer_type_idx]\n",
|
||
"best_customer_type_rows['max_saas_customer_type'] = best_customer_type_rows['customer_type_rank'].apply(\n",
|
||
" lambda x: customer_type_order[x])\n",
|
||
"\n",
|
||
"# 找到每组中 group_grade_rank 最小值对应的行\n",
|
||
"best_group_grade_idx = data_NGV.groupby('id_own_group')['group_grade_rank'].idxmin()\n",
|
||
"best_group_grade_rows = data_NGV.loc[best_group_grade_idx]\n",
|
||
"best_group_grade_rows['max_group_grade'] = best_group_grade_rows['group_grade']\n",
|
||
"\n",
|
||
"print(\"最佳值查找完成\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-16T07:28:11.374632Z",
|
||
"start_time": "2026-01-16T07:28:11.141730700Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"最佳值合并完成,当前数据量: 45653 条\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# ========== 合并最佳值回到原数据集 ==========\n",
|
||
"# 合并最佳值回到原数据集\n",
|
||
"best_values = (\n",
|
||
" best_edition_rows[['id_own_group', 'max_saas_edition']]\n",
|
||
" .merge(best_customer_type_rows[['id_own_group', 'max_saas_customer_type']], on='id_own_group',\n",
|
||
" how='outer')\n",
|
||
" .merge(best_group_grade_rows[['id_own_group', 'max_group_grade']], on='id_own_group', how='outer')\n",
|
||
")\n",
|
||
"\n",
|
||
"# 将最佳值合并回原数据集\n",
|
||
"data_NGV = data_NGV.merge(best_values, on='id_own_group', how='left')\n",
|
||
"\n",
|
||
"print(f\"最佳值合并完成,当前数据量: {len(data_NGV)} 条\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-16T07:30:06.358604500Z",
|
||
"start_time": "2026-01-16T07:30:05.752861600Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"============================================================\n",
|
||
"调试信息:处理主店过期情况\n",
|
||
"============================================================\n",
|
||
"\n",
|
||
"当前 data_NGV 数据量: 45653 条\n",
|
||
"\n",
|
||
"【字段检查】\n",
|
||
"is_main_org 数据类型: object\n",
|
||
"is_main_org 唯一值: ['0', '1']\n",
|
||
"is_main_org 值分布:\n",
|
||
"is_main_org\n",
|
||
"1 37628\n",
|
||
"0 8025\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"org_status 数据类型: object\n",
|
||
"org_status 唯一值: ['留存', '过期']\n",
|
||
"org_status 值分布:\n",
|
||
"org_status\n",
|
||
"留存 27985\n",
|
||
"过期 17668\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"org_type 数据类型: object\n",
|
||
"org_type 唯一值: ['一般', '天猫']\n",
|
||
"org_type 值分布:\n",
|
||
"org_type\n",
|
||
"一般 42985\n",
|
||
"天猫 2668\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"【步骤1: 筛选主店过期】\n",
|
||
"警告: is_main_org 是字符串类型,尝试转换为数值\n",
|
||
"条件筛选结果数量: 15065 条\n",
|
||
"主店过期数据量 (ngvv2): 15065 条\n",
|
||
"ngvv2 中的 id_own_group 数量: 15065 个\n",
|
||
"ngvv2 中的 id_own_group 示例: ['10545055917999655906', '10545055917999678943', '10545055917999702656', '10545055917999726421', '10545055917999791008', '10545055917999907421', '10545055917999958815', '10545055917999963314', '10545055917999973061', '10545055918000062921']\n",
|
||
"\n",
|
||
"【步骤2: 筛选分店留存】\n",
|
||
"data_NGV_V2 初始数据量: 45653 条\n",
|
||
"\n",
|
||
"area_manager 唯一值数量: 16\n",
|
||
"area_manager 值分布(前10):\n",
|
||
"area_manager\n",
|
||
"肖军 10824\n",
|
||
"景东强 8408\n",
|
||
"陈庆伟 8322\n",
|
||
"张凯 8269\n",
|
||
"关磊 7028\n",
|
||
"孙玉蕾 2006\n",
|
||
"殷昊 556\n",
|
||
"王涛 161\n",
|
||
"刘伟 52\n",
|
||
" 8\n",
|
||
"Name: count, dtype: int64\n",
|
||
"\n",
|
||
"各条件筛选结果:\n",
|
||
" org_type == '一般': 42985 条\n",
|
||
" org_status == '留存': 27985 条\n",
|
||
" area_manager != '殷昊': 45097 条\n",
|
||
" area_manager != '孙玉蕾': 43647 条\n",
|
||
" is_main_org != 1: 8025 条\n",
|
||
"\n",
|
||
"所有条件合并后数据量: 4882 条\n",
|
||
"data_NGV_V2_filtered 中的 id_own_group 数量: 1564 个\n",
|
||
"data_NGV_V2_filtered 中的 id_own_group 示例: ['10545055917999659357', '10545055917999659357', '10545055917999688607', '10545055917999688607', '10545055917999688607', '10545055917999719687', '10545055917999791008', '10545055917999791008', '10545055917999995278', '10545055918000106937']\n",
|
||
"\n",
|
||
"【步骤3: 检查 id_own_group 交集】\n",
|
||
"ngvv2 中的 id_own_group 数量: 15065\n",
|
||
"data_NGV_V2_filtered 中的 id_own_group 数量: 1564\n",
|
||
"交集数量: 316\n",
|
||
"交集中的 id_own_group 示例: ['11240984669917478021', '10546172455175803787', '10546172455166018835', '10546172455220322161', '10546443563657780587', '10546172455213602644', '10546443563816611858', '11240984669917430021', '11240984669917329620', '11240984669917352563']\n",
|
||
"\n",
|
||
"【步骤4: 最终过滤】\n",
|
||
"过滤后的数据量: 468 条\n",
|
||
"\n",
|
||
"============================================================\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\hp_z66\\AppData\\Local\\Temp\\ipykernel_14280\\4275068286.py:100: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" data_NGV_V2_filtered['exists_in_ngvv2'] = data_NGV_V2_filtered['id_own_group'].isin(ngvv2['id_own_group'])\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# ========== 处理主店过期的情况 ==========\n",
|
||
"# 调试信息:检查数据状态\n",
|
||
"print(\"=\" * 60)\n",
|
||
"print(\"调试信息:处理主店过期情况\")\n",
|
||
"print(\"=\" * 60)\n",
|
||
"print(f\"\\n当前 data_NGV 数据量: {len(data_NGV)} 条\")\n",
|
||
"\n",
|
||
"# 检查关键字段的数据类型和唯一值\n",
|
||
"print(f\"\\n【字段检查】\")\n",
|
||
"print(f\"is_main_org 数据类型: {data_NGV['is_main_org'].dtype}\")\n",
|
||
"print(f\"is_main_org 唯一值: {sorted(data_NGV['is_main_org'].unique())}\")\n",
|
||
"print(f\"is_main_org 值分布:\\n{data_NGV['is_main_org'].value_counts()}\")\n",
|
||
"\n",
|
||
"print(f\"\\norg_status 数据类型: {data_NGV['org_status'].dtype}\")\n",
|
||
"print(f\"org_status 唯一值: {sorted(data_NGV['org_status'].unique())}\")\n",
|
||
"print(f\"org_status 值分布:\\n{data_NGV['org_status'].value_counts()}\")\n",
|
||
"\n",
|
||
"print(f\"\\norg_type 数据类型: {data_NGV['org_type'].dtype}\")\n",
|
||
"print(f\"org_type 唯一值: {sorted(data_NGV['org_type'].unique())}\")\n",
|
||
"print(f\"org_type 值分布:\\n{data_NGV['org_type'].value_counts()}\")\n",
|
||
"\n",
|
||
"# 步骤1: 筛选主店过期的情况\n",
|
||
"print(f\"\\n【步骤1: 筛选主店过期】\")\n",
|
||
"# 确保 is_main_org 是数值类型\n",
|
||
"if data_NGV['is_main_org'].dtype == 'object':\n",
|
||
" print(\"警告: is_main_org 是字符串类型,尝试转换为数值\")\n",
|
||
" data_NGV['is_main_org'] = pd.to_numeric(data_NGV['is_main_org'], errors='coerce')\n",
|
||
"\n",
|
||
"condition = (data_NGV['is_main_org'] == 1) & (data_NGV['org_status'] == '过期')\n",
|
||
"print(f\"条件筛选结果数量: {condition.sum()} 条\")\n",
|
||
"\n",
|
||
"ngvv2 = data_NGV[condition]\n",
|
||
"print(f\"主店过期数据量 (ngvv2): {len(ngvv2)} 条\")\n",
|
||
"\n",
|
||
"if len(ngvv2) > 0:\n",
|
||
" print(f\"ngvv2 中的 id_own_group 数量: {ngvv2['id_own_group'].nunique()} 个\")\n",
|
||
" print(f\"ngvv2 中的 id_own_group 示例: {ngvv2['id_own_group'].head(10).tolist()}\")\n",
|
||
"else:\n",
|
||
" print(\"⚠️ 警告: ngvv2 为空,没有主店过期的情况!\")\n",
|
||
"\n",
|
||
"# 步骤2: 检查分店留存的情况\n",
|
||
"print(f\"\\n【步骤2: 筛选分店留存】\")\n",
|
||
"# 在合并最佳值之前保存原始数据副本(重要!)\n",
|
||
"data_NGV_V2 = data_NGV.copy()\n",
|
||
"print(f\"data_NGV_V2 初始数据量: {len(data_NGV_V2)} 条\")\n",
|
||
"\n",
|
||
"# 检查 area_manager 字段\n",
|
||
"print(f\"\\narea_manager 唯一值数量: {data_NGV_V2['area_manager'].nunique()}\")\n",
|
||
"print(f\"area_manager 值分布(前10):\\n{data_NGV_V2['area_manager'].value_counts().head(10)}\")\n",
|
||
"\n",
|
||
"# 确保 is_main_org 是数值类型\n",
|
||
"if data_NGV_V2['is_main_org'].dtype == 'object':\n",
|
||
" data_NGV_V2['is_main_org'] = pd.to_numeric(data_NGV_V2['is_main_org'], errors='coerce')\n",
|
||
"\n",
|
||
"# 逐步检查每个条件\n",
|
||
"cond1 = (data_NGV_V2['org_type'] == \"一般\")\n",
|
||
"cond2 = (data_NGV_V2['org_status'] == '留存')\n",
|
||
"cond3 = (data_NGV_V2['area_manager'] != '殷昊')\n",
|
||
"cond4 = (data_NGV_V2['area_manager'] != '孙玉蕾')\n",
|
||
"cond5 = (data_NGV_V2['is_main_org'] != 1)\n",
|
||
"\n",
|
||
"print(f\"\\n各条件筛选结果:\")\n",
|
||
"print(f\" org_type == '一般': {cond1.sum()} 条\")\n",
|
||
"print(f\" org_status == '留存': {cond2.sum()} 条\")\n",
|
||
"print(f\" area_manager != '殷昊': {cond3.sum()} 条\")\n",
|
||
"print(f\" area_manager != '孙玉蕾': {cond4.sum()} 条\")\n",
|
||
"print(f\" is_main_org != 1: {cond5.sum()} 条\")\n",
|
||
"\n",
|
||
"data_NGV_V2['条件'] = cond1 & cond2 & cond3 & cond4 & cond5\n",
|
||
"data_NGV_V2_filtered = data_NGV_V2.loc[data_NGV_V2[\"条件\"]]\n",
|
||
"print(f\"\\n所有条件合并后数据量: {len(data_NGV_V2_filtered)} 条\")\n",
|
||
"\n",
|
||
"if len(data_NGV_V2_filtered) > 0:\n",
|
||
" print(f\"data_NGV_V2_filtered 中的 id_own_group 数量: {data_NGV_V2_filtered['id_own_group'].nunique()} 个\")\n",
|
||
" print(f\"data_NGV_V2_filtered 中的 id_own_group 示例: {data_NGV_V2_filtered['id_own_group'].head(10).tolist()}\")\n",
|
||
"\n",
|
||
"# 步骤3: 检查交集\n",
|
||
"print(f\"\\n【步骤3: 检查 id_own_group 交集】\")\n",
|
||
"if len(ngvv2) > 0 and len(data_NGV_V2_filtered) > 0:\n",
|
||
" ngvv2_groups = set(ngvv2['id_own_group'].unique())\n",
|
||
" v2_groups = set(data_NGV_V2_filtered['id_own_group'].unique())\n",
|
||
" intersection = ngvv2_groups & v2_groups\n",
|
||
" \n",
|
||
" print(f\"ngvv2 中的 id_own_group 数量: {len(ngvv2_groups)}\")\n",
|
||
" print(f\"data_NGV_V2_filtered 中的 id_own_group 数量: {len(v2_groups)}\")\n",
|
||
" print(f\"交集数量: {len(intersection)}\")\n",
|
||
" \n",
|
||
" if len(intersection) > 0:\n",
|
||
" print(f\"交集中的 id_own_group 示例: {list(intersection)[:10]}\")\n",
|
||
" else:\n",
|
||
" print(\"⚠️ 警告: 没有交集!这可能是问题所在。\")\n",
|
||
" print(f\"ngvv2 中的前10个 id_own_group: {list(ngvv2_groups)[:10]}\")\n",
|
||
" print(f\"data_NGV_V2_filtered 中的前10个 id_own_group: {list(v2_groups)[:10]}\")\n",
|
||
"else:\n",
|
||
" print(\"⚠️ 警告: ngvv2 或 data_NGV_V2_filtered 为空,无法检查交集\")\n",
|
||
"\n",
|
||
"# 步骤4: 过滤存在的记录\n",
|
||
"print(f\"\\n【步骤4: 最终过滤】\")\n",
|
||
"if len(ngvv2) > 0:\n",
|
||
" data_NGV_V2_filtered['exists_in_ngvv2'] = data_NGV_V2_filtered['id_own_group'].isin(ngvv2['id_own_group'])\n",
|
||
" filtered_data = data_NGV_V2_filtered[data_NGV_V2_filtered['exists_in_ngvv2']]\n",
|
||
" print(f\"过滤后的数据量: {len(filtered_data)} 条\")\n",
|
||
" \n",
|
||
" if len(filtered_data) == 0:\n",
|
||
" print(\"\\n❌ 问题诊断:\")\n",
|
||
" print(\" 过滤后数据为空,可能的原因:\")\n",
|
||
" print(\" 1. ngvv2 为空(没有主店过期的情况)\")\n",
|
||
" print(\" 2. data_NGV_V2_filtered 为空(没有满足条件的分店留存数据)\")\n",
|
||
" print(\" 3. 两者的 id_own_group 没有交集\")\n",
|
||
" print(\"\\n建议:\")\n",
|
||
" print(\" - 检查数据源是否正确\")\n",
|
||
" print(\" - 检查字段值是否匹配(注意数据类型和格式)\")\n",
|
||
" print(\" - 检查是否有主店过期但分店留存的情况\")\n",
|
||
"else:\n",
|
||
" print(\"⚠️ 警告: ngvv2 为空,无法进行过滤\")\n",
|
||
" filtered_data = pd.DataFrame() # 创建空DataFrame\n",
|
||
"\n",
|
||
"print(\"\\n\" + \"=\" * 60)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-16T07:30:16.487181500Z",
|
||
"start_time": "2026-01-16T07:30:16.440099400Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"排序去重后数据量: 316 条\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\hp_z66\\AppData\\Local\\Temp\\ipykernel_14280\\2835892650.py:5: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" filtered_data['sort_key'] = filtered_data['saas_edition_fmt'].map(fixed_order_map)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# ========== 对过滤数据进行排序和去重 ==========\n",
|
||
"fixed_order = ['皇冠版', '至尊版', '尊享版', '旗舰版', '标准版', '进阶版', '基础版', '入门版']\n",
|
||
"\n",
|
||
"fixed_order_map = {edition: index for index, edition in enumerate(fixed_order)}\n",
|
||
"filtered_data['sort_key'] = filtered_data['saas_edition_fmt'].map(fixed_order_map)\n",
|
||
"filtered_data = filtered_data.sort_values(by='sort_key').drop('sort_key', axis=1)\n",
|
||
"\n",
|
||
"result = filtered_data.drop_duplicates(subset='id_own_group', keep='first')\n",
|
||
"\n",
|
||
"print(f\"排序去重后数据量: {len(result)} 条\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# ========== 合并主店留存数据和分店数据 ==========\n",
|
||
"data_NGV['条件'] = (data_NGV['org_type'] == \"一般\") & (data_NGV['org_status'] == '留存') & (\n",
|
||
" data_NGV['area_manager'] != '殷昊') & (\n",
|
||
" data_NGV['area_manager'] != '孙玉蕾') & (\n",
|
||
" data_NGV['is_main_org'] == 1)\n",
|
||
"data_NGV = data_NGV.loc[data_NGV[\"条件\"]]\n",
|
||
"\n",
|
||
"data_NGV = pd.concat([data_NGV, result], axis=0)\n",
|
||
"data_details = data_NGV.copy()\n",
|
||
"\n",
|
||
"# 重置索引\n",
|
||
"data_details = data_details.reset_index(drop=True)\n",
|
||
"\n",
|
||
"print(f\"合并后数据量: {len(data_details)} 条\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-16T07:30:21.600199200Z",
|
||
"start_time": "2026-01-16T07:30:20.828225500Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"日期计算后数据量: 9845 条\n",
|
||
"需要扩展的数据行数: 9845 条\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# ========== 判断日期差并计算年数 ==========\n",
|
||
"# 判断A列的日期是否大于B列的日期730天,如果是的话,将B列的值设置为天数差\n",
|
||
"data_details['条件'] = data_details.apply(\n",
|
||
" lambda row: (\n",
|
||
" (pd.to_datetime(row['A']) - pd.to_datetime(row['B'])).days\n",
|
||
" if pd.to_datetime(row['A']) - pd.to_datetime(row['B']) >= pd.Timedelta(days=730)\n",
|
||
" else 0\n",
|
||
" ),\n",
|
||
" axis=1\n",
|
||
")\n",
|
||
"data_details = data_details.loc[data_details[\"条件\"] > 0]\n",
|
||
"\n",
|
||
"# 定义一个函数,用于将数字除以365并取整数\n",
|
||
"def divide_by_365(x):\n",
|
||
" if isinstance(x, (int, float)):\n",
|
||
" return int(x / 365)\n",
|
||
" else:\n",
|
||
" return x\n",
|
||
"\n",
|
||
"# 使用apply函数将divide_by_365函数应用到DataFrame的列\n",
|
||
"data_details['年'] = data_details['条件'].apply(divide_by_365)\n",
|
||
"\n",
|
||
"# 重置索引\n",
|
||
"data_details = data_details.reset_index(drop=True)\n",
|
||
"\n",
|
||
"print(f\"日期计算后数据量: {len(data_details)} 条\")\n",
|
||
"print(f\"需要扩展的数据行数: {len(data_details[data_details['年'] > 1])} 条\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# ========== 扩展数据:根据年数复制行并修改日期 ==========\n",
|
||
"# 创建一个新的空的DataFrame\n",
|
||
"new_df = pd.DataFrame()\n",
|
||
"\n",
|
||
"# 遍历原始DataFrame的每一行\n",
|
||
"for index, row in data_details.iterrows():\n",
|
||
" # 根据年数来决定复制的次数\n",
|
||
" if row[\"renew_date\"] != \"2024-02-29\":\n",
|
||
" for i_new in range(1, row['年']):\n",
|
||
" # 修改日期\n",
|
||
" row_new = row.copy()\n",
|
||
" c = row_new[\"renew_date\"]\n",
|
||
" date_obj = datetime.strptime(c, \"%Y-%m-%d\")\n",
|
||
" new_year = date_obj.year + i_new\n",
|
||
" new_date_obj = date_obj.replace(year=new_year)\n",
|
||
" new_c = new_date_obj.strftime(\"%Y-%m-%d\")\n",
|
||
" row_new[\"renew_date\"] = new_c\n",
|
||
" # 将当前行添加到新的DataFrame中\n",
|
||
" new_df = pd.concat([new_df, pd.DataFrame([row_new])], ignore_index=True)\n",
|
||
"\n",
|
||
"print(f\"扩展后的新数据量: {len(new_df)} 条\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-16T07:39:28.813848800Z",
|
||
"start_time": "2026-01-16T07:39:28.255902200Z"
|
||
}
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"合并后总数据量: 39599 条\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# ========== 合并原始数据和扩展数据 ==========\n",
|
||
"# 合并两个DataFrame\n",
|
||
"merged_df = pd.concat([data_NGV, new_df], axis=0, ignore_index=True)\n",
|
||
"data_details = merged_df.copy() # 替换名称\n",
|
||
"\n",
|
||
"data_details_not_null = data_details[data_details['renew_date'].notnull()]\n",
|
||
"# 重置索引\n",
|
||
"data_details_not_null = data_details_not_null.reset_index(drop=True)\n",
|
||
"data_details = data_details_not_null.copy() # 替换名称 v2\n",
|
||
"\n",
|
||
"print(f\"合并后总数据量: {len(data_details)} 条\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# ========== 最终过滤:排除创建时间等于续约时间的记录 ==========\n",
|
||
"data_details['saas_create_time'] = data_details['saas_create_time'].str[:4] # 截取前4位(年份)\n",
|
||
"data_details['renew_date_new'] = data_details['renew_date'].str[:4] # 截取前4位(年份)\n",
|
||
"data_details = data_details[\n",
|
||
" data_details['saas_create_time'] != data_details['renew_date_new']] # 过滤掉等于renew_date的行\n",
|
||
"\n",
|
||
"data_details = data_details.reset_index(drop=True)\n",
|
||
"\n",
|
||
"logger.info(f\"过滤后的数据长度为: {len(data_details)}\")\n",
|
||
"print(f\"\\n========== 数据处理完成 ==========\")\n",
|
||
"print(f\"最终数据量: {len(data_details)} 条\")\n",
|
||
"print(f\"处理完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 数据验证和检查\n",
|
||
"\n",
|
||
"可以在这里添加数据验证代码,检查处理结果的正确性\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"start_time": "2026-01-16T09:00:49.656903800Z"
|
||
}
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# ========== 数据验证 ==========\n",
|
||
"# 查看数据基本信息\n",
|
||
"print(\"数据基本信息:\")\n",
|
||
"print(f\"数据形状: {data_details.shape}\")\n",
|
||
"print(f\"\\n数据列名:\")\n",
|
||
"print(data_details.columns.tolist())\n",
|
||
"\n",
|
||
"# 查看前几行数据\n",
|
||
"print(\"\\n前5行数据:\")\n",
|
||
"print(data_details.head())\n",
|
||
"\n",
|
||
"# 检查关键字段的数据分布\n",
|
||
"if 'saas_edition_fmt' in data_details.columns:\n",
|
||
" print(\"\\n版本分布:\")\n",
|
||
" print(data_details['saas_edition_fmt'].value_counts())\n",
|
||
"\n",
|
||
"if 'org_status' in data_details.columns:\n",
|
||
" print(\"\\n组织状态分布:\")\n",
|
||
" print(data_details['org_status'].value_counts())\n",
|
||
"\n",
|
||
"# 可以保存到CSV文件进行进一步检查\n",
|
||
"# data_details.to_csv(\"处理后的数据.csv\", index=False, encoding='utf-8-sig')\n",
|
||
"# print(\"\\n数据已保存到: 处理后的数据.csv\")\n"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"language_info": {
|
||
"name": "python"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|