Files
saas/test/数据库验证脚本_数据处理.ipynb
T
2026-04-09 09:53:47 +08:00

900 lines
36 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 数据库验证脚本 - 数据处理部分\n",
"\n",
"本notebook用于调试和验证数据库验证脚本的数据处理逻辑(260-425行)\n",
"\n",
"## 使用说明\n",
"1. 先执行数据加载部分(第2个单元格),这部分比较耗时\n",
"2. 数据加载完成后,再执行后续的数据处理单元格\n",
"3. 每个单元格都可以单独执行和调试\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2026-01-16T06:53:03.604128900Z",
"start_time": "2026-01-16T06:53:01.840121200Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"库导入完成\n",
"项目根目录: D:\\Idea Project\\SaaS_V1.7\n"
]
}
],
"source": [
"# 导入必要的库\n",
"import os\n",
"import sys\n",
"import pandas as pd\n",
"import datetime\n",
"from datetime import datetime, timedelta\n",
"import re\n",
"\n",
"# 添加项目根目录到路径(notebook文件在test目录下,需要添加父目录)\n",
"current_dir = os.getcwd()\n",
"# 如果当前目录是test,则添加父目录;否则添加当前目录\n",
"if os.path.basename(current_dir) == 'test':\n",
" project_root = os.path.dirname(current_dir)\n",
"else:\n",
" project_root = current_dir\n",
"sys.path.insert(0, project_root)\n",
"\n",
"from api import API\n",
"from back_ground_module import CommonModule\n",
"from log_config import configure_task_logger, configure_error_task_logger\n",
"\n",
"# 初始化API和CommonModule\n",
"api_instance = API()\n",
"common_module = CommonModule()\n",
"\n",
"# 获取日志记录器\n",
"logger = configure_task_logger()\n",
"error_task_logger = configure_error_task_logger()\n",
"\n",
"print(\"库导入完成\")\n",
"print(f\"项目根目录: {project_root}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤1: 数据加载(耗时操作,可单独执行)\n",
"\n",
"这部分会加载所有必要的数据,包括:\n",
"- 省市区人员关系表\n",
"- 员工ID列表\n",
"- 权限表\n",
"- NGV数据列表\n",
"- 服务提醒数据\n",
"- 智能检测数据\n",
"- 功能使用情况表\n",
"- 保单识别表\n",
"- 私域/公域小程序数据\n",
"- 异业合作数据\n",
"- 短信数据\n",
"- 多公司过滤表\n",
"- NGV明细数据(从数据库获取)\n",
"- 节假日列表\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ========== 数据加载部分 ==========\n",
"# 这部分比较耗时,可以单独执行\n",
"\n",
"print(\"开始加载数据...\")\n",
"\n",
"# 省市区人员关系表\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676512ac3e54dc3159460c0a\"}\n",
"json_dict = api_instance.entry_data_list(payload)\n",
"if json_dict and \"data\" in json_dict:\n",
" json_list = json_dict.get(\"data\")\n",
"else:\n",
" print(\"加载省市区人员关系表失败\")\n",
" json_list = []\n",
"print(f\"省市区人员关系表: {len(json_list)} 条\")\n",
"\n",
"# 获取简道云员工id\n",
"payload = {\"api_key\": \"6694d3c4fcb69ca9a111a6c4\", \"entry_id\": \"6769204a1902c9341340a1bc\"}\n",
"staff_id = api_instance.entry_data_list(payload)\n",
"staff_id_list = staff_id.get(\"data\")\n",
"print(f\"员工ID列表: {len(staff_id_list)} 条\")\n",
"\n",
"# 获取权限表信息\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"675b96c14e839f90fef1647c\"}\n",
"permissions_table = api_instance.entry_data_list(payload).get(\"data\")\n",
"print(f\"权限表: {len(permissions_table)} 条\")\n",
"\n",
"# 获取NGV数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"675bb02bd2d53c2034c665e4\"}\n",
"NGV_data_list = api_instance.entry_data_list(payload).get(\"data\")\n",
"print(f\"NGV数据列表: {len(NGV_data_list)} 条\")\n",
"\n",
"# 获取服务提醒-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676bb7bda3029720f1083e99\"}\n",
"service_remind = api_instance.entry_data_list(payload).get(\"data\")\n",
"print(f\"服务提醒数据: {len(service_remind)} 条\")\n",
"\n",
"# 获取智能检测-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676bb99649ab3ac975af6e39\"}\n",
"Smart_detection = api_instance.entry_data_list(payload).get(\"data\")\n",
"print(f\"智能检测数据: {len(Smart_detection)} 条\")\n",
"\n",
"# 获取功能使用情况表\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"6763bbf657bd8fb76fcb41b2\"}\n",
"get_feature_usage = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"print(f\"功能使用情况表: {len(get_feature_usage)} 条\")\n",
"\n",
"# 获取保单识别表\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"6773a60d30ed87ff9f68d3c5\"}\n",
"policy_recognition = api_instance.entry_data_list(payload).get(\"data\")\n",
"widget_list = [item['_widget_1735632397600'] for item in policy_recognition]\n",
"print(f\"保单识别表: {len(policy_recognition)} 条\")\n",
"\n",
"# 获取私域小程序-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e0f0fae622896749ba5087\"}\n",
"private_domain = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"print(f\"私域小程序数据: {len(private_domain)} 条\")\n",
"\n",
"# 获取公域小程序-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e0c702c8f603b997980999\"}\n",
"public_domain = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"public_domain_list = [item['_widget_1742784257506'] for item in public_domain]\n",
"print(f\"公域小程序数据: {len(public_domain)} 条\")\n",
"\n",
"# 获取异业合作-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e24fdd8dfcfa918e17c30b\"}\n",
"different_industries = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"different_industries_list = [item['_widget_1742884829007'] for item in different_industries]\n",
"print(f\"异业合作数据: {len(different_industries)} 条\")\n",
"\n",
"# 获取短信-数据支持表单数据\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e5107198ba1b20d5df3974\"}\n",
"groupnotification = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"print(f\"短信数据: {len(groupnotification)} 条\")\n",
"\n",
"# 获取多公司过滤表\n",
"payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"689bf5f8ba88a28cb0679ec9\"}\n",
"get_filter_company_list = api_instance.entry_data_list(payload).get(\"data\", [])\n",
"print(f\"多公司过滤表: {len(get_filter_company_list)} 条\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 获取节假日列表\n",
"date_list = common_module.get_holiday_list()\n",
"print(f\"节假日列表: {len(date_list)} 个日期\")\n",
"\n",
"# 获取NGV明细数据(从数据库获取,比较耗时)\n",
"print(\"\\n开始从数据库获取NGV明细数据(这可能需要一些时间)...\")\n",
"data_NGV = common_module.get_ngv_details(days_back=1)\n",
"print(f\"NGV明细数据: {len(data_NGV)} 条\")\n",
"print(f\"NGV明细数据列数: {len(data_NGV.columns)}\")\n",
"\n",
"# 构建省市区索引\n",
"def build_index(json_list):\n",
" index = {}\n",
" for json_item in json_list:\n",
" try:\n",
" key = (json_item['_widget_1734677164861'], json_item['_widget_1734677164862'],\n",
" json_item['_widget_1734677164863']) # 省市区\n",
" if '_widget_1734677164871' not in json_item: # 日常回访客服\n",
" raise KeyError(\"缺少 '日常回访客服' 键\")\n",
" index[key] = json_item\n",
" except KeyError as e:\n",
" print(f\"警告:{e},跳过该条记录\")\n",
" continue\n",
" return index\n",
"\n",
"index = build_index(json_list)\n",
"print(f\"省市区索引构建完成: {len(index)} 条\")\n",
"\n",
"print(\"\\n========== 数据加载完成 ==========\")\n",
"print(f\"数据加载时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤2: 数据处理逻辑(260-425行)\n",
"\n",
"这部分包含主要的数据处理逻辑,包括:\n",
"- 获取多公司过滤公司id\n",
"- 数据清洗和转换\n",
"- 优先级排序\n",
"- 数据过滤和合并\n",
"- 日期计算和扩展\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ========== 获取多公司过滤公司id ==========\n",
"logger.info(\"获取多公司过滤公司id\")\n",
"all_filter_company_list = [] # 获取多公司过滤公司id\n",
"for company in get_filter_company_list:\n",
" company_list = company.get(\"_widget_1755052002491\")\n",
" if company_list:\n",
" for company_item in company_list:\n",
" if company_item.get(\"_widget_1755052002496\") == \"否\":\n",
" all_filter_company_list.append(company_item.get(\"_widget_1755052002495\"))\n",
"logger.info(f\"过滤公司条数:{len(all_filter_company_list)}\")\n",
"print(f\"过滤公司条数: {len(all_filter_company_list)}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2026-01-16T07:27:13.175060200Z",
"start_time": "2026-01-16T07:27:09.857076900Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"数据预处理完成,当前数据量: 45672 条\n",
"数据列数: 143\n"
]
}
],
"source": [
"# ========== 数据预处理:日期转换和数据清洗 ==========\n",
"# 将A列和B列的日期字符串转换为日期格式\n",
"data_NGV = data_NGV.copy()\n",
"data_NGV['A'] = pd.to_datetime(data_NGV['expiry_time'])\n",
"data_NGV['B'] = pd.to_datetime(data_NGV['renew_date'])\n",
"\n",
"def replace_values(series):\n",
" # 使用条件判断来进行替换\n",
" return series.apply(lambda x: '' if pd.isna(x) or x in ['NA', 'None', ''] else x)\n",
"\n",
"# 处理字符串数据并显式指定数据类型\n",
"data_NGV = data_NGV.apply(replace_values)\n",
"\n",
"print(f\"数据预处理完成,当前数据量: {len(data_NGV)} 条\")\n",
"print(f\"数据列数: {len(data_NGV.columns)}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2026-01-16T07:28:01.298494800Z",
"start_time": "2026-01-16T07:28:01.106113700Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"过滤多公司后数据量: 45653 条\n"
]
}
],
"source": [
"# ========== 过滤多公司 ==========\n",
"# 针对公司主店过期,取公司最高等级版本派发\n",
"# 过滤多公司\n",
"data_NGV = data_NGV[~data_NGV['id_own_group'].isin(all_filter_company_list)]\n",
"print(f\"过滤多公司后数据量: {len(data_NGV)} 条\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2026-01-16T07:28:04.235079300Z",
"start_time": "2026-01-16T07:28:04.185820400Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"优先级映射完成\n"
]
}
],
"source": [
"# ========== 定义优先级顺序和创建映射字典 ==========\n",
"# 定义优先级顺序\n",
"edition_order = ['皇冠版', '至尊版', '尊享版', '旗舰版', '标准版', '进阶版', '基础版', '入门版']\n",
"customer_type_order = [\"F\", \"E\", \"D\", \"C\", \"B\", \"A\"] # 索引越小优先级越高\n",
"group_grade_order = ['全国KAFMVP', '区域KAMVP', '重要客户(SVIP', '普通客户(VIP']\n",
"\n",
"# 创建映射字典,并为不在列表中的值设置默认值\n",
"edition_map = {edition: idx for idx, edition in enumerate(edition_order)}\n",
"customer_type_map = {ctype: idx for idx, ctype in enumerate(customer_type_order)}\n",
"group_grade_map = {grade: idx for idx, grade in enumerate(group_grade_order)}\n",
"\n",
"# 添加用于排序的新列,并处理不在映射字典中的值\n",
"data_NGV['edition_rank'] = data_NGV['saas_edition_fmt'].map(edition_map).fillna(0).astype(int) # 缺失值用最高优先级填充\n",
"data_NGV['customer_type_rank'] = data_NGV['saas_customer_type'].map(customer_type_map).fillna(0).astype(int)\n",
"data_NGV['group_grade_rank'] = data_NGV['group_grade'].map(group_grade_map).fillna(0).astype(int)\n",
"\n",
"print(\"优先级映射完成\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2026-01-16T07:28:08.790102700Z",
"start_time": "2026-01-16T07:28:08.399298Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"最佳值查找完成\n"
]
}
],
"source": [
"# ========== 找到每组中的最佳值 ==========\n",
"# 找到每组中 edition_rank 最小值对应的行\n",
"best_edition_idx = data_NGV.groupby('id_own_group')['edition_rank'].idxmin()\n",
"best_edition_rows = data_NGV.loc[best_edition_idx]\n",
"best_edition_rows['max_saas_edition'] = best_edition_rows['saas_edition_fmt']\n",
"\n",
"# 找到每组中 customer_type_rank 最小值对应的行\n",
"best_customer_type_idx = data_NGV.groupby('id_own_group')['customer_type_rank'].idxmin()\n",
"best_customer_type_rows = data_NGV.loc[best_customer_type_idx]\n",
"best_customer_type_rows['max_saas_customer_type'] = best_customer_type_rows['customer_type_rank'].apply(\n",
" lambda x: customer_type_order[x])\n",
"\n",
"# 找到每组中 group_grade_rank 最小值对应的行\n",
"best_group_grade_idx = data_NGV.groupby('id_own_group')['group_grade_rank'].idxmin()\n",
"best_group_grade_rows = data_NGV.loc[best_group_grade_idx]\n",
"best_group_grade_rows['max_group_grade'] = best_group_grade_rows['group_grade']\n",
"\n",
"print(\"最佳值查找完成\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2026-01-16T07:28:11.374632Z",
"start_time": "2026-01-16T07:28:11.141730700Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"最佳值合并完成,当前数据量: 45653 条\n"
]
}
],
"source": [
"# ========== 合并最佳值回到原数据集 ==========\n",
"# 合并最佳值回到原数据集\n",
"best_values = (\n",
" best_edition_rows[['id_own_group', 'max_saas_edition']]\n",
" .merge(best_customer_type_rows[['id_own_group', 'max_saas_customer_type']], on='id_own_group',\n",
" how='outer')\n",
" .merge(best_group_grade_rows[['id_own_group', 'max_group_grade']], on='id_own_group', how='outer')\n",
")\n",
"\n",
"# 将最佳值合并回原数据集\n",
"data_NGV = data_NGV.merge(best_values, on='id_own_group', how='left')\n",
"\n",
"print(f\"最佳值合并完成,当前数据量: {len(data_NGV)} 条\")\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2026-01-16T07:30:06.358604500Z",
"start_time": "2026-01-16T07:30:05.752861600Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"============================================================\n",
"调试信息:处理主店过期情况\n",
"============================================================\n",
"\n",
"当前 data_NGV 数据量: 45653 条\n",
"\n",
"【字段检查】\n",
"is_main_org 数据类型: object\n",
"is_main_org 唯一值: ['0', '1']\n",
"is_main_org 值分布:\n",
"is_main_org\n",
"1 37628\n",
"0 8025\n",
"Name: count, dtype: int64\n",
"\n",
"org_status 数据类型: object\n",
"org_status 唯一值: ['留存', '过期']\n",
"org_status 值分布:\n",
"org_status\n",
"留存 27985\n",
"过期 17668\n",
"Name: count, dtype: int64\n",
"\n",
"org_type 数据类型: object\n",
"org_type 唯一值: ['一般', '天猫']\n",
"org_type 值分布:\n",
"org_type\n",
"一般 42985\n",
"天猫 2668\n",
"Name: count, dtype: int64\n",
"\n",
"【步骤1: 筛选主店过期】\n",
"警告: is_main_org 是字符串类型,尝试转换为数值\n",
"条件筛选结果数量: 15065 条\n",
"主店过期数据量 (ngvv2): 15065 条\n",
"ngvv2 中的 id_own_group 数量: 15065 个\n",
"ngvv2 中的 id_own_group 示例: ['10545055917999655906', '10545055917999678943', '10545055917999702656', '10545055917999726421', '10545055917999791008', '10545055917999907421', '10545055917999958815', '10545055917999963314', '10545055917999973061', '10545055918000062921']\n",
"\n",
"【步骤2: 筛选分店留存】\n",
"data_NGV_V2 初始数据量: 45653 条\n",
"\n",
"area_manager 唯一值数量: 16\n",
"area_manager 值分布(前10:\n",
"area_manager\n",
"肖军 10824\n",
"景东强 8408\n",
"陈庆伟 8322\n",
"张凯 8269\n",
"关磊 7028\n",
"孙玉蕾 2006\n",
"殷昊 556\n",
"王涛 161\n",
"刘伟 52\n",
" 8\n",
"Name: count, dtype: int64\n",
"\n",
"各条件筛选结果:\n",
" org_type == '一般': 42985 条\n",
" org_status == '留存': 27985 条\n",
" area_manager != '殷昊': 45097 条\n",
" area_manager != '孙玉蕾': 43647 条\n",
" is_main_org != 1: 8025 条\n",
"\n",
"所有条件合并后数据量: 4882 条\n",
"data_NGV_V2_filtered 中的 id_own_group 数量: 1564 个\n",
"data_NGV_V2_filtered 中的 id_own_group 示例: ['10545055917999659357', '10545055917999659357', '10545055917999688607', '10545055917999688607', '10545055917999688607', '10545055917999719687', '10545055917999791008', '10545055917999791008', '10545055917999995278', '10545055918000106937']\n",
"\n",
"【步骤3: 检查 id_own_group 交集】\n",
"ngvv2 中的 id_own_group 数量: 15065\n",
"data_NGV_V2_filtered 中的 id_own_group 数量: 1564\n",
"交集数量: 316\n",
"交集中的 id_own_group 示例: ['11240984669917478021', '10546172455175803787', '10546172455166018835', '10546172455220322161', '10546443563657780587', '10546172455213602644', '10546443563816611858', '11240984669917430021', '11240984669917329620', '11240984669917352563']\n",
"\n",
"【步骤4: 最终过滤】\n",
"过滤后的数据量: 468 条\n",
"\n",
"============================================================\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\hp_z66\\AppData\\Local\\Temp\\ipykernel_14280\\4275068286.py:100: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" data_NGV_V2_filtered['exists_in_ngvv2'] = data_NGV_V2_filtered['id_own_group'].isin(ngvv2['id_own_group'])\n"
]
}
],
"source": [
"# ========== 处理主店过期的情况 ==========\n",
"# 调试信息:检查数据状态\n",
"print(\"=\" * 60)\n",
"print(\"调试信息:处理主店过期情况\")\n",
"print(\"=\" * 60)\n",
"print(f\"\\n当前 data_NGV 数据量: {len(data_NGV)} 条\")\n",
"\n",
"# 检查关键字段的数据类型和唯一值\n",
"print(f\"\\n【字段检查】\")\n",
"print(f\"is_main_org 数据类型: {data_NGV['is_main_org'].dtype}\")\n",
"print(f\"is_main_org 唯一值: {sorted(data_NGV['is_main_org'].unique())}\")\n",
"print(f\"is_main_org 值分布:\\n{data_NGV['is_main_org'].value_counts()}\")\n",
"\n",
"print(f\"\\norg_status 数据类型: {data_NGV['org_status'].dtype}\")\n",
"print(f\"org_status 唯一值: {sorted(data_NGV['org_status'].unique())}\")\n",
"print(f\"org_status 值分布:\\n{data_NGV['org_status'].value_counts()}\")\n",
"\n",
"print(f\"\\norg_type 数据类型: {data_NGV['org_type'].dtype}\")\n",
"print(f\"org_type 唯一值: {sorted(data_NGV['org_type'].unique())}\")\n",
"print(f\"org_type 值分布:\\n{data_NGV['org_type'].value_counts()}\")\n",
"\n",
"# 步骤1: 筛选主店过期的情况\n",
"print(f\"\\n【步骤1: 筛选主店过期】\")\n",
"# 确保 is_main_org 是数值类型\n",
"if data_NGV['is_main_org'].dtype == 'object':\n",
" print(\"警告: is_main_org 是字符串类型,尝试转换为数值\")\n",
" data_NGV['is_main_org'] = pd.to_numeric(data_NGV['is_main_org'], errors='coerce')\n",
"\n",
"condition = (data_NGV['is_main_org'] == 1) & (data_NGV['org_status'] == '过期')\n",
"print(f\"条件筛选结果数量: {condition.sum()} 条\")\n",
"\n",
"ngvv2 = data_NGV[condition]\n",
"print(f\"主店过期数据量 (ngvv2): {len(ngvv2)} 条\")\n",
"\n",
"if len(ngvv2) > 0:\n",
" print(f\"ngvv2 中的 id_own_group 数量: {ngvv2['id_own_group'].nunique()} 个\")\n",
" print(f\"ngvv2 中的 id_own_group 示例: {ngvv2['id_own_group'].head(10).tolist()}\")\n",
"else:\n",
" print(\"⚠️ 警告: ngvv2 为空,没有主店过期的情况!\")\n",
"\n",
"# 步骤2: 检查分店留存的情况\n",
"print(f\"\\n【步骤2: 筛选分店留存】\")\n",
"# 在合并最佳值之前保存原始数据副本(重要!)\n",
"data_NGV_V2 = data_NGV.copy()\n",
"print(f\"data_NGV_V2 初始数据量: {len(data_NGV_V2)} 条\")\n",
"\n",
"# 检查 area_manager 字段\n",
"print(f\"\\narea_manager 唯一值数量: {data_NGV_V2['area_manager'].nunique()}\")\n",
"print(f\"area_manager 值分布(前10:\\n{data_NGV_V2['area_manager'].value_counts().head(10)}\")\n",
"\n",
"# 确保 is_main_org 是数值类型\n",
"if data_NGV_V2['is_main_org'].dtype == 'object':\n",
" data_NGV_V2['is_main_org'] = pd.to_numeric(data_NGV_V2['is_main_org'], errors='coerce')\n",
"\n",
"# 逐步检查每个条件\n",
"cond1 = (data_NGV_V2['org_type'] == \"一般\")\n",
"cond2 = (data_NGV_V2['org_status'] == '留存')\n",
"cond3 = (data_NGV_V2['area_manager'] != '殷昊')\n",
"cond4 = (data_NGV_V2['area_manager'] != '孙玉蕾')\n",
"cond5 = (data_NGV_V2['is_main_org'] != 1)\n",
"\n",
"print(f\"\\n各条件筛选结果:\")\n",
"print(f\" org_type == '一般': {cond1.sum()} 条\")\n",
"print(f\" org_status == '留存': {cond2.sum()} 条\")\n",
"print(f\" area_manager != '殷昊': {cond3.sum()} 条\")\n",
"print(f\" area_manager != '孙玉蕾': {cond4.sum()} 条\")\n",
"print(f\" is_main_org != 1: {cond5.sum()} 条\")\n",
"\n",
"data_NGV_V2['条件'] = cond1 & cond2 & cond3 & cond4 & cond5\n",
"data_NGV_V2_filtered = data_NGV_V2.loc[data_NGV_V2[\"条件\"]]\n",
"print(f\"\\n所有条件合并后数据量: {len(data_NGV_V2_filtered)} 条\")\n",
"\n",
"if len(data_NGV_V2_filtered) > 0:\n",
" print(f\"data_NGV_V2_filtered 中的 id_own_group 数量: {data_NGV_V2_filtered['id_own_group'].nunique()} 个\")\n",
" print(f\"data_NGV_V2_filtered 中的 id_own_group 示例: {data_NGV_V2_filtered['id_own_group'].head(10).tolist()}\")\n",
"\n",
"# 步骤3: 检查交集\n",
"print(f\"\\n【步骤3: 检查 id_own_group 交集】\")\n",
"if len(ngvv2) > 0 and len(data_NGV_V2_filtered) > 0:\n",
" ngvv2_groups = set(ngvv2['id_own_group'].unique())\n",
" v2_groups = set(data_NGV_V2_filtered['id_own_group'].unique())\n",
" intersection = ngvv2_groups & v2_groups\n",
" \n",
" print(f\"ngvv2 中的 id_own_group 数量: {len(ngvv2_groups)}\")\n",
" print(f\"data_NGV_V2_filtered 中的 id_own_group 数量: {len(v2_groups)}\")\n",
" print(f\"交集数量: {len(intersection)}\")\n",
" \n",
" if len(intersection) > 0:\n",
" print(f\"交集中的 id_own_group 示例: {list(intersection)[:10]}\")\n",
" else:\n",
" print(\"⚠️ 警告: 没有交集!这可能是问题所在。\")\n",
" print(f\"ngvv2 中的前10个 id_own_group: {list(ngvv2_groups)[:10]}\")\n",
" print(f\"data_NGV_V2_filtered 中的前10个 id_own_group: {list(v2_groups)[:10]}\")\n",
"else:\n",
" print(\"⚠️ 警告: ngvv2 或 data_NGV_V2_filtered 为空,无法检查交集\")\n",
"\n",
"# 步骤4: 过滤存在的记录\n",
"print(f\"\\n【步骤4: 最终过滤】\")\n",
"if len(ngvv2) > 0:\n",
" data_NGV_V2_filtered['exists_in_ngvv2'] = data_NGV_V2_filtered['id_own_group'].isin(ngvv2['id_own_group'])\n",
" filtered_data = data_NGV_V2_filtered[data_NGV_V2_filtered['exists_in_ngvv2']]\n",
" print(f\"过滤后的数据量: {len(filtered_data)} 条\")\n",
" \n",
" if len(filtered_data) == 0:\n",
" print(\"\\n❌ 问题诊断:\")\n",
" print(\" 过滤后数据为空,可能的原因:\")\n",
" print(\" 1. ngvv2 为空(没有主店过期的情况)\")\n",
" print(\" 2. data_NGV_V2_filtered 为空(没有满足条件的分店留存数据)\")\n",
" print(\" 3. 两者的 id_own_group 没有交集\")\n",
" print(\"\\n建议:\")\n",
" print(\" - 检查数据源是否正确\")\n",
" print(\" - 检查字段值是否匹配(注意数据类型和格式)\")\n",
" print(\" - 检查是否有主店过期但分店留存的情况\")\n",
"else:\n",
" print(\"⚠️ 警告: ngvv2 为空,无法进行过滤\")\n",
" filtered_data = pd.DataFrame() # 创建空DataFrame\n",
"\n",
"print(\"\\n\" + \"=\" * 60)\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2026-01-16T07:30:16.487181500Z",
"start_time": "2026-01-16T07:30:16.440099400Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"排序去重后数据量: 316 条\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\hp_z66\\AppData\\Local\\Temp\\ipykernel_14280\\2835892650.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" filtered_data['sort_key'] = filtered_data['saas_edition_fmt'].map(fixed_order_map)\n"
]
}
],
"source": [
"# ========== 对过滤数据进行排序和去重 ==========\n",
"fixed_order = ['皇冠版', '至尊版', '尊享版', '旗舰版', '标准版', '进阶版', '基础版', '入门版']\n",
"\n",
"fixed_order_map = {edition: index for index, edition in enumerate(fixed_order)}\n",
"filtered_data['sort_key'] = filtered_data['saas_edition_fmt'].map(fixed_order_map)\n",
"filtered_data = filtered_data.sort_values(by='sort_key').drop('sort_key', axis=1)\n",
"\n",
"result = filtered_data.drop_duplicates(subset='id_own_group', keep='first')\n",
"\n",
"print(f\"排序去重后数据量: {len(result)} 条\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ========== 合并主店留存数据和分店数据 ==========\n",
"data_NGV['条件'] = (data_NGV['org_type'] == \"一般\") & (data_NGV['org_status'] == '留存') & (\n",
" data_NGV['area_manager'] != '殷昊') & (\n",
" data_NGV['area_manager'] != '孙玉蕾') & (\n",
" data_NGV['is_main_org'] == 1)\n",
"data_NGV = data_NGV.loc[data_NGV[\"条件\"]]\n",
"\n",
"data_NGV = pd.concat([data_NGV, result], axis=0)\n",
"data_details = data_NGV.copy()\n",
"\n",
"# 重置索引\n",
"data_details = data_details.reset_index(drop=True)\n",
"\n",
"print(f\"合并后数据量: {len(data_details)} 条\")\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2026-01-16T07:30:21.600199200Z",
"start_time": "2026-01-16T07:30:20.828225500Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"日期计算后数据量: 9845 条\n",
"需要扩展的数据行数: 9845 条\n"
]
}
],
"source": [
"# ========== 判断日期差并计算年数 ==========\n",
"# 判断A列的日期是否大于B列的日期730天,如果是的话,将B列的值设置为天数差\n",
"data_details['条件'] = data_details.apply(\n",
" lambda row: (\n",
" (pd.to_datetime(row['A']) - pd.to_datetime(row['B'])).days\n",
" if pd.to_datetime(row['A']) - pd.to_datetime(row['B']) >= pd.Timedelta(days=730)\n",
" else 0\n",
" ),\n",
" axis=1\n",
")\n",
"data_details = data_details.loc[data_details[\"条件\"] > 0]\n",
"\n",
"# 定义一个函数,用于将数字除以365并取整数\n",
"def divide_by_365(x):\n",
" if isinstance(x, (int, float)):\n",
" return int(x / 365)\n",
" else:\n",
" return x\n",
"\n",
"# 使用apply函数将divide_by_365函数应用到DataFrame的列\n",
"data_details['年'] = data_details['条件'].apply(divide_by_365)\n",
"\n",
"# 重置索引\n",
"data_details = data_details.reset_index(drop=True)\n",
"\n",
"print(f\"日期计算后数据量: {len(data_details)} 条\")\n",
"print(f\"需要扩展的数据行数: {len(data_details[data_details['年'] > 1])} 条\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ========== 扩展数据:根据年数复制行并修改日期 ==========\n",
"# 创建一个新的空的DataFrame\n",
"new_df = pd.DataFrame()\n",
"\n",
"# 遍历原始DataFrame的每一行\n",
"for index, row in data_details.iterrows():\n",
" # 根据年数来决定复制的次数\n",
" if row[\"renew_date\"] != \"2024-02-29\":\n",
" for i_new in range(1, row['年']):\n",
" # 修改日期\n",
" row_new = row.copy()\n",
" c = row_new[\"renew_date\"]\n",
" date_obj = datetime.strptime(c, \"%Y-%m-%d\")\n",
" new_year = date_obj.year + i_new\n",
" new_date_obj = date_obj.replace(year=new_year)\n",
" new_c = new_date_obj.strftime(\"%Y-%m-%d\")\n",
" row_new[\"renew_date\"] = new_c\n",
" # 将当前行添加到新的DataFrame中\n",
" new_df = pd.concat([new_df, pd.DataFrame([row_new])], ignore_index=True)\n",
"\n",
"print(f\"扩展后的新数据量: {len(new_df)} 条\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2026-01-16T07:39:28.813848800Z",
"start_time": "2026-01-16T07:39:28.255902200Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"合并后总数据量: 39599 条\n"
]
}
],
"source": [
"# ========== 合并原始数据和扩展数据 ==========\n",
"# 合并两个DataFrame\n",
"merged_df = pd.concat([data_NGV, new_df], axis=0, ignore_index=True)\n",
"data_details = merged_df.copy() # 替换名称\n",
"\n",
"data_details_not_null = data_details[data_details['renew_date'].notnull()]\n",
"# 重置索引\n",
"data_details_not_null = data_details_not_null.reset_index(drop=True)\n",
"data_details = data_details_not_null.copy() # 替换名称 v2\n",
"\n",
"print(f\"合并后总数据量: {len(data_details)} 条\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ========== 最终过滤:排除创建时间等于续约时间的记录 ==========\n",
"data_details['saas_create_time'] = data_details['saas_create_time'].str[:4] # 截取前4位(年份)\n",
"data_details['renew_date_new'] = data_details['renew_date'].str[:4] # 截取前4位(年份)\n",
"data_details = data_details[\n",
" data_details['saas_create_time'] != data_details['renew_date_new']] # 过滤掉等于renew_date的行\n",
"\n",
"data_details = data_details.reset_index(drop=True)\n",
"\n",
"logger.info(f\"过滤后的数据长度为: {len(data_details)}\")\n",
"print(f\"\\n========== 数据处理完成 ==========\")\n",
"print(f\"最终数据量: {len(data_details)} 条\")\n",
"print(f\"处理完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 数据验证和检查\n",
"\n",
"可以在这里添加数据验证代码,检查处理结果的正确性\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"start_time": "2026-01-16T09:00:49.656903800Z"
}
},
"outputs": [],
"source": [
"# ========== 数据验证 ==========\n",
"# 查看数据基本信息\n",
"print(\"数据基本信息:\")\n",
"print(f\"数据形状: {data_details.shape}\")\n",
"print(f\"\\n数据列名:\")\n",
"print(data_details.columns.tolist())\n",
"\n",
"# 查看前几行数据\n",
"print(\"\\n前5行数据:\")\n",
"print(data_details.head())\n",
"\n",
"# 检查关键字段的数据分布\n",
"if 'saas_edition_fmt' in data_details.columns:\n",
" print(\"\\n版本分布:\")\n",
" print(data_details['saas_edition_fmt'].value_counts())\n",
"\n",
"if 'org_status' in data_details.columns:\n",
" print(\"\\n组织状态分布:\")\n",
" print(data_details['org_status'].value_counts())\n",
"\n",
"# 可以保存到CSV文件进行进一步检查\n",
"# data_details.to_csv(\"处理后的数据.csv\", index=False, encoding='utf-8-sig')\n",
"# print(\"\\n数据已保存到: 处理后的数据.csv\")\n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}