{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 数据库验证脚本 - 数据处理部分\n", "\n", "本notebook用于调试和验证数据库验证脚本的数据处理逻辑(260-425行)\n", "\n", "## 使用说明\n", "1. 先执行数据加载部分(第2个单元格),这部分比较耗时\n", "2. 数据加载完成后,再执行后续的数据处理单元格\n", "3. 每个单元格都可以单独执行和调试\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2026-01-16T06:53:03.604128900Z", "start_time": "2026-01-16T06:53:01.840121200Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "库导入完成\n", "项目根目录: D:\\Idea Project\\SaaS_V1.7\n" ] } ], "source": [ "# 导入必要的库\n", "import os\n", "import sys\n", "import pandas as pd\n", "import datetime\n", "from datetime import datetime, timedelta\n", "import re\n", "\n", "# 添加项目根目录到路径(notebook文件在test目录下,需要添加父目录)\n", "current_dir = os.getcwd()\n", "# 如果当前目录是test,则添加父目录;否则添加当前目录\n", "if os.path.basename(current_dir) == 'test':\n", " project_root = os.path.dirname(current_dir)\n", "else:\n", " project_root = current_dir\n", "sys.path.insert(0, project_root)\n", "\n", "from api import API\n", "from back_ground_module import CommonModule\n", "from log_config import configure_task_logger, configure_error_task_logger\n", "\n", "# 初始化API和CommonModule\n", "api_instance = API()\n", "common_module = CommonModule()\n", "\n", "# 获取日志记录器\n", "logger = configure_task_logger()\n", "error_task_logger = configure_error_task_logger()\n", "\n", "print(\"库导入完成\")\n", "print(f\"项目根目录: {project_root}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 步骤1: 数据加载(耗时操作,可单独执行)\n", "\n", "这部分会加载所有必要的数据,包括:\n", "- 省市区人员关系表\n", "- 员工ID列表\n", "- 权限表\n", "- NGV数据列表\n", "- 服务提醒数据\n", "- 智能检测数据\n", "- 功能使用情况表\n", "- 保单识别表\n", "- 私域/公域小程序数据\n", "- 异业合作数据\n", "- 短信数据\n", "- 多公司过滤表\n", "- NGV明细数据(从数据库获取)\n", "- 节假日列表\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ========== 数据加载部分 ==========\n", "# 这部分比较耗时,可以单独执行\n", "\n", "print(\"开始加载数据...\")\n", "\n", "# 省市区人员关系表\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676512ac3e54dc3159460c0a\"}\n", "json_dict = api_instance.entry_data_list(payload)\n", "if json_dict and \"data\" in json_dict:\n", " json_list = json_dict.get(\"data\")\n", "else:\n", " print(\"加载省市区人员关系表失败\")\n", " json_list = []\n", "print(f\"省市区人员关系表: {len(json_list)} 条\")\n", "\n", "# 获取简道云员工id\n", "payload = {\"api_key\": \"6694d3c4fcb69ca9a111a6c4\", \"entry_id\": \"6769204a1902c9341340a1bc\"}\n", "staff_id = api_instance.entry_data_list(payload)\n", "staff_id_list = staff_id.get(\"data\")\n", "print(f\"员工ID列表: {len(staff_id_list)} 条\")\n", "\n", "# 获取权限表信息\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"675b96c14e839f90fef1647c\"}\n", "permissions_table = api_instance.entry_data_list(payload).get(\"data\")\n", "print(f\"权限表: {len(permissions_table)} 条\")\n", "\n", "# 获取NGV数据\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"675bb02bd2d53c2034c665e4\"}\n", "NGV_data_list = api_instance.entry_data_list(payload).get(\"data\")\n", "print(f\"NGV数据列表: {len(NGV_data_list)} 条\")\n", "\n", "# 获取服务提醒-数据支持表单数据\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676bb7bda3029720f1083e99\"}\n", "service_remind = api_instance.entry_data_list(payload).get(\"data\")\n", "print(f\"服务提醒数据: {len(service_remind)} 条\")\n", "\n", "# 获取智能检测-数据支持表单数据\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"676bb99649ab3ac975af6e39\"}\n", "Smart_detection = api_instance.entry_data_list(payload).get(\"data\")\n", "print(f\"智能检测数据: {len(Smart_detection)} 条\")\n", "\n", "# 获取功能使用情况表\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"6763bbf657bd8fb76fcb41b2\"}\n", "get_feature_usage = api_instance.entry_data_list(payload).get(\"data\", [])\n", "print(f\"功能使用情况表: {len(get_feature_usage)} 条\")\n", "\n", "# 获取保单识别表\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"6773a60d30ed87ff9f68d3c5\"}\n", "policy_recognition = api_instance.entry_data_list(payload).get(\"data\")\n", "widget_list = [item['_widget_1735632397600'] for item in policy_recognition]\n", "print(f\"保单识别表: {len(policy_recognition)} 条\")\n", "\n", "# 获取私域小程序-数据支持表单数据\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e0f0fae622896749ba5087\"}\n", "private_domain = api_instance.entry_data_list(payload).get(\"data\", [])\n", "print(f\"私域小程序数据: {len(private_domain)} 条\")\n", "\n", "# 获取公域小程序-数据支持表单数据\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e0c702c8f603b997980999\"}\n", "public_domain = api_instance.entry_data_list(payload).get(\"data\", [])\n", "public_domain_list = [item['_widget_1742784257506'] for item in public_domain]\n", "print(f\"公域小程序数据: {len(public_domain)} 条\")\n", "\n", "# 获取异业合作-数据支持表单数据\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e24fdd8dfcfa918e17c30b\"}\n", "different_industries = api_instance.entry_data_list(payload).get(\"data\", [])\n", "different_industries_list = [item['_widget_1742884829007'] for item in different_industries]\n", "print(f\"异业合作数据: {len(different_industries)} 条\")\n", "\n", "# 获取短信-数据支持表单数据\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"67e5107198ba1b20d5df3974\"}\n", "groupnotification = api_instance.entry_data_list(payload).get(\"data\", [])\n", "print(f\"短信数据: {len(groupnotification)} 条\")\n", "\n", "# 获取多公司过滤表\n", "payload = {\"api_key\": \"675b900991ad2491c69389ca\", \"entry_id\": \"689bf5f8ba88a28cb0679ec9\"}\n", "get_filter_company_list = api_instance.entry_data_list(payload).get(\"data\", [])\n", "print(f\"多公司过滤表: {len(get_filter_company_list)} 条\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# 获取节假日列表\n", "date_list = common_module.get_holiday_list()\n", "print(f\"节假日列表: {len(date_list)} 个日期\")\n", "\n", "# 获取NGV明细数据(从数据库获取,比较耗时)\n", "print(\"\\n开始从数据库获取NGV明细数据(这可能需要一些时间)...\")\n", "data_NGV = common_module.get_ngv_details(days_back=1)\n", "print(f\"NGV明细数据: {len(data_NGV)} 条\")\n", "print(f\"NGV明细数据列数: {len(data_NGV.columns)}\")\n", "\n", "# 构建省市区索引\n", "def build_index(json_list):\n", " index = {}\n", " for json_item in json_list:\n", " try:\n", " key = (json_item['_widget_1734677164861'], json_item['_widget_1734677164862'],\n", " json_item['_widget_1734677164863']) # 省市区\n", " if '_widget_1734677164871' not in json_item: # 日常回访客服\n", " raise KeyError(\"缺少 '日常回访客服' 键\")\n", " index[key] = json_item\n", " except KeyError as e:\n", " print(f\"警告:{e},跳过该条记录\")\n", " continue\n", " return index\n", "\n", "index = build_index(json_list)\n", "print(f\"省市区索引构建完成: {len(index)} 条\")\n", "\n", "print(\"\\n========== 数据加载完成 ==========\")\n", "print(f\"数据加载时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 步骤2: 数据处理逻辑(260-425行)\n", "\n", "这部分包含主要的数据处理逻辑,包括:\n", "- 获取多公司过滤公司id\n", "- 数据清洗和转换\n", "- 优先级排序\n", "- 数据过滤和合并\n", "- 日期计算和扩展\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ========== 获取多公司过滤公司id ==========\n", "logger.info(\"获取多公司过滤公司id\")\n", "all_filter_company_list = [] # 获取多公司过滤公司id\n", "for company in get_filter_company_list:\n", " company_list = company.get(\"_widget_1755052002491\")\n", " if company_list:\n", " for company_item in company_list:\n", " if company_item.get(\"_widget_1755052002496\") == \"否\":\n", " all_filter_company_list.append(company_item.get(\"_widget_1755052002495\"))\n", "logger.info(f\"过滤公司条数:{len(all_filter_company_list)}\")\n", "print(f\"过滤公司条数: {len(all_filter_company_list)}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2026-01-16T07:27:13.175060200Z", "start_time": "2026-01-16T07:27:09.857076900Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "数据预处理完成,当前数据量: 45672 条\n", "数据列数: 143\n" ] } ], "source": [ "# ========== 数据预处理:日期转换和数据清洗 ==========\n", "# 将A列和B列的日期字符串转换为日期格式\n", "data_NGV = data_NGV.copy()\n", "data_NGV['A'] = pd.to_datetime(data_NGV['expiry_time'])\n", "data_NGV['B'] = pd.to_datetime(data_NGV['renew_date'])\n", "\n", "def replace_values(series):\n", " # 使用条件判断来进行替换\n", " return series.apply(lambda x: '' if pd.isna(x) or x in ['NA', 'None', ''] else x)\n", "\n", "# 处理字符串数据并显式指定数据类型\n", "data_NGV = data_NGV.apply(replace_values)\n", "\n", "print(f\"数据预处理完成,当前数据量: {len(data_NGV)} 条\")\n", "print(f\"数据列数: {len(data_NGV.columns)}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2026-01-16T07:28:01.298494800Z", "start_time": "2026-01-16T07:28:01.106113700Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "过滤多公司后数据量: 45653 条\n" ] } ], "source": [ "# ========== 过滤多公司 ==========\n", "# 针对公司主店过期,取公司最高等级版本派发\n", "# 过滤多公司\n", "data_NGV = data_NGV[~data_NGV['id_own_group'].isin(all_filter_company_list)]\n", "print(f\"过滤多公司后数据量: {len(data_NGV)} 条\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2026-01-16T07:28:04.235079300Z", "start_time": "2026-01-16T07:28:04.185820400Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "优先级映射完成\n" ] } ], "source": [ "# ========== 定义优先级顺序和创建映射字典 ==========\n", "# 定义优先级顺序\n", "edition_order = ['皇冠版', '至尊版', '尊享版', '旗舰版', '标准版', '进阶版', '基础版', '入门版']\n", "customer_type_order = [\"F\", \"E\", \"D\", \"C\", \"B\", \"A\"] # 索引越小优先级越高\n", "group_grade_order = ['全国KA(FMVP)', '区域KA(MVP)', '重要客户(SVIP)', '普通客户(VIP)']\n", "\n", "# 创建映射字典,并为不在列表中的值设置默认值\n", "edition_map = {edition: idx for idx, edition in enumerate(edition_order)}\n", "customer_type_map = {ctype: idx for idx, ctype in enumerate(customer_type_order)}\n", "group_grade_map = {grade: idx for idx, grade in enumerate(group_grade_order)}\n", "\n", "# 添加用于排序的新列,并处理不在映射字典中的值\n", "data_NGV['edition_rank'] = data_NGV['saas_edition_fmt'].map(edition_map).fillna(0).astype(int) # 缺失值用最高优先级填充\n", "data_NGV['customer_type_rank'] = data_NGV['saas_customer_type'].map(customer_type_map).fillna(0).astype(int)\n", "data_NGV['group_grade_rank'] = data_NGV['group_grade'].map(group_grade_map).fillna(0).astype(int)\n", "\n", "print(\"优先级映射完成\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2026-01-16T07:28:08.790102700Z", "start_time": "2026-01-16T07:28:08.399298Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "最佳值查找完成\n" ] } ], "source": [ "# ========== 找到每组中的最佳值 ==========\n", "# 找到每组中 edition_rank 最小值对应的行\n", "best_edition_idx = data_NGV.groupby('id_own_group')['edition_rank'].idxmin()\n", "best_edition_rows = data_NGV.loc[best_edition_idx]\n", "best_edition_rows['max_saas_edition'] = best_edition_rows['saas_edition_fmt']\n", "\n", "# 找到每组中 customer_type_rank 最小值对应的行\n", "best_customer_type_idx = data_NGV.groupby('id_own_group')['customer_type_rank'].idxmin()\n", "best_customer_type_rows = data_NGV.loc[best_customer_type_idx]\n", "best_customer_type_rows['max_saas_customer_type'] = best_customer_type_rows['customer_type_rank'].apply(\n", " lambda x: customer_type_order[x])\n", "\n", "# 找到每组中 group_grade_rank 最小值对应的行\n", "best_group_grade_idx = data_NGV.groupby('id_own_group')['group_grade_rank'].idxmin()\n", "best_group_grade_rows = data_NGV.loc[best_group_grade_idx]\n", "best_group_grade_rows['max_group_grade'] = best_group_grade_rows['group_grade']\n", "\n", "print(\"最佳值查找完成\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2026-01-16T07:28:11.374632Z", "start_time": "2026-01-16T07:28:11.141730700Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "最佳值合并完成,当前数据量: 45653 条\n" ] } ], "source": [ "# ========== 合并最佳值回到原数据集 ==========\n", "# 合并最佳值回到原数据集\n", "best_values = (\n", " best_edition_rows[['id_own_group', 'max_saas_edition']]\n", " .merge(best_customer_type_rows[['id_own_group', 'max_saas_customer_type']], on='id_own_group',\n", " how='outer')\n", " .merge(best_group_grade_rows[['id_own_group', 'max_group_grade']], on='id_own_group', how='outer')\n", ")\n", "\n", "# 将最佳值合并回原数据集\n", "data_NGV = data_NGV.merge(best_values, on='id_own_group', how='left')\n", "\n", "print(f\"最佳值合并完成,当前数据量: {len(data_NGV)} 条\")\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2026-01-16T07:30:06.358604500Z", "start_time": "2026-01-16T07:30:05.752861600Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "============================================================\n", "调试信息:处理主店过期情况\n", "============================================================\n", "\n", "当前 data_NGV 数据量: 45653 条\n", "\n", "【字段检查】\n", "is_main_org 数据类型: object\n", "is_main_org 唯一值: ['0', '1']\n", "is_main_org 值分布:\n", "is_main_org\n", "1 37628\n", "0 8025\n", "Name: count, dtype: int64\n", "\n", "org_status 数据类型: object\n", "org_status 唯一值: ['留存', '过期']\n", "org_status 值分布:\n", "org_status\n", "留存 27985\n", "过期 17668\n", "Name: count, dtype: int64\n", "\n", "org_type 数据类型: object\n", "org_type 唯一值: ['一般', '天猫']\n", "org_type 值分布:\n", "org_type\n", "一般 42985\n", "天猫 2668\n", "Name: count, dtype: int64\n", "\n", "【步骤1: 筛选主店过期】\n", "警告: is_main_org 是字符串类型,尝试转换为数值\n", "条件筛选结果数量: 15065 条\n", "主店过期数据量 (ngvv2): 15065 条\n", "ngvv2 中的 id_own_group 数量: 15065 个\n", "ngvv2 中的 id_own_group 示例: ['10545055917999655906', '10545055917999678943', '10545055917999702656', '10545055917999726421', '10545055917999791008', '10545055917999907421', '10545055917999958815', '10545055917999963314', '10545055917999973061', '10545055918000062921']\n", "\n", "【步骤2: 筛选分店留存】\n", "data_NGV_V2 初始数据量: 45653 条\n", "\n", "area_manager 唯一值数量: 16\n", "area_manager 值分布(前10):\n", "area_manager\n", "肖军 10824\n", "景东强 8408\n", "陈庆伟 8322\n", "张凯 8269\n", "关磊 7028\n", "孙玉蕾 2006\n", "殷昊 556\n", "王涛 161\n", "刘伟 52\n", " 8\n", "Name: count, dtype: int64\n", "\n", "各条件筛选结果:\n", " org_type == '一般': 42985 条\n", " org_status == '留存': 27985 条\n", " area_manager != '殷昊': 45097 条\n", " area_manager != '孙玉蕾': 43647 条\n", " is_main_org != 1: 8025 条\n", "\n", "所有条件合并后数据量: 4882 条\n", "data_NGV_V2_filtered 中的 id_own_group 数量: 1564 个\n", "data_NGV_V2_filtered 中的 id_own_group 示例: ['10545055917999659357', '10545055917999659357', '10545055917999688607', '10545055917999688607', '10545055917999688607', '10545055917999719687', '10545055917999791008', '10545055917999791008', '10545055917999995278', '10545055918000106937']\n", "\n", "【步骤3: 检查 id_own_group 交集】\n", "ngvv2 中的 id_own_group 数量: 15065\n", "data_NGV_V2_filtered 中的 id_own_group 数量: 1564\n", "交集数量: 316\n", "交集中的 id_own_group 示例: ['11240984669917478021', '10546172455175803787', '10546172455166018835', '10546172455220322161', '10546443563657780587', '10546172455213602644', '10546443563816611858', '11240984669917430021', '11240984669917329620', '11240984669917352563']\n", "\n", "【步骤4: 最终过滤】\n", "过滤后的数据量: 468 条\n", "\n", "============================================================\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\hp_z66\\AppData\\Local\\Temp\\ipykernel_14280\\4275068286.py:100: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " data_NGV_V2_filtered['exists_in_ngvv2'] = data_NGV_V2_filtered['id_own_group'].isin(ngvv2['id_own_group'])\n" ] } ], "source": [ "# ========== 处理主店过期的情况 ==========\n", "# 调试信息:检查数据状态\n", "print(\"=\" * 60)\n", "print(\"调试信息:处理主店过期情况\")\n", "print(\"=\" * 60)\n", "print(f\"\\n当前 data_NGV 数据量: {len(data_NGV)} 条\")\n", "\n", "# 检查关键字段的数据类型和唯一值\n", "print(f\"\\n【字段检查】\")\n", "print(f\"is_main_org 数据类型: {data_NGV['is_main_org'].dtype}\")\n", "print(f\"is_main_org 唯一值: {sorted(data_NGV['is_main_org'].unique())}\")\n", "print(f\"is_main_org 值分布:\\n{data_NGV['is_main_org'].value_counts()}\")\n", "\n", "print(f\"\\norg_status 数据类型: {data_NGV['org_status'].dtype}\")\n", "print(f\"org_status 唯一值: {sorted(data_NGV['org_status'].unique())}\")\n", "print(f\"org_status 值分布:\\n{data_NGV['org_status'].value_counts()}\")\n", "\n", "print(f\"\\norg_type 数据类型: {data_NGV['org_type'].dtype}\")\n", "print(f\"org_type 唯一值: {sorted(data_NGV['org_type'].unique())}\")\n", "print(f\"org_type 值分布:\\n{data_NGV['org_type'].value_counts()}\")\n", "\n", "# 步骤1: 筛选主店过期的情况\n", "print(f\"\\n【步骤1: 筛选主店过期】\")\n", "# 确保 is_main_org 是数值类型\n", "if data_NGV['is_main_org'].dtype == 'object':\n", " print(\"警告: is_main_org 是字符串类型,尝试转换为数值\")\n", " data_NGV['is_main_org'] = pd.to_numeric(data_NGV['is_main_org'], errors='coerce')\n", "\n", "condition = (data_NGV['is_main_org'] == 1) & (data_NGV['org_status'] == '过期')\n", "print(f\"条件筛选结果数量: {condition.sum()} 条\")\n", "\n", "ngvv2 = data_NGV[condition]\n", "print(f\"主店过期数据量 (ngvv2): {len(ngvv2)} 条\")\n", "\n", "if len(ngvv2) > 0:\n", " print(f\"ngvv2 中的 id_own_group 数量: {ngvv2['id_own_group'].nunique()} 个\")\n", " print(f\"ngvv2 中的 id_own_group 示例: {ngvv2['id_own_group'].head(10).tolist()}\")\n", "else:\n", " print(\"⚠️ 警告: ngvv2 为空,没有主店过期的情况!\")\n", "\n", "# 步骤2: 检查分店留存的情况\n", "print(f\"\\n【步骤2: 筛选分店留存】\")\n", "# 在合并最佳值之前保存原始数据副本(重要!)\n", "data_NGV_V2 = data_NGV.copy()\n", "print(f\"data_NGV_V2 初始数据量: {len(data_NGV_V2)} 条\")\n", "\n", "# 检查 area_manager 字段\n", "print(f\"\\narea_manager 唯一值数量: {data_NGV_V2['area_manager'].nunique()}\")\n", "print(f\"area_manager 值分布(前10):\\n{data_NGV_V2['area_manager'].value_counts().head(10)}\")\n", "\n", "# 确保 is_main_org 是数值类型\n", "if data_NGV_V2['is_main_org'].dtype == 'object':\n", " data_NGV_V2['is_main_org'] = pd.to_numeric(data_NGV_V2['is_main_org'], errors='coerce')\n", "\n", "# 逐步检查每个条件\n", "cond1 = (data_NGV_V2['org_type'] == \"一般\")\n", "cond2 = (data_NGV_V2['org_status'] == '留存')\n", "cond3 = (data_NGV_V2['area_manager'] != '殷昊')\n", "cond4 = (data_NGV_V2['area_manager'] != '孙玉蕾')\n", "cond5 = (data_NGV_V2['is_main_org'] != 1)\n", "\n", "print(f\"\\n各条件筛选结果:\")\n", "print(f\" org_type == '一般': {cond1.sum()} 条\")\n", "print(f\" org_status == '留存': {cond2.sum()} 条\")\n", "print(f\" area_manager != '殷昊': {cond3.sum()} 条\")\n", "print(f\" area_manager != '孙玉蕾': {cond4.sum()} 条\")\n", "print(f\" is_main_org != 1: {cond5.sum()} 条\")\n", "\n", "data_NGV_V2['条件'] = cond1 & cond2 & cond3 & cond4 & cond5\n", "data_NGV_V2_filtered = data_NGV_V2.loc[data_NGV_V2[\"条件\"]]\n", "print(f\"\\n所有条件合并后数据量: {len(data_NGV_V2_filtered)} 条\")\n", "\n", "if len(data_NGV_V2_filtered) > 0:\n", " print(f\"data_NGV_V2_filtered 中的 id_own_group 数量: {data_NGV_V2_filtered['id_own_group'].nunique()} 个\")\n", " print(f\"data_NGV_V2_filtered 中的 id_own_group 示例: {data_NGV_V2_filtered['id_own_group'].head(10).tolist()}\")\n", "\n", "# 步骤3: 检查交集\n", "print(f\"\\n【步骤3: 检查 id_own_group 交集】\")\n", "if len(ngvv2) > 0 and len(data_NGV_V2_filtered) > 0:\n", " ngvv2_groups = set(ngvv2['id_own_group'].unique())\n", " v2_groups = set(data_NGV_V2_filtered['id_own_group'].unique())\n", " intersection = ngvv2_groups & v2_groups\n", " \n", " print(f\"ngvv2 中的 id_own_group 数量: {len(ngvv2_groups)}\")\n", " print(f\"data_NGV_V2_filtered 中的 id_own_group 数量: {len(v2_groups)}\")\n", " print(f\"交集数量: {len(intersection)}\")\n", " \n", " if len(intersection) > 0:\n", " print(f\"交集中的 id_own_group 示例: {list(intersection)[:10]}\")\n", " else:\n", " print(\"⚠️ 警告: 没有交集!这可能是问题所在。\")\n", " print(f\"ngvv2 中的前10个 id_own_group: {list(ngvv2_groups)[:10]}\")\n", " print(f\"data_NGV_V2_filtered 中的前10个 id_own_group: {list(v2_groups)[:10]}\")\n", "else:\n", " print(\"⚠️ 警告: ngvv2 或 data_NGV_V2_filtered 为空,无法检查交集\")\n", "\n", "# 步骤4: 过滤存在的记录\n", "print(f\"\\n【步骤4: 最终过滤】\")\n", "if len(ngvv2) > 0:\n", " data_NGV_V2_filtered['exists_in_ngvv2'] = data_NGV_V2_filtered['id_own_group'].isin(ngvv2['id_own_group'])\n", " filtered_data = data_NGV_V2_filtered[data_NGV_V2_filtered['exists_in_ngvv2']]\n", " print(f\"过滤后的数据量: {len(filtered_data)} 条\")\n", " \n", " if len(filtered_data) == 0:\n", " print(\"\\n❌ 问题诊断:\")\n", " print(\" 过滤后数据为空,可能的原因:\")\n", " print(\" 1. ngvv2 为空(没有主店过期的情况)\")\n", " print(\" 2. data_NGV_V2_filtered 为空(没有满足条件的分店留存数据)\")\n", " print(\" 3. 两者的 id_own_group 没有交集\")\n", " print(\"\\n建议:\")\n", " print(\" - 检查数据源是否正确\")\n", " print(\" - 检查字段值是否匹配(注意数据类型和格式)\")\n", " print(\" - 检查是否有主店过期但分店留存的情况\")\n", "else:\n", " print(\"⚠️ 警告: ngvv2 为空,无法进行过滤\")\n", " filtered_data = pd.DataFrame() # 创建空DataFrame\n", "\n", "print(\"\\n\" + \"=\" * 60)\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2026-01-16T07:30:16.487181500Z", "start_time": "2026-01-16T07:30:16.440099400Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "排序去重后数据量: 316 条\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\hp_z66\\AppData\\Local\\Temp\\ipykernel_14280\\2835892650.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " filtered_data['sort_key'] = filtered_data['saas_edition_fmt'].map(fixed_order_map)\n" ] } ], "source": [ "# ========== 对过滤数据进行排序和去重 ==========\n", "fixed_order = ['皇冠版', '至尊版', '尊享版', '旗舰版', '标准版', '进阶版', '基础版', '入门版']\n", "\n", "fixed_order_map = {edition: index for index, edition in enumerate(fixed_order)}\n", "filtered_data['sort_key'] = filtered_data['saas_edition_fmt'].map(fixed_order_map)\n", "filtered_data = filtered_data.sort_values(by='sort_key').drop('sort_key', axis=1)\n", "\n", "result = filtered_data.drop_duplicates(subset='id_own_group', keep='first')\n", "\n", "print(f\"排序去重后数据量: {len(result)} 条\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ========== 合并主店留存数据和分店数据 ==========\n", "data_NGV['条件'] = (data_NGV['org_type'] == \"一般\") & (data_NGV['org_status'] == '留存') & (\n", " data_NGV['area_manager'] != '殷昊') & (\n", " data_NGV['area_manager'] != '孙玉蕾') & (\n", " data_NGV['is_main_org'] == 1)\n", "data_NGV = data_NGV.loc[data_NGV[\"条件\"]]\n", "\n", "data_NGV = pd.concat([data_NGV, result], axis=0)\n", "data_details = data_NGV.copy()\n", "\n", "# 重置索引\n", "data_details = data_details.reset_index(drop=True)\n", "\n", "print(f\"合并后数据量: {len(data_details)} 条\")\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2026-01-16T07:30:21.600199200Z", "start_time": "2026-01-16T07:30:20.828225500Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "日期计算后数据量: 9845 条\n", "需要扩展的数据行数: 9845 条\n" ] } ], "source": [ "# ========== 判断日期差并计算年数 ==========\n", "# 判断A列的日期是否大于B列的日期730天,如果是的话,将B列的值设置为天数差\n", "data_details['条件'] = data_details.apply(\n", " lambda row: (\n", " (pd.to_datetime(row['A']) - pd.to_datetime(row['B'])).days\n", " if pd.to_datetime(row['A']) - pd.to_datetime(row['B']) >= pd.Timedelta(days=730)\n", " else 0\n", " ),\n", " axis=1\n", ")\n", "data_details = data_details.loc[data_details[\"条件\"] > 0]\n", "\n", "# 定义一个函数,用于将数字除以365并取整数\n", "def divide_by_365(x):\n", " if isinstance(x, (int, float)):\n", " return int(x / 365)\n", " else:\n", " return x\n", "\n", "# 使用apply函数将divide_by_365函数应用到DataFrame的列\n", "data_details['年'] = data_details['条件'].apply(divide_by_365)\n", "\n", "# 重置索引\n", "data_details = data_details.reset_index(drop=True)\n", "\n", "print(f\"日期计算后数据量: {len(data_details)} 条\")\n", "print(f\"需要扩展的数据行数: {len(data_details[data_details['年'] > 1])} 条\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ========== 扩展数据:根据年数复制行并修改日期 ==========\n", "# 创建一个新的空的DataFrame\n", "new_df = pd.DataFrame()\n", "\n", "# 遍历原始DataFrame的每一行\n", "for index, row in data_details.iterrows():\n", " # 根据年数来决定复制的次数\n", " if row[\"renew_date\"] != \"2024-02-29\":\n", " for i_new in range(1, row['年']):\n", " # 修改日期\n", " row_new = row.copy()\n", " c = row_new[\"renew_date\"]\n", " date_obj = datetime.strptime(c, \"%Y-%m-%d\")\n", " new_year = date_obj.year + i_new\n", " new_date_obj = date_obj.replace(year=new_year)\n", " new_c = new_date_obj.strftime(\"%Y-%m-%d\")\n", " row_new[\"renew_date\"] = new_c\n", " # 将当前行添加到新的DataFrame中\n", " new_df = pd.concat([new_df, pd.DataFrame([row_new])], ignore_index=True)\n", "\n", "print(f\"扩展后的新数据量: {len(new_df)} 条\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2026-01-16T07:39:28.813848800Z", "start_time": "2026-01-16T07:39:28.255902200Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "合并后总数据量: 39599 条\n" ] } ], "source": [ "# ========== 合并原始数据和扩展数据 ==========\n", "# 合并两个DataFrame\n", "merged_df = pd.concat([data_NGV, new_df], axis=0, ignore_index=True)\n", "data_details = merged_df.copy() # 替换名称\n", "\n", "data_details_not_null = data_details[data_details['renew_date'].notnull()]\n", "# 重置索引\n", "data_details_not_null = data_details_not_null.reset_index(drop=True)\n", "data_details = data_details_not_null.copy() # 替换名称 v2\n", "\n", "print(f\"合并后总数据量: {len(data_details)} 条\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ========== 最终过滤:排除创建时间等于续约时间的记录 ==========\n", "data_details['saas_create_time'] = data_details['saas_create_time'].str[:4] # 截取前4位(年份)\n", "data_details['renew_date_new'] = data_details['renew_date'].str[:4] # 截取前4位(年份)\n", "data_details = data_details[\n", " data_details['saas_create_time'] != data_details['renew_date_new']] # 过滤掉等于renew_date的行\n", "\n", "data_details = data_details.reset_index(drop=True)\n", "\n", "logger.info(f\"过滤后的数据长度为: {len(data_details)}\")\n", "print(f\"\\n========== 数据处理完成 ==========\")\n", "print(f\"最终数据量: {len(data_details)} 条\")\n", "print(f\"处理完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 数据验证和检查\n", "\n", "可以在这里添加数据验证代码,检查处理结果的正确性\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "start_time": "2026-01-16T09:00:49.656903800Z" } }, "outputs": [], "source": [ "# ========== 数据验证 ==========\n", "# 查看数据基本信息\n", "print(\"数据基本信息:\")\n", "print(f\"数据形状: {data_details.shape}\")\n", "print(f\"\\n数据列名:\")\n", "print(data_details.columns.tolist())\n", "\n", "# 查看前几行数据\n", "print(\"\\n前5行数据:\")\n", "print(data_details.head())\n", "\n", "# 检查关键字段的数据分布\n", "if 'saas_edition_fmt' in data_details.columns:\n", " print(\"\\n版本分布:\")\n", " print(data_details['saas_edition_fmt'].value_counts())\n", "\n", "if 'org_status' in data_details.columns:\n", " print(\"\\n组织状态分布:\")\n", " print(data_details['org_status'].value_counts())\n", "\n", "# 可以保存到CSV文件进行进一步检查\n", "# data_details.to_csv(\"处理后的数据.csv\", index=False, encoding='utf-8-sig')\n", "# print(\"\\n数据已保存到: 处理后的数据.csv\")\n" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }