377 lines
14 KiB
Plaintext
377 lines
14 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"id": "initial_id",
|
||
"metadata": {
|
||
"collapsed": true,
|
||
"ExecuteTime": {
|
||
"end_time": "2026-01-29T03:44:29.227796600Z",
|
||
"start_time": "2026-01-29T03:44:28.870218900Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import re\n",
|
||
"import sys\n",
|
||
"\n",
|
||
"def remove_4byte_chars(text):\n",
|
||
" \"\"\"\n",
|
||
" 移除字符串中的 4 字节 UTF-8 字符(如 Emoji、部分生僻字)\n",
|
||
" 这些字符的 Unicode 码点范围是 U+10000 到 U+10FFFF\n",
|
||
" \"\"\"\n",
|
||
" if isinstance(text, str):\n",
|
||
" # 使用正则表达式匹配并移除 4 字节字符(即 Unicode 超出 BMP 平面)\n",
|
||
" cleaned = re.sub(r'[\\U00010000-\\U0010FFFF]', '', text)\n",
|
||
" return cleaned\n",
|
||
" return text\n",
|
||
"\n",
|
||
"def sanitize_dataframe(df):\n",
|
||
" \"\"\"\n",
|
||
" 对 DataFrame 中所有字符串类型的列进行清洗\n",
|
||
" \"\"\"\n",
|
||
" for col in df.columns:\n",
|
||
" if df[col].dtype == 'object': # 通常是字符串列\n",
|
||
" df[col] = df[col].apply(remove_4byte_chars)\n",
|
||
" return df\n",
|
||
"\n",
|
||
"# 示例:读取 Excel 文件\n",
|
||
"def process_excel_file(file_path):\n",
|
||
" df = pd.read_excel(file_path, engine='openpyxl') # 支持 .xlsx\n",
|
||
" # df = pd.read_excel(file_path, engine='xlrd') # 支持 .xlsx\n",
|
||
" print(\"原始数据:\")\n",
|
||
" print(df.head())\n",
|
||
"\n",
|
||
" # 清洗数据\n",
|
||
" df_cleaned = sanitize_dataframe(df.copy())\n",
|
||
"\n",
|
||
" print(\"\\n清洗后数据(已移除 4 字节字符):\")\n",
|
||
" print(df_cleaned.head())\n",
|
||
"\n",
|
||
" # 保存或返回清洗后的数据\n",
|
||
" df_cleaned.to_excel('cleaned_output.xlsx', index=False)\n",
|
||
" print(\"\\n已保存清洗后的文件:cleaned_output.xlsx\")\n",
|
||
" return df_cleaned\n",
|
||
"\n",
|
||
"# 示例:读取 CSV 文件\n",
|
||
"def process_csv_file(file_path):\n",
|
||
" df = pd.read_csv(file_path, encoding='utf-8') # 假设是 UTF-8 编码\n",
|
||
" print(\"原始数据:\")\n",
|
||
" print(df.head())\n",
|
||
"\n",
|
||
" df_cleaned = sanitize_dataframe(df.copy())\n",
|
||
"\n",
|
||
" print(\"\\n清洗后数据:\")\n",
|
||
" print(df_cleaned.head())\n",
|
||
"\n",
|
||
" df_cleaned.to_csv('cleaned_output.csv', index=False, encoding='utf-8')\n",
|
||
" print(\"\\n已保存清洗后的文件:cleaned_output.csv\")\n",
|
||
" return df_cleaned\n",
|
||
"\n",
|
||
"file_path = fr\"C:\\Users\\hp_z66\\Desktop\\钉钉文件\\储值卡信息匹配后文件.xlsx\"\n",
|
||
"process_excel_file(file_path)"
|
||
],
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
" 卡名称 关联模板ID 持卡人姓名 持卡人手机号 绑定车牌号 卡号 可用门店 \\\n",
|
||
"0 储值卡 12147360560970768475 刘斌 18299152198 新A9KW77 18299152198 全部 \n",
|
||
"1 储值卡 12147360560970768475 张怀彬 13319992628 新F54991 8883939 全部 \n",
|
||
"2 储值卡 12147360560970768475 刘君君 15292780801 新F1861P 8883933 全部 \n",
|
||
"3 储值卡 12147360560970768475 热那提 18699990750 新FD2D08 18699990750 全部 \n",
|
||
"4 储值卡 12147360560970768475 安斌 13779519897 新F9010Q 13779519897 全部 \n",
|
||
".. ... ... ... ... ... ... ... \n",
|
||
"721 储值卡 12147360560970768475 陈轲 13095118455 新A125YK 9990223 全部 \n",
|
||
"722 储值卡 12147360560970768475 姜海南 18999586345 新F7V551 9990232 全部 \n",
|
||
"723 储值卡 12147360560970768475 买力侧 13679950078 新F16J72 13679950078 全部 \n",
|
||
"724 储值卡 12147360560970768475 王子煜 15109991005 新FF32122 8883965 全部 \n",
|
||
"725 储值卡 12147360560970768475 贵小军 18999588628 新F6616L 8889762 全部 \n",
|
||
"\n",
|
||
" 卡有效期 剩余面额 剩余实额 服务顾问 售卡门店 售卡日期 卡说明 使用条款 \n",
|
||
"0 2025-03-24 200.0 200.0 NaN 伊宁车友 2024-03-24 NaN NaN \n",
|
||
"1 2025-02-12 10.0 10.0 NaN 伊宁车友 2021-05-07 NaN NaN \n",
|
||
"2 2024-05-09 260.0 260.0 NaN 伊宁车友 2021-03-28 NaN NaN \n",
|
||
"3 2026-01-25 500.0 500.0 NaN 伊宁车友 2021-01-25 NaN NaN \n",
|
||
"4 2025-12-31 29.8 29.8 NaN 伊宁车友 2020-12-30 NaN NaN \n",
|
||
".. ... ... ... ... ... ... ... ... \n",
|
||
"721 2026-03-06 334.0 334.0 NaN 伊宁车友 2017-10-29 NaN NaN \n",
|
||
"722 2026-07-17 430.0 430.0 NaN 伊宁车友 2017-04-09 NaN NaN \n",
|
||
"723 2027-12-14 200.0 0.0 NaN 伊宁车友 2022-12-14 NaN NaN \n",
|
||
"724 2027-07-02 1000.0 0.0 NaN 伊宁车友 2022-07-02 NaN NaN \n",
|
||
"725 2026-11-24 230.0 0.0 NaN 伊宁车友 2021-05-22 NaN NaN \n",
|
||
"\n",
|
||
"[726 rows x 15 columns]"
|
||
],
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>卡名称</th>\n",
|
||
" <th>关联模板ID</th>\n",
|
||
" <th>持卡人姓名</th>\n",
|
||
" <th>持卡人手机号</th>\n",
|
||
" <th>绑定车牌号</th>\n",
|
||
" <th>卡号</th>\n",
|
||
" <th>可用门店</th>\n",
|
||
" <th>卡有效期</th>\n",
|
||
" <th>剩余面额</th>\n",
|
||
" <th>剩余实额</th>\n",
|
||
" <th>服务顾问</th>\n",
|
||
" <th>售卡门店</th>\n",
|
||
" <th>售卡日期</th>\n",
|
||
" <th>卡说明</th>\n",
|
||
" <th>使用条款</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>储值卡</td>\n",
|
||
" <td>12147360560970768475</td>\n",
|
||
" <td>刘斌</td>\n",
|
||
" <td>18299152198</td>\n",
|
||
" <td>新A9KW77</td>\n",
|
||
" <td>18299152198</td>\n",
|
||
" <td>全部</td>\n",
|
||
" <td>2025-03-24</td>\n",
|
||
" <td>200.0</td>\n",
|
||
" <td>200.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>伊宁车友</td>\n",
|
||
" <td>2024-03-24</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>储值卡</td>\n",
|
||
" <td>12147360560970768475</td>\n",
|
||
" <td>张怀彬</td>\n",
|
||
" <td>13319992628</td>\n",
|
||
" <td>新F54991</td>\n",
|
||
" <td>8883939</td>\n",
|
||
" <td>全部</td>\n",
|
||
" <td>2025-02-12</td>\n",
|
||
" <td>10.0</td>\n",
|
||
" <td>10.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>伊宁车友</td>\n",
|
||
" <td>2021-05-07</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>储值卡</td>\n",
|
||
" <td>12147360560970768475</td>\n",
|
||
" <td>刘君君</td>\n",
|
||
" <td>15292780801</td>\n",
|
||
" <td>新F1861P</td>\n",
|
||
" <td>8883933</td>\n",
|
||
" <td>全部</td>\n",
|
||
" <td>2024-05-09</td>\n",
|
||
" <td>260.0</td>\n",
|
||
" <td>260.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>伊宁车友</td>\n",
|
||
" <td>2021-03-28</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>储值卡</td>\n",
|
||
" <td>12147360560970768475</td>\n",
|
||
" <td>热那提</td>\n",
|
||
" <td>18699990750</td>\n",
|
||
" <td>新FD2D08</td>\n",
|
||
" <td>18699990750</td>\n",
|
||
" <td>全部</td>\n",
|
||
" <td>2026-01-25</td>\n",
|
||
" <td>500.0</td>\n",
|
||
" <td>500.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>伊宁车友</td>\n",
|
||
" <td>2021-01-25</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>储值卡</td>\n",
|
||
" <td>12147360560970768475</td>\n",
|
||
" <td>安斌</td>\n",
|
||
" <td>13779519897</td>\n",
|
||
" <td>新F9010Q</td>\n",
|
||
" <td>13779519897</td>\n",
|
||
" <td>全部</td>\n",
|
||
" <td>2025-12-31</td>\n",
|
||
" <td>29.8</td>\n",
|
||
" <td>29.8</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>伊宁车友</td>\n",
|
||
" <td>2020-12-30</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>721</th>\n",
|
||
" <td>储值卡</td>\n",
|
||
" <td>12147360560970768475</td>\n",
|
||
" <td>陈轲</td>\n",
|
||
" <td>13095118455</td>\n",
|
||
" <td>新A125YK</td>\n",
|
||
" <td>9990223</td>\n",
|
||
" <td>全部</td>\n",
|
||
" <td>2026-03-06</td>\n",
|
||
" <td>334.0</td>\n",
|
||
" <td>334.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>伊宁车友</td>\n",
|
||
" <td>2017-10-29</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>722</th>\n",
|
||
" <td>储值卡</td>\n",
|
||
" <td>12147360560970768475</td>\n",
|
||
" <td>姜海南</td>\n",
|
||
" <td>18999586345</td>\n",
|
||
" <td>新F7V551</td>\n",
|
||
" <td>9990232</td>\n",
|
||
" <td>全部</td>\n",
|
||
" <td>2026-07-17</td>\n",
|
||
" <td>430.0</td>\n",
|
||
" <td>430.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>伊宁车友</td>\n",
|
||
" <td>2017-04-09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>723</th>\n",
|
||
" <td>储值卡</td>\n",
|
||
" <td>12147360560970768475</td>\n",
|
||
" <td>买力侧</td>\n",
|
||
" <td>13679950078</td>\n",
|
||
" <td>新F16J72</td>\n",
|
||
" <td>13679950078</td>\n",
|
||
" <td>全部</td>\n",
|
||
" <td>2027-12-14</td>\n",
|
||
" <td>200.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>伊宁车友</td>\n",
|
||
" <td>2022-12-14</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>724</th>\n",
|
||
" <td>储值卡</td>\n",
|
||
" <td>12147360560970768475</td>\n",
|
||
" <td>王子煜</td>\n",
|
||
" <td>15109991005</td>\n",
|
||
" <td>新FF32122</td>\n",
|
||
" <td>8883965</td>\n",
|
||
" <td>全部</td>\n",
|
||
" <td>2027-07-02</td>\n",
|
||
" <td>1000.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>伊宁车友</td>\n",
|
||
" <td>2022-07-02</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>725</th>\n",
|
||
" <td>储值卡</td>\n",
|
||
" <td>12147360560970768475</td>\n",
|
||
" <td>贵小军</td>\n",
|
||
" <td>18999588628</td>\n",
|
||
" <td>新F6616L</td>\n",
|
||
" <td>8889762</td>\n",
|
||
" <td>全部</td>\n",
|
||
" <td>2026-11-24</td>\n",
|
||
" <td>230.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>伊宁车友</td>\n",
|
||
" <td>2021-05-22</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>726 rows × 15 columns</p>\n",
|
||
"</div>"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 7
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 2
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython2",
|
||
"version": "2.7.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|