Files
F6--/张阳脚本/工具/emoji严格处理.ipynb
T
2026-01-30 11:28:35 +08:00

377 lines
14 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2026-01-29T03:44:29.227796600Z",
"start_time": "2026-01-29T03:44:28.870218900Z"
}
},
"source": [
"import pandas as pd\n",
"import re\n",
"import sys\n",
"\n",
"def remove_4byte_chars(text):\n",
" \"\"\"\n",
" 移除字符串中的 4 字节 UTF-8 字符(如 Emoji、部分生僻字)\n",
" 这些字符的 Unicode 码点范围是 U+10000 到 U+10FFFF\n",
" \"\"\"\n",
" if isinstance(text, str):\n",
" # 使用正则表达式匹配并移除 4 字节字符(即 Unicode 超出 BMP 平面)\n",
" cleaned = re.sub(r'[\\U00010000-\\U0010FFFF]', '', text)\n",
" return cleaned\n",
" return text\n",
"\n",
"def sanitize_dataframe(df):\n",
" \"\"\"\n",
" 对 DataFrame 中所有字符串类型的列进行清洗\n",
" \"\"\"\n",
" for col in df.columns:\n",
" if df[col].dtype == 'object': # 通常是字符串列\n",
" df[col] = df[col].apply(remove_4byte_chars)\n",
" return df\n",
"\n",
"# 示例:读取 Excel 文件\n",
"def process_excel_file(file_path):\n",
" df = pd.read_excel(file_path, engine='openpyxl') # 支持 .xlsx\n",
" # df = pd.read_excel(file_path, engine='xlrd') # 支持 .xlsx\n",
" print(\"原始数据:\")\n",
" print(df.head())\n",
"\n",
" # 清洗数据\n",
" df_cleaned = sanitize_dataframe(df.copy())\n",
"\n",
" print(\"\\n清洗后数据(已移除 4 字节字符):\")\n",
" print(df_cleaned.head())\n",
"\n",
" # 保存或返回清洗后的数据\n",
" df_cleaned.to_excel('cleaned_output.xlsx', index=False)\n",
" print(\"\\n已保存清洗后的文件:cleaned_output.xlsx\")\n",
" return df_cleaned\n",
"\n",
"# 示例:读取 CSV 文件\n",
"def process_csv_file(file_path):\n",
" df = pd.read_csv(file_path, encoding='utf-8') # 假设是 UTF-8 编码\n",
" print(\"原始数据:\")\n",
" print(df.head())\n",
"\n",
" df_cleaned = sanitize_dataframe(df.copy())\n",
"\n",
" print(\"\\n清洗后数据:\")\n",
" print(df_cleaned.head())\n",
"\n",
" df_cleaned.to_csv('cleaned_output.csv', index=False, encoding='utf-8')\n",
" print(\"\\n已保存清洗后的文件:cleaned_output.csv\")\n",
" return df_cleaned\n",
"\n",
"file_path = fr\"C:\\Users\\hp_z66\\Desktop\\钉钉文件\\储值卡信息匹配后文件.xlsx\"\n",
"process_excel_file(file_path)"
],
"outputs": [
{
"data": {
"text/plain": [
" 卡名称 关联模板ID 持卡人姓名 持卡人手机号 绑定车牌号 卡号 可用门店 \\\n",
"0 储值卡 12147360560970768475 刘斌 18299152198 新A9KW77 18299152198 全部 \n",
"1 储值卡 12147360560970768475 张怀彬 13319992628 新F54991 8883939 全部 \n",
"2 储值卡 12147360560970768475 刘君君 15292780801 新F1861P 8883933 全部 \n",
"3 储值卡 12147360560970768475 热那提 18699990750 新FD2D08 18699990750 全部 \n",
"4 储值卡 12147360560970768475 安斌 13779519897 新F9010Q 13779519897 全部 \n",
".. ... ... ... ... ... ... ... \n",
"721 储值卡 12147360560970768475 陈轲 13095118455 新A125YK 9990223 全部 \n",
"722 储值卡 12147360560970768475 姜海南 18999586345 新F7V551 9990232 全部 \n",
"723 储值卡 12147360560970768475 买力侧 13679950078 新F16J72 13679950078 全部 \n",
"724 储值卡 12147360560970768475 王子煜 15109991005 新FF32122 8883965 全部 \n",
"725 储值卡 12147360560970768475 贵小军 18999588628 新F6616L 8889762 全部 \n",
"\n",
" 卡有效期 剩余面额 剩余实额 服务顾问 售卡门店 售卡日期 卡说明 使用条款 \n",
"0 2025-03-24 200.0 200.0 NaN 伊宁车友 2024-03-24 NaN NaN \n",
"1 2025-02-12 10.0 10.0 NaN 伊宁车友 2021-05-07 NaN NaN \n",
"2 2024-05-09 260.0 260.0 NaN 伊宁车友 2021-03-28 NaN NaN \n",
"3 2026-01-25 500.0 500.0 NaN 伊宁车友 2021-01-25 NaN NaN \n",
"4 2025-12-31 29.8 29.8 NaN 伊宁车友 2020-12-30 NaN NaN \n",
".. ... ... ... ... ... ... ... ... \n",
"721 2026-03-06 334.0 334.0 NaN 伊宁车友 2017-10-29 NaN NaN \n",
"722 2026-07-17 430.0 430.0 NaN 伊宁车友 2017-04-09 NaN NaN \n",
"723 2027-12-14 200.0 0.0 NaN 伊宁车友 2022-12-14 NaN NaN \n",
"724 2027-07-02 1000.0 0.0 NaN 伊宁车友 2022-07-02 NaN NaN \n",
"725 2026-11-24 230.0 0.0 NaN 伊宁车友 2021-05-22 NaN NaN \n",
"\n",
"[726 rows x 15 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>卡名称</th>\n",
" <th>关联模板ID</th>\n",
" <th>持卡人姓名</th>\n",
" <th>持卡人手机号</th>\n",
" <th>绑定车牌号</th>\n",
" <th>卡号</th>\n",
" <th>可用门店</th>\n",
" <th>卡有效期</th>\n",
" <th>剩余面额</th>\n",
" <th>剩余实额</th>\n",
" <th>服务顾问</th>\n",
" <th>售卡门店</th>\n",
" <th>售卡日期</th>\n",
" <th>卡说明</th>\n",
" <th>使用条款</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>储值卡</td>\n",
" <td>12147360560970768475</td>\n",
" <td>刘斌</td>\n",
" <td>18299152198</td>\n",
" <td>新A9KW77</td>\n",
" <td>18299152198</td>\n",
" <td>全部</td>\n",
" <td>2025-03-24</td>\n",
" <td>200.0</td>\n",
" <td>200.0</td>\n",
" <td>NaN</td>\n",
" <td>伊宁车友</td>\n",
" <td>2024-03-24</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>储值卡</td>\n",
" <td>12147360560970768475</td>\n",
" <td>张怀彬</td>\n",
" <td>13319992628</td>\n",
" <td>新F54991</td>\n",
" <td>8883939</td>\n",
" <td>全部</td>\n",
" <td>2025-02-12</td>\n",
" <td>10.0</td>\n",
" <td>10.0</td>\n",
" <td>NaN</td>\n",
" <td>伊宁车友</td>\n",
" <td>2021-05-07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>储值卡</td>\n",
" <td>12147360560970768475</td>\n",
" <td>刘君君</td>\n",
" <td>15292780801</td>\n",
" <td>新F1861P</td>\n",
" <td>8883933</td>\n",
" <td>全部</td>\n",
" <td>2024-05-09</td>\n",
" <td>260.0</td>\n",
" <td>260.0</td>\n",
" <td>NaN</td>\n",
" <td>伊宁车友</td>\n",
" <td>2021-03-28</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>储值卡</td>\n",
" <td>12147360560970768475</td>\n",
" <td>热那提</td>\n",
" <td>18699990750</td>\n",
" <td>新FD2D08</td>\n",
" <td>18699990750</td>\n",
" <td>全部</td>\n",
" <td>2026-01-25</td>\n",
" <td>500.0</td>\n",
" <td>500.0</td>\n",
" <td>NaN</td>\n",
" <td>伊宁车友</td>\n",
" <td>2021-01-25</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>储值卡</td>\n",
" <td>12147360560970768475</td>\n",
" <td>安斌</td>\n",
" <td>13779519897</td>\n",
" <td>新F9010Q</td>\n",
" <td>13779519897</td>\n",
" <td>全部</td>\n",
" <td>2025-12-31</td>\n",
" <td>29.8</td>\n",
" <td>29.8</td>\n",
" <td>NaN</td>\n",
" <td>伊宁车友</td>\n",
" <td>2020-12-30</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>721</th>\n",
" <td>储值卡</td>\n",
" <td>12147360560970768475</td>\n",
" <td>陈轲</td>\n",
" <td>13095118455</td>\n",
" <td>新A125YK</td>\n",
" <td>9990223</td>\n",
" <td>全部</td>\n",
" <td>2026-03-06</td>\n",
" <td>334.0</td>\n",
" <td>334.0</td>\n",
" <td>NaN</td>\n",
" <td>伊宁车友</td>\n",
" <td>2017-10-29</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>722</th>\n",
" <td>储值卡</td>\n",
" <td>12147360560970768475</td>\n",
" <td>姜海南</td>\n",
" <td>18999586345</td>\n",
" <td>新F7V551</td>\n",
" <td>9990232</td>\n",
" <td>全部</td>\n",
" <td>2026-07-17</td>\n",
" <td>430.0</td>\n",
" <td>430.0</td>\n",
" <td>NaN</td>\n",
" <td>伊宁车友</td>\n",
" <td>2017-04-09</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>723</th>\n",
" <td>储值卡</td>\n",
" <td>12147360560970768475</td>\n",
" <td>买力侧</td>\n",
" <td>13679950078</td>\n",
" <td>新F16J72</td>\n",
" <td>13679950078</td>\n",
" <td>全部</td>\n",
" <td>2027-12-14</td>\n",
" <td>200.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>伊宁车友</td>\n",
" <td>2022-12-14</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>724</th>\n",
" <td>储值卡</td>\n",
" <td>12147360560970768475</td>\n",
" <td>王子煜</td>\n",
" <td>15109991005</td>\n",
" <td>新FF32122</td>\n",
" <td>8883965</td>\n",
" <td>全部</td>\n",
" <td>2027-07-02</td>\n",
" <td>1000.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>伊宁车友</td>\n",
" <td>2022-07-02</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>725</th>\n",
" <td>储值卡</td>\n",
" <td>12147360560970768475</td>\n",
" <td>贵小军</td>\n",
" <td>18999588628</td>\n",
" <td>新F6616L</td>\n",
" <td>8889762</td>\n",
" <td>全部</td>\n",
" <td>2026-11-24</td>\n",
" <td>230.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>伊宁车友</td>\n",
" <td>2021-05-22</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>726 rows × 15 columns</p>\n",
"</div>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 7
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}