Files
F6--/张阳脚本/工具/单元格包含多行文本拆分.ipynb
2026-01-30 11:28:35 +08:00

264 lines
10 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": [
"查看excel里面的换行符格式\n",
"检查后可以看到,excel里面的单元格内换行符是\\r\\n格式"
],
"id": "c9030123c5d35ea8"
},
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-02-13T01:38:58.145302Z",
"start_time": "2025-02-13T01:38:49.123687Z"
}
},
"source": [
"import pandas as pd\n",
"\n",
"# 读取Excel文件\n",
"file_path = r\"C:\\Users\\Administrator.DESKTOP-7IC2USJ\\Desktop\\明细统计.xls\" # 使用原始字符串 (r\"\") 来避免转义字符问题\n",
"df = pd.read_excel(file_path)\n",
"\n",
"# 选择你要检查的列(例如 '图号'\n",
"column_to_check = '图号'\n",
"\n",
"# 打印前几行的数据,查看是否有换行符\n",
"for index, row in df.head().iterrows():\n",
" print(f\"Row {index}:\")\n",
" print(row[column_to_check])\n",
" print(repr(row[column_to_check])) # 使用 repr() 查看实际的换行符\n",
" print(\"-\" * 40)"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Row 0:\n",
"TH04733\r\n",
"YLS-1L5W30\r\n",
"LSGL-4-75W90\r\n",
"NO-20002\r\n",
"1307012TAA\r\n",
"1002250TAR-1\r\n",
"EZL401\r\n",
"NA-14021U\r\n",
"CC-22\r\n",
"3701250TAC\r\n",
"TH04732\r\n",
"1006060TARD1\r\n",
"1002350TRA-F\n",
"'TH04733\\r\\nYLS-1L5W30\\r\\nLSGL-4-75W90\\r\\nNO-20002\\r\\n1307012TAA\\r\\n1002250TAR-1\\r\\nEZL401\\r\\nNA-14021U\\r\\nCC-22\\r\\n3701250TAC\\r\\nTH04732\\r\\n1006060TARD1\\r\\n1002350TRA-F'\n",
"----------------------------------------\n",
"Row 1:\n",
"nan\n",
"nan\n",
"----------------------------------------\n",
"Row 2:\n",
"0280150989\r\n",
"H15015-0200\r\n",
"06F 905 115G\r\n",
"7092-BKR6EGP\n",
"'0280150989\\r\\nH15015-0200\\r\\n06F 905 115G\\r\\n7092-BKR6EGP'\n",
"----------------------------------------\n",
"Row 3:\n",
"0M08-10129\r\n",
"S101014-0400\r\n",
"0M10-30064\r\n",
"H15002-1000\n",
"'0M08-10129\\r\\nS101014-0400\\r\\n0M10-30064\\r\\nH15002-1000'\n",
"----------------------------------------\n",
"Row 4:\n",
"JLS-SN5W301L\r\n",
"JL00004\r\n",
"AST00002\r\n",
"NA-18034P\r\n",
"AST00010\r\n",
"NC-17012L\r\n",
"56110-RAA-A01HC\r\n",
"08285-P99-01ZE3\r\n",
"N0-18003\n",
"'JLS-SN5W301L\\r\\nJL00004\\r\\nAST00002\\r\\nNA-18034P\\r\\nAST00010\\r\\nNC-17012L\\r\\n56110-RAA-A01HC\\r\\n08285-P99-01ZE3\\r\\nN0-18003'\n",
"----------------------------------------\n"
]
}
],
"execution_count": 1
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"拆分\\r\\n换行符\n",
"\n",
"根据同一组\\r\\n的数量,"
],
"id": "3023b57d0a934d9e"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-13T01:44:18.374344Z",
"start_time": "2025-02-13T01:39:23.892604Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from tqdm import tqdm\n",
"from pathlib import Path\n",
"\n",
"def split_rows(df, key_column='工单号'):\n",
" # 获取需要处理的列\n",
" part_columns = ['配件名称', '配件数量', '配件施工人员', '配件成本', '配件单价', '配件金额']\n",
" service_columns = ['服务项目', '项目金额', '项目施工人员']\n",
"\n",
" # 初始化三个空列表来存储正常、异常和异常工单号对应的正常数据\n",
" normal_rows = []\n",
" abnormal_rows = []\n",
" abnormal_key_values = set() # 用于存储异常工单号\n",
" original_abnormal_rows = [] # 用于存储异常工单的原始行\n",
"\n",
" # 创建进度条,用于显示处理每一行的进度\n",
" with tqdm(total=len(df), desc='Processing Rows') as pbar:\n",
" for index, row in df.iterrows():\n",
" # 分割配件组和服务项目组数据\n",
" parts_data = {col: row[col].split('\\r\\n') if isinstance(row[col], str) and pd.notna(row[col]) else [row[col]] for col in part_columns}\n",
" services_data = {col: row[col].split('\\r\\n') if isinstance(row[col], str) and pd.notna(row[col]) else [row[col]] for col in service_columns}\n",
"\n",
" # 合并两个字典\n",
" all_data = {**parts_data, **services_data}\n",
"\n",
" # 确定最大长度\n",
" max_length = max(len(data) for data in all_data.values())\n",
"\n",
" # 记录当前行的工单号\n",
" key_value = row[key_column]\n",
"\n",
" # 将所有列的数据扩展到最大长度\n",
" for i in range(max_length):\n",
" new_row = row.copy()\n",
" for col, data in all_data.items():\n",
" if i < len(data):\n",
" new_row[col] = data[i]\n",
" else:\n",
" new_row[col] = None\n",
"\n",
" # 检查是否有异常情况\n",
" non_null_values = [new_row[col] for col in all_data.keys() if pd.notna(new_row[col])]\n",
" if len(non_null_values) == 1:\n",
" # 如果这一行只有单个非空值,则记录为异常数据\n",
" abnormal_row = {\n",
" key_column: key_value,\n",
" '异常列': list(all_data.keys())[0],\n",
" '异常值': non_null_values[0],\n",
" '原始行索引': index # 记录原始行的索引\n",
" }\n",
" abnormal_rows.append(abnormal_row)\n",
" abnormal_key_values.add(key_value) # 记录异常工单号\n",
" else:\n",
" normal_rows.append(new_row)\n",
"\n",
" # 更新进度条\n",
" pbar.update(1)\n",
"\n",
" # 一次性将所有正常和异常的行合并为新的DataFrame\n",
" normal_df = pd.DataFrame(normal_rows)\n",
" abnormal_df = pd.DataFrame(abnormal_rows)\n",
"\n",
" # 提取异常工单号对应的正常数据\n",
" abnormal_normal_df = normal_df[normal_df[key_column].isin(abnormal_key_values)]\n",
"\n",
" # 从正常数据中移除异常工单号的数据\n",
" normal_df = normal_df[~normal_df[key_column].isin(abnormal_key_values)]\n",
"\n",
" # 提取异常工单的原始行\n",
" original_abnormal_df = df[df[key_column].isin(abnormal_key_values)]\n",
"\n",
" return normal_df, abnormal_df, abnormal_normal_df, original_abnormal_df\n",
"\n",
"# 读取Excel文件\n",
"input_file_path = Path(r\"C:\\Users\\Administrator.DESKTOP-7IC2USJ\\Desktop\\明细统计.xls\",sheet='Sheet2')\n",
"df = pd.read_excel(input_file_path)\n",
"\n",
"# 处理数据\n",
"normal_df, abnormal_df, abnormal_normal_df, original_abnormal_df = split_rows(df, key_column='工单号')\n",
"\n",
"# 写入新的Excel文件,包含四个工作表\n",
"output_file_path = Path(r\"C:\\Users\\Administrator.DESKTOP-7IC2USJ\\Desktop\\拆分后_明细统计.xlsx\", engine='xlsxwriter') # 修改为 .xlsx 格式\n",
"with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:\n",
" normal_df.to_excel(writer, sheet_name='正常数据', index=False)\n",
" abnormal_df.to_excel(writer, sheet_name='异常数据', index=False)\n",
" abnormal_normal_df.to_excel(writer, sheet_name='异常工单的正常数据', index=False)\n",
" original_abnormal_df.to_excel(writer, sheet_name='异常工单的原始行', index=False)\n",
"\n",
"print(f\"处理完成,正常数据、异常数据、异常工单的正常数据以及异常工单的原始行已分别保存到 '{output_file_path}' 的 '正常数据'、'异常数据'、'异常工单的正常数据' 和 '异常工单的原始行' 工作表中。\")"
],
"id": "78e36241d6a7696",
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing Rows: 100%|██████████| 31335/31335 [01:08<00:00, 456.67it/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"处理完成,正常数据、异常数据、异常工单的正常数据以及异常工单的原始行已分别保存到 'C:\\Users\\Administrator.DESKTOP-7IC2USJ\\Desktop\\拆分后_明细统计.xlsx' 的 '正常数据'、'异常数据'、'异常工单的正常数据' 和 '异常工单的原始行' 工作表中。\n"
]
}
],
"execution_count": 2
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-11T01:29:12.873010Z",
"start_time": "2024-12-11T01:27:31.524526Z"
}
},
"cell_type": "code",
"source": [
"with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:\n",
" normal_df.to_excel(writer, sheet_name='正常数据', index=False)\n",
" abnormal_df.to_excel(writer, sheet_name='异常数据', index=False)\n",
" abnormal_normal_df.to_excel(writer, sheet_name='异常工单的正常数据', index=False)"
],
"id": "2884a4a0209bf061",
"outputs": [],
"execution_count": 3
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}