{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": [ "查看excel里面的换行符格式\n", "检查后可以看到,excel里面的单元格内换行符是\\r\\n格式" ], "id": "c9030123c5d35ea8" }, { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-02-13T01:38:58.145302Z", "start_time": "2025-02-13T01:38:49.123687Z" } }, "source": [ "import pandas as pd\n", "\n", "# 读取Excel文件\n", "file_path = r\"C:\\Users\\Administrator.DESKTOP-7IC2USJ\\Desktop\\明细统计.xls\" # 使用原始字符串 (r\"\") 来避免转义字符问题\n", "df = pd.read_excel(file_path)\n", "\n", "# 选择你要检查的列(例如 '图号')\n", "column_to_check = '图号'\n", "\n", "# 打印前几行的数据,查看是否有换行符\n", "for index, row in df.head().iterrows():\n", " print(f\"Row {index}:\")\n", " print(row[column_to_check])\n", " print(repr(row[column_to_check])) # 使用 repr() 查看实际的换行符\n", " print(\"-\" * 40)" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Row 0:\n", "TH04733\r\n", "YLS-1L5W30\r\n", "LSGL-4-75W90\r\n", "NO-20002\r\n", "1307012TAA\r\n", "1002250TAR-1\r\n", "EZL401\r\n", "NA-14021U\r\n", "CC-22\r\n", "3701250TAC\r\n", "TH04732\r\n", "1006060TARD1\r\n", "1002350TRA-F\n", "'TH04733\\r\\nYLS-1L5W30\\r\\nLSGL-4-75W90\\r\\nNO-20002\\r\\n1307012TAA\\r\\n1002250TAR-1\\r\\nEZL401\\r\\nNA-14021U\\r\\nCC-22\\r\\n3701250TAC\\r\\nTH04732\\r\\n1006060TARD1\\r\\n1002350TRA-F'\n", "----------------------------------------\n", "Row 1:\n", "nan\n", "nan\n", "----------------------------------------\n", "Row 2:\n", "0280150989\r\n", "H15015-0200\r\n", "06F 905 115G\r\n", "7092-BKR6EGP\n", "'0280150989\\r\\nH15015-0200\\r\\n06F 905 115G\\r\\n7092-BKR6EGP'\n", "----------------------------------------\n", "Row 3:\n", "0M08-10129\r\n", "S101014-0400\r\n", "0M10-30064\r\n", "H15002-1000\n", "'0M08-10129\\r\\nS101014-0400\\r\\n0M10-30064\\r\\nH15002-1000'\n", "----------------------------------------\n", "Row 4:\n", "JLS-SN5W301L\r\n", "JL00004\r\n", "AST00002\r\n", "NA-18034P\r\n", "AST00010\r\n", "NC-17012L\r\n", "56110-RAA-A01HC\r\n", "08285-P99-01ZE3\r\n", "N0-18003\n", "'JLS-SN5W301L\\r\\nJL00004\\r\\nAST00002\\r\\nNA-18034P\\r\\nAST00010\\r\\nNC-17012L\\r\\n56110-RAA-A01HC\\r\\n08285-P99-01ZE3\\r\\nN0-18003'\n", "----------------------------------------\n" ] } ], "execution_count": 1 }, { "metadata": {}, "cell_type": "markdown", "source": [ "拆分\\r\\n换行符\n", "\n", "根据同一组\\r\\n的数量," ], "id": "3023b57d0a934d9e" }, { "metadata": { "ExecuteTime": { "end_time": "2025-02-13T01:44:18.374344Z", "start_time": "2025-02-13T01:39:23.892604Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "from tqdm import tqdm\n", "from pathlib import Path\n", "\n", "def split_rows(df, key_column='工单号'):\n", " # 获取需要处理的列\n", " part_columns = ['配件名称', '配件数量', '配件施工人员', '配件成本', '配件单价', '配件金额']\n", " service_columns = ['服务项目', '项目金额', '项目施工人员']\n", "\n", " # 初始化三个空列表来存储正常、异常和异常工单号对应的正常数据\n", " normal_rows = []\n", " abnormal_rows = []\n", " abnormal_key_values = set() # 用于存储异常工单号\n", " original_abnormal_rows = [] # 用于存储异常工单的原始行\n", "\n", " # 创建进度条,用于显示处理每一行的进度\n", " with tqdm(total=len(df), desc='Processing Rows') as pbar:\n", " for index, row in df.iterrows():\n", " # 分割配件组和服务项目组数据\n", " parts_data = {col: row[col].split('\\r\\n') if isinstance(row[col], str) and pd.notna(row[col]) else [row[col]] for col in part_columns}\n", " services_data = {col: row[col].split('\\r\\n') if isinstance(row[col], str) and pd.notna(row[col]) else [row[col]] for col in service_columns}\n", "\n", " # 合并两个字典\n", " all_data = {**parts_data, **services_data}\n", "\n", " # 确定最大长度\n", " max_length = max(len(data) for data in all_data.values())\n", "\n", " # 记录当前行的工单号\n", " key_value = row[key_column]\n", "\n", " # 将所有列的数据扩展到最大长度\n", " for i in range(max_length):\n", " new_row = row.copy()\n", " for col, data in all_data.items():\n", " if i < len(data):\n", " new_row[col] = data[i]\n", " else:\n", " new_row[col] = None\n", "\n", " # 检查是否有异常情况\n", " non_null_values = [new_row[col] for col in all_data.keys() if pd.notna(new_row[col])]\n", " if len(non_null_values) == 1:\n", " # 如果这一行只有单个非空值,则记录为异常数据\n", " abnormal_row = {\n", " key_column: key_value,\n", " '异常列': list(all_data.keys())[0],\n", " '异常值': non_null_values[0],\n", " '原始行索引': index # 记录原始行的索引\n", " }\n", " abnormal_rows.append(abnormal_row)\n", " abnormal_key_values.add(key_value) # 记录异常工单号\n", " else:\n", " normal_rows.append(new_row)\n", "\n", " # 更新进度条\n", " pbar.update(1)\n", "\n", " # 一次性将所有正常和异常的行合并为新的DataFrame\n", " normal_df = pd.DataFrame(normal_rows)\n", " abnormal_df = pd.DataFrame(abnormal_rows)\n", "\n", " # 提取异常工单号对应的正常数据\n", " abnormal_normal_df = normal_df[normal_df[key_column].isin(abnormal_key_values)]\n", "\n", " # 从正常数据中移除异常工单号的数据\n", " normal_df = normal_df[~normal_df[key_column].isin(abnormal_key_values)]\n", "\n", " # 提取异常工单的原始行\n", " original_abnormal_df = df[df[key_column].isin(abnormal_key_values)]\n", "\n", " return normal_df, abnormal_df, abnormal_normal_df, original_abnormal_df\n", "\n", "# 读取Excel文件\n", "input_file_path = Path(r\"C:\\Users\\Administrator.DESKTOP-7IC2USJ\\Desktop\\明细统计.xls\",sheet='Sheet2')\n", "df = pd.read_excel(input_file_path)\n", "\n", "# 处理数据\n", "normal_df, abnormal_df, abnormal_normal_df, original_abnormal_df = split_rows(df, key_column='工单号')\n", "\n", "# 写入新的Excel文件,包含四个工作表\n", "output_file_path = Path(r\"C:\\Users\\Administrator.DESKTOP-7IC2USJ\\Desktop\\拆分后_明细统计.xlsx\", engine='xlsxwriter') # 修改为 .xlsx 格式\n", "with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:\n", " normal_df.to_excel(writer, sheet_name='正常数据', index=False)\n", " abnormal_df.to_excel(writer, sheet_name='异常数据', index=False)\n", " abnormal_normal_df.to_excel(writer, sheet_name='异常工单的正常数据', index=False)\n", " original_abnormal_df.to_excel(writer, sheet_name='异常工单的原始行', index=False)\n", "\n", "print(f\"处理完成,正常数据、异常数据、异常工单的正常数据以及异常工单的原始行已分别保存到 '{output_file_path}' 的 '正常数据'、'异常数据'、'异常工单的正常数据' 和 '异常工单的原始行' 工作表中。\")" ], "id": "78e36241d6a7696", "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Processing Rows: 100%|██████████| 31335/31335 [01:08<00:00, 456.67it/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "处理完成,正常数据、异常数据、异常工单的正常数据以及异常工单的原始行已分别保存到 'C:\\Users\\Administrator.DESKTOP-7IC2USJ\\Desktop\\拆分后_明细统计.xlsx' 的 '正常数据'、'异常数据'、'异常工单的正常数据' 和 '异常工单的原始行' 工作表中。\n" ] } ], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-11T01:29:12.873010Z", "start_time": "2024-12-11T01:27:31.524526Z" } }, "cell_type": "code", "source": [ "with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:\n", " normal_df.to_excel(writer, sheet_name='正常数据', index=False)\n", " abnormal_df.to_excel(writer, sheet_name='异常数据', index=False)\n", " abnormal_normal_df.to_excel(writer, sheet_name='异常工单的正常数据', index=False)" ], "id": "2884a4a0209bf061", "outputs": [], "execution_count": 3 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }