38 lines
1.3 KiB
Python
38 lines
1.3 KiB
Python
# -*- coding: utf-8 -*-
|
||
import os
|
||
import pandas as pd
|
||
|
||
df = pd.read_excel(r"D:\Idea Project\F6+宜搭+其它(1)\张阳脚本\udesk\自动评测.xlsx", sheet_name='Sheet1', header=None)
|
||
|
||
# 按位置指定列名(第1列→query, 第2列→reference_response, 第3列→session_id)
|
||
# 根据源文件实际列数调整
|
||
col_count = df.shape[1]
|
||
if col_count == 2:
|
||
df.columns = ["query", "reference_response"]
|
||
elif col_count == 3:
|
||
df.columns = ["query", "reference_response", "session_id"]
|
||
else:
|
||
df.columns = ["query", "reference_response", "session_id"][:col_count]
|
||
|
||
# 按模板字段整理输出
|
||
template_cols = ["session_id", "query", "reference_response"]
|
||
for col in template_cols:
|
||
if col not in df.columns:
|
||
df[col] = pd.NA # 缺失的字段填空值
|
||
|
||
df = df[template_cols]
|
||
|
||
# 按字段类型转换
|
||
df["session_id"] = pd.to_numeric(df["session_id"], errors="coerce").astype("Int64")
|
||
df["query"] = df["query"].astype(str)
|
||
df["reference_response"] = df["reference_response"].astype(str)
|
||
|
||
# 随机抽取100条,若数据不足100条则全部抽取
|
||
n = min(100, len(df))
|
||
sampled = df.sample(n=n, random_state=42)
|
||
|
||
# 生成到当前目录
|
||
output_path = os.path.join(os.getcwd(), "随机抽取100条结果.xlsx")
|
||
sampled.to_excel(output_path, index=False)
|
||
print(f"已随机抽取 {n} 条数据,保存至: {output_path}")
|