shortcuts/split.ts

68 lines
1.5 KiB
TypeScript
Raw Normal View History

2024-09-26 01:41:33 +00:00
import pl from "npm:nodejs-polars";
2024-09-28 12:11:00 +00:00
type DfSplit = {
trainX: pl.DataFrame;
trainY: pl.DataFrame;
testX: pl.DataFrame;
testY: pl.DataFrame;
size: number;
};
export function sliceK(
df: pl.DataFrame,
size: number,
k: number,
...yFeatures: string[]
): DfSplit[] {
let testSize = Math.round(df.shape.height * size);
while (testSize % k !== 0) {
testSize -= 1;
}
if (df.shape.height / testSize < k) {
throw new Error(
`k value is too large, max k value is ${df.shape.height / testSize}`,
);
}
2024-09-28 20:18:36 +00:00
const trainSize = df.shape.height - testSize;
const result: DfSplit[] = [];
2024-09-28 12:11:00 +00:00
let data = df;
for (let i = 0; i < k; i++) {
2024-09-28 20:18:36 +00:00
const [train, test] = [data.head(trainSize), data.tail(testSize)];
const [trainY, testY] = [
2024-09-28 12:11:00 +00:00
train.select(...yFeatures),
test.select(...yFeatures),
];
2024-09-28 20:18:36 +00:00
const [trainX, testX] = [train.drop(yFeatures), test.drop(yFeatures)];
2024-09-28 12:11:00 +00:00
result.push({
trainX,
trainY,
testX,
testY,
size,
});
data = pl.concat([test, train]);
}
return result;
}
export function trainTestSplit(
df: pl.DataFrame,
size: number,
shuffle = true,
...yFeatures: string[]
) {
2024-09-28 20:18:36 +00:00
const data = shuffle ? df.sample(df.height - 1) : df;
2024-09-28 12:11:00 +00:00
const result = sliceK(data, size, 1, ...yFeatures);
return result[0];
}
export function kFold(
df: pl.DataFrame,
k: number,
shuffle = true,
...yFeatures: string[]
): DfSplit[] {
2024-09-28 20:18:36 +00:00
const data = shuffle ? df.sample(df.height - 1) : df;
2024-09-28 12:11:00 +00:00
return sliceK(data, 1 / k, k, ...yFeatures);
2024-09-26 01:41:33 +00:00
}