2024-09-26 01:41:33 +00:00
|
|
|
import pl from "npm:nodejs-polars";
|
|
|
|
|
2024-09-28 12:11:00 +00:00
|
|
|
type DfSplit = {
|
|
|
|
trainX: pl.DataFrame;
|
|
|
|
trainY: pl.DataFrame;
|
|
|
|
testX: pl.DataFrame;
|
|
|
|
testY: pl.DataFrame;
|
|
|
|
size: number;
|
|
|
|
};
|
|
|
|
|
|
|
|
export function sliceK(
|
|
|
|
df: pl.DataFrame,
|
|
|
|
size: number,
|
|
|
|
k: number,
|
|
|
|
...yFeatures: string[]
|
|
|
|
): DfSplit[] {
|
|
|
|
let testSize = Math.round(df.shape.height * size);
|
|
|
|
while (testSize % k !== 0) {
|
|
|
|
testSize -= 1;
|
|
|
|
}
|
|
|
|
if (df.shape.height / testSize < k) {
|
|
|
|
throw new Error(
|
|
|
|
`k value is too large, max k value is ${df.shape.height / testSize}`,
|
|
|
|
);
|
|
|
|
}
|
2024-09-28 20:18:36 +00:00
|
|
|
const trainSize = df.shape.height - testSize;
|
|
|
|
const result: DfSplit[] = [];
|
2024-09-28 12:11:00 +00:00
|
|
|
let data = df;
|
|
|
|
for (let i = 0; i < k; i++) {
|
2024-09-28 20:18:36 +00:00
|
|
|
const [train, test] = [data.head(trainSize), data.tail(testSize)];
|
|
|
|
const [trainY, testY] = [
|
2024-09-28 12:11:00 +00:00
|
|
|
train.select(...yFeatures),
|
|
|
|
test.select(...yFeatures),
|
|
|
|
];
|
2024-09-28 20:18:36 +00:00
|
|
|
const [trainX, testX] = [train.drop(yFeatures), test.drop(yFeatures)];
|
2024-09-28 12:11:00 +00:00
|
|
|
result.push({
|
|
|
|
trainX,
|
|
|
|
trainY,
|
|
|
|
testX,
|
|
|
|
testY,
|
|
|
|
size,
|
|
|
|
});
|
|
|
|
data = pl.concat([test, train]);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
export function trainTestSplit(
|
|
|
|
df: pl.DataFrame,
|
|
|
|
size: number,
|
|
|
|
shuffle = true,
|
|
|
|
...yFeatures: string[]
|
|
|
|
) {
|
2024-09-28 20:18:36 +00:00
|
|
|
const data = shuffle ? df.sample(df.height - 1) : df;
|
2024-09-28 12:11:00 +00:00
|
|
|
const result = sliceK(data, size, 1, ...yFeatures);
|
|
|
|
return result[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
export function kFold(
|
|
|
|
df: pl.DataFrame,
|
|
|
|
k: number,
|
|
|
|
shuffle = true,
|
|
|
|
...yFeatures: string[]
|
|
|
|
): DfSplit[] {
|
2024-09-28 20:18:36 +00:00
|
|
|
const data = shuffle ? df.sample(df.height - 1) : df;
|
2024-09-28 12:11:00 +00:00
|
|
|
return sliceK(data, 1 / k, k, ...yFeatures);
|
2024-09-26 01:41:33 +00:00
|
|
|
}
|