2024-09-29 14:32:23 +00:00
|
|
|
// deno-lint-ignore-file no-explicit-any
|
2024-09-25 19:19:10 +00:00
|
|
|
import pl from "npm:nodejs-polars";
|
|
|
|
|
2024-09-27 11:20:19 +00:00
|
|
|
export function oneHotEncoding(dataframe: pl.DataFrame): pl.DataFrame {
|
2024-09-25 19:19:10 +00:00
|
|
|
let df = pl.DataFrame();
|
|
|
|
for (const columnName of dataframe.columns) {
|
2024-09-29 14:32:23 +00:00
|
|
|
const column = (dataframe as any)[columnName];
|
2024-09-25 19:19:10 +00:00
|
|
|
if (!column.isNumeric()) {
|
|
|
|
df = df.hstack(column.toDummies());
|
|
|
|
} else {
|
|
|
|
df = df.hstack(dataframe.select(columnName));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return df;
|
|
|
|
}
|
2024-09-27 11:20:19 +00:00
|
|
|
|
|
|
|
export function polynomialTransform(
|
|
|
|
dataframe: pl.DataFrame,
|
|
|
|
degree = 2,
|
|
|
|
interaction_only = false,
|
|
|
|
include_bias = true,
|
|
|
|
): pl.DataFrame {
|
2024-09-29 14:32:23 +00:00
|
|
|
const polyRecords: number[][] = [];
|
2024-09-27 11:20:19 +00:00
|
|
|
dataframe.map((X: number[]) => {
|
|
|
|
polyRecords.push(
|
|
|
|
polynomialFeatures(X, degree, interaction_only, include_bias),
|
|
|
|
);
|
|
|
|
});
|
|
|
|
return pl.readRecords(polyRecords);
|
|
|
|
}
|
|
|
|
|
|
|
|
export function polynomialFeatures(
|
|
|
|
X: number[],
|
|
|
|
degree = 2,
|
|
|
|
interaction_only = false,
|
|
|
|
include_bias = true,
|
|
|
|
): number[] {
|
|
|
|
let features = [...X];
|
|
|
|
let prev_chunk = [...X];
|
|
|
|
const indices = Array.from({ length: X.length }, (_, i) => i);
|
|
|
|
for (let d = 1; d < degree; d++) {
|
2024-09-29 14:32:23 +00:00
|
|
|
const new_chunk: number[] = [];
|
2024-09-27 11:20:19 +00:00
|
|
|
for (let i = 0; i < (interaction_only ? X.length - d : X.length); i++) {
|
|
|
|
const v = X[i];
|
|
|
|
const next_index = new_chunk.length;
|
|
|
|
for (let j = i + (interaction_only ? 1 : 0); j < prev_chunk.length; j++) {
|
|
|
|
new_chunk.push(v * prev_chunk[j]);
|
|
|
|
}
|
|
|
|
indices[i] = next_index;
|
|
|
|
}
|
|
|
|
features = features.concat(new_chunk);
|
|
|
|
prev_chunk = new_chunk;
|
|
|
|
}
|
|
|
|
if (include_bias) {
|
|
|
|
features.unshift(1);
|
|
|
|
}
|
|
|
|
return features;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2024-09-27 20:37:35 +00:00
|
|
|
* Add rows at given interval, use average to fill values.
|
|
|
|
* Usage:
|
|
|
|
* ```ts
|
|
|
|
* let df = augmentMeanForward("price", df, 100);
|
|
|
|
* ```
|
2024-09-27 11:20:19 +00:00
|
|
|
* @param feature
|
|
|
|
* @param df
|
2024-09-27 20:37:35 +00:00
|
|
|
* @param interval
|
2024-09-27 11:20:19 +00:00
|
|
|
*/
|
|
|
|
export function augmentMeanForward(
|
|
|
|
feature: string,
|
|
|
|
df: pl.DataFrame,
|
|
|
|
interval = 100,
|
|
|
|
) {
|
2024-09-29 14:32:23 +00:00
|
|
|
const sorted = df.sort(feature);
|
|
|
|
const featIdx = sorted.findIdxByName(feature);
|
2024-09-27 20:37:35 +00:00
|
|
|
let result = sorted.head(1);
|
|
|
|
for (let i = 0; i < sorted.height; i++) {
|
2024-09-29 14:32:23 +00:00
|
|
|
const p1 = sorted.row(i).at(featIdx);
|
|
|
|
const k = (i + 1) % sorted.height;
|
|
|
|
const p2 = sorted.row(k).at(featIdx);
|
2024-09-27 11:20:19 +00:00
|
|
|
if (p2 - p1 > interval) {
|
2024-09-27 20:37:35 +00:00
|
|
|
for (let j = 0; j < Math.round((p2 - p1) / interval) - 1; j++) {
|
|
|
|
result = pl.concat([
|
|
|
|
result,
|
|
|
|
pl.concat([
|
|
|
|
result.tail(1),
|
|
|
|
sorted.slice({ offset: k, length: 1 }),
|
|
|
|
sorted.head(1).shift(-1),
|
|
|
|
])
|
|
|
|
.fillNull("mean")
|
|
|
|
.tail(1),
|
|
|
|
]);
|
2024-09-27 11:20:19 +00:00
|
|
|
}
|
2024-09-27 20:37:35 +00:00
|
|
|
} else {
|
2024-09-27 11:20:19 +00:00
|
|
|
result = pl.concat([result, sorted.slice(1, i)]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|