+ plots, + enc

This commit is contained in:
Anton Nesterov 2024-09-27 13:20:19 +02:00
parent 43e21e87a7
commit 5d38fb02f9
No known key found for this signature in database
GPG key ID: 59121E8AE2851FB5
4 changed files with 136 additions and 8 deletions

View file

@ -27,7 +27,7 @@
],
"source": [
"const document = null;\n",
"import * as p from \"https://l12.xyz/x/shortcuts/raw/plots.ts\"\n"
"import * as p from \"https://l12.xyz/x/shortcuts/raw/plots.ts\";"
]
}
],

View file

@ -1,6 +1,6 @@
import pl from "npm:nodejs-polars";
export function oneHotEncoding(dataframe) {
export function oneHotEncoding(dataframe: pl.DataFrame): pl.DataFrame {
let df = pl.DataFrame();
for (const columnName of dataframe.columns) {
const column = dataframe[columnName];
@ -12,3 +12,85 @@ export function oneHotEncoding(dataframe) {
}
return df;
}
export function polynomialTransform(
dataframe: pl.DataFrame,
degree = 2,
interaction_only = false,
include_bias = true,
): pl.DataFrame {
let polyRecords: number[][] = [];
dataframe.map((X: number[]) => {
polyRecords.push(
polynomialFeatures(X, degree, interaction_only, include_bias),
);
});
return pl.readRecords(polyRecords);
}
export function polynomialFeatures(
X: number[],
degree = 2,
interaction_only = false,
include_bias = true,
): number[] {
let features = [...X];
let prev_chunk = [...X];
const indices = Array.from({ length: X.length }, (_, i) => i);
for (let d = 1; d < degree; d++) {
const new_chunk: any[] = [];
for (let i = 0; i < (interaction_only ? X.length - d : X.length); i++) {
const v = X[i];
const next_index = new_chunk.length;
for (let j = i + (interaction_only ? 1 : 0); j < prev_chunk.length; j++) {
new_chunk.push(v * prev_chunk[j]);
}
indices[i] = next_index;
}
features = features.concat(new_chunk);
prev_chunk = new_chunk;
}
if (include_bias) {
features.unshift(1);
}
return features;
}
/**
* Adds missing rows at given interval, uses mean of previous and next value.
* Example for one feature: [1, 2, 4, 5] -> [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]
* @param feature
* @param df
* @param bin
*/
export function augmentMeanForward(
feature: string,
df: pl.DataFrame,
interval = 100,
) {
let sorted = df.sort(feature);
let result = sorted.head(0);
let n: null | number = null;
for (let i = 0; i < sorted.height - 1; i++) {
let p1 = n ?? sorted.row(i).at(-1);
let p2 = sorted.row(i + 1).at(-1);
if (p2 - p1 > interval) {
let avg = (p1 + p2) / 2;
result = pl.concat([
result,
pl.concat([result.tail(2), sorted.slice({ offset: i + 1, length: 2 })])
.shift(-1)
.fillNull("mean")
.tail(1),
]);
if (p2 - avg > interval) {
i--;
n = avg;
continue;
}
result = pl.concat([result, sorted.slice(1, i)]);
n = null;
}
}
return result;
}

11
expr.ts
View file

@ -12,9 +12,8 @@ export const fillzero = (
value = 0.0001,
) => (pl.all().replaceStrict(0, value, pl.all()));
export const ScaleExpr: pl.Expr = (pl.all().minus(pl.all().min())).div(
pl.all().max().minus(pl.all().min()),
);
export const StdNormExpr: pl.Expr = pl.all().minus(pl.all().mean()).div(
pl.all().std(),
);
export const minmaxScale = (col: pl.Expr) =>
(col.minus(col.min())).div(col.max().minus(col.min()));
export const standardScale = (col: pl.Expr) =>
(col.minus(col.mean())).div(col.std());

View file

@ -208,3 +208,50 @@ export function threeChart(data: any[], x = "column", opts = { width: 800 }) {
}),
};
}
export function distPlot(...data: number[][]) {
const colors = [
"red",
"blue",
"green",
"orange",
"purple",
"brown",
"pink",
"gray",
"black",
"cyan",
"magenta",
"yellow",
"lightblue",
"lightgreen",
"lightgray",
];
const plt = Plot.plot({
margin: 50,
width: 1200,
marks: [
Plot.hexgrid(),
data.map((a, i) =>
Plot.areaY(
a,
Plot.binX({ y: "count" }, {
x: (x: number) => x,
fill: colors[i % colors.length],
curve: "catmull-rom",
fillOpacity: .5,
}),
)
),
],
document,
});
plt.setAttribute("xmlns", "http://www.w3.org/2000/svg");
return {
[Symbol.for("Jupyter.display")]: () => ({
"text/html": `<img src='data:image/svg+xml;base64,${
btoa(unescape(encodeURIComponent(plt)))
}'>`,
}),
};
}