shortcuts/notebooks/regressions.ipynb

576 lines
202 KiB
Plaintext
Raw Normal View History

2024-09-30 23:58:32 +00:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-10-04 04:09:41 +00:00
"# Regressions\n"
2024-09-30 23:58:32 +00:00
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"![name](
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// deno-lint-ignore-file\n",
"import pl from \"npm:nodejs-polars\";\n",
"import plot from \"../plot/mod.ts\";\n",
"\n",
"const data = await Deno.readTextFile(\"assets/X_Y_Sinusoid_Data.csv\");\n",
"const df = pl.readCSV(data, { sep: \",\" });\n",
"\n",
"const real = pl.DataFrame({ x: new Array(100).fill(0).map((_, i) => i / 100)}).select(\n",
" pl.col('x'),\n",
" pl.col('x').mul(2).mul(3.14).sin().alias('y')\n",
");\n",
"\n",
"const draw = (x, y, title = \"Sinusoid Data\") => \n",
" plot.DrawPlot(\n",
" { \n",
" title,\n",
" width: 6,\n",
" height: 3,\n",
" XLabel: \"X\", \n",
" YLabel: \"Y\", \n",
" }, \n",
" { type: \"line\", data: [real.x, real.y], legend: \"Sinusoid\", lineDashes: [3, 4], lineColor: \"#ff8888\", lineWidth: 1 },\n",
" { type: \"scatter\", data: [x, y], legend: \"Data\", lineDashes: [3, 4], lineWidth: 2, glyphColor: \"#4444ff\", glyphShape: \"circle\" },\n",
" { type: \"trend\", data: [x, y], legend: \"Trend\", lineDashes: [4, 2], lineColor: '#aacccc', lineWidth: .5 },\n",
" );\n",
"\n",
2024-10-01 00:00:01 +00:00
" const comparePredicted = (x, y, predicted) => plot.DrawPlot(\n",
" { \n",
" width: 7,\n",
" height: 4,\n",
" XLabel: \"X\", \n",
" YLabel: \"Y\", \n",
" }, \n",
" { type: \"line\", data: [real.x, real.y], legend: \"Sinusoid\", lineDashes: [3, 4], lineColor: \"#ff8888\", lineWidth: 1 },\n",
" { type: \"linePoints\", data: [x, y], legend: \"Test Data\", lineDashes: [3, 4], lineColor: \"#8888ff\", glyphColor: \"#4444ff\", glyphShape: \"circle\" },\n",
" { type: \"linePoints\", data: [x, predicted], lineWidth: .5, legend: \"Predicted\", glyphColor: '#f00', glyphShape: \"pyramid\" },\n",
");\n",
"\n",
2024-09-30 23:58:32 +00:00
"\n",
"draw(df.x, df.y);"
]
},
2024-10-01 00:00:01 +00:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Polynomial Tranformation\n",
"\n",
"First we try to predict values without polynomial transformation:"
]
},
2024-09-30 23:58:32 +00:00
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
2024-10-01 00:00:01 +00:00
"![name](
2024-09-30 23:58:32 +00:00
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import regr from '../regr/mod.ts';\n",
"\n",
2024-10-01 00:00:01 +00:00
"const linregWoPoly = regr.Linear();\n",
2024-09-30 23:58:32 +00:00
"\n",
2024-10-01 00:00:01 +00:00
"linregWoPoly.fit(df.drop('y').rows(), df.select('y').rows());\n",
"const predWoPoly = linregWoPoly.predict(df.drop('y').rows());\n",
2024-09-30 23:58:32 +00:00
"\n",
2024-10-01 00:00:01 +00:00
"comparePredicted(df.x, df.y, predWoPoly);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now augment our dataset with high degree polynomial:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"![name](
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import { polynomialTransform } from '../encoding.ts';\n",
2024-09-30 23:58:32 +00:00
"\n",
2024-10-01 00:00:01 +00:00
"\n",
"\n",
"const polyFeatures = polynomialTransform(df.drop('y'), 36, false, false)\n",
"\n",
"const [polyX, polyY] = [polyFeatures, df.select('y')]\n",
"\n",
"const linregPoly = regr.Linear();\n",
"\n",
"linregPoly.fit(polyX.rows(), polyY.rows());\n",
"const predY = linregPoly.predict(polyX.rows());\n",
"\n",
"comparePredicted(df.x, df.y, predY);\n"
2024-09-30 23:58:32 +00:00
]
2024-10-01 02:37:42 +00:00
},
2024-10-01 13:41:13 +00:00
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[33m0.9999999469410812\u001b[39m"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"linregPoly.score(df.select('y').rows(), pl.DataFrame({\"py\":predY}).rows());"
]
},
2024-10-01 02:37:42 +00:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ElasticNet"
]
},
{
"cell_type": "code",
2024-10-01 13:41:13 +00:00
"execution_count": 5,
2024-10-01 02:37:42 +00:00
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"![name](
]
},
2024-10-01 13:41:13 +00:00
"execution_count": 5,
2024-10-01 02:37:42 +00:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"const elasticNetPoly = regr.ElasticNet(1000, 0.0001);\n",
"elasticNetPoly.fit(polyX.rows(), polyY.rows());\n",
"const predEnetY = elasticNetPoly.predict(polyX.rows());\n",
"\n",
"comparePredicted(df.x, df.y, predEnetY);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lasso"
]
},
{
"cell_type": "code",
2024-10-01 13:41:13 +00:00
"execution_count": 6,
2024-10-01 02:37:42 +00:00
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"![name](
]
},
2024-10-01 13:41:13 +00:00
"execution_count": 6,
2024-10-01 02:37:42 +00:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"const lassoPoly = regr.Lasso(1000, 0.0001);\n",
"lassoPoly.fit(polyX.rows(), polyY.rows());\n",
"const predLassoY = lassoPoly.predict(polyX.rows());\n",
"\n",
"comparePredicted(df.x, df.y, predLassoY);"
]
2024-10-01 13:41:13 +00:00
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[33m0.8637104981289901\u001b[39m"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lassoPoly.score(df.select('y').rows(), pl.DataFrame({\"py\":predLassoY}).rows());\n"
]
2024-10-04 04:09:41 +00:00
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Logistic Regression\n",
"\n",
"Logistic regression is applicable for classification problems when there are linear relationships in the data.\n",
"\n",
"For example we'll use a simple linear pattern for predictions:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"is_even": 1,
"x": 0,
"x2": -0.9906467224227306,
"x3": 0.00261179419023964
},
{
"is_even": 0,
"x": 1,
"x2": 0.00202764718654894,
"x3": 0.0013126936809724655
},
{
"is_even": 1,
"x": 2,
"x2": -0.9972687463767552,
"x3": -1.9905576758692132
},
{
"is_even": 0,
"x": 3,
"x2": 0.009024117668298463,
"x3": 1.0021275936361942
},
{
"is_even": 1,
"x": 4,
"x2": -0.9968599379688912,
"x3": -0.9947633704461447
},
{
"is_even": 0,
"x": 5,
"x2": 0.0019298798125454942,
"x3": -0.9913865823439642
},
{
"is_even": 1,
"x": 6,
"x2": -0.9967558047825875,
"x3": 0.00801993440722967
},
{
"is_even": 0,
"x": 7,
"x2": 0.007736464312311071,
"x3": 0.00034330022067074583
},
{
"is_even": 1,
"x": 8,
"x2": -0.9959077406033643,
"x3": -1.994011690184521
},
{
"is_even": 0,
"x": 9,
"x2": 0.002684616942261051,
"x3": 1.0072230674765243
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "x",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "x2",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "x3",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "is_even",
"rdfType": null,
"title": null,
"type": "number"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
"<table><thead><tr><th>x</th><th>x2</th><th>x3</th><th>is_even</th></tr></thead><tbody><tr><td>0</td><td>-0.9906467224227306</td><td>0.00261179419023964</td><td>1</td></tr><tr><td>1</td><td>0.00202764718654894</td><td>0.0013126936809724655</td><td>0</td></tr><tr><td>2</td><td>-0.9972687463767552</td><td>-1.9905576758692132</td><td>1</td></tr><tr><td>3</td><td>0.009024117668298463</td><td>1.0021275936361942</td><td>0</td></tr><tr><td>4</td><td>-0.9968599379688912</td><td>-0.9947633704461447</td><td>1</td></tr><tr><td>5</td><td>0.0019298798125454942</td><td>-0.9913865823439642</td><td>0</td></tr><tr><td>6</td><td>-0.9967558047825875</td><td>0.00801993440722967</td><td>1</td></tr><tr><td>7</td><td>0.007736464312311071</td><td>0.00034330022067074583</td><td>0</td></tr><tr><td>8</td><td>-0.9959077406033643</td><td>-1.994011690184521</td><td>1</td></tr><tr><td>9</td><td>0.002684616942261051</td><td>1.0072230674765243</td><td>0</td></tr></tbody></table>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"const clsDf = pl.DataFrame({ \n",
" x: new Array(100).fill(0).map((_, i) => i ),\n",
" x2: new Array(100).fill(0).map((_, i) => i % 2 - 1 + Math.random() / 100),\n",
" x3: new Array(100).fill(0).map((_, i) => i % 2 - i % 3 + Math.random() / 100),\n",
" }).select(\n",
" pl.all(),\n",
" pl.col('x').modulo(2).eq(0).add(0).alias('is_even'),\n",
");\n",
"clsDf.head(10);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notice that I left `x` in training data which is a continuos sequence that couln't be generalized by this model, however the model should guess a correct class in most cases. "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"tags": [
"hide_code",
"parameters"
]
},
"outputs": [],
"source": [
"const drawBin = (x1, y1, t = \"Example\") => {\n",
" const xTrue = clsDf.x.toArray().map((v, i) => v % 2)\n",
" const yTrue = clsDf.is_even.toArray()\n",
" y1 = y1.map((v, i) => v + i * 0.02)\n",
" x1 = x1.map((v, i) => v % 2)\n",
" return plot.DrawPlot(\n",
" { \n",
" title: \"\",\n",
" width: 2.5,\n",
" height: 2,\n",
" XLabel: \"X\", \n",
" YLabel: \"Y\", \n",
" }, \n",
" { type: \"scatter\", data: [xTrue, yTrue], lineDashes: [3, 4], glypRadius: 12, glyphColor: \"#00f\", glyphShape: \"ring\" },\n",
" { type: \"scatter\", data: [x1, y1], legend: t, lineDashes: [3, 4], glypRadius: 3, glyphColor: \"#f00\", glyphShape: \"ring\" },\n",
" );\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import {trainTestSplit} from \"../split.ts\";\n",
"\n",
"\n",
"const {testX, trainX, testY, trainY} = trainTestSplit(clsDf, 0.05, true, \"is_even\");\n",
"\n",
"const drawTestBin = () => drawBin(testX.x.toArray(), testY.is_even.toArray(), \"Test Data\");\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"![name](
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"const logreg = regr.Logistic({\n",
" epochs: 5000,\n",
" learningRate: 0.001,\n",
"});\n",
"logreg.fit(trainX.rows(), trainY.rows());\n",
"\n",
"const predLogReg = logreg.predict(testX.rows());\n",
"\n",
"const yPred1 = predLogReg.map((x) => x[0]);\n",
"\n",
"drawBin(\n",
" testX.x.toArray(), \n",
" yPred1,\n",
" \"Predicted\"\n",
");"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"tags": [
"hide_code",
"parameters"
]
},
"outputs": [
{
"data": {
"text/markdown": [
"![name](
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drawTestBin()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0, 1, 1, 0, 1 ] should be [ 0, 1, 1, 0, 1 ]\n"
]
}
],
"source": [
"console.log(yPred1, \"should be\", testY.is_even.toArray());"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[33m1.6635532311438688\u001b[39m"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logreg.loss();"
]
2024-09-30 23:58:32 +00:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Deno",
"language": "typescript",
"name": "deno"
},
"language_info": {
"codemirror_mode": "typescript",
"file_extension": ".ts",
"mimetype": "text/x.typescript",
"name": "typescript",
"nbconvert_exporter": "script",
"pygments_lexer": "typescript",
"version": "5.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}