{ "cells": [ { "cell_type": "markdown", "id": "b9db28d4-ea49-443c-a5d6-e1c44bfe4942", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "# Data Analytics with JS\n", "\n", "The dataset contains all the information about cars, a name of a manufacturer,\n", "all car's technical parameters and a sale price of a car.\n", "\n", "Libraries:\n", "\n", "- nodejs-polars\n", "- @observable/plot" ] }, { "cell_type": "markdown", "id": "d163c580-6aa0-4e8c-a780-5cc931003dc8", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Exploring Data\n", "\n", "Use [pola-rs](https://pola-rs.github.io/nodejs-polars/modules.html) dataframes\n", "to read and manipulate data.\n", "\n", "- use `df.head(n)` to get n first rows\n", "- use `df.sample(n)` to get n random rows from the dataset\n", "- use `df.describe()` to get mean, std, min, max\n", "- use `df.select(...cols)` to get new dataframe for columns\n", "- use `df.groupBy(..cols).agg()` to group data and get aggragates" ] }, { "cell_type": "code", "execution_count": 1, "id": "1cb3589a-bffe-422a-8e32-a853161b93c4", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "CarName": "alfa-romero giulia", "aspiration": "std", "boreratio": 3.47, "car_ID": 1, "carbody": "convertible", "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "compressionratio": 9, "curbweight": 2548, "cylindernumber": "four", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 130, "enginetype": "dohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 27, "horsepower": 111, "peakrpm": 5000, "price": 13495, "stroke": 2.68, "symboling": 3, "wheelbase": 88.6 }, { "CarName": "alfa-romero stelvio", "aspiration": "std", "boreratio": 3.47, "car_ID": 2, "carbody": "convertible", "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "compressionratio": 9, "curbweight": 2548, "cylindernumber": "four", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 130, "enginetype": "dohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 27, "horsepower": 111, "peakrpm": 5000, "price": 16500, "stroke": 2.68, "symboling": 3, "wheelbase": 88.6 }, { "CarName": "alfa-romero Quadrifoglio", "aspiration": "std", "boreratio": 2.68, "car_ID": 3, "carbody": "hatchback", "carheight": 52.4, "carlength": 171.2, "carwidth": 65.5, "citympg": 19, "compressionratio": 9, "curbweight": 2823, "cylindernumber": "six", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 152, "enginetype": "ohcv", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 26, "horsepower": 154, "peakrpm": 5000, "price": 16500, "stroke": 3.47, "symboling": 1, "wheelbase": 94.5 }, { "CarName": "audi 100 ls", "aspiration": "std", "boreratio": 3.19, "car_ID": 4, "carbody": "sedan", "carheight": 54.3, "carlength": 176.6, "carwidth": 66.2, "citympg": 24, "compressionratio": 10, "curbweight": 2337, "cylindernumber": "four", "doornumber": "four", "drivewheel": "fwd", "enginelocation": "front", "enginesize": 109, "enginetype": "ohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 30, "horsepower": 102, "peakrpm": 5500, "price": 13950, "stroke": 3.4, "symboling": 2, "wheelbase": 99.8 }, { "CarName": "audi 100ls", "aspiration": "std", "boreratio": 3.19, "car_ID": 5, "carbody": "sedan", "carheight": 54.3, "carlength": 176.6, "carwidth": 66.4, "citympg": 18, "compressionratio": 8, "curbweight": 2824, "cylindernumber": "five", "doornumber": "four", "drivewheel": "4wd", "enginelocation": "front", "enginesize": 136, "enginetype": "ohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 22, "horsepower": 115, "peakrpm": 5500, "price": 17450, "stroke": 3.4, "symboling": 2, "wheelbase": 99.4 } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "car_ID", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "symboling", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "CarName", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fueltype", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "aspiration", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "doornumber", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carbody", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "drivewheel", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginelocation", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "wheelbase", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carlength", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carwidth", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carheight", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "curbweight", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginesize", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "boreratio", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "stroke", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "compressionratio", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "horsepower", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "peakrpm", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "citympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "highwaympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "price", "rdfType": null, "title": null, "type": "number" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
13alfa-romero giuliagasstdtwoconvertiblerwdfront88.6168.864.148.82548dohcfour130mpfi3.472.6891115000212713495
23alfa-romero stelviogasstdtwoconvertiblerwdfront88.6168.864.148.82548dohcfour130mpfi3.472.6891115000212716500
31alfa-romero Quadrifogliogasstdtwohatchbackrwdfront94.5171.265.552.42823ohcvsix152mpfi2.683.4791545000192616500
42audi 100 lsgasstdfoursedanfwdfront99.8176.666.254.32337ohcfour109mpfi3.193.4101025500243013950
52audi 100lsgasstdfoursedan4wdfront99.4176.666.454.32824ohcfive136mpfi3.193.481155500182217450
" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import { display } from \"https://deno.land/x/display@v0.1.1/mod.ts\";\n", "import { Plot } from \"https://l12.xyz/x/shortcuts/raw/plots.ts\";\n", "import pl from \"npm:nodejs-polars\";\n", "\n", "let data = await Deno.readTextFile(\"assets/CarPrice_Assignment.csv\");\n", "let df = pl.readCSV(data, { sep: \",\" });\n", "\n", "await display(df.head(5));" ] }, { "cell_type": "code", "execution_count": 2, "id": "a82e43fd-04a6-4bc8-a752-3b03abd6e983", "metadata": {}, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "describe": "mean", "enginesize": 126.90731707317073, "horsepower": 104.1170731707317, "price": 13276.710570731706 }, { "describe": "std", "enginesize": 41.642693438179855, "horsepower": 39.54416680936116, "price": 7988.852331743148 }, { "describe": "min", "enginesize": 61, "horsepower": 48, "price": 5118 }, { "describe": "max", "enginesize": 326, "horsepower": 288, "price": 45400 }, { "describe": "median", "enginesize": 120, "horsepower": 95, "price": 10295 } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "describe", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginesize", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "horsepower", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "price", "rdfType": null, "title": null, "type": "number" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "await display(\n", " df.select(\n", " \"enginesize\",\n", " \"horsepower\",\n", " \"price\",\n", " ).describe(),\n", ");" ] }, { "cell_type": "markdown", "id": "5c931435-0e19-4d32-8ca8-ae81a9cb43c8", "metadata": {}, "source": [ "## Data Cleaning" ] }, { "cell_type": "code", "execution_count": 3, "id": "e70e5f22-bdde-4140-9015-a6281b8478bf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[33mfalse\u001b[39m" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// check for duplicates\n", "const hasDups = df.select(\"car_ID\").isDuplicated().toArray().includes(true);\n", "// if there are duplicates, use df.filter()\n", "hasDups;" ] }, { "cell_type": "markdown", "id": "7fb89a88-f3ad-457d-9211-2b75eef6096f", "metadata": {}, "source": [ "Resolve categorical values. For this dataset it is a brand name:" ] }, { "cell_type": "code", "execution_count": 4, "id": "ffb4509e-1930-4e2e-954c-51e919084514", "metadata": { "scrolled": true }, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "brand": "toyota" }, { "brand": "mercury" }, { "brand": "mitsubishi" }, { "brand": "honda" }, { "brand": "mazda" }, { "brand": "peugeot" }, { "brand": "porcshce" }, { "brand": "renault" }, { "brand": "isuzu" }, { "brand": "dodge" }, { "brand": "bmw" }, { "brand": "vw" }, { "brand": "maxda" }, { "brand": "volkswagen" }, { "brand": "alfa-romero" }, { "brand": "chevrolet" }, { "brand": "toyouta" }, { "brand": "jaguar" }, { "brand": "saab" }, { "brand": "porsche" }, { "brand": "audi" }, { "brand": "vokswagen" }, { "brand": "subaru" }, { "brand": "nissan" }, { "brand": "plymouth" }, { "brand": "volvo" }, { "brand": "buick" } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "brand", "rdfType": null, "title": null, "type": "string" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// get brand names from `CarName`\n", "let brandNameTable = df.select(\"CarName\").map((row) => {\n", " const [carName] = row;\n", " const brand = carName.split(\" \")[0].toLowerCase();\n", " return brand;\n", "});\n", "\n", "// create a dataframe from brand names\n", "let brandDf = pl.DataFrame({\n", " \"brand\": brandNameTable,\n", "});\n", "await display(brandDf.unique());" ] }, { "cell_type": "code", "execution_count": 5, "id": "7f91cbc3-364c-49c9-97e3-370f78b708f8", "metadata": { "scrolled": true }, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "brand": "nissan" }, { "brand": "toyota" }, { "brand": "isuzu" }, { "brand": "plymouth" }, { "brand": "subaru" }, { "brand": "audi" }, { "brand": "renault" }, { "brand": "honda" }, { "brand": "dodge" }, { "brand": "bmw" }, { "brand": "volvo" }, { "brand": "peugeot" }, { "brand": "volkswagen" }, { "brand": "chevrolet" }, { "brand": "mitsubishi" }, { "brand": "buick" }, { "brand": "alfa-romero" }, { "brand": "porsche" }, { "brand": "mazda" }, { "brand": "saab" }, { "brand": "jaguar" }, { "brand": "mercury" } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "brand", "rdfType": null, "title": null, "type": "string" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// transform to remove duplicates\n", "brandNameTable = brandNameTable.map((name) => {\n", " name = name\n", " .replace(\"maxda\", \"mazda\")\n", " .replace(\"porcshce\", \"porsche\")\n", " .replace(\"toyouta\", \"toyota\")\n", " .replace(/(vw|vokswagen)/ig, \"volkswagen\");\n", " return name;\n", "});\n", "\n", "brandDf = pl.DataFrame({\n", " brand: brandNameTable,\n", "});\n", "await display(brandDf.unique());" ] }, { "cell_type": "code", "execution_count": 6, "id": "eb19de54-11bb-40af-8e27-e5e3a6bd11f0", "metadata": {}, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "CarName": "alfa-romero giulia", "aspiration": "std", "boreratio": 3.47, "brand": "alfa-romero", "car_ID": 1, "carbody": "convertible", "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "compressionratio": 9, "curbweight": 2548, "cylindernumber": "four", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 130, "enginetype": "dohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 27, "horsepower": 111, "peakrpm": 5000, "price": 13495, "stroke": 2.68, "symboling": 3, "wheelbase": 88.6 }, { "CarName": "alfa-romero stelvio", "aspiration": "std", "boreratio": 3.47, "brand": "alfa-romero", "car_ID": 2, "carbody": "convertible", "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "compressionratio": 9, "curbweight": 2548, "cylindernumber": "four", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 130, "enginetype": "dohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 27, "horsepower": 111, "peakrpm": 5000, "price": 16500, "stroke": 2.68, "symboling": 3, "wheelbase": 88.6 }, { "CarName": "alfa-romero Quadrifoglio", "aspiration": "std", "boreratio": 2.68, "brand": "alfa-romero", "car_ID": 3, "carbody": "hatchback", "carheight": 52.4, "carlength": 171.2, "carwidth": 65.5, "citympg": 19, "compressionratio": 9, "curbweight": 2823, "cylindernumber": "six", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 152, "enginetype": "ohcv", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 26, "horsepower": 154, "peakrpm": 5000, "price": 16500, "stroke": 3.47, "symboling": 1, "wheelbase": 94.5 }, { "CarName": "audi 100 ls", "aspiration": "std", "boreratio": 3.19, "brand": "audi", "car_ID": 4, "carbody": "sedan", "carheight": 54.3, "carlength": 176.6, "carwidth": 66.2, "citympg": 24, "compressionratio": 10, "curbweight": 2337, "cylindernumber": "four", "doornumber": "four", "drivewheel": "fwd", "enginelocation": "front", "enginesize": 109, "enginetype": "ohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 30, "horsepower": 102, "peakrpm": 5500, "price": 13950, "stroke": 3.4, "symboling": 2, "wheelbase": 99.8 }, { "CarName": "audi 100ls", "aspiration": "std", "boreratio": 3.19, "brand": "audi", "car_ID": 5, "carbody": "sedan", "carheight": 54.3, "carlength": 176.6, "carwidth": 66.4, "citympg": 18, "compressionratio": 8, "curbweight": 2824, "cylindernumber": "five", "doornumber": "four", "drivewheel": "4wd", "enginelocation": "front", "enginesize": 136, "enginetype": "ohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 22, "horsepower": 115, "peakrpm": 5500, "price": 17450, "stroke": 3.4, "symboling": 2, "wheelbase": 99.4 } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "brand", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "car_ID", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "symboling", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "CarName", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fueltype", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "aspiration", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "doornumber", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carbody", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "drivewheel", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginelocation", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "wheelbase", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carlength", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carwidth", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carheight", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "curbweight", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginesize", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "boreratio", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "stroke", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "compressionratio", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "horsepower", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "peakrpm", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "citympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "highwaympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "price", "rdfType": null, "title": null, "type": "number" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
alfa-romero13alfa-romero giuliagasstdtwoconvertiblerwdfront88.6168.864.148.82548dohcfour130mpfi3.472.6891115000212713495
alfa-romero23alfa-romero stelviogasstdtwoconvertiblerwdfront88.6168.864.148.82548dohcfour130mpfi3.472.6891115000212716500
alfa-romero31alfa-romero Quadrifogliogasstdtwohatchbackrwdfront94.5171.265.552.42823ohcvsix152mpfi2.683.4791545000192616500
audi42audi 100 lsgasstdfoursedanfwdfront99.8176.666.254.32337ohcfour109mpfi3.193.4101025500243013950
audi52audi 100lsgasstdfoursedan4wdfront99.4176.666.454.32824ohcfive136mpfi3.193.481155500182217450
" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// add new column `brand` to our dataframe\n", "df = brandDf.hstack(df);\n", "await display(df.head(5));" ] }, { "cell_type": "markdown", "id": "81abdb0d-41ce-445c-9a75-f9494eb1f9d3", "metadata": {}, "source": [ "Drop unnecessary values, and write to the new file:" ] }, { "cell_type": "code", "execution_count": 7, "id": "6da96c06-b0b6-46b7-be2d-824af909aec4", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "aspiration": "std", "boreratio": 3.47, "brand": "alfa-romero", "carbody": "convertible", "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "compressionratio": 9, "curbweight": 2548, "cylindernumber": "four", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 130, "enginetype": "dohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 27, "horsepower": 111, "peakrpm": 5000, "price": 13495, "stroke": 2.68, "wheelbase": 88.6 }, { "aspiration": "std", "boreratio": 3.47, "brand": "alfa-romero", "carbody": "convertible", "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "compressionratio": 9, "curbweight": 2548, "cylindernumber": "four", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 130, "enginetype": "dohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 27, "horsepower": 111, "peakrpm": 5000, "price": 16500, "stroke": 2.68, "wheelbase": 88.6 }, { "aspiration": "std", "boreratio": 2.68, "brand": "alfa-romero", "carbody": "hatchback", "carheight": 52.4, "carlength": 171.2, "carwidth": 65.5, "citympg": 19, "compressionratio": 9, "curbweight": 2823, "cylindernumber": "six", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 152, "enginetype": "ohcv", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 26, "horsepower": 154, "peakrpm": 5000, "price": 16500, "stroke": 3.47, "wheelbase": 94.5 } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "brand", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fueltype", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "aspiration", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "doornumber", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carbody", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "drivewheel", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginelocation", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "wheelbase", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carlength", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carwidth", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carheight", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "curbweight", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginesize", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "boreratio", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "stroke", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "compressionratio", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "horsepower", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "peakrpm", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "citympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "highwaympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "price", "rdfType": null, "title": null, "type": "number" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.drop(\"car_ID\", \"symboling\", \"CarName\");\n", "df.head(3);" ] }, { "cell_type": "markdown", "id": "0f827b40-99b5-445d-b339-2bb55b9d6686", "metadata": {}, "source": [ "- Use df.groupBy(...cols) in order to get aggregates form the dataset:\n", "\n", "Count cars by brand:" ] }, { "cell_type": "code", "execution_count": 8, "id": "7512602c-6ce4-4167-9f74-78bc5e8c0332", "metadata": { "scrolled": true }, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "brand": "mercury", "brand_count": 1 }, { "brand": "renault", "brand_count": 2 }, { "brand": "jaguar", "brand_count": 3 }, { "brand": "alfa-romero", "brand_count": 3 }, { "brand": "chevrolet", "brand_count": 3 }, { "brand": "isuzu", "brand_count": 4 }, { "brand": "porsche", "brand_count": 5 }, { "brand": "saab", "brand_count": 6 }, { "brand": "audi", "brand_count": 7 }, { "brand": "plymouth", "brand_count": 7 }, { "brand": "buick", "brand_count": 8 }, { "brand": "bmw", "brand_count": 8 }, { "brand": "dodge", "brand_count": 9 }, { "brand": "peugeot", "brand_count": 11 }, { "brand": "volvo", "brand_count": 11 }, { "brand": "volkswagen", "brand_count": 12 }, { "brand": "subaru", "brand_count": 12 }, { "brand": "honda", "brand_count": 13 }, { "brand": "mitsubishi", "brand_count": 13 }, { "brand": "mazda", "brand_count": 17 }, { "brand": "nissan", "brand_count": 18 }, { "brand": "toyota", "brand_count": 32 } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "brand", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "brand_count", "rdfType": null, "title": null, "type": "integer" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "let brandCount = df.groupBy(\"brand\").len().sort(\"brand_count\");\n", "brandCount;" ] }, { "cell_type": "code", "execution_count": 9, "id": "44b5baf0-cbd2-48d7-a28b-e871682bbaf0", "metadata": {}, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "brand": null, "describe": "mean", "price": 15079.877371916704 }, { "brand": null, "describe": "std", "price": 8738.804703803093 }, { "brand": "alfa-romero", "describe": "min", "price": 6007 }, { "brand": "volvo", "describe": "max", "price": 34600 }, { "brand": null, "describe": "median", "price": 10534.274509803921 } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "describe", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "brand", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "price", "rdfType": null, "title": null, "type": "number" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "let avgPricePerBrand = df.groupBy(\"brand\").agg({ \"price\": [\"mean\"] });\n", "avgPricePerBrand.describe();" ] }, { "cell_type": "code", "execution_count": 10, "id": "98a18f05-665f-4143-b5d5-6ecda68ca2df", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{\n", " renault: \u001b[33m9595\u001b[39m,\n", " jaguar: \u001b[33m34600\u001b[39m,\n", " saab: \u001b[33m15223.333333333334\u001b[39m,\n", " nissan: \u001b[33m10415.666666666666\u001b[39m,\n", " toyota: \u001b[33m9885.8125\u001b[39m,\n", " mitsubishi: \u001b[33m9239.76923076923\u001b[39m,\n", " porsche: \u001b[33m31400.5\u001b[39m,\n", " plymouth: \u001b[33m7963.428571428572\u001b[39m,\n", " dodge: \u001b[33m7875.444444444444\u001b[39m,\n", " mazda: \u001b[33m10652.882352941177\u001b[39m,\n", " honda: \u001b[33m8184.692307692308\u001b[39m,\n", " peugeot: \u001b[33m15489.09090909091\u001b[39m,\n", " audi: \u001b[33m17859.166714285715\u001b[39m,\n", " volvo: \u001b[33m18063.18181818182\u001b[39m,\n", " buick: \u001b[33m33647\u001b[39m,\n", " chevrolet: \u001b[33m6007\u001b[39m,\n", " isuzu: \u001b[33m8916.5\u001b[39m,\n", " bmw: \u001b[33m26118.75\u001b[39m,\n", " \u001b[32m\"alfa-romero\"\u001b[39m: \u001b[33m15498.333333333334\u001b[39m,\n", " volkswagen: \u001b[33m10077.5\u001b[39m,\n", " mercury: \u001b[33m16503\u001b[39m,\n", " subaru: \u001b[33m8541.25\u001b[39m\n", "}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// map brand name to price\n", "avgPricePerBrand = avgPricePerBrand\n", " .toRecords()\n", " .reduce((acc, rec) => ({ ...acc, [rec.brand]: rec.price }), {});" ] }, { "cell_type": "code", "execution_count": 11, "id": "9e7dd2b4-4356-41ad-bd1e-20b7938f55f0", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "brand_category": "Mid_Range" }, { "brand_category": "Budget" }, { "brand_category": "Mid_Range" }, { "brand_category": "Mid_Range" }, { "brand_category": "Budget" } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "brand_category", "rdfType": null, "title": null, "type": "string" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// create brand categories by budget\n", "let brandCategory = df.brand.toArray().map((brand) => {\n", " const avgPrice = avgPricePerBrand[brand];\n", " return avgPrice < 10000\n", " ? \"Budget\"\n", " : avgPrice > 20000\n", " ? \"Luxury\"\n", " : \"Mid_Range\";\n", "});\n", "let catDf = pl.DataFrame({\n", " \"brand_category\": brandCategory,\n", "});\n", "\n", "catDf.sample(5);" ] }, { "cell_type": "markdown", "id": "14930aef-1fc0-4baf-92bf-8a093c3d2e86", "metadata": {}, "source": [ "Write the cleaned dataset to a new file:" ] }, { "cell_type": "code", "execution_count": 12, "id": "350c57c7-ec9e-400d-a8e7-19301c720c56", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "df = catDf.hstack(df);\n", "df.writeCSV(\"assets/cleaned_car_prices.csv\");" ] }, { "cell_type": "markdown", "id": "0ea22d38-cb20-47a0-8f92-422bc5925abf", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Exploratory Data Analysis\n", "\n", "For plotting we use [@observable/plot](https://observablehq.com/plot) and\n", "configured shotcuts for jupyter notebooks imported from\n", "[l12.xyz/x/shortcuts](https://l12.xyz/x/shortcuts)." ] }, { "cell_type": "code", "execution_count": 13, "id": "2fed060e-1785-4dc1-ba44-4bcd5cee92f6", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[\n", " { brand: \"mercury\", brand_count: 1 },\n", " { brand: \"renault\", brand_count: 2 },\n", " { brand: \"jaguar\", brand_count: 3 },\n", " { brand: \"alfa-romero\", brand_count: 3 },\n", " { brand: \"chevrolet\", brand_count: 3 },\n", " { brand: \"isuzu\", brand_count: 4 },\n", " { brand: \"porsche\", brand_count: 5 },\n", " { brand: \"saab\", brand_count: 6 },\n", " { brand: \"audi\", brand_count: 7 },\n", " { brand: \"plymouth\", brand_count: 7 },\n", " { brand: \"buick\", brand_count: 8 },\n", " { brand: \"bmw\", brand_count: 8 },\n", " { brand: \"dodge\", brand_count: 9 },\n", " { brand: \"peugeot\", brand_count: 11 },\n", " { brand: \"volvo\", brand_count: 11 },\n", " { brand: \"volkswagen\", brand_count: 12 },\n", " { brand: \"subaru\", brand_count: 12 },\n", " { brand: \"honda\", brand_count: 13 },\n", " { brand: \"mitsubishi\", brand_count: 13 },\n", " { brand: \"mazda\", brand_count: 17 },\n", " { brand: \"nissan\", brand_count: 18 },\n", " { brand: \"toyota\", brand_count: 32 }\n", "]\n" ] }, { "data": { "image/svg+xml": [ "toyotanissanmazdahondamitsubishisubaruvolkswagenpeugeotvolvododgebmwbuickaudiplymouthsaabporscheisuzualfa-romerochevroletjaguarrenaultmercurybrand051015202530brand_count →" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "import { document } from \"https://l12.xyz/x/shortcuts/raw/plots.ts\";\n", "\n", "const brandCountRecords = brandCount.toRecords();\n", "console.log(brandCountRecords);\n", "const brandCountPlot = Plot.plot({\n", " marginLeft: 80,\n", " style: {\n", " backgroundColor: \"#fff\",\n", " },\n", " x: { padding: 0.4 },\n", " marks: [\n", " Plot.barX(brandCountRecords, {\n", " x: \"brand_count\",\n", " y: \"brand\",\n", " sort: { y: \"x\", order: \"descending\" },\n", " }),\n", " ],\n", " document,\n", "});\n", "await display(brandCountPlot);" ] }, { "cell_type": "code", "execution_count": 14, "id": "e21f5bdf-a5a8-43b2-855b-0eb079656da7", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[\n", " \u001b[32m\"wheelbase\"\u001b[39m, \u001b[32m\"carlength\"\u001b[39m,\n", " \u001b[32m\"carwidth\"\u001b[39m, \u001b[32m\"carheight\"\u001b[39m,\n", " \u001b[32m\"curbweight\"\u001b[39m, \u001b[32m\"enginesize\"\u001b[39m,\n", " \u001b[32m\"boreratio\"\u001b[39m, \u001b[32m\"stroke\"\u001b[39m,\n", " \u001b[32m\"compressionratio\"\u001b[39m, \u001b[32m\"horsepower\"\u001b[39m,\n", " \u001b[32m\"peakrpm\"\u001b[39m, \u001b[32m\"citympg\"\u001b[39m,\n", " \u001b[32m\"highwaympg\"\u001b[39m\n", "]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "let numericColumns = df.columns.filter((col) =>\n", " df[col].isNumeric() && col !== \"price\"\n", ");\n", "numericColumns;" ] }, { "cell_type": "markdown", "id": "1c8d3b90-b4b1-4da6-9e7c-f8fd7cffb77d", "metadata": {}, "source": [ "Sometimes we can get some intuitive insight seeing the data plotted from\n", "different dimensions. It is an optional step, but it might help to get some\n", "assumtions about the relationships in the dataset. Below is an example for\n", "drawing plots side-by-side." ] }, { "cell_type": "code", "execution_count": 15, "id": "9a82f623-b39a-473a-91b9-0a0347c64c63", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", "
\n", " " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import { sideBySidePlot } from \"https://l12.xyz/x/shortcuts/raw/plots.ts\";\n", "\n", "let records = df.toRecords();\n", "\n", "const plt = sideBySidePlot({\n", " x: numericColumns,\n", " y: [\"price\"],\n", " marks: [\n", " (x, y) => Plot.dot(records, { x, y }),\n", " (x, y) => Plot.linearRegressionY(records, { x, y, stroke: \"red\" }),\n", " ],\n", " cols: 3,\n", "});\n", "\n", "await display(\n", " plt,\n", ");" ] }, { "cell_type": "markdown", "id": "53d04850-df05-474c-8568-a5f7c97146f3", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "Let's view the list of top features that have high correlation coefficient. The\n", "pearsonCorr() function calculates the Pearson'r correlation coefficients with\n", "respect to the 'price'." ] }, { "cell_type": "code", "execution_count": 16, "id": "924838be-1b4d-4edf-abe5-8bf02e02cf11", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "Variable": "enginesize", "idx (price)": 0.8741448025245117 }, { "Variable": "curbweight", "idx (price)": 0.8353048793372955 }, { "Variable": "horsepower", "idx (price)": 0.8081388225362217 }, { "Variable": "carwidth", "idx (price)": 0.7593252997414263 }, { "Variable": "carlength", "idx (price)": 0.6829200156779843 }, { "Variable": "wheelbase", "idx (price)": 0.5778155982921477 }, { "Variable": "boreratio", "idx (price)": 0.5531732367984261 }, { "Variable": "carheight", "idx (price)": 0.11933622657047727 }, { "Variable": "stroke", "idx (price)": 0.07944308388192935 }, { "Variable": "compressionratio", "idx (price)": 0.06798350579944248 }, { "Variable": "peakrpm", "idx (price)": -0.0852671502778569 }, { "Variable": "citympg", "idx (price)": -0.6857513360270401 }, { "Variable": "highwaympg", "idx (price)": -0.6975990916465564 } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "Variable", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "idx (price)", "rdfType": null, "title": null, "type": "number" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
Variableidx (price)
" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// we select pearon's idx from dataframe for all numeric cols,\n", "// then we transpose result so that columns become rows,\n", "// then we sort by the idx column\n", "df.select(\n", " ...numericColumns.map((col) => pl.pearsonCorr(col, \"price\")),\n", ")\n", " .transpose({\n", " columnNames: [\"idx (price)\"],\n", " headerName: \"Variable\",\n", " includeHeader: true,\n", " })\n", " .sort(\"idx (price)\", true);" ] }, { "cell_type": "markdown", "id": "63bc6b92-5ab9-43e0-a957-57d895bc60f9", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "### Linearity Assumption\n", "\n", "Linear regression needs the relationship between independent variable and the\n", "dependent variable to be linear. We can test this assumption with some scatter\n", "plots and regression lines.\n", "\n", "** Here we use the same side-by-side plot shortcut, but for selected varisbles\n", "with high correlation coefficent." ] }, { "cell_type": "code", "execution_count": 17, "id": "2d5a635a-c819-4989-9818-a1927363993d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", "
\n", " " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "const plt = sideBySidePlot({\n", " x: [\"enginesize\", \"curbweight\", \"horsepower\", \"carwidth\"],\n", " y: [\"price\"],\n", " marks: [\n", " (x, y) => Plot.dot(records, { x, y }),\n", " (x, y) => Plot.linearRegressionY(records, { x, y, stroke: \"red\" }),\n", " ],\n", " cols: 2,\n", "});\n", "\n", "await display(\n", " plt,\n", ");" ] }, { "cell_type": "markdown", "id": "0a351f17-309b-444b-b4a8-26500b077e47", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "### Homoscedasticity\n", "\n", "The assumption of homoscedasticity (constant variance), is crucial to linear\n", "regression models. Homoscedasticity describes a situation in which the error\n", "term or variance or the \"noise\" or random disturbance in the relationship\n", "between the independent variables and the dependent variable is the same across\n", "all values of the independent variable. In other words, there is a constant\n", "variance present in the response variable as the predictor variable increases.\n", "If the \"noise\" is not the same across the values of an independent variable, we\n", "call it heteroscedasticity, opposite of homoscedasticity.\n", "\n", "#### Residuals\n", "\n", "Next we apply residual expression to 'price' and 'enginesize' varibles in order\n", "to check this assumption.\n", "[The residuals function](https://l12.xyz/x/shortcuts/src/branch/main/expr.ts)\n", "uses mean squared." ] }, { "cell_type": "code", "execution_count": 18, "id": "fc9a31c3-6ac0-4058-b5bb-341eb1c40dd1", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "image/svg+xml": [ "−10,000−8,000−6,000−4,000−2,00002,0004,0006,0008,00010,00012,00014,000↑ price100150200250300enginesize →" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import { residuals } from \"https://l12.xyz/x/shortcuts/raw/expr.ts\";\n", "\n", "let residualDf = df.select(\n", " \"enginesize\",\n", " residuals(pl.col(\"enginesize\"), pl.col(\"price\")),\n", ");\n", "\n", "let residPlot = Plot.plot({\n", " x: \"enginesize\",\n", " y: \"price\",\n", " marks: [\n", " Plot.dot(residualDf.toRecords(), { x: \"enginesize\", y: \"price\" }),\n", " Plot.ruleY([0], { stroke: \"#ccc\" }),\n", " ],\n", " document,\n", "});\n", "\n", "await display(residPlot);" ] }, { "cell_type": "markdown", "id": "29a0ec4e-084c-4af6-9af6-a14ae5cee4e7", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "From the above plot, we can tell the error variance across the true line is\n", "dispersed somewhat not uniformly, but in a funnel like shape. So, the assumption\n", "of the _homoscedasticity_ is more likely not met." ] }, { "cell_type": "markdown", "id": "33d787f3-2a7b-421c-9d9b-a86719a92f9a", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Normality\n", "\n", "The linear regression analysis requires the dependent variable, 'price', to be\n", "normally distributed. A histogram, box plot, or a Q-Q-Plot can check if the\n", "target variable is normally distributed. The goodness of fit test, e.g., the\n", "Kolmogorov-Smirnov test can check for normality in the dependent variable.\n", "[This documentation](https://towardsdatascience.com/normality-tests-in-python-31e04aa4f411)\n", "contains more information on the normality assumption.\n", "\n", "Let's display all three charts to show how our target variable, 'price' behaves." ] }, { "cell_type": "code", "execution_count": 19, "id": "b94d0d8d-ae95-41f5-b17f-0ab9b3aa27ec", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import { threeChart } from \"https://l12.xyz/x/shortcuts/raw/plots.ts\";\n", "\n", "await display(threeChart(records, \"price\"));" ] }, { "cell_type": "markdown", "id": "81fe890e-56c4-4076-b57c-38fe438c0ead", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "These three charts above can tell us a lot about our target variable:\n", "\n", "- Our target variable, 'price' is not normally distributed\n", "- Our target variable is right-skewed\n", "- There are some outliers in the variable\n", "\n", "The right-skewed plot means that most prices in the dataset are on the lower end\n", "(below 15,000). The 'max' value is very far from the '75%' quantile statistic.\n", "All these plots show that the assumption for accurate linear regression modeling\n", "is not met.\n", "\n", "Next, we will perform the log transformation to correct our target variable and\n", "to make it more normally distributed." ] }, { "cell_type": "code", "execution_count": 20, "id": "8f8c6044-d3a1-4f8e-89ba-365fea3fbd8b", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", " \n", " \n", "
\n", "
\n", " \n", "
\n", "
\n", " " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "let log2df = df.select(pl.col(\"price\").log());\n", "\n", "await display(threeChart(log2df.toRecords(), \"price\"));" ] }, { "cell_type": "markdown", "id": "09a9d9c0-7fb6-4353-91f8-4899520b36d7", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Data Encoding" ] }, { "cell_type": "code", "execution_count": 21, "id": "79f05578-b839-4278-afb1-0123c1d32d17", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "aspiration": "std", "boreratio": 3.47, "brand": "alfa-romero", "brand_category": "Mid_Range", "carbody": "convertible", "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "compressionratio": 9, "curbweight": 2548, "cylindernumber": "four", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 130, "enginetype": "dohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 27, "horsepower": 111, "peakrpm": 5000, "price": 13495, "stroke": 2.68, "wheelbase": 88.6 }, { "aspiration": "std", "boreratio": 3.47, "brand": "alfa-romero", "brand_category": "Mid_Range", "carbody": "convertible", "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "compressionratio": 9, "curbweight": 2548, "cylindernumber": "four", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 130, "enginetype": "dohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 27, "horsepower": 111, "peakrpm": 5000, "price": 16500, "stroke": 2.68, "wheelbase": 88.6 }, { "aspiration": "std", "boreratio": 2.68, "brand": "alfa-romero", "brand_category": "Mid_Range", "carbody": "hatchback", "carheight": 52.4, "carlength": 171.2, "carwidth": 65.5, "citympg": 19, "compressionratio": 9, "curbweight": 2823, "cylindernumber": "six", "doornumber": "two", "drivewheel": "rwd", "enginelocation": "front", "enginesize": 152, "enginetype": "ohcv", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 26, "horsepower": 154, "peakrpm": 5000, "price": 16500, "stroke": 3.47, "wheelbase": 94.5 }, { "aspiration": "std", "boreratio": 3.19, "brand": "audi", "brand_category": "Mid_Range", "carbody": "sedan", "carheight": 54.3, "carlength": 176.6, "carwidth": 66.2, "citympg": 24, "compressionratio": 10, "curbweight": 2337, "cylindernumber": "four", "doornumber": "four", "drivewheel": "fwd", "enginelocation": "front", "enginesize": 109, "enginetype": "ohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 30, "horsepower": 102, "peakrpm": 5500, "price": 13950, "stroke": 3.4, "wheelbase": 99.8 }, { "aspiration": "std", "boreratio": 3.19, "brand": "audi", "brand_category": "Mid_Range", "carbody": "sedan", "carheight": 54.3, "carlength": 176.6, "carwidth": 66.4, "citympg": 18, "compressionratio": 8, "curbweight": 2824, "cylindernumber": "five", "doornumber": "four", "drivewheel": "4wd", "enginelocation": "front", "enginesize": 136, "enginetype": "ohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 22, "horsepower": 115, "peakrpm": 5500, "price": 17450, "stroke": 3.4, "wheelbase": 99.4 } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "brand_category", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "brand", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fueltype", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "aspiration", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "doornumber", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carbody", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "drivewheel", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginelocation", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "wheelbase", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carlength", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carwidth", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carheight", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "curbweight", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginesize", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "boreratio", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "stroke", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "compressionratio", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "horsepower", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "peakrpm", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "citympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "highwaympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "price", "rdfType": null, "title": null, "type": "number" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "let carData = pl.readCSV(\n", " await Deno.readTextFile(\"assets/cleaned_car_prices.csv\"),\n", " { sep: \",\" },\n", ");\n", "\n", "carData.head(5);" ] }, { "cell_type": "markdown", "id": "af6eb67c-574a-430a-a04c-24d0ef9f35fa", "metadata": {}, "source": [ "We'll drop some unnecessary columns:" ] }, { "cell_type": "code", "execution_count": 22, "id": "e2df0c3f-dbc0-4820-b807-373ce3787645", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "aspiration": "std", "boreratio": 3.47, "brand_category": "Mid_Range", "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "curbweight": 2548, "cylindernumber": "four", "doornumber": "two", "drivewheel": "rwd", "enginesize": 130, "enginetype": "dohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 27, "horsepower": 111, "price": 13495, "wheelbase": 88.6 }, { "aspiration": "std", "boreratio": 3.47, "brand_category": "Mid_Range", "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "curbweight": 2548, "cylindernumber": "four", "doornumber": "two", "drivewheel": "rwd", "enginesize": 130, "enginetype": "dohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 27, "horsepower": 111, "price": 16500, "wheelbase": 88.6 }, { "aspiration": "std", "boreratio": 2.68, "brand_category": "Mid_Range", "carheight": 52.4, "carlength": 171.2, "carwidth": 65.5, "citympg": 19, "curbweight": 2823, "cylindernumber": "six", "doornumber": "two", "drivewheel": "rwd", "enginesize": 152, "enginetype": "ohcv", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 26, "horsepower": 154, "price": 16500, "wheelbase": 94.5 }, { "aspiration": "std", "boreratio": 3.19, "brand_category": "Mid_Range", "carheight": 54.3, "carlength": 176.6, "carwidth": 66.2, "citympg": 24, "curbweight": 2337, "cylindernumber": "four", "doornumber": "four", "drivewheel": "fwd", "enginesize": 109, "enginetype": "ohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 30, "horsepower": 102, "price": 13950, "wheelbase": 99.8 }, { "aspiration": "std", "boreratio": 3.19, "brand_category": "Mid_Range", "carheight": 54.3, "carlength": 176.6, "carwidth": 66.4, "citympg": 18, "curbweight": 2824, "cylindernumber": "five", "doornumber": "four", "drivewheel": "4wd", "enginesize": 136, "enginetype": "ohc", "fuelsystem": "mpfi", "fueltype": "gas", "highwaympg": 22, "horsepower": 115, "price": 17450, "wheelbase": 99.4 } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "brand_category", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fueltype", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "aspiration", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "doornumber", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "drivewheel", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "wheelbase", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carlength", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carwidth", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carheight", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "curbweight", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginesize", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem", "rdfType": null, "title": null, "type": "string" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "boreratio", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "horsepower", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "citympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "highwaympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "price", "rdfType": null, "title": null, "type": "number" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "let carDataGeneralized = carData.drop(\n", " \"brand\",\n", " \"carbody\",\n", " \"enginelocation\",\n", " \"stroke\",\n", " \"compressionratio\",\n", " \"peakrpm\",\n", ");\n", "carDataGeneralized.head(5);" ] }, { "cell_type": "markdown", "id": "826ab213-65a2-43d0-b9ce-fecdebd79eb8", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "Next we use one hot (binary) encoding. We assume that all non-numeric colums are\n", "categorical." ] }, { "cell_type": "code", "execution_count": 23, "id": "9bc95dd8-0240-45c6-a0f9-21b1873ae04b", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.dataresource+json": { "bytes": null, "data": [ { "aspiration_std": 1, "aspiration_turbo": 0, "boreratio": 3.47, "brand_category_Budget": 0, "brand_category_Luxury": 0, "brand_category_Mid_Range": 1, "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "curbweight": 2548, "cylindernumber_eight": 0, "cylindernumber_five": 0, "cylindernumber_four": 1, "cylindernumber_six": 0, "cylindernumber_three": 0, "cylindernumber_twelve": 0, "cylindernumber_two": 0, "doornumber_four": 0, "doornumber_two": 1, "drivewheel_4wd": 0, "drivewheel_fwd": 0, "drivewheel_rwd": 1, "enginesize": 130, "enginetype_dohc": 1, "enginetype_dohcv": 0, "enginetype_l": 0, "enginetype_ohc": 0, "enginetype_ohcf": 0, "enginetype_ohcv": 0, "enginetype_rotor": 0, "fuelsystem_1bbl": 0, "fuelsystem_2bbl": 0, "fuelsystem_4bbl": 0, "fuelsystem_idi": 0, "fuelsystem_mfi": 0, "fuelsystem_mpfi": 1, "fuelsystem_spdi": 0, "fuelsystem_spfi": 0, "fueltype_diesel": 0, "fueltype_gas": 1, "highwaympg": 27, "horsepower": 111, "price": 13495, "wheelbase": 88.6 }, { "aspiration_std": 1, "aspiration_turbo": 0, "boreratio": 3.47, "brand_category_Budget": 0, "brand_category_Luxury": 0, "brand_category_Mid_Range": 1, "carheight": 48.8, "carlength": 168.8, "carwidth": 64.1, "citympg": 21, "curbweight": 2548, "cylindernumber_eight": 0, "cylindernumber_five": 0, "cylindernumber_four": 1, "cylindernumber_six": 0, "cylindernumber_three": 0, "cylindernumber_twelve": 0, "cylindernumber_two": 0, "doornumber_four": 0, "doornumber_two": 1, "drivewheel_4wd": 0, "drivewheel_fwd": 0, "drivewheel_rwd": 1, "enginesize": 130, "enginetype_dohc": 1, "enginetype_dohcv": 0, "enginetype_l": 0, "enginetype_ohc": 0, "enginetype_ohcf": 0, "enginetype_ohcv": 0, "enginetype_rotor": 0, "fuelsystem_1bbl": 0, "fuelsystem_2bbl": 0, "fuelsystem_4bbl": 0, "fuelsystem_idi": 0, "fuelsystem_mfi": 0, "fuelsystem_mpfi": 1, "fuelsystem_spdi": 0, "fuelsystem_spfi": 0, "fueltype_diesel": 0, "fueltype_gas": 1, "highwaympg": 27, "horsepower": 111, "price": 16500, "wheelbase": 88.6 }, { "aspiration_std": 1, "aspiration_turbo": 0, "boreratio": 2.68, "brand_category_Budget": 0, "brand_category_Luxury": 0, "brand_category_Mid_Range": 1, "carheight": 52.4, "carlength": 171.2, "carwidth": 65.5, "citympg": 19, "curbweight": 2823, "cylindernumber_eight": 0, "cylindernumber_five": 0, "cylindernumber_four": 0, "cylindernumber_six": 1, "cylindernumber_three": 0, "cylindernumber_twelve": 0, "cylindernumber_two": 0, "doornumber_four": 0, "doornumber_two": 1, "drivewheel_4wd": 0, "drivewheel_fwd": 0, "drivewheel_rwd": 1, "enginesize": 152, "enginetype_dohc": 0, "enginetype_dohcv": 0, "enginetype_l": 0, "enginetype_ohc": 0, "enginetype_ohcf": 0, "enginetype_ohcv": 1, "enginetype_rotor": 0, "fuelsystem_1bbl": 0, "fuelsystem_2bbl": 0, "fuelsystem_4bbl": 0, "fuelsystem_idi": 0, "fuelsystem_mfi": 0, "fuelsystem_mpfi": 1, "fuelsystem_spdi": 0, "fuelsystem_spfi": 0, "fueltype_diesel": 0, "fueltype_gas": 1, "highwaympg": 26, "horsepower": 154, "price": 16500, "wheelbase": 94.5 }, { "aspiration_std": 1, "aspiration_turbo": 0, "boreratio": 3.19, "brand_category_Budget": 0, "brand_category_Luxury": 0, "brand_category_Mid_Range": 1, "carheight": 54.3, "carlength": 176.6, "carwidth": 66.2, "citympg": 24, "curbweight": 2337, "cylindernumber_eight": 0, "cylindernumber_five": 0, "cylindernumber_four": 1, "cylindernumber_six": 0, "cylindernumber_three": 0, "cylindernumber_twelve": 0, "cylindernumber_two": 0, "doornumber_four": 1, "doornumber_two": 0, "drivewheel_4wd": 0, "drivewheel_fwd": 1, "drivewheel_rwd": 0, "enginesize": 109, "enginetype_dohc": 0, "enginetype_dohcv": 0, "enginetype_l": 0, "enginetype_ohc": 1, "enginetype_ohcf": 0, "enginetype_ohcv": 0, "enginetype_rotor": 0, "fuelsystem_1bbl": 0, "fuelsystem_2bbl": 0, "fuelsystem_4bbl": 0, "fuelsystem_idi": 0, "fuelsystem_mfi": 0, "fuelsystem_mpfi": 1, "fuelsystem_spdi": 0, "fuelsystem_spfi": 0, "fueltype_diesel": 0, "fueltype_gas": 1, "highwaympg": 30, "horsepower": 102, "price": 13950, "wheelbase": 99.8 }, { "aspiration_std": 1, "aspiration_turbo": 0, "boreratio": 3.19, "brand_category_Budget": 0, "brand_category_Luxury": 0, "brand_category_Mid_Range": 1, "carheight": 54.3, "carlength": 176.6, "carwidth": 66.4, "citympg": 18, "curbweight": 2824, "cylindernumber_eight": 0, "cylindernumber_five": 1, "cylindernumber_four": 0, "cylindernumber_six": 0, "cylindernumber_three": 0, "cylindernumber_twelve": 0, "cylindernumber_two": 0, "doornumber_four": 1, "doornumber_two": 0, "drivewheel_4wd": 1, "drivewheel_fwd": 0, "drivewheel_rwd": 0, "enginesize": 136, "enginetype_dohc": 0, "enginetype_dohcv": 0, "enginetype_l": 0, "enginetype_ohc": 1, "enginetype_ohcf": 0, "enginetype_ohcv": 0, "enginetype_rotor": 0, "fuelsystem_1bbl": 0, "fuelsystem_2bbl": 0, "fuelsystem_4bbl": 0, "fuelsystem_idi": 0, "fuelsystem_mfi": 0, "fuelsystem_mpfi": 1, "fuelsystem_spdi": 0, "fuelsystem_spfi": 0, "fueltype_diesel": 0, "fueltype_gas": 1, "highwaympg": 22, "horsepower": 115, "price": 17450, "wheelbase": 99.4 } ], "description": null, "dialect": null, "encoding": null, "format": null, "hash": null, "homepage": null, "licenses": null, "mediatype": null, "path": null, "schema": { "fields": [ { "constraints": null, "description": null, "example": null, "format": null, "name": "brand_category_Budget", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "brand_category_Luxury", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "brand_category_Mid_Range", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fueltype_diesel", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fueltype_gas", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "aspiration_std", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "aspiration_turbo", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "doornumber_four", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "doornumber_two", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "drivewheel_4wd", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "drivewheel_fwd", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "drivewheel_rwd", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "wheelbase", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carlength", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carwidth", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "carheight", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "curbweight", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype_dohc", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype_dohcv", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype_l", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype_ohc", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype_ohcf", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype_ohcv", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginetype_rotor", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber_eight", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber_five", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber_four", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber_six", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber_three", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber_twelve", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "cylindernumber_two", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "enginesize", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem_1bbl", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem_2bbl", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem_4bbl", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem_idi", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem_mfi", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem_mpfi", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem_spdi", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "fuelsystem_spfi", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "boreratio", "rdfType": null, "title": null, "type": "number" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "horsepower", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "citympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "highwaympg", "rdfType": null, "title": null, "type": "integer" }, { "constraints": null, "description": null, "example": null, "format": null, "name": "price", "rdfType": null, "title": null, "type": "number" } ], "foreignKeys": null, "missingValues": null, "primaryKey": null }, "sources": null, "title": null }, "text/html": [ "
" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import { oneHotEncoding } from \"https://l12.xyz/x/shortcuts/raw/encoding.ts\";\n", "\n", "let encodedCarData = oneHotEncoding(carDataGeneralized);\n", "encodedCarData.head(5);" ] }, { "cell_type": "code", "execution_count": 24, "id": "aff549ce-6736-43c7-8d40-b09b9ca7fa59", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "encodedCarData.writeCSV(\"assets/encoded_car_data.csv\");" ] }, { "cell_type": "code", "execution_count": null, "id": "ec799133-584f-450d-bd8b-6b443fbf5fb5", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Deno", "language": "typescript", "name": "deno" }, "language_info": { "codemirror_mode": "typescript", "file_extension": ".ts", "mimetype": "text/x.typescript", "name": "typescript", "nbconvert_exporter": "script", "pygments_lexer": "typescript", "version": "5.6.2" } }, "nbformat": 4, "nbformat_minor": 5 }