shortcuts/notebooks/data_analytics_js.ipynb

4464 lines
723 KiB
Plaintext
Raw Permalink Normal View History

2024-09-25 19:52:53 +00:00
{
"cells": [
{
"cell_type": "markdown",
"id": "b9db28d4-ea49-443c-a5d6-e1c44bfe4942",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"# Data Analytics with JS\n",
"\n",
2024-09-26 01:41:33 +00:00
"The dataset contains all the information about cars, a name of a manufacturer,\n",
"all car's technical parameters and a sale price of a car.\n",
2024-09-25 19:52:53 +00:00
"\n",
"Libraries:\n",
2024-09-26 01:41:33 +00:00
"\n",
"- nodejs-polars\n",
"- @observable/plot"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "d163c580-6aa0-4e8c-a780-5cc931003dc8",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"## Exploring Data\n",
"\n",
2024-09-26 01:41:33 +00:00
"Use [pola-rs](https://pola-rs.github.io/nodejs-polars/modules.html) dataframes\n",
"to read and manipulate data.\n",
2024-09-25 19:52:53 +00:00
"\n",
"- use `df.head(n)` to get n first rows\n",
"- use `df.sample(n)` to get n random rows from the dataset\n",
"- use `df.describe()` to get mean, std, min, max\n",
"- use `df.select(...cols)` to get new dataframe for columns\n",
"- use `df.groupBy(..cols).agg()` to group data and get aggragates"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1cb3589a-bffe-422a-8e32-a853161b93c4",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"CarName": "alfa-romero giulia",
"aspiration": "std",
"boreratio": 3.47,
"car_ID": 1,
"carbody": "convertible",
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"compressionratio": 9,
"curbweight": 2548,
"cylindernumber": "four",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 130,
"enginetype": "dohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 27,
"horsepower": 111,
"peakrpm": 5000,
"price": 13495,
"stroke": 2.68,
"symboling": 3,
"wheelbase": 88.6
},
{
"CarName": "alfa-romero stelvio",
"aspiration": "std",
"boreratio": 3.47,
"car_ID": 2,
"carbody": "convertible",
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"compressionratio": 9,
"curbweight": 2548,
"cylindernumber": "four",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 130,
"enginetype": "dohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 27,
"horsepower": 111,
"peakrpm": 5000,
"price": 16500,
"stroke": 2.68,
"symboling": 3,
"wheelbase": 88.6
},
{
"CarName": "alfa-romero Quadrifoglio",
"aspiration": "std",
"boreratio": 2.68,
"car_ID": 3,
"carbody": "hatchback",
"carheight": 52.4,
"carlength": 171.2,
"carwidth": 65.5,
"citympg": 19,
"compressionratio": 9,
"curbweight": 2823,
"cylindernumber": "six",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 152,
"enginetype": "ohcv",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 26,
"horsepower": 154,
"peakrpm": 5000,
"price": 16500,
"stroke": 3.47,
"symboling": 1,
"wheelbase": 94.5
},
{
"CarName": "audi 100 ls",
"aspiration": "std",
"boreratio": 3.19,
"car_ID": 4,
"carbody": "sedan",
"carheight": 54.3,
"carlength": 176.6,
"carwidth": 66.2,
"citympg": 24,
"compressionratio": 10,
"curbweight": 2337,
"cylindernumber": "four",
"doornumber": "four",
"drivewheel": "fwd",
"enginelocation": "front",
"enginesize": 109,
"enginetype": "ohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 30,
"horsepower": 102,
"peakrpm": 5500,
"price": 13950,
"stroke": 3.4,
"symboling": 2,
"wheelbase": 99.8
},
{
"CarName": "audi 100ls",
"aspiration": "std",
"boreratio": 3.19,
"car_ID": 5,
"carbody": "sedan",
"carheight": 54.3,
"carlength": 176.6,
"carwidth": 66.4,
"citympg": 18,
"compressionratio": 8,
"curbweight": 2824,
"cylindernumber": "five",
"doornumber": "four",
"drivewheel": "4wd",
"enginelocation": "front",
"enginesize": 136,
"enginetype": "ohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 22,
"horsepower": 115,
"peakrpm": 5500,
"price": 17450,
"stroke": 3.4,
"symboling": 2,
"wheelbase": 99.4
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "car_ID",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "symboling",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "CarName",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fueltype",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "aspiration",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "doornumber",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carbody",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "drivewheel",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginelocation",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "wheelbase",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carlength",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carwidth",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carheight",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "curbweight",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginesize",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "boreratio",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "stroke",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "compressionratio",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "horsepower",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "peakrpm",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "citympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "highwaympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "price",
"rdfType": null,
"title": null,
"type": "number"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
"<table><thead><tr><th>car_ID</th><th>symboling</th><th>CarName</th><th>fueltype</th><th>aspiration</th><th>doornumber</th><th>carbody</th><th>drivewheel</th><th>enginelocation</th><th>wheelbase</th><th>carlength</th><th>carwidth</th><th>carheight</th><th>curbweight</th><th>enginetype</th><th>cylindernumber</th><th>enginesize</th><th>fuelsystem</th><th>boreratio</th><th>stroke</th><th>compressionratio</th><th>horsepower</th><th>peakrpm</th><th>citympg</th><th>highwaympg</th><th>price</th></tr></thead><tbody><tr><td>1</td><td>3</td><td>alfa-romero giulia</td><td>gas</td><td>std</td><td>two</td><td>convertible</td><td>rwd</td><td>front</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>dohc</td><td>four</td><td>130</td><td>mpfi</td><td>3.47</td><td>2.68</td><td>9</td><td>111</td><td>5000</td><td>21</td><td>27</td><td>13495</td></tr><tr><td>2</td><td>3</td><td>alfa-romero stelvio</td><td>gas</td><td>std</td><td>two</td><td>convertible</td><td>rwd</td><td>front</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>dohc</td><td>four</td><td>130</td><td>mpfi</td><td>3.47</td><td>2.68</td><td>9</td><td>111</td><td>5000</td><td>21</td><td>27</td><td>16500</td></tr><tr><td>3</td><td>1</td><td>alfa-romero Quadrifoglio</td><td>gas</td><td>std</td><td>two</td><td>hatchback</td><td>rwd</td><td>front</td><td>94.5</td><td>171.2</td><td>65.5</td><td>52.4</td><td>2823</td><td>ohcv</td><td>six</td><td>152</td><td>mpfi</td><td>2.68</td><td>3.47</td><td>9</td><td>154</td><td>5000</td><td>19</td><td>26</td><td>16500</td></tr><tr><td>4</td><td>2</td><td>audi 100 ls</td><td>gas</td><td>std</td><td>four</td><td>sedan</td><td>fwd</td><td>front</td><td>99.8</td><td>176.6</td><td>66.2</td><td>54.3</td><td>2337</td><td>ohc</td><td>four</td><td>109</td><td>mpfi</td><td>3.19</td><td>3.4</td><td>10</td><td>102</td><td>5500</td><td>24</td><td>30</td><td>13950</td></tr><tr><td>5</td><td>2</td><td>audi 100ls</td><td>gas</td><td>std</td><td>four</td><td>sedan</td><td>4wd</td><td>front</td><td>99.4</td><td>176.6</td><td>66.4</td><td>54.3</td><td>2824</td><td>ohc</td><td>five</td><td>136</td><td>mpfi</td><td>3.19</td><td>3.4</td><td>8</td><td>115</td><td>5500</td><td>18</td><td>22</td><td>17450</td></tr></tbody></table>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"import { display } from \"https://deno.land/x/display@v0.1.1/mod.ts\";\n",
2024-10-04 05:24:07 +00:00
"import { Plot } from \"https://l12.xyz/x/shortcuts/raw/plots.ts\";\n",
2024-09-26 01:41:33 +00:00
"import pl from \"npm:nodejs-polars\";\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"let data = await Deno.readTextFile(\"assets/CarPrice_Assignment.csv\");\n",
"let df = pl.readCSV(data, { sep: \",\" });\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"await display(df.head(5));"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a82e43fd-04a6-4bc8-a752-3b03abd6e983",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"describe": "mean",
"enginesize": 126.90731707317073,
"horsepower": 104.1170731707317,
"price": 13276.710570731706
},
{
"describe": "std",
"enginesize": 41.642693438179855,
"horsepower": 39.54416680936116,
"price": 7988.852331743148
},
{
"describe": "min",
"enginesize": 61,
"horsepower": 48,
"price": 5118
},
{
"describe": "max",
"enginesize": 326,
"horsepower": 288,
"price": 45400
},
{
"describe": "median",
"enginesize": 120,
"horsepower": 95,
"price": 10295
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "describe",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginesize",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "horsepower",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "price",
"rdfType": null,
"title": null,
"type": "number"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
"<table><thead><tr><th>describe</th><th>enginesize</th><th>horsepower</th><th>price</th></tr></thead><tbody><tr><td>mean</td><td>126.90731707317073</td><td>104.1170731707317</td><td>13276.710570731706</td></tr><tr><td>std</td><td>41.642693438179855</td><td>39.54416680936116</td><td>7988.852331743148</td></tr><tr><td>min</td><td>61</td><td>48</td><td>5118</td></tr><tr><td>max</td><td>326</td><td>288</td><td>45400</td></tr><tr><td>median</td><td>120</td><td>95</td><td>10295</td></tr></tbody></table>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"await display(\n",
2024-09-26 01:41:33 +00:00
" df.select(\n",
" \"enginesize\",\n",
" \"horsepower\",\n",
" \"price\",\n",
" ).describe(),\n",
");"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "5c931435-0e19-4d32-8ca8-ae81a9cb43c8",
"metadata": {},
"source": [
"## Data Cleaning"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e70e5f22-bdde-4140-9015-a6281b8478bf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[33mfalse\u001b[39m"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// check for duplicates\n",
2024-09-26 01:41:33 +00:00
"const hasDups = df.select(\"car_ID\").isDuplicated().toArray().includes(true);\n",
2024-09-25 19:52:53 +00:00
"// if there are duplicates, use df.filter()\n",
2024-09-26 01:41:33 +00:00
"hasDups;"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "7fb89a88-f3ad-457d-9211-2b75eef6096f",
"metadata": {},
"source": [
"Resolve categorical values. For this dataset it is a brand name:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ffb4509e-1930-4e2e-954c-51e919084514",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
2024-10-04 05:24:07 +00:00
"brand": "toyota"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "mercury"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "mitsubishi"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "honda"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "mazda"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "peugeot"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "porcshce"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "renault"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "isuzu"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "dodge"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "bmw"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "vw"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "maxda"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "volkswagen"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "alfa-romero"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "chevrolet"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "toyouta"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "jaguar"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "saab"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "porsche"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "audi"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "vokswagen"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "subaru"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "nissan"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "plymouth"
2024-09-25 19:52:53 +00:00
},
{
2024-09-25 20:34:25 +00:00
"brand": "volvo"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "buick"
2024-09-25 19:52:53 +00:00
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand",
"rdfType": null,
"title": null,
"type": "string"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
2024-10-04 05:24:07 +00:00
"<table><thead><tr><th>brand</th></tr></thead><tbody><tr><td>toyota</td></tr><tr><td>mercury</td></tr><tr><td>mitsubishi</td></tr><tr><td>honda</td></tr><tr><td>mazda</td></tr><tr><td>peugeot</td></tr><tr><td>porcshce</td></tr><tr><td>renault</td></tr><tr><td>isuzu</td></tr><tr><td>dodge</td></tr><tr><td>bmw</td></tr><tr><td>vw</td></tr><tr><td>maxda</td></tr><tr><td>volkswagen</td></tr><tr><td>alfa-romero</td></tr><tr><td>chevrolet</td></tr><tr><td>toyouta</td></tr><tr><td>jaguar</td></tr><tr><td>saab</td></tr><tr><td>porsche</td></tr><tr><td>audi</td></tr><tr><td>vokswagen</td></tr><tr><td>subaru</td></tr><tr><td>nissan</td></tr><tr><td>plymouth</td></tr><tr><td>volvo</td></tr><tr><td>buick</td></tr></tbody></table>"
2024-09-25 19:52:53 +00:00
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// get brand names from `CarName`\n",
2024-09-26 01:41:33 +00:00
"let brandNameTable = df.select(\"CarName\").map((row) => {\n",
" const [carName] = row;\n",
" const brand = carName.split(\" \")[0].toLowerCase();\n",
" return brand;\n",
"});\n",
2024-09-25 19:52:53 +00:00
"\n",
"// create a dataframe from brand names\n",
"let brandDf = pl.DataFrame({\n",
2024-09-26 01:41:33 +00:00
" \"brand\": brandNameTable,\n",
"});\n",
"await display(brandDf.unique());"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "7f91cbc3-364c-49c9-97e3-370f78b708f8",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
2024-10-04 05:24:07 +00:00
"brand": "nissan"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "toyota"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "isuzu"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "plymouth"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "subaru"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "audi"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "renault"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "honda"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "dodge"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "bmw"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "volvo"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "peugeot"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "volkswagen"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "chevrolet"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "mitsubishi"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "buick"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "alfa-romero"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "porsche"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "mazda"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "saab"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "jaguar"
2024-09-25 20:34:25 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand": "mercury"
2024-09-25 19:52:53 +00:00
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand",
"rdfType": null,
"title": null,
"type": "string"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
2024-10-04 05:24:07 +00:00
"<table><thead><tr><th>brand</th></tr></thead><tbody><tr><td>nissan</td></tr><tr><td>toyota</td></tr><tr><td>isuzu</td></tr><tr><td>plymouth</td></tr><tr><td>subaru</td></tr><tr><td>audi</td></tr><tr><td>renault</td></tr><tr><td>honda</td></tr><tr><td>dodge</td></tr><tr><td>bmw</td></tr><tr><td>volvo</td></tr><tr><td>peugeot</td></tr><tr><td>volkswagen</td></tr><tr><td>chevrolet</td></tr><tr><td>mitsubishi</td></tr><tr><td>buick</td></tr><tr><td>alfa-romero</td></tr><tr><td>porsche</td></tr><tr><td>mazda</td></tr><tr><td>saab</td></tr><tr><td>jaguar</td></tr><tr><td>mercury</td></tr></tbody></table>"
2024-09-25 19:52:53 +00:00
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// transform to remove duplicates\n",
"brandNameTable = brandNameTable.map((name) => {\n",
2024-09-26 01:41:33 +00:00
" name = name\n",
" .replace(\"maxda\", \"mazda\")\n",
" .replace(\"porcshce\", \"porsche\")\n",
" .replace(\"toyouta\", \"toyota\")\n",
" .replace(/(vw|vokswagen)/ig, \"volkswagen\");\n",
" return name;\n",
"});\n",
2024-09-25 19:52:53 +00:00
"\n",
"brandDf = pl.DataFrame({\n",
2024-09-26 01:41:33 +00:00
" brand: brandNameTable,\n",
"});\n",
"await display(brandDf.unique());"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "eb19de54-11bb-40af-8e27-e5e3a6bd11f0",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"CarName": "alfa-romero giulia",
"aspiration": "std",
"boreratio": 3.47,
"brand": "alfa-romero",
"car_ID": 1,
"carbody": "convertible",
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"compressionratio": 9,
"curbweight": 2548,
"cylindernumber": "four",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 130,
"enginetype": "dohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 27,
"horsepower": 111,
"peakrpm": 5000,
"price": 13495,
"stroke": 2.68,
"symboling": 3,
"wheelbase": 88.6
},
{
"CarName": "alfa-romero stelvio",
"aspiration": "std",
"boreratio": 3.47,
"brand": "alfa-romero",
"car_ID": 2,
"carbody": "convertible",
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"compressionratio": 9,
"curbweight": 2548,
"cylindernumber": "four",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 130,
"enginetype": "dohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 27,
"horsepower": 111,
"peakrpm": 5000,
"price": 16500,
"stroke": 2.68,
"symboling": 3,
"wheelbase": 88.6
},
{
"CarName": "alfa-romero Quadrifoglio",
"aspiration": "std",
"boreratio": 2.68,
"brand": "alfa-romero",
"car_ID": 3,
"carbody": "hatchback",
"carheight": 52.4,
"carlength": 171.2,
"carwidth": 65.5,
"citympg": 19,
"compressionratio": 9,
"curbweight": 2823,
"cylindernumber": "six",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 152,
"enginetype": "ohcv",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 26,
"horsepower": 154,
"peakrpm": 5000,
"price": 16500,
"stroke": 3.47,
"symboling": 1,
"wheelbase": 94.5
},
{
"CarName": "audi 100 ls",
"aspiration": "std",
"boreratio": 3.19,
"brand": "audi",
"car_ID": 4,
"carbody": "sedan",
"carheight": 54.3,
"carlength": 176.6,
"carwidth": 66.2,
"citympg": 24,
"compressionratio": 10,
"curbweight": 2337,
"cylindernumber": "four",
"doornumber": "four",
"drivewheel": "fwd",
"enginelocation": "front",
"enginesize": 109,
"enginetype": "ohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 30,
"horsepower": 102,
"peakrpm": 5500,
"price": 13950,
"stroke": 3.4,
"symboling": 2,
"wheelbase": 99.8
},
{
"CarName": "audi 100ls",
"aspiration": "std",
"boreratio": 3.19,
"brand": "audi",
"car_ID": 5,
"carbody": "sedan",
"carheight": 54.3,
"carlength": 176.6,
"carwidth": 66.4,
"citympg": 18,
"compressionratio": 8,
"curbweight": 2824,
"cylindernumber": "five",
"doornumber": "four",
"drivewheel": "4wd",
"enginelocation": "front",
"enginesize": 136,
"enginetype": "ohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 22,
"horsepower": 115,
"peakrpm": 5500,
"price": 17450,
"stroke": 3.4,
"symboling": 2,
"wheelbase": 99.4
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "car_ID",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "symboling",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "CarName",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fueltype",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "aspiration",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "doornumber",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carbody",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "drivewheel",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginelocation",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "wheelbase",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carlength",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carwidth",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carheight",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "curbweight",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginesize",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "boreratio",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "stroke",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "compressionratio",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "horsepower",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "peakrpm",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "citympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "highwaympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "price",
"rdfType": null,
"title": null,
"type": "number"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
"<table><thead><tr><th>brand</th><th>car_ID</th><th>symboling</th><th>CarName</th><th>fueltype</th><th>aspiration</th><th>doornumber</th><th>carbody</th><th>drivewheel</th><th>enginelocation</th><th>wheelbase</th><th>carlength</th><th>carwidth</th><th>carheight</th><th>curbweight</th><th>enginetype</th><th>cylindernumber</th><th>enginesize</th><th>fuelsystem</th><th>boreratio</th><th>stroke</th><th>compressionratio</th><th>horsepower</th><th>peakrpm</th><th>citympg</th><th>highwaympg</th><th>price</th></tr></thead><tbody><tr><td>alfa-romero</td><td>1</td><td>3</td><td>alfa-romero giulia</td><td>gas</td><td>std</td><td>two</td><td>convertible</td><td>rwd</td><td>front</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>dohc</td><td>four</td><td>130</td><td>mpfi</td><td>3.47</td><td>2.68</td><td>9</td><td>111</td><td>5000</td><td>21</td><td>27</td><td>13495</td></tr><tr><td>alfa-romero</td><td>2</td><td>3</td><td>alfa-romero stelvio</td><td>gas</td><td>std</td><td>two</td><td>convertible</td><td>rwd</td><td>front</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>dohc</td><td>four</td><td>130</td><td>mpfi</td><td>3.47</td><td>2.68</td><td>9</td><td>111</td><td>5000</td><td>21</td><td>27</td><td>16500</td></tr><tr><td>alfa-romero</td><td>3</td><td>1</td><td>alfa-romero Quadrifoglio</td><td>gas</td><td>std</td><td>two</td><td>hatchback</td><td>rwd</td><td>front</td><td>94.5</td><td>171.2</td><td>65.5</td><td>52.4</td><td>2823</td><td>ohcv</td><td>six</td><td>152</td><td>mpfi</td><td>2.68</td><td>3.47</td><td>9</td><td>154</td><td>5000</td><td>19</td><td>26</td><td>16500</td></tr><tr><td>audi</td><td>4</td><td>2</td><td>audi 100 ls</td><td>gas</td><td>std</td><td>four</td><td>sedan</td><td>fwd</td><td>front</td><td>99.8</td><td>176.6</td><td>66.2</td><td>54.3</td><td>2337</td><td>ohc</td><td>four</td><td>109</td><td>mpfi</td><td>3.19</td><td>3.4</td><td>10</td><td>102</td><td>5500</td><td>24</td><td>30</td><td>13950</td></tr><tr><td>audi</td><td>5</td><td>2</td><td>audi 100ls</td><td>gas</td><td>std</td><td>four</td><td>sedan</td><td>4wd</td><td>front</td><td>99.4</td><td>176.6</td><td>66.4</td><td>54.3</td><td>2824</td><td>ohc</td><td>five</td><td>136</td><td>mpfi</td><td>3.19</td><td>3.4</td><td>8</td><td>115</td><td>5500</td><td>18</td><td>22</td><td>17450</td></tr></tbody></table>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// add new column `brand` to our dataframe\n",
2024-09-26 01:41:33 +00:00
"df = brandDf.hstack(df);\n",
"await display(df.head(5));"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "81abdb0d-41ce-445c-9a75-f9494eb1f9d3",
"metadata": {},
"source": [
"Drop unnecessary values, and write to the new file:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "6da96c06-b0b6-46b7-be2d-824af909aec4",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"aspiration": "std",
"boreratio": 3.47,
"brand": "alfa-romero",
"carbody": "convertible",
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"compressionratio": 9,
"curbweight": 2548,
"cylindernumber": "four",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 130,
"enginetype": "dohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 27,
"horsepower": 111,
"peakrpm": 5000,
"price": 13495,
"stroke": 2.68,
"wheelbase": 88.6
},
{
"aspiration": "std",
"boreratio": 3.47,
"brand": "alfa-romero",
"carbody": "convertible",
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"compressionratio": 9,
"curbweight": 2548,
"cylindernumber": "four",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 130,
"enginetype": "dohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 27,
"horsepower": 111,
"peakrpm": 5000,
"price": 16500,
"stroke": 2.68,
"wheelbase": 88.6
},
{
"aspiration": "std",
"boreratio": 2.68,
"brand": "alfa-romero",
"carbody": "hatchback",
"carheight": 52.4,
"carlength": 171.2,
"carwidth": 65.5,
"citympg": 19,
"compressionratio": 9,
"curbweight": 2823,
"cylindernumber": "six",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 152,
"enginetype": "ohcv",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 26,
"horsepower": 154,
"peakrpm": 5000,
"price": 16500,
"stroke": 3.47,
"wheelbase": 94.5
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fueltype",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "aspiration",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "doornumber",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carbody",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "drivewheel",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginelocation",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "wheelbase",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carlength",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carwidth",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carheight",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "curbweight",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginesize",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "boreratio",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "stroke",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "compressionratio",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "horsepower",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "peakrpm",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "citympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "highwaympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "price",
"rdfType": null,
"title": null,
"type": "number"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
"<table><thead><tr><th>brand</th><th>fueltype</th><th>aspiration</th><th>doornumber</th><th>carbody</th><th>drivewheel</th><th>enginelocation</th><th>wheelbase</th><th>carlength</th><th>carwidth</th><th>carheight</th><th>curbweight</th><th>enginetype</th><th>cylindernumber</th><th>enginesize</th><th>fuelsystem</th><th>boreratio</th><th>stroke</th><th>compressionratio</th><th>horsepower</th><th>peakrpm</th><th>citympg</th><th>highwaympg</th><th>price</th></tr></thead><tbody><tr><td>alfa-romero</td><td>gas</td><td>std</td><td>two</td><td>convertible</td><td>rwd</td><td>front</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>dohc</td><td>four</td><td>130</td><td>mpfi</td><td>3.47</td><td>2.68</td><td>9</td><td>111</td><td>5000</td><td>21</td><td>27</td><td>13495</td></tr><tr><td>alfa-romero</td><td>gas</td><td>std</td><td>two</td><td>convertible</td><td>rwd</td><td>front</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>dohc</td><td>four</td><td>130</td><td>mpfi</td><td>3.47</td><td>2.68</td><td>9</td><td>111</td><td>5000</td><td>21</td><td>27</td><td>16500</td></tr><tr><td>alfa-romero</td><td>gas</td><td>std</td><td>two</td><td>hatchback</td><td>rwd</td><td>front</td><td>94.5</td><td>171.2</td><td>65.5</td><td>52.4</td><td>2823</td><td>ohcv</td><td>six</td><td>152</td><td>mpfi</td><td>2.68</td><td>3.47</td><td>9</td><td>154</td><td>5000</td><td>19</td><td>26</td><td>16500</td></tr></tbody></table>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"df = df.drop(\"car_ID\", \"symboling\", \"CarName\");\n",
"df.head(3);"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "0f827b40-99b5-445d-b339-2bb55b9d6686",
"metadata": {},
"source": [
"- Use df.groupBy(...cols) in order to get aggregates form the dataset:\n",
"\n",
"Count cars by brand:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7512602c-6ce4-4167-9f74-78bc5e8c0332",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"brand": "mercury",
"brand_count": 1
},
{
"brand": "renault",
"brand_count": 2
},
{
2024-09-25 20:34:25 +00:00
"brand": "jaguar",
2024-09-25 19:52:53 +00:00
"brand_count": 3
},
{
2024-10-04 05:24:07 +00:00
"brand": "alfa-romero",
2024-09-25 19:52:53 +00:00
"brand_count": 3
},
{
2024-10-04 05:24:07 +00:00
"brand": "chevrolet",
2024-09-25 19:52:53 +00:00
"brand_count": 3
},
{
"brand": "isuzu",
"brand_count": 4
},
{
"brand": "porsche",
"brand_count": 5
},
{
"brand": "saab",
"brand_count": 6
},
{
2024-10-04 05:24:07 +00:00
"brand": "audi",
2024-09-25 19:52:53 +00:00
"brand_count": 7
},
{
2024-10-04 05:24:07 +00:00
"brand": "plymouth",
2024-09-25 19:52:53 +00:00
"brand_count": 7
},
{
2024-09-25 20:34:25 +00:00
"brand": "buick",
2024-09-25 19:52:53 +00:00
"brand_count": 8
},
{
2024-09-25 20:34:25 +00:00
"brand": "bmw",
2024-09-25 19:52:53 +00:00
"brand_count": 8
},
{
"brand": "dodge",
"brand_count": 9
},
{
2024-09-25 20:34:25 +00:00
"brand": "peugeot",
2024-09-25 19:52:53 +00:00
"brand_count": 11
},
{
2024-09-25 20:34:25 +00:00
"brand": "volvo",
2024-09-25 19:52:53 +00:00
"brand_count": 11
},
{
2024-10-04 05:24:07 +00:00
"brand": "volkswagen",
2024-09-25 19:52:53 +00:00
"brand_count": 12
},
{
2024-10-04 05:24:07 +00:00
"brand": "subaru",
2024-09-25 19:52:53 +00:00
"brand_count": 12
},
{
2024-10-04 05:24:07 +00:00
"brand": "honda",
2024-09-25 19:52:53 +00:00
"brand_count": 13
},
{
2024-10-04 05:24:07 +00:00
"brand": "mitsubishi",
2024-09-25 19:52:53 +00:00
"brand_count": 13
},
{
"brand": "mazda",
"brand_count": 17
},
{
"brand": "nissan",
"brand_count": 18
},
{
"brand": "toyota",
"brand_count": 32
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand_count",
"rdfType": null,
"title": null,
"type": "integer"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
2024-10-04 05:24:07 +00:00
"<table><thead><tr><th>brand</th><th>brand_count</th></tr></thead><tbody><tr><td>mercury</td><td>1</td></tr><tr><td>renault</td><td>2</td></tr><tr><td>jaguar</td><td>3</td></tr><tr><td>alfa-romero</td><td>3</td></tr><tr><td>chevrolet</td><td>3</td></tr><tr><td>isuzu</td><td>4</td></tr><tr><td>porsche</td><td>5</td></tr><tr><td>saab</td><td>6</td></tr><tr><td>audi</td><td>7</td></tr><tr><td>plymouth</td><td>7</td></tr><tr><td>buick</td><td>8</td></tr><tr><td>bmw</td><td>8</td></tr><tr><td>dodge</td><td>9</td></tr><tr><td>peugeot</td><td>11</td></tr><tr><td>volvo</td><td>11</td></tr><tr><td>volkswagen</td><td>12</td></tr><tr><td>subaru</td><td>12</td></tr><tr><td>honda</td><td>13</td></tr><tr><td>mitsubishi</td><td>13</td></tr><tr><td>mazda</td><td>17</td></tr><tr><td>nissan</td><td>18</td></tr><tr><td>toyota</td><td>32</td></tr></tbody></table>"
2024-09-25 19:52:53 +00:00
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"let brandCount = df.groupBy(\"brand\").len().sort(\"brand_count\");\n",
"brandCount;"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "44b5baf0-cbd2-48d7-a28b-e871682bbaf0",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"brand": null,
"describe": "mean",
2024-09-25 20:34:25 +00:00
"price": 15079.877371916704
2024-09-25 19:52:53 +00:00
},
{
"brand": null,
"describe": "std",
2024-09-25 20:34:25 +00:00
"price": 8738.804703803093
2024-09-25 19:52:53 +00:00
},
{
"brand": "alfa-romero",
"describe": "min",
"price": 6007
},
{
"brand": "volvo",
"describe": "max",
"price": 34600
},
{
"brand": null,
"describe": "median",
"price": 10534.274509803921
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "describe",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "price",
"rdfType": null,
"title": null,
"type": "number"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
2024-09-25 20:34:25 +00:00
"<table><thead><tr><th>describe</th><th>brand</th><th>price</th></tr></thead><tbody><tr><td>mean</td><td>null</td><td>15079.877371916704</td></tr><tr><td>std</td><td>null</td><td>8738.804703803093</td></tr><tr><td>min</td><td>alfa-romero</td><td>6007</td></tr><tr><td>max</td><td>volvo</td><td>34600</td></tr><tr><td>median</td><td>null</td><td>10534.274509803921</td></tr></tbody></table>"
2024-09-25 19:52:53 +00:00
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"let avgPricePerBrand = df.groupBy(\"brand\").agg({ \"price\": [\"mean\"] });\n",
"avgPricePerBrand.describe();"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "98a18f05-665f-4143-b5d5-6ecda68ca2df",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{\n",
" renault: \u001b[33m9595\u001b[39m,\n",
2024-09-25 20:34:25 +00:00
" jaguar: \u001b[33m34600\u001b[39m,\n",
2024-09-25 19:52:53 +00:00
" saab: \u001b[33m15223.333333333334\u001b[39m,\n",
2024-10-04 05:24:07 +00:00
" nissan: \u001b[33m10415.666666666666\u001b[39m,\n",
" toyota: \u001b[33m9885.8125\u001b[39m,\n",
" mitsubishi: \u001b[33m9239.76923076923\u001b[39m,\n",
" porsche: \u001b[33m31400.5\u001b[39m,\n",
" plymouth: \u001b[33m7963.428571428572\u001b[39m,\n",
" dodge: \u001b[33m7875.444444444444\u001b[39m,\n",
" mazda: \u001b[33m10652.882352941177\u001b[39m,\n",
2024-09-25 20:34:25 +00:00
" honda: \u001b[33m8184.692307692308\u001b[39m,\n",
2024-10-04 05:24:07 +00:00
" peugeot: \u001b[33m15489.09090909091\u001b[39m,\n",
2024-09-25 20:34:25 +00:00
" audi: \u001b[33m17859.166714285715\u001b[39m,\n",
2024-10-04 05:24:07 +00:00
" volvo: \u001b[33m18063.18181818182\u001b[39m,\n",
" buick: \u001b[33m33647\u001b[39m,\n",
2024-09-25 20:34:25 +00:00
" chevrolet: \u001b[33m6007\u001b[39m,\n",
" isuzu: \u001b[33m8916.5\u001b[39m,\n",
2024-10-04 05:24:07 +00:00
" bmw: \u001b[33m26118.75\u001b[39m,\n",
2024-09-25 20:34:25 +00:00
" \u001b[32m\"alfa-romero\"\u001b[39m: \u001b[33m15498.333333333334\u001b[39m,\n",
2024-10-04 05:24:07 +00:00
" volkswagen: \u001b[33m10077.5\u001b[39m,\n",
" mercury: \u001b[33m16503\u001b[39m,\n",
" subaru: \u001b[33m8541.25\u001b[39m\n",
2024-09-25 19:52:53 +00:00
"}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// map brand name to price\n",
"avgPricePerBrand = avgPricePerBrand\n",
2024-09-26 01:41:33 +00:00
" .toRecords()\n",
" .reduce((acc, rec) => ({ ...acc, [rec.brand]: rec.price }), {});"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9e7dd2b4-4356-41ad-bd1e-20b7938f55f0",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
2024-09-25 20:34:25 +00:00
"brand_category": "Mid_Range"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand_category": "Budget"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand_category": "Mid_Range"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand_category": "Mid_Range"
2024-09-25 19:52:53 +00:00
},
{
2024-10-04 05:24:07 +00:00
"brand_category": "Budget"
2024-09-25 19:52:53 +00:00
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand_category",
"rdfType": null,
"title": null,
"type": "string"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
2024-10-04 05:24:07 +00:00
"<table><thead><tr><th>brand_category</th></tr></thead><tbody><tr><td>Mid_Range</td></tr><tr><td>Budget</td></tr><tr><td>Mid_Range</td></tr><tr><td>Mid_Range</td></tr><tr><td>Budget</td></tr></tbody></table>"
2024-09-25 19:52:53 +00:00
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// create brand categories by budget\n",
"let brandCategory = df.brand.toArray().map((brand) => {\n",
2024-09-26 01:41:33 +00:00
" const avgPrice = avgPricePerBrand[brand];\n",
" return avgPrice < 10000\n",
" ? \"Budget\"\n",
" : avgPrice > 20000\n",
" ? \"Luxury\"\n",
" : \"Mid_Range\";\n",
"});\n",
2024-09-25 19:52:53 +00:00
"let catDf = pl.DataFrame({\n",
2024-09-26 01:41:33 +00:00
" \"brand_category\": brandCategory,\n",
"});\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"catDf.sample(5);"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "14930aef-1fc0-4baf-92bf-8a093c3d2e86",
"metadata": {},
"source": [
"Write the cleaned dataset to a new file:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "350c57c7-ec9e-400d-a8e7-19301c720c56",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
2024-09-26 01:41:33 +00:00
"df = catDf.hstack(df);\n",
"df.writeCSV(\"assets/cleaned_car_prices.csv\");"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "0ea22d38-cb20-47a0-8f92-422bc5925abf",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"## Exploratory Data Analysis\n",
"\n",
2024-09-26 01:41:33 +00:00
"For plotting we use [@observable/plot](https://observablehq.com/plot) and\n",
"configured shotcuts for jupyter notebooks imported from\n",
"[l12.xyz/x/shortcuts](https://l12.xyz/x/shortcuts)."
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2fed060e-1785-4dc1-ba44-4bcd5cee92f6",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
2024-09-25 20:34:25 +00:00
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\n",
" { brand: \"mercury\", brand_count: 1 },\n",
" { brand: \"renault\", brand_count: 2 },\n",
" { brand: \"jaguar\", brand_count: 3 },\n",
" { brand: \"alfa-romero\", brand_count: 3 },\n",
2024-10-04 05:24:07 +00:00
" { brand: \"chevrolet\", brand_count: 3 },\n",
2024-09-25 20:34:25 +00:00
" { brand: \"isuzu\", brand_count: 4 },\n",
" { brand: \"porsche\", brand_count: 5 },\n",
" { brand: \"saab\", brand_count: 6 },\n",
" { brand: \"audi\", brand_count: 7 },\n",
2024-10-04 05:24:07 +00:00
" { brand: \"plymouth\", brand_count: 7 },\n",
2024-09-25 20:34:25 +00:00
" { brand: \"buick\", brand_count: 8 },\n",
" { brand: \"bmw\", brand_count: 8 },\n",
" { brand: \"dodge\", brand_count: 9 },\n",
" { brand: \"peugeot\", brand_count: 11 },\n",
" { brand: \"volvo\", brand_count: 11 },\n",
" { brand: \"volkswagen\", brand_count: 12 },\n",
2024-10-04 05:24:07 +00:00
" { brand: \"subaru\", brand_count: 12 },\n",
2024-09-25 20:34:25 +00:00
" { brand: \"honda\", brand_count: 13 },\n",
2024-10-04 05:24:07 +00:00
" { brand: \"mitsubishi\", brand_count: 13 },\n",
2024-09-25 20:34:25 +00:00
" { brand: \"mazda\", brand_count: 17 },\n",
" { brand: \"nissan\", brand_count: 18 },\n",
" { brand: \"toyota\", brand_count: 32 }\n",
"]\n"
]
},
2024-09-25 19:52:53 +00:00
{
"data": {
"image/svg+xml": [
"<svg style=\"background-color:#fff\" viewBox=\"0 0 640 500\" height=\"500\" width=\"640\" text-anchor=\"middle\" font-size=\"10\" font-family=\"system-ui, sans-serif\" fill=\"currentColor\" class=\"plot-d6a7b5\"><style>:where(.plot-d6a7b5) {\n",
" --plot-background: white;\n",
" display: block;\n",
" height: auto;\n",
" height: intrinsic;\n",
" max-width: 100%;\n",
"}\n",
":where(.plot-d6a7b5 text),\n",
":where(.plot-d6a7b5 tspan) {\n",
" white-space: pre;\n",
2024-10-04 05:24:07 +00:00
"}</style><g transform=\"translate(0,9)\" stroke=\"currentColor\" fill=\"none\" aria-hidden=\"true\" aria-label=\"y-axis tick\"><path d=\"M0,0L-6,0\" transform=\"translate(80,26)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,46)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,66)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,86)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,106)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,126)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,146)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,166)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,186)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,206)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,226)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,246)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,266)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,286)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,306)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,326)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,346)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,366)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,386)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,406)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,426)\" /><path d=\"M0,0L-6,0\" transform=\"translate(80,446)\" /></g><g transform=\"translate(-9,9)\" text-anchor=\"end\" aria-label=\"y-axis tick label\"><text transform=\"translate(80,26)\" y=\"0.32em\">toyota</text><text transform=\"translate(80,46)\" y=\"0.32em\">nissan</text><text transform=\"translate(80,66)\" y=\"0.32em\">mazda</text><text transform=\"translate(80,86)\" y=\"0.32em\">honda</text><text transform=\"translate(80,106)\" y=\"0.32em\">mitsubishi</text><text transform=\"translate(80,126)\" y=\"0.32em\">subaru</text><text transform=\"translate(80,146)\" y=\"0.32em\">volkswagen</text><text transform=\"translate(80,166)\" y=\"0.32em\">peugeot</text><text transform=\"translate(80,186)\" y=\"0.32em\">volvo</text><text transform=\"translate(80,206)\" y=\"0.32em\">dodge</text><text transform=\"translate(80,226)\" y=\"0.32em\">bmw</text><text transform=\"translate(80,246)\" y=\"0.32em\">buick</text><text transform=\"translate(80,266)\" y=\"0.32em\">audi</text><text transform=\"translate(80,286)\" y=\"0.32em\">plymouth</text><text transform=\"translate(80,306)\" y=\"0.32em\">saab</text><text transform=\"translate(80,326)\" y=\"0.32em\">porsche</text><text transform=\"translate(80,346)\" y=\"0.32em\">isuzu</text><text transform=\"translate(80,366)\" y=\"0.32em\">alfa-romero</text><text transform=\"translate(80,386)\" y=\"0.32em\">chevrolet</text><text transform=\"translate(80,406)\" y=\"0.32em\">jaguar</text><text transform=\"translate(80,426)\" y=\"0.32em\">renault</text><text transform=\"translate(80,446)\" y=\"0.32em\">mercury</text></g><g transform=\"translate(-77,0)\" aria-label=\"y-axis label\"><text transform=\"translate(80,245) rotate(-90)\" y=\"0.71em\">brand</text></g><g stroke=\"currentColor\" fill=\"none\" aria-hidden=\"true\" aria-label=\"x-axis tick\"><path d=\"M0,0L0,6\" transform=\"translate(80,470)\" /><path d=\"M0,0L0,6\" transform=\"translate(164.375,470)\" /><path d=\"M0,0L0,6\" transform=\"translate(248.75,470)\" /><path d=\"M0,0L0,6\" transform=\"translate(333.125,470)\" /><path d=\"M0,0L0,6\" transform=\"translate(417.5,470)\" /><path d=\"M0,0L0,6\" transform=\"translate(501.875,470)\" /><path d=\"M0,0L0,6\" transform=\"translate(586.25,470)\" /></g><g transform=\"translate(0,9)\" font-variant=\"tabular-nums\" aria-label=\"x-axis tick label\"><text transform=\"translate(80,470)\" y=\"0.71em\">0</text><text transform=\"translate(164.375,470)\" y=\"0.71em\">5</text><text transform=\"translate(248.75,470)\" y=\"0.71em\">10</text><text transform=\"translate(333.125,470)\" y=\"0.71em\">15</text><text transform=\"translate(417.5,470)\" y=\"0.71em\">20</text><text transform=\"translate(501.875,470)\" y=\"0.71em\">25</text><text transform=\"translate(586.25,470)\" y=\"0.71em\">30</text></g><g transform=\"translate(17,27)\" tex
2024-09-25 19:52:53 +00:00
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-10-04 05:24:07 +00:00
"\n",
"import { document } from \"https://l12.xyz/x/shortcuts/raw/plots.ts\";\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"const brandCountRecords = brandCount.toRecords();\n",
"console.log(brandCountRecords);\n",
2024-09-25 19:52:53 +00:00
"const brandCountPlot = Plot.plot({\n",
2024-09-26 01:41:33 +00:00
" marginLeft: 80,\n",
" style: {\n",
" backgroundColor: \"#fff\",\n",
" },\n",
" x: { padding: 0.4 },\n",
" marks: [\n",
" Plot.barX(brandCountRecords, {\n",
" x: \"brand_count\",\n",
" y: \"brand\",\n",
" sort: { y: \"x\", order: \"descending\" },\n",
" }),\n",
" ],\n",
" document,\n",
"});\n",
"await display(brandCountPlot);"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e21f5bdf-a5a8-43b2-855b-0eb079656da7",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[\n",
" \u001b[32m\"wheelbase\"\u001b[39m, \u001b[32m\"carlength\"\u001b[39m,\n",
" \u001b[32m\"carwidth\"\u001b[39m, \u001b[32m\"carheight\"\u001b[39m,\n",
" \u001b[32m\"curbweight\"\u001b[39m, \u001b[32m\"enginesize\"\u001b[39m,\n",
" \u001b[32m\"boreratio\"\u001b[39m, \u001b[32m\"stroke\"\u001b[39m,\n",
" \u001b[32m\"compressionratio\"\u001b[39m, \u001b[32m\"horsepower\"\u001b[39m,\n",
" \u001b[32m\"peakrpm\"\u001b[39m, \u001b[32m\"citympg\"\u001b[39m,\n",
" \u001b[32m\"highwaympg\"\u001b[39m\n",
"]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"let numericColumns = df.columns.filter((col) =>\n",
" df[col].isNumeric() && col !== \"price\"\n",
");\n",
"numericColumns;"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "1c8d3b90-b4b1-4da6-9e7c-f8fd7cffb77d",
"metadata": {},
"source": [
2024-09-26 01:41:33 +00:00
"Sometimes we can get some intuitive insight seeing the data plotted from\n",
"different dimensions. It is an optional step, but it might help to get some\n",
"assumtions about the relationships in the dataset. Below is an example for\n",
"drawing plots side-by-side."
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "9a82f623-b39a-473a-91b9-0a0347c64c63",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <section style=\"display:grid;grid-template-columns: repeat(3, 1fr);\">\n",
2024-09-25 20:34:25 +00:00
" <img title=\"wheelbase / price\" src='
2024-09-25 19:52:53 +00:00
" </section>\n",
" "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"import { sideBySidePlot } from \"https://l12.xyz/x/shortcuts/raw/plots.ts\";\n",
2024-09-25 19:52:53 +00:00
"\n",
"let records = df.toRecords();\n",
"\n",
"const plt = sideBySidePlot({\n",
2024-09-26 01:41:33 +00:00
" x: numericColumns,\n",
" y: [\"price\"],\n",
" marks: [\n",
" (x, y) => Plot.dot(records, { x, y }),\n",
" (x, y) => Plot.linearRegressionY(records, { x, y, stroke: \"red\" }),\n",
" ],\n",
" cols: 3,\n",
"});\n",
2024-09-25 19:52:53 +00:00
"\n",
"await display(\n",
2024-09-26 01:41:33 +00:00
" plt,\n",
2024-09-25 19:52:53 +00:00
");"
]
},
{
"cell_type": "markdown",
"id": "53d04850-df05-474c-8568-a5f7c97146f3",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
2024-09-26 01:41:33 +00:00
"Let's view the list of top features that have high correlation coefficient. The\n",
"pearsonCorr() function calculates the Pearson'r correlation coefficients with\n",
"respect to the 'price'."
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "924838be-1b4d-4edf-abe5-8bf02e02cf11",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"Variable": "enginesize",
"idx (price)": 0.8741448025245117
},
{
"Variable": "curbweight",
"idx (price)": 0.8353048793372955
},
{
"Variable": "horsepower",
"idx (price)": 0.8081388225362217
},
{
"Variable": "carwidth",
"idx (price)": 0.7593252997414263
},
{
"Variable": "carlength",
"idx (price)": 0.6829200156779843
},
{
"Variable": "wheelbase",
"idx (price)": 0.5778155982921477
},
{
"Variable": "boreratio",
"idx (price)": 0.5531732367984261
},
{
"Variable": "carheight",
"idx (price)": 0.11933622657047727
},
{
"Variable": "stroke",
"idx (price)": 0.07944308388192935
},
{
"Variable": "compressionratio",
"idx (price)": 0.06798350579944248
},
{
"Variable": "peakrpm",
"idx (price)": -0.0852671502778569
},
{
"Variable": "citympg",
"idx (price)": -0.6857513360270401
},
{
"Variable": "highwaympg",
"idx (price)": -0.6975990916465564
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "Variable",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "idx (price)",
"rdfType": null,
"title": null,
"type": "number"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
"<table><thead><tr><th>Variable</th><th>idx (price)</th></tr></thead><tbody><tr><td>enginesize</td><td>0.8741448025245117</td></tr><tr><td>curbweight</td><td>0.8353048793372955</td></tr><tr><td>horsepower</td><td>0.8081388225362217</td></tr><tr><td>carwidth</td><td>0.7593252997414263</td></tr><tr><td>carlength</td><td>0.6829200156779843</td></tr><tr><td>wheelbase</td><td>0.5778155982921477</td></tr><tr><td>boreratio</td><td>0.5531732367984261</td></tr><tr><td>carheight</td><td>0.11933622657047727</td></tr><tr><td>stroke</td><td>0.07944308388192935</td></tr><tr><td>compressionratio</td><td>0.06798350579944248</td></tr><tr><td>peakrpm</td><td>-0.0852671502778569</td></tr><tr><td>citympg</td><td>-0.6857513360270401</td></tr><tr><td>highwaympg</td><td>-0.6975990916465564</td></tr></tbody></table>"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"// we select pearon's idx from dataframe for all numeric cols,\n",
"// then we transpose result so that columns become rows,\n",
2024-09-25 19:52:53 +00:00
"// then we sort by the idx column\n",
"df.select(\n",
2024-09-26 01:41:33 +00:00
" ...numericColumns.map((col) => pl.pearsonCorr(col, \"price\")),\n",
2024-09-25 19:52:53 +00:00
")\n",
2024-09-26 01:41:33 +00:00
" .transpose({\n",
" columnNames: [\"idx (price)\"],\n",
" headerName: \"Variable\",\n",
" includeHeader: true,\n",
" })\n",
" .sort(\"idx (price)\", true);"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "63bc6b92-5ab9-43e0-a957-57d895bc60f9",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"### Linearity Assumption\n",
"\n",
2024-09-26 01:41:33 +00:00
"Linear regression needs the relationship between independent variable and the\n",
"dependent variable to be linear. We can test this assumption with some scatter\n",
"plots and regression lines.\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"** Here we use the same side-by-side plot shortcut, but for selected varisbles\n",
"with high correlation coefficent."
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "2d5a635a-c819-4989-9818-a1927363993d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <section style=\"display:grid;grid-template-columns: repeat(2, 1fr);\">\n",
2024-09-25 20:34:25 +00:00
" <img title=\"enginesize / price\" src='
2024-09-25 19:52:53 +00:00
" </section>\n",
" "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"const plt = sideBySidePlot({\n",
2024-09-26 01:41:33 +00:00
" x: [\"enginesize\", \"curbweight\", \"horsepower\", \"carwidth\"],\n",
" y: [\"price\"],\n",
" marks: [\n",
" (x, y) => Plot.dot(records, { x, y }),\n",
" (x, y) => Plot.linearRegressionY(records, { x, y, stroke: \"red\" }),\n",
" ],\n",
" cols: 2,\n",
"});\n",
2024-09-25 19:52:53 +00:00
"\n",
"await display(\n",
2024-09-26 01:41:33 +00:00
" plt,\n",
2024-09-25 19:52:53 +00:00
");"
]
},
{
"cell_type": "markdown",
"id": "0a351f17-309b-444b-b4a8-26500b077e47",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"### Homoscedasticity\n",
"\n",
2024-09-26 01:41:33 +00:00
"The assumption of homoscedasticity (constant variance), is crucial to linear\n",
"regression models. Homoscedasticity describes a situation in which the error\n",
"term or variance or the \"noise\" or random disturbance in the relationship\n",
"between the independent variables and the dependent variable is the same across\n",
"all values of the independent variable. In other words, there is a constant\n",
"variance present in the response variable as the predictor variable increases.\n",
"If the \"noise\" is not the same across the values of an independent variable, we\n",
"call it heteroscedasticity, opposite of homoscedasticity.\n",
2024-09-25 19:52:53 +00:00
"\n",
"#### Residuals\n",
"\n",
2024-09-26 01:41:33 +00:00
"Next we apply residual expression to 'price' and 'enginesize' varibles in order\n",
"to check this assumption.\n",
"[The residuals function](https://l12.xyz/x/shortcuts/src/branch/main/expr.ts)\n",
"uses mean squared."
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "fc9a31c3-6ac0-4058-b5bb-341eb1c40dd1",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"image/svg+xml": [
"<svg viewBox=\"0 0 640 400\" height=\"400\" width=\"640\" text-anchor=\"middle\" font-size=\"10\" font-family=\"system-ui, sans-serif\" fill=\"currentColor\" class=\"plot-d6a7b5\"><style>:where(.plot-d6a7b5) {\n",
" --plot-background: white;\n",
" display: block;\n",
" height: auto;\n",
" height: intrinsic;\n",
" max-width: 100%;\n",
"}\n",
":where(.plot-d6a7b5 text),\n",
":where(.plot-d6a7b5 tspan) {\n",
" white-space: pre;\n",
"}</style><g stroke=\"currentColor\" fill=\"none\" aria-hidden=\"true\" aria-label=\"y-axis tick\"><path d=\"M0,0L-6,0\" transform=\"translate(40,360.676690955552)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,332.6045058220467)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,304.5323206885414)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,276.4601355550362)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,248.3879504215309)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,220.31576528802563)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,192.24358015452034)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,164.17139502101506)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,136.0992098875098)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,108.02702475400451)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,79.95483962049926)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,51.882654486993985)\" /><path d=\"M0,0L-6,0\" transform=\"translate(40,23.810469353488706)\" /></g><g transform=\"translate(-9,0)\" font-variant=\"tabular-nums\" text-anchor=\"end\" aria-label=\"y-axis tick label\"><text transform=\"translate(40,360.676690955552)\" y=\"0.32em\">10,000</text><text transform=\"translate(40,332.6045058220467)\" y=\"0.32em\">8,000</text><text transform=\"translate(40,304.5323206885414)\" y=\"0.32em\">6,000</text><text transform=\"translate(40,276.4601355550362)\" y=\"0.32em\">4,000</text><text transform=\"translate(40,248.3879504215309)\" y=\"0.32em\">2,000</text><text transform=\"translate(40,220.31576528802563)\" y=\"0.32em\">0</text><text transform=\"translate(40,192.24358015452034)\" y=\"0.32em\">2,000</text><text transform=\"translate(40,164.17139502101506)\" y=\"0.32em\">4,000</text><text transform=\"translate(40,136.0992098875098)\" y=\"0.32em\">6,000</text><text transform=\"translate(40,108.02702475400451)\" y=\"0.32em\">8,000</text><text transform=\"translate(40,79.95483962049926)\" y=\"0.32em\">10,000</text><text transform=\"translate(40,51.882654486993985)\" y=\"0.32em\">12,000</text><text transform=\"translate(40,23.810469353488706)\" y=\"0.32em\">14,000</text></g><g transform=\"translate(-37,-17)\" text-anchor=\"start\" aria-label=\"y-axis label\"><text transform=\"translate(40,20)\" y=\"0.71em\">↑ price</text></g><g stroke=\"currentColor\" fill=\"none\" aria-hidden=\"true\" aria-label=\"x-axis tick\"><path d=\"M0,0L0,6\" transform=\"translate(125.35849056603774,370)\" /><path d=\"M0,0L0,6\" transform=\"translate(234.79245283018867,370)\" /><path d=\"M0,0L0,6\" transform=\"translate(344.22641509433964,370)\" /><path d=\"M0,0L0,6\" transform=\"translate(453.6603773584905,370)\" /><path d=\"M0,0L0,6\" transform=\"translate(563.0943396226415,370)\" /></g><g transform=\"translate(0,9)\" font-variant=\"tabular-nums\" aria-label=\"x-axis tick label\"><text transform=\"translate(125.35849056603774,370)\" y=\"0.71em\">100</text><text transform=\"translate(234.79245283018867,370)\" y=\"0.71em\">150</text><text transform=\"translate(344.22641509433964,370)\" y=\"0.71em\">200</text><text transform=\"translate(453.6603773584905,370)\" y=\"0.71em\">250</text><text transform=\"translate(563.0943396226415,370)\" y=\"0.71em\">300</text></g><g transform=\"translate(17,27)\" text-anchor=\"end\" aria-label=\"x-axis label\"><text transform=\"translate(620,370)\">enginesize →</text></g><g stroke-width=\"1.5\" stroke=\"currentColor\" fill=\"none\" aria-label=\"dot\"><circle r=\"3\" cy=\"224.53148603945945\" cx=\"191.0188679245283\" /><circle r=\"3\" cy=\"182.35302787636778\" cx=\"191.0188679245283\" /><circle r=\"3\" cy=\"234.13729878380033\" cx=\"239.16981132075472\" /><circle r=\"3\" cy=\"168.7146235099468\" cx=\"145.0566037735849\" /><circle r=\"3\" cy=\"183.14172291270714\" cx=\"204.1509433962264\" /><circle r=\"3\" cy=\"214.0211265595629\" cx=\"204.1509433962264\" /><circle r=\"3\" cy=\"179.49233884535147\" cx=\"204.1509433962264\" /><circle r=\"3\" cy=\"162.50866683958074\" cx=\"204.1509433962264\" /><circle r=\"3\" cy=\"81.19067569235949\" cx=\"193.20754716981133\"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"import { residuals } from \"https://l12.xyz/x/shortcuts/raw/expr.ts\";\n",
2024-09-25 19:52:53 +00:00
"\n",
"let residualDf = df.select(\n",
2024-09-26 01:41:33 +00:00
" \"enginesize\",\n",
" residuals(pl.col(\"enginesize\"), pl.col(\"price\")),\n",
");\n",
2024-09-25 19:52:53 +00:00
"\n",
"let residPlot = Plot.plot({\n",
2024-09-26 01:41:33 +00:00
" x: \"enginesize\",\n",
" y: \"price\",\n",
" marks: [\n",
" Plot.dot(residualDf.toRecords(), { x: \"enginesize\", y: \"price\" }),\n",
" Plot.ruleY([0], { stroke: \"#ccc\" }),\n",
" ],\n",
" document,\n",
"});\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"await display(residPlot);"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "29a0ec4e-084c-4af6-9af6-a14ae5cee4e7",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
2024-09-26 01:41:33 +00:00
"From the above plot, we can tell the error variance across the true line is\n",
"dispersed somewhat not uniformly, but in a funnel like shape. So, the assumption\n",
"of the _homoscedasticity_ is more likely not met."
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "33d787f3-2a7b-421c-9d9b-a86719a92f9a",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"## Normality\n",
"\n",
2024-09-26 01:41:33 +00:00
"The linear regression analysis requires the dependent variable, 'price', to be\n",
"normally distributed. A histogram, box plot, or a Q-Q-Plot can check if the\n",
"target variable is normally distributed. The goodness of fit test, e.g., the\n",
"Kolmogorov-Smirnov test can check for normality in the dependent variable.\n",
"[This documentation](https://towardsdatascience.com/normality-tests-in-python-31e04aa4f411)\n",
"contains more information on the normality assumption.\n",
2024-09-25 19:52:53 +00:00
"\n",
"Let's display all three charts to show how our target variable, 'price' behaves."
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b94d0d8d-ae95-41f5-b17f-0ab9b3aa27ec",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
2024-09-25 20:34:25 +00:00
" <section style=\"display:flex;flex-direction: column; gap: 1em;\">\n",
" <div style=\"gap:0.5em; display:flex; flex-direction: row; border: 1px solid black;\">\n",
" <img src=\"
" <img src=\"
" </div>\n",
" <div style=\"padding:1em; border: 1px solid black;\">\n",
" <img src=\"
" </div>\n",
" </section>\n",
" "
2024-09-25 19:52:53 +00:00
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"import { threeChart } from \"https://l12.xyz/x/shortcuts/raw/plots.ts\";\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"await display(threeChart(records, \"price\"));"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "81fe890e-56c4-4076-b57c-38fe438c0ead",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"These three charts above can tell us a lot about our target variable:\n",
"\n",
"- Our target variable, 'price' is not normally distributed\n",
"- Our target variable is right-skewed\n",
"- There are some outliers in the variable\n",
"\n",
2024-09-26 01:41:33 +00:00
"The right-skewed plot means that most prices in the dataset are on the lower end\n",
"(below 15,000). The 'max' value is very far from the '75%' quantile statistic.\n",
"All these plots show that the assumption for accurate linear regression modeling\n",
"is not met.\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"Next, we will perform the log transformation to correct our target variable and\n",
"to make it more normally distributed."
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "8f8c6044-d3a1-4f8e-89ba-365fea3fbd8b",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
2024-09-25 20:34:25 +00:00
" <section style=\"display:flex;flex-direction: column; gap: 1em;\">\n",
" <div style=\"gap:0.5em; display:flex; flex-direction: row; border: 1px solid black;\">\n",
" <img src=\"
" <img src=\"
" </div>\n",
" <div style=\"padding:1em; border: 1px solid black;\">\n",
" <img src=\"
" </div>\n",
" </section>\n",
" "
2024-09-25 19:52:53 +00:00
]
},
2024-10-04 05:24:07 +00:00
"execution_count": 20,
2024-09-25 19:52:53 +00:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"let log2df = df.select(pl.col(\"price\").log());\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"await display(threeChart(log2df.toRecords(), \"price\"));"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "09a9d9c0-7fb6-4353-91f8-4899520b36d7",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"## Data Encoding"
]
},
{
"cell_type": "code",
2024-10-04 05:24:07 +00:00
"execution_count": 21,
2024-09-25 19:52:53 +00:00
"id": "79f05578-b839-4278-afb1-0123c1d32d17",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"aspiration": "std",
"boreratio": 3.47,
"brand": "alfa-romero",
"brand_category": "Mid_Range",
"carbody": "convertible",
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"compressionratio": 9,
"curbweight": 2548,
"cylindernumber": "four",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 130,
"enginetype": "dohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 27,
"horsepower": 111,
"peakrpm": 5000,
"price": 13495,
"stroke": 2.68,
"wheelbase": 88.6
},
{
"aspiration": "std",
"boreratio": 3.47,
"brand": "alfa-romero",
"brand_category": "Mid_Range",
"carbody": "convertible",
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"compressionratio": 9,
"curbweight": 2548,
"cylindernumber": "four",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 130,
"enginetype": "dohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 27,
"horsepower": 111,
"peakrpm": 5000,
"price": 16500,
"stroke": 2.68,
"wheelbase": 88.6
},
{
"aspiration": "std",
"boreratio": 2.68,
"brand": "alfa-romero",
"brand_category": "Mid_Range",
"carbody": "hatchback",
"carheight": 52.4,
"carlength": 171.2,
"carwidth": 65.5,
"citympg": 19,
"compressionratio": 9,
"curbweight": 2823,
"cylindernumber": "six",
"doornumber": "two",
"drivewheel": "rwd",
"enginelocation": "front",
"enginesize": 152,
"enginetype": "ohcv",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 26,
"horsepower": 154,
"peakrpm": 5000,
"price": 16500,
"stroke": 3.47,
"wheelbase": 94.5
},
{
"aspiration": "std",
"boreratio": 3.19,
"brand": "audi",
"brand_category": "Mid_Range",
"carbody": "sedan",
"carheight": 54.3,
"carlength": 176.6,
"carwidth": 66.2,
"citympg": 24,
"compressionratio": 10,
"curbweight": 2337,
"cylindernumber": "four",
"doornumber": "four",
"drivewheel": "fwd",
"enginelocation": "front",
"enginesize": 109,
"enginetype": "ohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 30,
"horsepower": 102,
"peakrpm": 5500,
"price": 13950,
"stroke": 3.4,
"wheelbase": 99.8
},
{
"aspiration": "std",
"boreratio": 3.19,
"brand": "audi",
"brand_category": "Mid_Range",
"carbody": "sedan",
"carheight": 54.3,
"carlength": 176.6,
"carwidth": 66.4,
"citympg": 18,
"compressionratio": 8,
"curbweight": 2824,
"cylindernumber": "five",
"doornumber": "four",
"drivewheel": "4wd",
"enginelocation": "front",
"enginesize": 136,
"enginetype": "ohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 22,
"horsepower": 115,
"peakrpm": 5500,
"price": 17450,
"stroke": 3.4,
"wheelbase": 99.4
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand_category",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fueltype",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "aspiration",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "doornumber",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carbody",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "drivewheel",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginelocation",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "wheelbase",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carlength",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carwidth",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carheight",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "curbweight",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginesize",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "boreratio",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "stroke",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "compressionratio",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "horsepower",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "peakrpm",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "citympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "highwaympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "price",
"rdfType": null,
"title": null,
"type": "number"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
"<table><thead><tr><th>brand_category</th><th>brand</th><th>fueltype</th><th>aspiration</th><th>doornumber</th><th>carbody</th><th>drivewheel</th><th>enginelocation</th><th>wheelbase</th><th>carlength</th><th>carwidth</th><th>carheight</th><th>curbweight</th><th>enginetype</th><th>cylindernumber</th><th>enginesize</th><th>fuelsystem</th><th>boreratio</th><th>stroke</th><th>compressionratio</th><th>horsepower</th><th>peakrpm</th><th>citympg</th><th>highwaympg</th><th>price</th></tr></thead><tbody><tr><td>Mid_Range</td><td>alfa-romero</td><td>gas</td><td>std</td><td>two</td><td>convertible</td><td>rwd</td><td>front</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>dohc</td><td>four</td><td>130</td><td>mpfi</td><td>3.47</td><td>2.68</td><td>9</td><td>111</td><td>5000</td><td>21</td><td>27</td><td>13495</td></tr><tr><td>Mid_Range</td><td>alfa-romero</td><td>gas</td><td>std</td><td>two</td><td>convertible</td><td>rwd</td><td>front</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>dohc</td><td>four</td><td>130</td><td>mpfi</td><td>3.47</td><td>2.68</td><td>9</td><td>111</td><td>5000</td><td>21</td><td>27</td><td>16500</td></tr><tr><td>Mid_Range</td><td>alfa-romero</td><td>gas</td><td>std</td><td>two</td><td>hatchback</td><td>rwd</td><td>front</td><td>94.5</td><td>171.2</td><td>65.5</td><td>52.4</td><td>2823</td><td>ohcv</td><td>six</td><td>152</td><td>mpfi</td><td>2.68</td><td>3.47</td><td>9</td><td>154</td><td>5000</td><td>19</td><td>26</td><td>16500</td></tr><tr><td>Mid_Range</td><td>audi</td><td>gas</td><td>std</td><td>four</td><td>sedan</td><td>fwd</td><td>front</td><td>99.8</td><td>176.6</td><td>66.2</td><td>54.3</td><td>2337</td><td>ohc</td><td>four</td><td>109</td><td>mpfi</td><td>3.19</td><td>3.4</td><td>10</td><td>102</td><td>5500</td><td>24</td><td>30</td><td>13950</td></tr><tr><td>Mid_Range</td><td>audi</td><td>gas</td><td>std</td><td>four</td><td>sedan</td><td>4wd</td><td>front</td><td>99.4</td><td>176.6</td><td>66.4</td><td>54.3</td><td>2824</td><td>ohc</td><td>five</td><td>136</td><td>mpfi</td><td>3.19</td><td>3.4</td><td>8</td><td>115</td><td>5500</td><td>18</td><td>22</td><td>17450</td></tr></tbody></table>"
]
},
2024-10-04 05:24:07 +00:00
"execution_count": 21,
2024-09-25 19:52:53 +00:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"let carData = pl.readCSV(\n",
2024-09-26 01:41:33 +00:00
" await Deno.readTextFile(\"assets/cleaned_car_prices.csv\"),\n",
" { sep: \",\" },\n",
");\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"carData.head(5);"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "af6eb67c-574a-430a-a04c-24d0ef9f35fa",
"metadata": {},
"source": [
"We'll drop some unnecessary columns:"
]
},
{
"cell_type": "code",
2024-10-04 05:24:07 +00:00
"execution_count": 22,
2024-09-25 19:52:53 +00:00
"id": "e2df0c3f-dbc0-4820-b807-373ce3787645",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"aspiration": "std",
"boreratio": 3.47,
"brand_category": "Mid_Range",
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"curbweight": 2548,
"cylindernumber": "four",
"doornumber": "two",
"drivewheel": "rwd",
"enginesize": 130,
"enginetype": "dohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 27,
"horsepower": 111,
"price": 13495,
"wheelbase": 88.6
},
{
"aspiration": "std",
"boreratio": 3.47,
"brand_category": "Mid_Range",
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"curbweight": 2548,
"cylindernumber": "four",
"doornumber": "two",
"drivewheel": "rwd",
"enginesize": 130,
"enginetype": "dohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 27,
"horsepower": 111,
"price": 16500,
"wheelbase": 88.6
},
{
"aspiration": "std",
"boreratio": 2.68,
"brand_category": "Mid_Range",
"carheight": 52.4,
"carlength": 171.2,
"carwidth": 65.5,
"citympg": 19,
"curbweight": 2823,
"cylindernumber": "six",
"doornumber": "two",
"drivewheel": "rwd",
"enginesize": 152,
"enginetype": "ohcv",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 26,
"horsepower": 154,
"price": 16500,
"wheelbase": 94.5
},
{
"aspiration": "std",
"boreratio": 3.19,
"brand_category": "Mid_Range",
"carheight": 54.3,
"carlength": 176.6,
"carwidth": 66.2,
"citympg": 24,
"curbweight": 2337,
"cylindernumber": "four",
"doornumber": "four",
"drivewheel": "fwd",
"enginesize": 109,
"enginetype": "ohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 30,
"horsepower": 102,
"price": 13950,
"wheelbase": 99.8
},
{
"aspiration": "std",
"boreratio": 3.19,
"brand_category": "Mid_Range",
"carheight": 54.3,
"carlength": 176.6,
"carwidth": 66.4,
"citympg": 18,
"curbweight": 2824,
"cylindernumber": "five",
"doornumber": "four",
"drivewheel": "4wd",
"enginesize": 136,
"enginetype": "ohc",
"fuelsystem": "mpfi",
"fueltype": "gas",
"highwaympg": 22,
"horsepower": 115,
"price": 17450,
"wheelbase": 99.4
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand_category",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fueltype",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "aspiration",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "doornumber",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "drivewheel",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "wheelbase",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carlength",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carwidth",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carheight",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "curbweight",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginesize",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem",
"rdfType": null,
"title": null,
"type": "string"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "boreratio",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "horsepower",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "citympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "highwaympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "price",
"rdfType": null,
"title": null,
"type": "number"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
"<table><thead><tr><th>brand_category</th><th>fueltype</th><th>aspiration</th><th>doornumber</th><th>drivewheel</th><th>wheelbase</th><th>carlength</th><th>carwidth</th><th>carheight</th><th>curbweight</th><th>enginetype</th><th>cylindernumber</th><th>enginesize</th><th>fuelsystem</th><th>boreratio</th><th>horsepower</th><th>citympg</th><th>highwaympg</th><th>price</th></tr></thead><tbody><tr><td>Mid_Range</td><td>gas</td><td>std</td><td>two</td><td>rwd</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>dohc</td><td>four</td><td>130</td><td>mpfi</td><td>3.47</td><td>111</td><td>21</td><td>27</td><td>13495</td></tr><tr><td>Mid_Range</td><td>gas</td><td>std</td><td>two</td><td>rwd</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>dohc</td><td>four</td><td>130</td><td>mpfi</td><td>3.47</td><td>111</td><td>21</td><td>27</td><td>16500</td></tr><tr><td>Mid_Range</td><td>gas</td><td>std</td><td>two</td><td>rwd</td><td>94.5</td><td>171.2</td><td>65.5</td><td>52.4</td><td>2823</td><td>ohcv</td><td>six</td><td>152</td><td>mpfi</td><td>2.68</td><td>154</td><td>19</td><td>26</td><td>16500</td></tr><tr><td>Mid_Range</td><td>gas</td><td>std</td><td>four</td><td>fwd</td><td>99.8</td><td>176.6</td><td>66.2</td><td>54.3</td><td>2337</td><td>ohc</td><td>four</td><td>109</td><td>mpfi</td><td>3.19</td><td>102</td><td>24</td><td>30</td><td>13950</td></tr><tr><td>Mid_Range</td><td>gas</td><td>std</td><td>four</td><td>4wd</td><td>99.4</td><td>176.6</td><td>66.4</td><td>54.3</td><td>2824</td><td>ohc</td><td>five</td><td>136</td><td>mpfi</td><td>3.19</td><td>115</td><td>18</td><td>22</td><td>17450</td></tr></tbody></table>"
]
},
2024-10-04 05:24:07 +00:00
"execution_count": 22,
2024-09-25 19:52:53 +00:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"let carDataGeneralized = carData.drop(\n",
" \"brand\",\n",
" \"carbody\",\n",
" \"enginelocation\",\n",
" \"stroke\",\n",
" \"compressionratio\",\n",
" \"peakrpm\",\n",
");\n",
"carDataGeneralized.head(5);"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "markdown",
"id": "826ab213-65a2-43d0-b9ce-fecdebd79eb8",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
2024-09-26 01:41:33 +00:00
"Next we use one hot (binary) encoding. We assume that all non-numeric colums are\n",
"categorical."
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
2024-10-04 05:24:07 +00:00
"execution_count": 23,
2024-09-25 19:52:53 +00:00
"id": "9bc95dd8-0240-45c6-a0f9-21b1873ae04b",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.dataresource+json": {
"bytes": null,
"data": [
{
"aspiration_std": 1,
"aspiration_turbo": 0,
"boreratio": 3.47,
"brand_category_Budget": 0,
"brand_category_Luxury": 0,
"brand_category_Mid_Range": 1,
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"curbweight": 2548,
"cylindernumber_eight": 0,
"cylindernumber_five": 0,
"cylindernumber_four": 1,
"cylindernumber_six": 0,
"cylindernumber_three": 0,
"cylindernumber_twelve": 0,
"cylindernumber_two": 0,
"doornumber_four": 0,
"doornumber_two": 1,
"drivewheel_4wd": 0,
"drivewheel_fwd": 0,
"drivewheel_rwd": 1,
"enginesize": 130,
"enginetype_dohc": 1,
"enginetype_dohcv": 0,
"enginetype_l": 0,
"enginetype_ohc": 0,
"enginetype_ohcf": 0,
"enginetype_ohcv": 0,
"enginetype_rotor": 0,
"fuelsystem_1bbl": 0,
"fuelsystem_2bbl": 0,
"fuelsystem_4bbl": 0,
"fuelsystem_idi": 0,
"fuelsystem_mfi": 0,
"fuelsystem_mpfi": 1,
"fuelsystem_spdi": 0,
"fuelsystem_spfi": 0,
"fueltype_diesel": 0,
"fueltype_gas": 1,
"highwaympg": 27,
"horsepower": 111,
"price": 13495,
"wheelbase": 88.6
},
{
"aspiration_std": 1,
"aspiration_turbo": 0,
"boreratio": 3.47,
"brand_category_Budget": 0,
"brand_category_Luxury": 0,
"brand_category_Mid_Range": 1,
"carheight": 48.8,
"carlength": 168.8,
"carwidth": 64.1,
"citympg": 21,
"curbweight": 2548,
"cylindernumber_eight": 0,
"cylindernumber_five": 0,
"cylindernumber_four": 1,
"cylindernumber_six": 0,
"cylindernumber_three": 0,
"cylindernumber_twelve": 0,
"cylindernumber_two": 0,
"doornumber_four": 0,
"doornumber_two": 1,
"drivewheel_4wd": 0,
"drivewheel_fwd": 0,
"drivewheel_rwd": 1,
"enginesize": 130,
"enginetype_dohc": 1,
"enginetype_dohcv": 0,
"enginetype_l": 0,
"enginetype_ohc": 0,
"enginetype_ohcf": 0,
"enginetype_ohcv": 0,
"enginetype_rotor": 0,
"fuelsystem_1bbl": 0,
"fuelsystem_2bbl": 0,
"fuelsystem_4bbl": 0,
"fuelsystem_idi": 0,
"fuelsystem_mfi": 0,
"fuelsystem_mpfi": 1,
"fuelsystem_spdi": 0,
"fuelsystem_spfi": 0,
"fueltype_diesel": 0,
"fueltype_gas": 1,
"highwaympg": 27,
"horsepower": 111,
"price": 16500,
"wheelbase": 88.6
},
{
"aspiration_std": 1,
"aspiration_turbo": 0,
"boreratio": 2.68,
"brand_category_Budget": 0,
"brand_category_Luxury": 0,
"brand_category_Mid_Range": 1,
"carheight": 52.4,
"carlength": 171.2,
"carwidth": 65.5,
"citympg": 19,
"curbweight": 2823,
"cylindernumber_eight": 0,
"cylindernumber_five": 0,
"cylindernumber_four": 0,
"cylindernumber_six": 1,
"cylindernumber_three": 0,
"cylindernumber_twelve": 0,
"cylindernumber_two": 0,
"doornumber_four": 0,
"doornumber_two": 1,
"drivewheel_4wd": 0,
"drivewheel_fwd": 0,
"drivewheel_rwd": 1,
"enginesize": 152,
"enginetype_dohc": 0,
"enginetype_dohcv": 0,
"enginetype_l": 0,
"enginetype_ohc": 0,
"enginetype_ohcf": 0,
"enginetype_ohcv": 1,
"enginetype_rotor": 0,
"fuelsystem_1bbl": 0,
"fuelsystem_2bbl": 0,
"fuelsystem_4bbl": 0,
"fuelsystem_idi": 0,
"fuelsystem_mfi": 0,
"fuelsystem_mpfi": 1,
"fuelsystem_spdi": 0,
"fuelsystem_spfi": 0,
"fueltype_diesel": 0,
"fueltype_gas": 1,
"highwaympg": 26,
"horsepower": 154,
"price": 16500,
"wheelbase": 94.5
},
{
"aspiration_std": 1,
"aspiration_turbo": 0,
"boreratio": 3.19,
"brand_category_Budget": 0,
"brand_category_Luxury": 0,
"brand_category_Mid_Range": 1,
"carheight": 54.3,
"carlength": 176.6,
"carwidth": 66.2,
"citympg": 24,
"curbweight": 2337,
"cylindernumber_eight": 0,
"cylindernumber_five": 0,
"cylindernumber_four": 1,
"cylindernumber_six": 0,
"cylindernumber_three": 0,
"cylindernumber_twelve": 0,
"cylindernumber_two": 0,
"doornumber_four": 1,
"doornumber_two": 0,
"drivewheel_4wd": 0,
"drivewheel_fwd": 1,
"drivewheel_rwd": 0,
"enginesize": 109,
"enginetype_dohc": 0,
"enginetype_dohcv": 0,
"enginetype_l": 0,
"enginetype_ohc": 1,
"enginetype_ohcf": 0,
"enginetype_ohcv": 0,
"enginetype_rotor": 0,
"fuelsystem_1bbl": 0,
"fuelsystem_2bbl": 0,
"fuelsystem_4bbl": 0,
"fuelsystem_idi": 0,
"fuelsystem_mfi": 0,
"fuelsystem_mpfi": 1,
"fuelsystem_spdi": 0,
"fuelsystem_spfi": 0,
"fueltype_diesel": 0,
"fueltype_gas": 1,
"highwaympg": 30,
"horsepower": 102,
"price": 13950,
"wheelbase": 99.8
},
{
"aspiration_std": 1,
"aspiration_turbo": 0,
"boreratio": 3.19,
"brand_category_Budget": 0,
"brand_category_Luxury": 0,
"brand_category_Mid_Range": 1,
"carheight": 54.3,
"carlength": 176.6,
"carwidth": 66.4,
"citympg": 18,
"curbweight": 2824,
"cylindernumber_eight": 0,
"cylindernumber_five": 1,
"cylindernumber_four": 0,
"cylindernumber_six": 0,
"cylindernumber_three": 0,
"cylindernumber_twelve": 0,
"cylindernumber_two": 0,
"doornumber_four": 1,
"doornumber_two": 0,
"drivewheel_4wd": 1,
"drivewheel_fwd": 0,
"drivewheel_rwd": 0,
"enginesize": 136,
"enginetype_dohc": 0,
"enginetype_dohcv": 0,
"enginetype_l": 0,
"enginetype_ohc": 1,
"enginetype_ohcf": 0,
"enginetype_ohcv": 0,
"enginetype_rotor": 0,
"fuelsystem_1bbl": 0,
"fuelsystem_2bbl": 0,
"fuelsystem_4bbl": 0,
"fuelsystem_idi": 0,
"fuelsystem_mfi": 0,
"fuelsystem_mpfi": 1,
"fuelsystem_spdi": 0,
"fuelsystem_spfi": 0,
"fueltype_diesel": 0,
"fueltype_gas": 1,
"highwaympg": 22,
"horsepower": 115,
"price": 17450,
"wheelbase": 99.4
}
],
"description": null,
"dialect": null,
"encoding": null,
"format": null,
"hash": null,
"homepage": null,
"licenses": null,
"mediatype": null,
"path": null,
"schema": {
"fields": [
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand_category_Budget",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand_category_Luxury",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "brand_category_Mid_Range",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fueltype_diesel",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fueltype_gas",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "aspiration_std",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "aspiration_turbo",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "doornumber_four",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "doornumber_two",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "drivewheel_4wd",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "drivewheel_fwd",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "drivewheel_rwd",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "wheelbase",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carlength",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carwidth",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "carheight",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "curbweight",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype_dohc",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype_dohcv",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype_l",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype_ohc",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype_ohcf",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype_ohcv",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginetype_rotor",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber_eight",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber_five",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber_four",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber_six",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber_three",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber_twelve",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "cylindernumber_two",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "enginesize",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem_1bbl",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem_2bbl",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem_4bbl",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem_idi",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem_mfi",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem_mpfi",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem_spdi",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "fuelsystem_spfi",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "boreratio",
"rdfType": null,
"title": null,
"type": "number"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "horsepower",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "citympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "highwaympg",
"rdfType": null,
"title": null,
"type": "integer"
},
{
"constraints": null,
"description": null,
"example": null,
"format": null,
"name": "price",
"rdfType": null,
"title": null,
"type": "number"
}
],
"foreignKeys": null,
"missingValues": null,
"primaryKey": null
},
"sources": null,
"title": null
},
"text/html": [
"<table><thead><tr><th>brand_category_Budget</th><th>brand_category_Luxury</th><th>brand_category_Mid_Range</th><th>fueltype_diesel</th><th>fueltype_gas</th><th>aspiration_std</th><th>aspiration_turbo</th><th>doornumber_four</th><th>doornumber_two</th><th>drivewheel_4wd</th><th>drivewheel_fwd</th><th>drivewheel_rwd</th><th>wheelbase</th><th>carlength</th><th>carwidth</th><th>carheight</th><th>curbweight</th><th>enginetype_dohc</th><th>enginetype_dohcv</th><th>enginetype_l</th><th>enginetype_ohc</th><th>enginetype_ohcf</th><th>enginetype_ohcv</th><th>enginetype_rotor</th><th>cylindernumber_eight</th><th>cylindernumber_five</th><th>cylindernumber_four</th><th>cylindernumber_six</th><th>cylindernumber_three</th><th>cylindernumber_twelve</th><th>cylindernumber_two</th><th>enginesize</th><th>fuelsystem_1bbl</th><th>fuelsystem_2bbl</th><th>fuelsystem_4bbl</th><th>fuelsystem_idi</th><th>fuelsystem_mfi</th><th>fuelsystem_mpfi</th><th>fuelsystem_spdi</th><th>fuelsystem_spfi</th><th>boreratio</th><th>horsepower</th><th>citympg</th><th>highwaympg</th><th>price</th></tr></thead><tbody><tr><td>0</td><td>0</td><td>1</td><td>0</td><td>1</td><td>1</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>1</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>130</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>3.47</td><td>111</td><td>21</td><td>27</td><td>13495</td></tr><tr><td>0</td><td>0</td><td>1</td><td>0</td><td>1</td><td>1</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>1</td><td>88.6</td><td>168.8</td><td>64.1</td><td>48.8</td><td>2548</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>130</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>3.47</td><td>111</td><td>21</td><td>27</td><td>16500</td></tr><tr><td>0</td><td>0</td><td>1</td><td>0</td><td>1</td><td>1</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>1</td><td>94.5</td><td>171.2</td><td>65.5</td><td>52.4</td><td>2823</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>0</td><td>152</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>2.68</td><td>154</td><td>19</td><td>26</td><td>16500</td></tr><tr><td>0</td><td>0</td><td>1</td><td>0</td><td>1</td><td>1</td><td>0</td><td>1</td><td>0</td><td>0</td><td>1</td><td>0</td><td>99.8</td><td>176.6</td><td>66.2</td><td>54.3</td><td>2337</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>109</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>3.19</td><td>102</td><td>24</td><td>30</td><td>13950</td></tr><tr><td>0</td><td>0</td><td>1</td><td>0</td><td>1</td><td>1</td><td>0</td><td>1</td><td>0</td><td>1</td><td>0</td><td>0</td><td>99.4</td><td>176.6</td><td>66.4</td><td>54.3</td><td>2824</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>136</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td><td>0</td><td>0</td><td>3.19</td><td>115</td><td>18</td><td>22</td><td>17450</td></tr></tbody></table>"
]
},
2024-10-04 05:24:07 +00:00
"execution_count": 23,
2024-09-25 19:52:53 +00:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-09-26 01:41:33 +00:00
"import { oneHotEncoding } from \"https://l12.xyz/x/shortcuts/raw/encoding.ts\";\n",
2024-09-25 19:52:53 +00:00
"\n",
2024-09-26 01:41:33 +00:00
"let encodedCarData = oneHotEncoding(carDataGeneralized);\n",
"encodedCarData.head(5);"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
2024-10-04 05:24:07 +00:00
"execution_count": 24,
2024-09-25 19:52:53 +00:00
"id": "aff549ce-6736-43c7-8d40-b09b9ca7fa59",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
2024-09-26 01:41:33 +00:00
"encodedCarData.writeCSV(\"assets/encoded_car_data.csv\");"
2024-09-25 19:52:53 +00:00
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec799133-584f-450d-bd8b-6b443fbf5fb5",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Deno",
"language": "typescript",
"name": "deno"
},
"language_info": {
"codemirror_mode": "typescript",
"file_extension": ".ts",
"mimetype": "text/x.typescript",
"name": "typescript",
"nbconvert_exporter": "script",
"pygments_lexer": "typescript",
2024-10-04 05:24:07 +00:00
"version": "5.6.2"
2024-09-25 19:52:53 +00:00
}
},
"nbformat": 4,
"nbformat_minor": 5
}