From 76636b184325acbc44e9894d55ab787db6726633 Mon Sep 17 00:00:00 2001 From: Gerardo Marx Date: Thu, 19 Sep 2024 18:13:06 -0600 Subject: [PATCH] basic information to start example --- Readme.md | 63 +++++++++++ data.csv | 301 +++++++++++++++++++++++++++++++++++++++++++++++++++++ main.ipynb | 226 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 590 insertions(+) create mode 100644 Readme.md create mode 100644 data.csv create mode 100644 main.ipynb diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..569ad15 --- /dev/null +++ b/Readme.md @@ -0,0 +1,63 @@ +# Linear regression + +The linear regression is a training procedure based on a linear model. The model makes a prediction by simply computing a weighted sum of the input features, plus a constant term called the bias term (also called the intercept term): + +$$ \hat{y}=\theta_0 + \theta_1 x_1 + \theta_2 x_2 + \cdots + \theta_n x_n$$ + +This can be writen more easy by using vector notation form for $m$ values. Therefore, the model will become: + +$$ + \begin{bmatrix} + \hat{y}^0 \\ + \hat{y}^1\\ + \hat{y}^2\\ + \vdots \\ + \hat{y}^m + \end{bmatrix} + = + \begin{bmatrix} + 1 & x_1^0 & x_2^0 & \cdots &x_n^0\\ + 1 & x_1^1 & x_2^1 & \cdots & x_n^1\\ + \vdots & \vdots &\vdots & \cdots & \vdots\\ + 1 & x_1^m & x_2^m & \cdots & x_n^m + \end{bmatrix} + + \begin{bmatrix} + \theta_0 \\ + \theta_1 \\ + \theta_2 \\ + \vdots \\ + \theta_n + \end{bmatrix} +$$ + +Resulting: + +$$\hat{y}= h_\theta(x) = x \theta $$ + +**Now that we have our mode, how do we train it?** + +Please, consider that training the model means adjusting the parameters to reduce the error or minimizing the cost function. The most common performance measure of a regression model is the Mean Square Error (MSE). Therefore, to train a Linear Regression model, you need to find the value of θ that minimizes the MSE: + +$$ MSE(X,h_\theta) = \frac{1}{m} \sum_{i=1}^{m} \left(\hat{y}^{(i)}-y^{(i)} \right)^2$$ + + +$$ MSE(X,h_\theta) = \frac{1}{m} \sum_{i=1}^{m} \left( x^{(i)}\theta-y^{(i)} \right)^2$$ + +$$ MSE(X,h_\theta) = \frac{1}{m} \left( x\theta-y \right)^T \left( x\theta-y \right)$$ + +# The normal equation + +To find the value of $\theta$ that minimizes the cost function, there is a closed-form solution that gives the result directly. This is called the **Normal Equation**; and can be find it by derivating the *MSE* equation as a function of $\theta$ and making it equals to zero: + + +$$\hat{\theta} = (X^T X)^{-1} X^{T} y $$ + +$$ Temp = \theta_0 + \theta_1 * t $$ + + +```python +import pandas as pd +df = pd.read_csv('data.csv') +df +``` diff --git a/data.csv b/data.csv new file mode 100644 index 0000000..62b10ba --- /dev/null +++ b/data.csv @@ -0,0 +1,301 @@ +0 +24.218 +23.154 +24.347 +24.411 +24.411 +24.347 +24.314 +24.347 +24.347 +23.896 +24.476 +24.637 +24.669 +24.669 +25.056 +25.088 +24.991 +25.088 +25.217 +25.281 +25.313 +25.668 +25.668 +25.636 +26.022 +25.926 +19.126 +26.248 +26.248 +26.055 +25.152 +26.699 +26.989 +26.957 +27.021 +27.118 +27.247 +27.344 +27.666 +27.183 +27.795 +27.892 +28.021 +28.311 +28.214 +28.504 +28.536 +28.762 +28.826 +28.858 +29.245 +29.181 +29.374 +29.6 +29.567 +29.793 +29.761 +29.89 +30.147 +30.147 +30.438 +30.599 +30.728 +30.856 +30.76 +31.018 +31.114 +31.34 +31.533 +31.501 +31.727 +31.469 +32.017 +32.081 +32.113 +32.5 +32.403 +32.403 +32.693 +32.726 +32.887 +33.016 +33.048 +33.08 +33.37 +33.37 +33.499 +33.725 +33.789 +33.821 +34.047 +34.079 +34.144 +34.305 +34.434 +34.434 +34.659 +34.756 +34.659 +34.691 +34.917 +34.981 +34.981 +35.271 +35.4 +35.336 +35.239 +35.594 +35.626 +35.819 +26.796 +35.948 +27.408 +36.174 +35.304 +36.271 +36.528 +36.561 +36.689 +36.657 +36.979 +36.979 +37.044 +37.205 +37.173 +37.237 +37.205 +37.302 +37.656 +37.56 +37.592 +37.882 +37.882 +37.817 +38.043 +37.173 +38.269 +38.365 +38.397 +38.591 +33.016 +26.022 +38.913 +38.945 +38.913 +38.945 +38.945 +39.235 +39.203 +39.268 +39.3 +39.493 +39.042 +39.59 +39.622 +39.654 +39.815 +39.88 +39.912 +39.912 +40.009 +40.009 +40.234 +40.234 +40.234 +40.363 +40.524 +40.524 +40.557 +40.557 +40.653 +40.814 +40.557 +40.911 +40.879 +41.072 +41.169 +41.104 +41.072 +41.104 +41.137 +41.523 +41.33 +41.523 +41.523 +41.62 +41.813 +41.781 +41.846 +41.813 +41.942 +42.136 +42.136 +42.136 +42.136 +42.104 +42.168 +42.361 +42.458 +42.232 +42.49 +42.361 +42.394 +42.426 +42.394 +42.716 +42.748 +42.813 +42.651 +42.813 +42.748 +42.941 +43.103 +43.135 +43.103 +43.038 +43.135 +43.264 +43.425 +43.328 +43.328 +43.457 +43.457 +43.521 +43.683 +43.779 +43.683 +43.683 +43.715 +43.973 +43.94 +44.102 +44.005 +44.005 +44.005 +44.23 +44.359 +44.424 +44.392 +44.327 +44.327 +44.424 +44.521 +43.779 +44.682 +44.714 +44.649 +44.649 +44.746 +44.778 +44.907 +44.972 +42.2 +44.939 +45.036 +44.907 +44.327 +43.876 +45.004 +45.197 +45.294 +45.358 +45.326 +45.229 +45.358 +45.101 +45.423 +45.391 +45.713 +45.681 +45.616 +45.713 +45.616 +45.713 +45.713 +45.713 +45.745 +45.648 +45.971 +45.938 +45.938 +45.938 +46.067 +45.971 +46.035 +46.132 +46.196 +45.938 +46.164 +46.261 +46.261 +46.229 +46.261 +46.229 +46.229 +46.357 +46.551 +46.519 +46.551 +46.583 diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 0000000..1f2c51f --- /dev/null +++ b/main.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Linear regression\n", + "\n", + "The linear regression is a training procedure based on a linear model. The model makes a prediction by simply computing a weighted sum of the input features, plus a constant term called the bias term (also called the intercept term):\n", + "\n", + "$$ \\hat{y}=\\theta_0 + \\theta_1 x_1 + \\theta_2 x_2 + \\cdots + \\theta_n x_n$$\n", + "\n", + "This can be writen more easy by using vector notation form for $m$ values. Therefore, the model will become:\n", + "\n", + "$$ \n", + " \\begin{bmatrix}\n", + " \\hat{y}^0 \\\\ \n", + " \\hat{y}^1\\\\\n", + " \\hat{y}^2\\\\\n", + " \\vdots \\\\\n", + " \\hat{y}^m\n", + " \\end{bmatrix}\n", + " =\n", + " \\begin{bmatrix}\n", + " 1 & x_1^0 & x_2^0 & \\cdots &x_n^0\\\\\n", + " 1 & x_1^1 & x_2^1 & \\cdots & x_n^1\\\\\n", + " \\vdots & \\vdots &\\vdots & \\cdots & \\vdots\\\\\n", + " 1 & x_1^m & x_2^m & \\cdots & x_n^m\n", + " \\end{bmatrix}\n", + "\n", + " \\begin{bmatrix}\n", + " \\theta_0 \\\\\n", + " \\theta_1 \\\\\n", + " \\theta_2 \\\\\n", + " \\vdots \\\\\n", + " \\theta_n\n", + " \\end{bmatrix}\n", + "$$\n", + "\n", + "Resulting:\n", + "\n", + "$$\\hat{y}= h_\\theta(x) = x \\theta $$\n", + "\n", + "**Now that we have our mode, how do we train it?**\n", + "\n", + "Please, consider that training the model means adjusting the parameters to reduce the error or minimizing the cost function. The most common performance measure of a regression model is the Mean Square Error (MSE). Therefore, to train a Linear Regression model, you need to find the value of θ that minimizes the MSE:\n", + "\n", + "$$ MSE(X,h_\\theta) = \\frac{1}{m} \\sum_{i=1}^{m} \\left(\\hat{y}^{(i)}-y^{(i)} \\right)^2$$\n", + "\n", + "\n", + "$$ MSE(X,h_\\theta) = \\frac{1}{m} \\sum_{i=1}^{m} \\left( x^{(i)}\\theta-y^{(i)} \\right)^2$$\n", + "\n", + "$$ MSE(X,h_\\theta) = \\frac{1}{m} \\left( x\\theta-y \\right)^T \\left( x\\theta-y \\right)$$\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# The normal equation\n", + "\n", + "To find the value of $\\theta$ that minimizes the cost function, there is a closed-form solution that gives the result directly. This is called the **Normal Equation**; and can be find it by derivating the *MSE* equation as a function of $\\theta$ and making it equals to zero:\n", + "\n", + "\n", + "$$\\hat{\\theta} = (X^T X)^{-1} X^{T} y $$\n", + "\n", + "$$ Temp = \\theta_0 + \\theta_1 * t $$\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
024.218
123.154
224.347
324.411
424.411
......
29546.357
29646.551
29746.519
29846.551
29946.583
\n", + "

300 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " 0\n", + "0 24.218\n", + "1 23.154\n", + "2 24.347\n", + "3 24.411\n", + "4 24.411\n", + ".. ...\n", + "295 46.357\n", + "296 46.551\n", + "297 46.519\n", + "298 46.551\n", + "299 46.583\n", + "\n", + "[300 rows x 1 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv('data.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'df' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdf\u001b[49m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}