{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**Chapter 4 – Training Linear Models**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# To support both python 2 and python 3\n", "from __future__ import division, print_function, unicode_literals\n", "\n", "# Common imports\n", "import numpy as np\n", "import os\n", "\n", "# to make this notebook's output stable across runs\n", "np.random.seed(42)\n", "\n", "# To plot pretty figures\n", "%matplotlib inline\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "plt.rcParams['axes.labelsize'] = 14\n", "plt.rcParams['xtick.labelsize'] = 12\n", "plt.rcParams['ytick.labelsize'] = 12\n", "\n", "def save_fig(fig_id, tight_layout=True):\n", " return" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Linear regression using the Normal Equation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "\n", "X = 2 * np.random.rand(100, 1)\n", "y = 4 + 3 * X + np.random.randn(100, 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.plot(X, y, \"b.\")\n", "plt.xlabel(\"$x_1$\", fontsize=18)\n", "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", "plt.axis([0, 2, 0, 15])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_b = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance\n", "theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "theta_best" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_new = np.array([[0], [2]])\n", "X_new_b = np.c_[np.ones((2, 1)), X_new] # add x0 = 1 to each instance\n", "y_predict = X_new_b.dot(theta_best)\n", "y_predict" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.plot(X_new, y_predict, \"r-\")\n", "plt.plot(X, y, \"b.\")\n", "plt.axis([0, 2, 0, 15])\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The figure in the book actually corresponds to the following code, with a legend and axis labels:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.plot(X_new, y_predict, \"r-\", linewidth=2, label=\"Predictions\")\n", "plt.plot(X, y, \"b.\")\n", "plt.xlabel(\"$x_1$\", fontsize=18)\n", "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", "plt.legend(loc=\"upper left\", fontsize=14)\n", "plt.axis([0, 2, 0, 15])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", "lin_reg = LinearRegression()\n", "lin_reg.fit(X, y)\n", "lin_reg.intercept_, lin_reg.coef_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lin_reg.predict(X_new)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Linear regression using batch gradient descent" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "eta = 0.1\n", "n_iterations = 1000\n", "m = 100\n", "theta = np.random.randn(2,1)\n", "\n", "for iteration in range(n_iterations):\n", " gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)\n", " theta = theta - eta * gradients" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "theta" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_new_b.dot(theta)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "theta_path_bgd = []\n", "\n", "def plot_gradient_descent(theta, eta, theta_path=None):\n", " m = len(X_b)\n", " plt.plot(X, y, \"b.\")\n", " n_iterations = 1000\n", " for iteration in range(n_iterations):\n", " if iteration < 10:\n", " y_predict = X_new_b.dot(theta)\n", " style = \"b-\" if iteration > 0 else \"r--\"\n", " plt.plot(X_new, y_predict, style)\n", " gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)\n", " theta = theta - eta * gradients\n", " if theta_path is not None:\n", " theta_path.append(theta)\n", " plt.xlabel(\"$x_1$\", fontsize=18)\n", " plt.axis([0, 2, 0, 15])\n", " plt.title(r\"$\\eta = {}$\".format(eta), fontsize=16)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", "theta = np.random.randn(2,1) # random initialization\n", "\n", "plt.figure(figsize=(10,4))\n", "plt.subplot(131); plot_gradient_descent(theta, eta=0.02)\n", "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", "plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)\n", "plt.subplot(133); plot_gradient_descent(theta, eta=0.5)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Stochastic Gradient Descent" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "theta_path_sgd = []\n", "m = len(X_b)\n", "np.random.seed(42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_epochs = 50\n", "t0, t1 = 5, 50 # learning schedule hyperparameters\n", "\n", "def learning_schedule(t):\n", " return t0 / (t + t1)\n", "\n", "theta = np.random.randn(2,1) # random initialization\n", "\n", "for epoch in range(n_epochs):\n", " for i in range(m):\n", " if epoch == 0 and i < 20: # not shown in the book\n", " y_predict = X_new_b.dot(theta) # not shown\n", " style = \"b-\" if i > 0 else \"r--\" # not shown\n", " plt.plot(X_new, y_predict, style) # not shown\n", " random_index = np.random.randint(m)\n", " xi = X_b[random_index:random_index+1]\n", " yi = y[random_index:random_index+1]\n", " gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n", " eta = learning_schedule(epoch * m + i)\n", " theta = theta - eta * gradients\n", " theta_path_sgd.append(theta) # not shown\n", "\n", "plt.plot(X, y, \"b.\") # not shown\n", "plt.xlabel(\"$x_1$\", fontsize=18) # not shown\n", "plt.ylabel(\"$y$\", rotation=0, fontsize=18) # not shown\n", "plt.axis([0, 2, 0, 15]) # not shown\n", "save_fig(\"sgd_plot\") # not shown\n", "plt.show() # not shown" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "theta" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import SGDRegressor\n", "sgd_reg = SGDRegressor(max_iter=50, penalty=None, eta0=0.1, random_state=42)\n", "sgd_reg.fit(X, y.ravel())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sgd_reg.intercept_, sgd_reg.coef_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Mini-batch gradient descent" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "theta_path_mgd = []\n", "\n", "n_iterations = 50\n", "minibatch_size = 20\n", "\n", "np.random.seed(42)\n", "theta = np.random.randn(2,1) # random initialization\n", "\n", "t0, t1 = 10, 1000\n", "def learning_schedule(t):\n", " return t0 / (t + t1)\n", "\n", "t = 0\n", "for epoch in range(n_iterations):\n", " shuffled_indices = np.random.permutation(m)\n", " X_b_shuffled = X_b[shuffled_indices]\n", " y_shuffled = y[shuffled_indices]\n", " for i in range(0, m, minibatch_size):\n", " t += 1\n", " xi = X_b_shuffled[i:i+minibatch_size]\n", " yi = y_shuffled[i:i+minibatch_size]\n", " gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n", " eta = learning_schedule(t)\n", " theta = theta - eta * gradients\n", " theta_path_mgd.append(theta)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "theta" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "theta_path_bgd = np.array(theta_path_bgd)\n", "theta_path_sgd = np.array(theta_path_sgd)\n", "theta_path_mgd = np.array(theta_path_mgd)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(7,4))\n", "plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], \"r-s\", linewidth=1, label=\"Stochastic\")\n", "plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], \"g-+\", linewidth=2, label=\"Mini-batch\")\n", "plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], \"b-o\", linewidth=3, label=\"Batch\")\n", "plt.legend(loc=\"upper left\", fontsize=16)\n", "plt.xlabel(r\"$\\theta_0$\", fontsize=20)\n", "plt.ylabel(r\"$\\theta_1$ \", fontsize=20, rotation=0)\n", "plt.axis([2.5, 4.5, 2.3, 3.9])\n", "save_fig(\"gradient_descent_paths_plot\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Polynomial regression" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import numpy.random as rnd\n", "\n", "np.random.seed(42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "m = 100\n", "X = 6 * np.random.rand(m, 1) - 3\n", "y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.plot(X, y, \"b.\")\n", "plt.xlabel(\"$x_1$\", fontsize=18)\n", "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", "plt.axis([-3, 3, 0, 10])\n", "save_fig(\"quadratic_data_plot\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import PolynomialFeatures\n", "poly_features = PolynomialFeatures(degree=2, include_bias=False)\n", "X_poly = poly_features.fit_transform(X)\n", "X[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_poly[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lin_reg = LinearRegression()\n", "lin_reg.fit(X_poly, y)\n", "lin_reg.intercept_, lin_reg.coef_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_new=np.linspace(-3, 3, 100).reshape(100, 1)\n", "X_new_poly = poly_features.transform(X_new)\n", "y_new = lin_reg.predict(X_new_poly)\n", "plt.plot(X, y, \"b.\")\n", "plt.plot(X_new, y_new, \"r-\", linewidth=2, label=\"Predictions\")\n", "plt.xlabel(\"$x_1$\", fontsize=18)\n", "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", "plt.legend(loc=\"upper left\", fontsize=14)\n", "plt.axis([-3, 3, 0, 10])\n", "save_fig(\"quadratic_predictions_plot\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", "from sklearn.pipeline import Pipeline\n", "\n", "for style, width, degree in ((\"g-\", 1, 300), (\"b--\", 2, 2), (\"r-+\", 2, 1)):\n", " polybig_features = PolynomialFeatures(degree=degree, include_bias=False)\n", " std_scaler = StandardScaler()\n", " lin_reg = LinearRegression()\n", " polynomial_regression = Pipeline([\n", " (\"poly_features\", polybig_features),\n", " (\"std_scaler\", std_scaler),\n", " (\"lin_reg\", lin_reg),\n", " ])\n", " polynomial_regression.fit(X, y)\n", " y_newbig = polynomial_regression.predict(X_new)\n", " plt.plot(X_new, y_newbig, style, label=str(degree), linewidth=width)\n", "\n", "plt.plot(X, y, \"b.\", linewidth=3)\n", "plt.legend(loc=\"upper left\")\n", "plt.xlabel(\"$x_1$\", fontsize=18)\n", "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", "plt.axis([-3, 3, 0, 10])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", "from sklearn.model_selection import train_test_split\n", "\n", "def plot_learning_curves(model, X, y):\n", " X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)\n", " train_errors, val_errors = [], []\n", " for m in range(1, len(X_train)):\n", " model.fit(X_train[:m], y_train[:m])\n", " y_train_predict = model.predict(X_train[:m])\n", " y_val_predict = model.predict(X_val)\n", " train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))\n", " val_errors.append(mean_squared_error(y_val_predict, y_val))\n", "\n", " plt.plot(np.sqrt(train_errors), \"r-+\", linewidth=2, label=\"train\")\n", " plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"val\")\n", " plt.legend(loc=\"upper right\", fontsize=14) # not shown in the book\n", " plt.xlabel(\"Training set size\", fontsize=14) # not shown\n", " plt.ylabel(\"RMSE\", fontsize=14) # not shown" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lin_reg = LinearRegression()\n", "plot_learning_curves(lin_reg, X, y)\n", "plt.axis([0, 80, 0, 3]) # not shown in the book\n", "plt.show() # not shown" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "\n", "polynomial_regression = Pipeline([\n", " (\"poly_features\", PolynomialFeatures(degree=10, include_bias=False)),\n", " (\"lin_reg\", LinearRegression()),\n", " ])\n", "\n", "plot_learning_curves(polynomial_regression, X, y)\n", "plt.axis([0, 80, 0, 3]) # not shown\n", "plt.show() # not shown" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Regularized models" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Ridge\n", "\n", "np.random.seed(42)\n", "m = 20\n", "X = 3 * np.random.rand(m, 1)\n", "y = 1 + 0.5 * X + np.random.randn(m, 1) / 1.5\n", "X_new = np.linspace(0, 3, 100).reshape(100, 1)\n", "\n", "def plot_model(model_class, polynomial, alphas, **model_kargs):\n", " for alpha, style in zip(alphas, (\"b-\", \"g--\", \"r:\")):\n", " model = model_class(alpha, **model_kargs) if alpha > 0 else LinearRegression()\n", " if polynomial:\n", " model = Pipeline([\n", " (\"poly_features\", PolynomialFeatures(degree=10, include_bias=False)),\n", " (\"std_scaler\", StandardScaler()),\n", " (\"regul_reg\", model),\n", " ])\n", " model.fit(X, y)\n", " y_new_regul = model.predict(X_new)\n", " lw = 2 if alpha > 0 else 1\n", " plt.plot(X_new, y_new_regul, style, linewidth=lw, label=r\"$\\alpha = {}$\".format(alpha))\n", " plt.plot(X, y, \"b.\", linewidth=3)\n", " plt.legend(loc=\"upper left\", fontsize=15)\n", " plt.xlabel(\"$x_1$\", fontsize=18)\n", " plt.axis([0, 3, 0, 4])\n", "\n", "plt.figure(figsize=(8,4))\n", "plt.subplot(121)\n", "plot_model(Ridge, polynomial=False, alphas=(0, 10, 100), random_state=42)\n", "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", "plt.subplot(122)\n", "plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1), random_state=42)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Ridge\n", "ridge_reg = Ridge(alpha=1, solver=\"cholesky\", random_state=42)\n", "ridge_reg.fit(X, y)\n", "ridge_reg.predict([[1.5]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sgd_reg = SGDRegressor(max_iter=5, penalty=\"l2\", random_state=42)\n", "sgd_reg.fit(X, y.ravel())\n", "sgd_reg.predict([[1.5]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ridge_reg = Ridge(alpha=1, solver=\"sag\", random_state=42)\n", "ridge_reg.fit(X, y)\n", "ridge_reg.predict([[1.5]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Lasso\n", "\n", "plt.figure(figsize=(8,4))\n", "plt.subplot(121)\n", "plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1), random_state=42)\n", "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", "plt.subplot(122)\n", "plot_model(Lasso, polynomial=True, alphas=(0, 10**-7, 1), tol=1, random_state=42)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Lasso\n", "lasso_reg = Lasso(alpha=0.1)\n", "lasso_reg.fit(X, y)\n", "lasso_reg.predict([[1.5]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import ElasticNet\n", "elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)\n", "elastic_net.fit(X, y)\n", "elastic_net.predict([[1.5]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "np.random.seed(42)\n", "m = 100\n", "X = 6 * np.random.rand(m, 1) - 3\n", "y = 2 + X + 0.5 * X**2 + np.random.randn(m, 1)\n", "\n", "X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)\n", "\n", "poly_scaler = Pipeline([\n", " (\"poly_features\", PolynomialFeatures(degree=90, include_bias=False)),\n", " (\"std_scaler\", StandardScaler()),\n", " ])\n", "\n", "X_train_poly_scaled = poly_scaler.fit_transform(X_train)\n", "X_val_poly_scaled = poly_scaler.transform(X_val)\n", "\n", "sgd_reg = SGDRegressor(max_iter=1,\n", " penalty=None,\n", " eta0=0.0005,\n", " warm_start=True,\n", " learning_rate=\"constant\",\n", " random_state=42)\n", "\n", "n_epochs = 500\n", "train_errors, val_errors = [], []\n", "for epoch in range(n_epochs):\n", " sgd_reg.fit(X_train_poly_scaled, y_train)\n", " y_train_predict = sgd_reg.predict(X_train_poly_scaled)\n", " y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n", " train_errors.append(mean_squared_error(y_train_predict, y_train))\n", " val_errors.append(mean_squared_error(y_val_predict, y_val))\n", "\n", "best_epoch = np.argmin(val_errors)\n", "best_val_rmse = np.sqrt(val_errors[best_epoch])\n", "\n", "plt.annotate('Best model',\n", " xy=(best_epoch, best_val_rmse),\n", " xytext=(best_epoch, best_val_rmse + 1),\n", " ha=\"center\",\n", " arrowprops=dict(facecolor='black', shrink=0.05),\n", " fontsize=16,\n", " )\n", "\n", "best_val_rmse -= 0.03 # just to make the graph look better\n", "plt.plot([0, n_epochs], [best_val_rmse, best_val_rmse], \"k:\", linewidth=2)\n", "plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"Validation set\")\n", "plt.plot(np.sqrt(train_errors), \"r--\", linewidth=2, label=\"Training set\")\n", "plt.legend(loc=\"upper right\", fontsize=14)\n", "plt.xlabel(\"Epoch\", fontsize=14)\n", "plt.ylabel(\"RMSE\", fontsize=14)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.base import clone\n", "sgd_reg = SGDRegressor(max_iter=1, warm_start=True, penalty=None,\n", " learning_rate=\"constant\", eta0=0.0005, random_state=42)\n", "\n", "minimum_val_error = float(\"inf\")\n", "best_epoch = None\n", "best_model = None\n", "for epoch in range(1000):\n", " sgd_reg.fit(X_train_poly_scaled, y_train) # continues where it left off\n", " y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n", " val_error = mean_squared_error(y_val_predict, y_val)\n", " if val_error < minimum_val_error:\n", " minimum_val_error = val_error\n", " best_epoch = epoch\n", " best_model = clone(sgd_reg)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "best_epoch, best_model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "t1a, t1b, t2a, t2b = -1, 3, -1.5, 1.5\n", "\n", "# ignoring bias term\n", "t1s = np.linspace(t1a, t1b, 500)\n", "t2s = np.linspace(t2a, t2b, 500)\n", "t1, t2 = np.meshgrid(t1s, t2s)\n", "T = np.c_[t1.ravel(), t2.ravel()]\n", "Xr = np.array([[-1, 1], [-0.3, -1], [1, 0.1]])\n", "yr = 2 * Xr[:, :1] + 0.5 * Xr[:, 1:]\n", "\n", "J = (1/len(Xr) * np.sum((T.dot(Xr.T) - yr.T)**2, axis=1)).reshape(t1.shape)\n", "\n", "N1 = np.linalg.norm(T, ord=1, axis=1).reshape(t1.shape)\n", "N2 = np.linalg.norm(T, ord=2, axis=1).reshape(t1.shape)\n", "\n", "t_min_idx = np.unravel_index(np.argmin(J), J.shape)\n", "t1_min, t2_min = t1[t_min_idx], t2[t_min_idx]\n", "\n", "t_init = np.array([[0.25], [-1]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def bgd_path(theta, X, y, l1, l2, core = 1, eta = 0.1, n_iterations = 50):\n", " path = [theta]\n", " for iteration in range(n_iterations):\n", " gradients = core * 2/len(X) * X.T.dot(X.dot(theta) - y) + l1 * np.sign(theta) + 2 * l2 * theta\n", "\n", " theta = theta - eta * gradients\n", " path.append(theta)\n", " return np.array(path)\n", "\n", "plt.figure(figsize=(12, 8))\n", "for i, N, l1, l2, title in ((0, N1, 0.5, 0, \"Lasso\"), (1, N2, 0, 0.1, \"Ridge\")):\n", " JR = J + l1 * N1 + l2 * N2**2\n", " \n", " tr_min_idx = np.unravel_index(np.argmin(JR), JR.shape)\n", " t1r_min, t2r_min = t1[tr_min_idx], t2[tr_min_idx]\n", "\n", " levelsJ=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(J) - np.min(J)) + np.min(J)\n", " levelsJR=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(JR) - np.min(JR)) + np.min(JR)\n", " levelsN=np.linspace(0, np.max(N), 10)\n", " \n", " path_J = bgd_path(t_init, Xr, yr, l1=0, l2=0)\n", " path_JR = bgd_path(t_init, Xr, yr, l1, l2)\n", " path_N = bgd_path(t_init, Xr, yr, np.sign(l1)/3, np.sign(l2), core=0)\n", "\n", " plt.subplot(221 + i * 2)\n", " plt.grid(True)\n", " plt.axhline(y=0, color='k')\n", " plt.axvline(x=0, color='k')\n", " plt.contourf(t1, t2, J, levels=levelsJ, alpha=0.9)\n", " plt.contour(t1, t2, N, levels=levelsN)\n", " plt.plot(path_J[:, 0], path_J[:, 1], \"w-o\")\n", " plt.plot(path_N[:, 0], path_N[:, 1], \"y-^\")\n", " plt.plot(t1_min, t2_min, \"rs\")\n", " plt.title(r\"$\\ell_{}$ penalty\".format(i + 1), fontsize=16)\n", " plt.axis([t1a, t1b, t2a, t2b])\n", "\n", " plt.subplot(222 + i * 2)\n", " plt.grid(True)\n", " plt.axhline(y=0, color='k')\n", " plt.axvline(x=0, color='k')\n", " plt.contourf(t1, t2, JR, levels=levelsJR, alpha=0.9)\n", " plt.plot(path_JR[:, 0], path_JR[:, 1], \"w-o\")\n", " plt.plot(t1r_min, t2r_min, \"rs\")\n", " plt.title(title, fontsize=16)\n", " plt.axis([t1a, t1b, t2a, t2b])\n", "\n", "for subplot in (221, 223):\n", " plt.subplot(subplot)\n", " plt.ylabel(r\"$\\theta_2$\", fontsize=20, rotation=0)\n", "\n", "for subplot in (223, 224):\n", " plt.subplot(subplot)\n", " plt.xlabel(r\"$\\theta_1$\", fontsize=20)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Logistic regression" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "t = np.linspace(-10, 10, 100)\n", "sig = 1 / (1 + np.exp(-t))\n", "plt.figure(figsize=(9, 3))\n", "plt.plot([-10, 10], [0, 0], \"k-\")\n", "plt.plot([-10, 10], [0.5, 0.5], \"k:\")\n", "plt.plot([-10, 10], [1, 1], \"k:\")\n", "plt.plot([0, 0], [-1.1, 1.1], \"k-\")\n", "plt.plot(t, sig, \"b-\", linewidth=2, label=r\"$\\sigma(t) = \\frac{1}{1 + e^{-t}}$\")\n", "plt.xlabel(\"t\")\n", "plt.legend(loc=\"upper left\", fontsize=20)\n", "plt.axis([-10, 10, -0.1, 1.1])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn import datasets\n", "iris = datasets.load_iris()\n", "list(iris.keys())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(iris.DESCR)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X = iris[\"data\"][:, 3:] # petal width\n", "y = (iris[\"target\"] == 2).astype(np.int) # 1 if Iris-Virginica, else 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "log_reg = LogisticRegression(random_state=42)\n", "log_reg.fit(X, y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n", "y_proba = log_reg.predict_proba(X_new)\n", "\n", "plt.plot(X_new, y_proba[:, 1], \"g-\", linewidth=2, label=\"Iris-Virginica\")\n", "plt.plot(X_new, y_proba[:, 0], \"b--\", linewidth=2, label=\"Not Iris-Virginica\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The figure in the book actually is actually a bit fancier:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n", "y_proba = log_reg.predict_proba(X_new)\n", "decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]\n", "\n", "plt.figure(figsize=(8, 3))\n", "plt.plot(X[y==0], y[y==0], \"bs\")\n", "plt.plot(X[y==1], y[y==1], \"g^\")\n", "plt.plot([decision_boundary, decision_boundary], [-1, 2], \"k:\", linewidth=2)\n", "plt.plot(X_new, y_proba[:, 1], \"g-\", linewidth=2, label=\"Iris-Virginica\")\n", "plt.plot(X_new, y_proba[:, 0], \"b--\", linewidth=2, label=\"Not Iris-Virginica\")\n", "plt.text(decision_boundary+0.02, 0.15, \"Decision boundary\", fontsize=14, color=\"k\", ha=\"center\")\n", "plt.arrow(decision_boundary, 0.08, -0.3, 0, head_width=0.05, head_length=0.1, fc='b', ec='b')\n", "plt.arrow(decision_boundary, 0.92, 0.3, 0, head_width=0.05, head_length=0.1, fc='g', ec='g')\n", "plt.xlabel(\"Petal width (cm)\", fontsize=14)\n", "plt.ylabel(\"Probability\", fontsize=14)\n", "plt.legend(loc=\"center left\", fontsize=14)\n", "plt.axis([0, 3, -0.02, 1.02])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decision_boundary" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "log_reg.predict([[1.7], [1.5]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "\n", "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", "y = (iris[\"target\"] == 2).astype(np.int)\n", "\n", "log_reg = LogisticRegression(C=10**10, random_state=42)\n", "log_reg.fit(X, y)\n", "\n", "x0, x1 = np.meshgrid(\n", " np.linspace(2.9, 7, 500).reshape(-1, 1),\n", " np.linspace(0.8, 2.7, 200).reshape(-1, 1),\n", " )\n", "X_new = np.c_[x0.ravel(), x1.ravel()]\n", "\n", "y_proba = log_reg.predict_proba(X_new)\n", "\n", "plt.figure(figsize=(10, 4))\n", "plt.plot(X[y==0, 0], X[y==0, 1], \"bs\")\n", "plt.plot(X[y==1, 0], X[y==1, 1], \"g^\")\n", "\n", "zz = y_proba[:, 1].reshape(x0.shape)\n", "contour = plt.contour(x0, x1, zz, cmap=plt.cm.brg)\n", "\n", "\n", "left_right = np.array([2.9, 7])\n", "boundary = -(log_reg.coef_[0][0] * left_right + log_reg.intercept_[0]) / log_reg.coef_[0][1]\n", "\n", "plt.clabel(contour, inline=1, fontsize=12)\n", "plt.plot(left_right, boundary, \"k--\", linewidth=3)\n", "plt.text(3.5, 1.5, \"Not Iris-Virginica\", fontsize=14, color=\"b\", ha=\"center\")\n", "plt.text(6.5, 2.3, \"Iris-Virginica\", fontsize=14, color=\"g\", ha=\"center\")\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", "plt.ylabel(\"Petal width\", fontsize=14)\n", "plt.axis([2.9, 7, 0.8, 2.7])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", "y = iris[\"target\"]\n", "\n", "softmax_reg = LogisticRegression(multi_class=\"multinomial\",solver=\"lbfgs\", C=10, random_state=42)\n", "softmax_reg.fit(X, y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x0, x1 = np.meshgrid(\n", " np.linspace(0, 8, 500).reshape(-1, 1),\n", " np.linspace(0, 3.5, 200).reshape(-1, 1),\n", " )\n", "X_new = np.c_[x0.ravel(), x1.ravel()]\n", "\n", "\n", "y_proba = softmax_reg.predict_proba(X_new)\n", "y_predict = softmax_reg.predict(X_new)\n", "\n", "zz1 = y_proba[:, 1].reshape(x0.shape)\n", "zz = y_predict.reshape(x0.shape)\n", "\n", "plt.figure(figsize=(10, 4))\n", "plt.plot(X[y==2, 0], X[y==2, 1], \"g^\", label=\"Iris-Virginica\")\n", "plt.plot(X[y==1, 0], X[y==1, 1], \"bs\", label=\"Iris-Versicolor\")\n", "plt.plot(X[y==0, 0], X[y==0, 1], \"yo\", label=\"Iris-Setosa\")\n", "\n", "from matplotlib.colors import ListedColormap\n", "custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n", "\n", "plt.contourf(x0, x1, zz, cmap=custom_cmap, linewidth=5)\n", "contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)\n", "plt.clabel(contour, inline=1, fontsize=12)\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", "plt.ylabel(\"Petal width\", fontsize=14)\n", "plt.legend(loc=\"center left\", fontsize=14)\n", "plt.axis([0, 7, 0, 3.5])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "softmax_reg.predict([[5, 2]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "softmax_reg.predict_proba([[5, 2]])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" }, "nav_menu": {}, "toc": { "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 6, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 1 }