{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Chapter 4 – Training Linear Models**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# To support both python 2 and python 3\n",
    "from __future__ import division, print_function, unicode_literals\n",
    "\n",
    "# Common imports\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "# to make this notebook's output stable across runs\n",
    "np.random.seed(42)\n",
    "\n",
    "# To plot pretty figures\n",
    "%matplotlib inline\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rcParams['axes.labelsize'] = 14\n",
    "plt.rcParams['xtick.labelsize'] = 12\n",
    "plt.rcParams['ytick.labelsize'] = 12\n",
    "\n",
    "def save_fig(fig_id, tight_layout=True):\n",
    "    return"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Linear regression using the Normal Equation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "X = 2 * np.random.rand(100, 1)\n",
    "y = 4 + 3 * X + np.random.randn(100, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(X, y, \"b.\")\n",
    "plt.xlabel(\"$x_1$\", fontsize=18)\n",
    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
    "plt.axis([0, 2, 0, 15])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_b = np.c_[np.ones((100, 1)), X]  # add x0 = 1 to each instance\n",
    "theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "theta_best"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_new = np.array([[0], [2]])\n",
    "X_new_b = np.c_[np.ones((2, 1)), X_new]  # add x0 = 1 to each instance\n",
    "y_predict = X_new_b.dot(theta_best)\n",
    "y_predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(X_new, y_predict, \"r-\")\n",
    "plt.plot(X, y, \"b.\")\n",
    "plt.axis([0, 2, 0, 15])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The figure in the book actually corresponds to the following code, with a legend and axis labels:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(X_new, y_predict, \"r-\", linewidth=2, label=\"Predictions\")\n",
    "plt.plot(X, y, \"b.\")\n",
    "plt.xlabel(\"$x_1$\", fontsize=18)\n",
    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
    "plt.legend(loc=\"upper left\", fontsize=14)\n",
    "plt.axis([0, 2, 0, 15])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "lin_reg = LinearRegression()\n",
    "lin_reg.fit(X, y)\n",
    "lin_reg.intercept_, lin_reg.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lin_reg.predict(X_new)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Linear regression using batch gradient descent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "eta = 0.1\n",
    "n_iterations = 1000\n",
    "m = 100\n",
    "theta = np.random.randn(2,1)\n",
    "\n",
    "for iteration in range(n_iterations):\n",
    "    gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)\n",
    "    theta = theta - eta * gradients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "theta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_new_b.dot(theta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "theta_path_bgd = []\n",
    "\n",
    "def plot_gradient_descent(theta, eta, theta_path=None):\n",
    "    m = len(X_b)\n",
    "    plt.plot(X, y, \"b.\")\n",
    "    n_iterations = 1000\n",
    "    for iteration in range(n_iterations):\n",
    "        if iteration < 10:\n",
    "            y_predict = X_new_b.dot(theta)\n",
    "            style = \"b-\" if iteration > 0 else \"r--\"\n",
    "            plt.plot(X_new, y_predict, style)\n",
    "        gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)\n",
    "        theta = theta - eta * gradients\n",
    "        if theta_path is not None:\n",
    "            theta_path.append(theta)\n",
    "    plt.xlabel(\"$x_1$\", fontsize=18)\n",
    "    plt.axis([0, 2, 0, 15])\n",
    "    plt.title(r\"$\\eta = {}$\".format(eta), fontsize=16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "theta = np.random.randn(2,1)  # random initialization\n",
    "\n",
    "plt.figure(figsize=(10,4))\n",
    "plt.subplot(131); plot_gradient_descent(theta, eta=0.02)\n",
    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
    "plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)\n",
    "plt.subplot(133); plot_gradient_descent(theta, eta=0.5)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stochastic Gradient Descent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "theta_path_sgd = []\n",
    "m = len(X_b)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_epochs = 50\n",
    "t0, t1 = 5, 50  # learning schedule hyperparameters\n",
    "\n",
    "def learning_schedule(t):\n",
    "    return t0 / (t + t1)\n",
    "\n",
    "theta = np.random.randn(2,1)  # random initialization\n",
    "\n",
    "for epoch in range(n_epochs):\n",
    "    for i in range(m):\n",
    "        if epoch == 0 and i < 20:                    # not shown in the book\n",
    "            y_predict = X_new_b.dot(theta)           # not shown\n",
    "            style = \"b-\" if i > 0 else \"r--\"         # not shown\n",
    "            plt.plot(X_new, y_predict, style)        # not shown\n",
    "        random_index = np.random.randint(m)\n",
    "        xi = X_b[random_index:random_index+1]\n",
    "        yi = y[random_index:random_index+1]\n",
    "        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n",
    "        eta = learning_schedule(epoch * m + i)\n",
    "        theta = theta - eta * gradients\n",
    "        theta_path_sgd.append(theta)                 # not shown\n",
    "\n",
    "plt.plot(X, y, \"b.\")                                 # not shown\n",
    "plt.xlabel(\"$x_1$\", fontsize=18)                     # not shown\n",
    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)           # not shown\n",
    "plt.axis([0, 2, 0, 15])                              # not shown\n",
    "save_fig(\"sgd_plot\")                                 # not shown\n",
    "plt.show()                                           # not shown"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "theta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import SGDRegressor\n",
    "sgd_reg = SGDRegressor(max_iter=50, penalty=None, eta0=0.1, random_state=42)\n",
    "sgd_reg.fit(X, y.ravel())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sgd_reg.intercept_, sgd_reg.coef_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Mini-batch gradient descent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "theta_path_mgd = []\n",
    "\n",
    "n_iterations = 50\n",
    "minibatch_size = 20\n",
    "\n",
    "np.random.seed(42)\n",
    "theta = np.random.randn(2,1)  # random initialization\n",
    "\n",
    "t0, t1 = 10, 1000\n",
    "def learning_schedule(t):\n",
    "    return t0 / (t + t1)\n",
    "\n",
    "t = 0\n",
    "for epoch in range(n_iterations):\n",
    "    shuffled_indices = np.random.permutation(m)\n",
    "    X_b_shuffled = X_b[shuffled_indices]\n",
    "    y_shuffled = y[shuffled_indices]\n",
    "    for i in range(0, m, minibatch_size):\n",
    "        t += 1\n",
    "        xi = X_b_shuffled[i:i+minibatch_size]\n",
    "        yi = y_shuffled[i:i+minibatch_size]\n",
    "        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n",
    "        eta = learning_schedule(t)\n",
    "        theta = theta - eta * gradients\n",
    "        theta_path_mgd.append(theta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "theta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "theta_path_bgd = np.array(theta_path_bgd)\n",
    "theta_path_sgd = np.array(theta_path_sgd)\n",
    "theta_path_mgd = np.array(theta_path_mgd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(7,4))\n",
    "plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], \"r-s\", linewidth=1, label=\"Stochastic\")\n",
    "plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], \"g-+\", linewidth=2, label=\"Mini-batch\")\n",
    "plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], \"b-o\", linewidth=3, label=\"Batch\")\n",
    "plt.legend(loc=\"upper left\", fontsize=16)\n",
    "plt.xlabel(r\"$\\theta_0$\", fontsize=20)\n",
    "plt.ylabel(r\"$\\theta_1$   \", fontsize=20, rotation=0)\n",
    "plt.axis([2.5, 4.5, 2.3, 3.9])\n",
    "save_fig(\"gradient_descent_paths_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Polynomial regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import numpy.random as rnd\n",
    "\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "m = 100\n",
    "X = 6 * np.random.rand(m, 1) - 3\n",
    "y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(X, y, \"b.\")\n",
    "plt.xlabel(\"$x_1$\", fontsize=18)\n",
    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
    "plt.axis([-3, 3, 0, 10])\n",
    "save_fig(\"quadratic_data_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "poly_features = PolynomialFeatures(degree=2, include_bias=False)\n",
    "X_poly = poly_features.fit_transform(X)\n",
    "X[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_poly[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lin_reg = LinearRegression()\n",
    "lin_reg.fit(X_poly, y)\n",
    "lin_reg.intercept_, lin_reg.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_new=np.linspace(-3, 3, 100).reshape(100, 1)\n",
    "X_new_poly = poly_features.transform(X_new)\n",
    "y_new = lin_reg.predict(X_new_poly)\n",
    "plt.plot(X, y, \"b.\")\n",
    "plt.plot(X_new, y_new, \"r-\", linewidth=2, label=\"Predictions\")\n",
    "plt.xlabel(\"$x_1$\", fontsize=18)\n",
    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
    "plt.legend(loc=\"upper left\", fontsize=14)\n",
    "plt.axis([-3, 3, 0, 10])\n",
    "save_fig(\"quadratic_predictions_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "for style, width, degree in ((\"g-\", 1, 300), (\"b--\", 2, 2), (\"r-+\", 2, 1)):\n",
    "    polybig_features = PolynomialFeatures(degree=degree, include_bias=False)\n",
    "    std_scaler = StandardScaler()\n",
    "    lin_reg = LinearRegression()\n",
    "    polynomial_regression = Pipeline([\n",
    "            (\"poly_features\", polybig_features),\n",
    "            (\"std_scaler\", std_scaler),\n",
    "            (\"lin_reg\", lin_reg),\n",
    "        ])\n",
    "    polynomial_regression.fit(X, y)\n",
    "    y_newbig = polynomial_regression.predict(X_new)\n",
    "    plt.plot(X_new, y_newbig, style, label=str(degree), linewidth=width)\n",
    "\n",
    "plt.plot(X, y, \"b.\", linewidth=3)\n",
    "plt.legend(loc=\"upper left\")\n",
    "plt.xlabel(\"$x_1$\", fontsize=18)\n",
    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
    "plt.axis([-3, 3, 0, 10])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "def plot_learning_curves(model, X, y):\n",
    "    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)\n",
    "    train_errors, val_errors = [], []\n",
    "    for m in range(1, len(X_train)):\n",
    "        model.fit(X_train[:m], y_train[:m])\n",
    "        y_train_predict = model.predict(X_train[:m])\n",
    "        y_val_predict = model.predict(X_val)\n",
    "        train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))\n",
    "        val_errors.append(mean_squared_error(y_val_predict, y_val))\n",
    "\n",
    "    plt.plot(np.sqrt(train_errors), \"r-+\", linewidth=2, label=\"train\")\n",
    "    plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"val\")\n",
    "    plt.legend(loc=\"upper right\", fontsize=14)   # not shown in the book\n",
    "    plt.xlabel(\"Training set size\", fontsize=14) # not shown\n",
    "    plt.ylabel(\"RMSE\", fontsize=14)              # not shown"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lin_reg = LinearRegression()\n",
    "plot_learning_curves(lin_reg, X, y)\n",
    "plt.axis([0, 80, 0, 3])                         # not shown in the book\n",
    "plt.show()                                      # not shown"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "polynomial_regression = Pipeline([\n",
    "        (\"poly_features\", PolynomialFeatures(degree=10, include_bias=False)),\n",
    "        (\"lin_reg\", LinearRegression()),\n",
    "    ])\n",
    "\n",
    "plot_learning_curves(polynomial_regression, X, y)\n",
    "plt.axis([0, 80, 0, 3])           # not shown\n",
    "plt.show()                        # not shown"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Regularized models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Ridge\n",
    "\n",
    "np.random.seed(42)\n",
    "m = 20\n",
    "X = 3 * np.random.rand(m, 1)\n",
    "y = 1 + 0.5 * X + np.random.randn(m, 1) / 1.5\n",
    "X_new = np.linspace(0, 3, 100).reshape(100, 1)\n",
    "\n",
    "def plot_model(model_class, polynomial, alphas, **model_kargs):\n",
    "    for alpha, style in zip(alphas, (\"b-\", \"g--\", \"r:\")):\n",
    "        model = model_class(alpha, **model_kargs) if alpha > 0 else LinearRegression()\n",
    "        if polynomial:\n",
    "            model = Pipeline([\n",
    "                    (\"poly_features\", PolynomialFeatures(degree=10, include_bias=False)),\n",
    "                    (\"std_scaler\", StandardScaler()),\n",
    "                    (\"regul_reg\", model),\n",
    "                ])\n",
    "        model.fit(X, y)\n",
    "        y_new_regul = model.predict(X_new)\n",
    "        lw = 2 if alpha > 0 else 1\n",
    "        plt.plot(X_new, y_new_regul, style, linewidth=lw, label=r\"$\\alpha = {}$\".format(alpha))\n",
    "    plt.plot(X, y, \"b.\", linewidth=3)\n",
    "    plt.legend(loc=\"upper left\", fontsize=15)\n",
    "    plt.xlabel(\"$x_1$\", fontsize=18)\n",
    "    plt.axis([0, 3, 0, 4])\n",
    "\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.subplot(121)\n",
    "plot_model(Ridge, polynomial=False, alphas=(0, 10, 100), random_state=42)\n",
    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
    "plt.subplot(122)\n",
    "plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1), random_state=42)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Ridge\n",
    "ridge_reg = Ridge(alpha=1, solver=\"cholesky\", random_state=42)\n",
    "ridge_reg.fit(X, y)\n",
    "ridge_reg.predict([[1.5]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sgd_reg = SGDRegressor(max_iter=5, penalty=\"l2\", random_state=42)\n",
    "sgd_reg.fit(X, y.ravel())\n",
    "sgd_reg.predict([[1.5]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ridge_reg = Ridge(alpha=1, solver=\"sag\", random_state=42)\n",
    "ridge_reg.fit(X, y)\n",
    "ridge_reg.predict([[1.5]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Lasso\n",
    "\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.subplot(121)\n",
    "plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1), random_state=42)\n",
    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
    "plt.subplot(122)\n",
    "plot_model(Lasso, polynomial=True, alphas=(0, 10**-7, 1), tol=1, random_state=42)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Lasso\n",
    "lasso_reg = Lasso(alpha=0.1)\n",
    "lasso_reg.fit(X, y)\n",
    "lasso_reg.predict([[1.5]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import ElasticNet\n",
    "elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)\n",
    "elastic_net.fit(X, y)\n",
    "elastic_net.predict([[1.5]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "m = 100\n",
    "X = 6 * np.random.rand(m, 1) - 3\n",
    "y = 2 + X + 0.5 * X**2 + np.random.randn(m, 1)\n",
    "\n",
    "X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)\n",
    "\n",
    "poly_scaler = Pipeline([\n",
    "        (\"poly_features\", PolynomialFeatures(degree=90, include_bias=False)),\n",
    "        (\"std_scaler\", StandardScaler()),\n",
    "    ])\n",
    "\n",
    "X_train_poly_scaled = poly_scaler.fit_transform(X_train)\n",
    "X_val_poly_scaled = poly_scaler.transform(X_val)\n",
    "\n",
    "sgd_reg = SGDRegressor(max_iter=1,\n",
    "                       penalty=None,\n",
    "                       eta0=0.0005,\n",
    "                       warm_start=True,\n",
    "                       learning_rate=\"constant\",\n",
    "                       random_state=42)\n",
    "\n",
    "n_epochs = 500\n",
    "train_errors, val_errors = [], []\n",
    "for epoch in range(n_epochs):\n",
    "    sgd_reg.fit(X_train_poly_scaled, y_train)\n",
    "    y_train_predict = sgd_reg.predict(X_train_poly_scaled)\n",
    "    y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n",
    "    train_errors.append(mean_squared_error(y_train_predict, y_train))\n",
    "    val_errors.append(mean_squared_error(y_val_predict, y_val))\n",
    "\n",
    "best_epoch = np.argmin(val_errors)\n",
    "best_val_rmse = np.sqrt(val_errors[best_epoch])\n",
    "\n",
    "plt.annotate('Best model',\n",
    "             xy=(best_epoch, best_val_rmse),\n",
    "             xytext=(best_epoch, best_val_rmse + 1),\n",
    "             ha=\"center\",\n",
    "             arrowprops=dict(facecolor='black', shrink=0.05),\n",
    "             fontsize=16,\n",
    "            )\n",
    "\n",
    "best_val_rmse -= 0.03  # just to make the graph look better\n",
    "plt.plot([0, n_epochs], [best_val_rmse, best_val_rmse], \"k:\", linewidth=2)\n",
    "plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"Validation set\")\n",
    "plt.plot(np.sqrt(train_errors), \"r--\", linewidth=2, label=\"Training set\")\n",
    "plt.legend(loc=\"upper right\", fontsize=14)\n",
    "plt.xlabel(\"Epoch\", fontsize=14)\n",
    "plt.ylabel(\"RMSE\", fontsize=14)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import clone\n",
    "sgd_reg = SGDRegressor(max_iter=1, warm_start=True, penalty=None,\n",
    "                       learning_rate=\"constant\", eta0=0.0005, random_state=42)\n",
    "\n",
    "minimum_val_error = float(\"inf\")\n",
    "best_epoch = None\n",
    "best_model = None\n",
    "for epoch in range(1000):\n",
    "    sgd_reg.fit(X_train_poly_scaled, y_train)  # continues where it left off\n",
    "    y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n",
    "    val_error = mean_squared_error(y_val_predict, y_val)\n",
    "    if val_error < minimum_val_error:\n",
    "        minimum_val_error = val_error\n",
    "        best_epoch = epoch\n",
    "        best_model = clone(sgd_reg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_epoch, best_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "t1a, t1b, t2a, t2b = -1, 3, -1.5, 1.5\n",
    "\n",
    "# ignoring bias term\n",
    "t1s = np.linspace(t1a, t1b, 500)\n",
    "t2s = np.linspace(t2a, t2b, 500)\n",
    "t1, t2 = np.meshgrid(t1s, t2s)\n",
    "T = np.c_[t1.ravel(), t2.ravel()]\n",
    "Xr = np.array([[-1, 1], [-0.3, -1], [1, 0.1]])\n",
    "yr = 2 * Xr[:, :1] + 0.5 * Xr[:, 1:]\n",
    "\n",
    "J = (1/len(Xr) * np.sum((T.dot(Xr.T) - yr.T)**2, axis=1)).reshape(t1.shape)\n",
    "\n",
    "N1 = np.linalg.norm(T, ord=1, axis=1).reshape(t1.shape)\n",
    "N2 = np.linalg.norm(T, ord=2, axis=1).reshape(t1.shape)\n",
    "\n",
    "t_min_idx = np.unravel_index(np.argmin(J), J.shape)\n",
    "t1_min, t2_min = t1[t_min_idx], t2[t_min_idx]\n",
    "\n",
    "t_init = np.array([[0.25], [-1]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bgd_path(theta, X, y, l1, l2, core = 1, eta = 0.1, n_iterations = 50):\n",
    "    path = [theta]\n",
    "    for iteration in range(n_iterations):\n",
    "        gradients = core * 2/len(X) * X.T.dot(X.dot(theta) - y) + l1 * np.sign(theta) + 2 * l2 * theta\n",
    "\n",
    "        theta = theta - eta * gradients\n",
    "        path.append(theta)\n",
    "    return np.array(path)\n",
    "\n",
    "plt.figure(figsize=(12, 8))\n",
    "for i, N, l1, l2, title in ((0, N1, 0.5, 0, \"Lasso\"), (1, N2, 0,  0.1, \"Ridge\")):\n",
    "    JR = J + l1 * N1 + l2 * N2**2\n",
    "    \n",
    "    tr_min_idx = np.unravel_index(np.argmin(JR), JR.shape)\n",
    "    t1r_min, t2r_min = t1[tr_min_idx], t2[tr_min_idx]\n",
    "\n",
    "    levelsJ=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(J) - np.min(J)) + np.min(J)\n",
    "    levelsJR=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(JR) - np.min(JR)) + np.min(JR)\n",
    "    levelsN=np.linspace(0, np.max(N), 10)\n",
    "    \n",
    "    path_J = bgd_path(t_init, Xr, yr, l1=0, l2=0)\n",
    "    path_JR = bgd_path(t_init, Xr, yr, l1, l2)\n",
    "    path_N = bgd_path(t_init, Xr, yr, np.sign(l1)/3, np.sign(l2), core=0)\n",
    "\n",
    "    plt.subplot(221 + i * 2)\n",
    "    plt.grid(True)\n",
    "    plt.axhline(y=0, color='k')\n",
    "    plt.axvline(x=0, color='k')\n",
    "    plt.contourf(t1, t2, J, levels=levelsJ, alpha=0.9)\n",
    "    plt.contour(t1, t2, N, levels=levelsN)\n",
    "    plt.plot(path_J[:, 0], path_J[:, 1], \"w-o\")\n",
    "    plt.plot(path_N[:, 0], path_N[:, 1], \"y-^\")\n",
    "    plt.plot(t1_min, t2_min, \"rs\")\n",
    "    plt.title(r\"$\\ell_{}$ penalty\".format(i + 1), fontsize=16)\n",
    "    plt.axis([t1a, t1b, t2a, t2b])\n",
    "\n",
    "    plt.subplot(222 + i * 2)\n",
    "    plt.grid(True)\n",
    "    plt.axhline(y=0, color='k')\n",
    "    plt.axvline(x=0, color='k')\n",
    "    plt.contourf(t1, t2, JR, levels=levelsJR, alpha=0.9)\n",
    "    plt.plot(path_JR[:, 0], path_JR[:, 1], \"w-o\")\n",
    "    plt.plot(t1r_min, t2r_min, \"rs\")\n",
    "    plt.title(title, fontsize=16)\n",
    "    plt.axis([t1a, t1b, t2a, t2b])\n",
    "\n",
    "for subplot in (221, 223):\n",
    "    plt.subplot(subplot)\n",
    "    plt.ylabel(r\"$\\theta_2$\", fontsize=20, rotation=0)\n",
    "\n",
    "for subplot in (223, 224):\n",
    "    plt.subplot(subplot)\n",
    "    plt.xlabel(r\"$\\theta_1$\", fontsize=20)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Logistic regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = np.linspace(-10, 10, 100)\n",
    "sig = 1 / (1 + np.exp(-t))\n",
    "plt.figure(figsize=(9, 3))\n",
    "plt.plot([-10, 10], [0, 0], \"k-\")\n",
    "plt.plot([-10, 10], [0.5, 0.5], \"k:\")\n",
    "plt.plot([-10, 10], [1, 1], \"k:\")\n",
    "plt.plot([0, 0], [-1.1, 1.1], \"k-\")\n",
    "plt.plot(t, sig, \"b-\", linewidth=2, label=r\"$\\sigma(t) = \\frac{1}{1 + e^{-t}}$\")\n",
    "plt.xlabel(\"t\")\n",
    "plt.legend(loc=\"upper left\", fontsize=20)\n",
    "plt.axis([-10, 10, -0.1, 1.1])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "iris = datasets.load_iris()\n",
    "list(iris.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(iris.DESCR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = iris[\"data\"][:, 3:]  # petal width\n",
    "y = (iris[\"target\"] == 2).astype(np.int)  # 1 if Iris-Virginica, else 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "log_reg = LogisticRegression(random_state=42)\n",
    "log_reg.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n",
    "y_proba = log_reg.predict_proba(X_new)\n",
    "\n",
    "plt.plot(X_new, y_proba[:, 1], \"g-\", linewidth=2, label=\"Iris-Virginica\")\n",
    "plt.plot(X_new, y_proba[:, 0], \"b--\", linewidth=2, label=\"Not Iris-Virginica\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The figure in the book actually is actually a bit fancier:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n",
    "y_proba = log_reg.predict_proba(X_new)\n",
    "decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]\n",
    "\n",
    "plt.figure(figsize=(8, 3))\n",
    "plt.plot(X[y==0], y[y==0], \"bs\")\n",
    "plt.plot(X[y==1], y[y==1], \"g^\")\n",
    "plt.plot([decision_boundary, decision_boundary], [-1, 2], \"k:\", linewidth=2)\n",
    "plt.plot(X_new, y_proba[:, 1], \"g-\", linewidth=2, label=\"Iris-Virginica\")\n",
    "plt.plot(X_new, y_proba[:, 0], \"b--\", linewidth=2, label=\"Not Iris-Virginica\")\n",
    "plt.text(decision_boundary+0.02, 0.15, \"Decision  boundary\", fontsize=14, color=\"k\", ha=\"center\")\n",
    "plt.arrow(decision_boundary, 0.08, -0.3, 0, head_width=0.05, head_length=0.1, fc='b', ec='b')\n",
    "plt.arrow(decision_boundary, 0.92, 0.3, 0, head_width=0.05, head_length=0.1, fc='g', ec='g')\n",
    "plt.xlabel(\"Petal width (cm)\", fontsize=14)\n",
    "plt.ylabel(\"Probability\", fontsize=14)\n",
    "plt.legend(loc=\"center left\", fontsize=14)\n",
    "plt.axis([0, 3, -0.02, 1.02])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "decision_boundary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "log_reg.predict([[1.7], [1.5]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "X = iris[\"data\"][:, (2, 3)]  # petal length, petal width\n",
    "y = (iris[\"target\"] == 2).astype(np.int)\n",
    "\n",
    "log_reg = LogisticRegression(C=10**10, random_state=42)\n",
    "log_reg.fit(X, y)\n",
    "\n",
    "x0, x1 = np.meshgrid(\n",
    "        np.linspace(2.9, 7, 500).reshape(-1, 1),\n",
    "        np.linspace(0.8, 2.7, 200).reshape(-1, 1),\n",
    "    )\n",
    "X_new = np.c_[x0.ravel(), x1.ravel()]\n",
    "\n",
    "y_proba = log_reg.predict_proba(X_new)\n",
    "\n",
    "plt.figure(figsize=(10, 4))\n",
    "plt.plot(X[y==0, 0], X[y==0, 1], \"bs\")\n",
    "plt.plot(X[y==1, 0], X[y==1, 1], \"g^\")\n",
    "\n",
    "zz = y_proba[:, 1].reshape(x0.shape)\n",
    "contour = plt.contour(x0, x1, zz, cmap=plt.cm.brg)\n",
    "\n",
    "\n",
    "left_right = np.array([2.9, 7])\n",
    "boundary = -(log_reg.coef_[0][0] * left_right + log_reg.intercept_[0]) / log_reg.coef_[0][1]\n",
    "\n",
    "plt.clabel(contour, inline=1, fontsize=12)\n",
    "plt.plot(left_right, boundary, \"k--\", linewidth=3)\n",
    "plt.text(3.5, 1.5, \"Not Iris-Virginica\", fontsize=14, color=\"b\", ha=\"center\")\n",
    "plt.text(6.5, 2.3, \"Iris-Virginica\", fontsize=14, color=\"g\", ha=\"center\")\n",
    "plt.xlabel(\"Petal length\", fontsize=14)\n",
    "plt.ylabel(\"Petal width\", fontsize=14)\n",
    "plt.axis([2.9, 7, 0.8, 2.7])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = iris[\"data\"][:, (2, 3)]  # petal length, petal width\n",
    "y = iris[\"target\"]\n",
    "\n",
    "softmax_reg = LogisticRegression(multi_class=\"multinomial\",solver=\"lbfgs\", C=10, random_state=42)\n",
    "softmax_reg.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x0, x1 = np.meshgrid(\n",
    "        np.linspace(0, 8, 500).reshape(-1, 1),\n",
    "        np.linspace(0, 3.5, 200).reshape(-1, 1),\n",
    "    )\n",
    "X_new = np.c_[x0.ravel(), x1.ravel()]\n",
    "\n",
    "\n",
    "y_proba = softmax_reg.predict_proba(X_new)\n",
    "y_predict = softmax_reg.predict(X_new)\n",
    "\n",
    "zz1 = y_proba[:, 1].reshape(x0.shape)\n",
    "zz = y_predict.reshape(x0.shape)\n",
    "\n",
    "plt.figure(figsize=(10, 4))\n",
    "plt.plot(X[y==2, 0], X[y==2, 1], \"g^\", label=\"Iris-Virginica\")\n",
    "plt.plot(X[y==1, 0], X[y==1, 1], \"bs\", label=\"Iris-Versicolor\")\n",
    "plt.plot(X[y==0, 0], X[y==0, 1], \"yo\", label=\"Iris-Setosa\")\n",
    "\n",
    "from matplotlib.colors import ListedColormap\n",
    "custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n",
    "\n",
    "plt.contourf(x0, x1, zz, cmap=custom_cmap, linewidth=5)\n",
    "contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)\n",
    "plt.clabel(contour, inline=1, fontsize=12)\n",
    "plt.xlabel(\"Petal length\", fontsize=14)\n",
    "plt.ylabel(\"Petal width\", fontsize=14)\n",
    "plt.legend(loc=\"center left\", fontsize=14)\n",
    "plt.axis([0, 7, 0, 3.5])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "softmax_reg.predict([[5, 2]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "softmax_reg.predict_proba([[5, 2]])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "nav_menu": {},
  "toc": {
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 6,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}