{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import sklearn\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy\n",
    "import seaborn as sns\n",
    "import os\n",
    "import re\n",
    "\n",
    "import sklearn_pandas\n",
    "import qgrid\n",
    "\n",
    "import math\n",
    "\n",
    "qgrid.set_grid_option('maxVisibleRows', 30)\n",
    "qgrid.set_grid_option('enableColumnReorder', True)\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train=pd.read_csv('/Users/pawel/.kaggle/competitions/titanic/train.csv')\n",
    "test=pd.read_csv('/Users/pawel/.kaggle/competitions/titanic/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "alldata = pd.concat([train, test])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.corr(method='spearman')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.groupby(['Embarked','Pclass']).mean().Survived.unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.groupby(['Sex', 'Pclass']).count().PassengerId.unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.pipeline import FeatureUnion\n",
    "from sklearn.preprocessing import Imputer\n",
    "from sklearn.preprocessing import FunctionTransformer\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin\n",
    "from category_encoders import OneHotEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CategoricalEncoder(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, categories=None):\n",
    "        self.categories = categories\n",
    "    \n",
    "    def get_feature_names(self):\n",
    "        if self.categories is None:\n",
    "            return [c for c,i in self.onehotencoder_.ordinal_encoder.mapping[0]['mapping']]\n",
    "        else:\n",
    "            return self.categories\n",
    "        \n",
    "    def fit(self, X, y=None):\n",
    "        try:\n",
    "            self.onehotencoder_ = OneHotEncoder(cols = list(X.columns), return_df=False, handle_unknown='ignore')\n",
    "        except AttributeError:\n",
    "            self.onehotencoder_ = OneHotEncoder(return_df=False, handle_unknown='ignore')\n",
    "        self.onehotencoder_.fit(X)\n",
    "        \n",
    "    def transform(self, X):\n",
    "        res = self.onehotencoder_.transform(X)\n",
    "\n",
    "        if self.categories is not None:\n",
    "            idxs = [i for c,i in self.onehotencoder_.ordinal_encoder.mapping[0]['mapping'] if c in set(self.categories)]\n",
    "            return res[:, idxs]\n",
    "            \n",
    "        return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TicketStats(BaseEstimator, TransformerMixin):\n",
    "    def get_feature_names(self):\n",
    "        return ['Survived','Sex','Count']\n",
    "    \n",
    "    def fit(self, X, y):\n",
    "        s = np.zeros(len(X))\n",
    "        s[X.Sex=='female'] = 1\n",
    "        \n",
    "        tmp = X.assign(Sex=s, Survived=y)\n",
    "        \n",
    "        self.ticketstats_ = tmp.groupby('Ticket').sum()[['Survived','Sex']]\n",
    "        \n",
    "        self.ticketstats_ = self.ticketstats_.assign(Count = tmp.groupby('Ticket').count().PassengerId)\n",
    "        \n",
    "        self.ticketstats_.columns = self.get_feature_names()\n",
    "        \n",
    "        return self\n",
    "        \n",
    "    def transform(self, X):\n",
    "        res = X.join(self.ticketstats_, on='Ticket', lsuffix='aa')\n",
    "        res = res.fillna(0)\n",
    "\n",
    "        return res[self.get_feature_names()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CleanTitles(BaseEstimator, TransformerMixin):\n",
    "    titlemap = {\n",
    "            'Capt.': 'Military',\n",
    "            'Col.': 'Military',\n",
    "            'Don.': 'Mr.',\n",
    "            'Dona.': 'Mrs.',\n",
    "            'Dr.': 'Dr.',\n",
    "            'Jonkheer.': 'Mr.',\n",
    "            'Lady.': 'Nobility',\n",
    "            'Major.': 'Military',\n",
    "            'Master.': 'Master.',\n",
    "            'Miss.': 'Miss.',\n",
    "            'Mlle.': 'Miss.',\n",
    "            'Mme.': 'Mrs.',\n",
    "            'Mr.': 'Mr.',\n",
    "            'Mrs.': 'Mrs.',\n",
    "            'Ms.': 'Miss.',\n",
    "            'Rev.': 'Clergy',\n",
    "            'Sir.': 'Nobility',\n",
    "            'the Countess.': 'Nobility'\n",
    "    }\n",
    "\n",
    "    def fit(self, X, y=None):\n",
    "        return self\n",
    "    \n",
    "    def transform(self, X):\n",
    "        @np.vectorize\n",
    "        def get_title(t):\n",
    "            return self.titlemap[t.split(', ')[1].split('.')[0]+'.']\n",
    "        \n",
    "        return X.assign(Name=get_title(X.Name)).as_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CategoricalImputer(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, categories, imputer, drop_categories=True):\n",
    "        self.categories = categories\n",
    "        self.imputer = imputer\n",
    "        self.drop_categories = drop_categories\n",
    "        \n",
    "    def fit(self, X, y=None):\n",
    "        self.imputers_ = {}\n",
    "        \n",
    "        for c in self.categories:\n",
    "            if len(X[X[:,0]==c]) > 0:\n",
    "                imp = sklearn.base.clone(self.imputer)\n",
    "                imp.fit(X[X[:,0]==c, 1:])\n",
    "                if sum(~np.isnan(imp.statistics_))>0:\n",
    "                    self.imputers_[c] = imp\n",
    "            \n",
    "        self.defaultimputer_ = sklearn.base.clone(self.imputer)\n",
    "        self.defaultimputer_.fit(X[:, 1:])\n",
    "        \n",
    "        return self\n",
    "        \n",
    "    def transform(self, X):\n",
    "        X = X.copy()\n",
    "        \n",
    "        for c, i in self.imputers_.items():\n",
    "            if sum(X[:,0]==c) > 0:\n",
    "                X[X[:,0]==c, 1:] = i.transform(X[X[:,0]==c, 1:])\n",
    "            \n",
    "        X[:, 1:] = self.defaultimputer_.transform(X[:, 1:])\n",
    "        \n",
    "        if self.drop_categories:\n",
    "            return X[:, 1:]\n",
    "        else:\n",
    "            return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CabinFeatures(BaseEstimator, TransformerMixin):\n",
    "    decks = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']\n",
    "    \n",
    "    def get_feature_names(self):\n",
    "        return self.decks + ['cabin']\n",
    "    \n",
    "    def _extract_decks(self, X):\n",
    "        return [set(re.sub('\\d+| ','',c)) for c in X.Cabin.fillna('')]\n",
    "    \n",
    "    def _extract_numbers(self, X):\n",
    "        def mean_cabin(c):\n",
    "            l = re.sub('[A-Z] ?','',c).split(' ')\n",
    "            arr = np.array([int(x) for x in l if x!=''])\n",
    "            if len(arr):\n",
    "                return arr.mean()\n",
    "            else:\n",
    "                return np.nan\n",
    "        return [mean_cabin(c) for c in X.Cabin.fillna('')]\n",
    "    \n",
    "    def _preprocess(self, X):\n",
    "        deckonehot = self.deckbinarizer_.transform(self._extract_decks(X))\n",
    "        cabav = self._extract_numbers(X)\n",
    "\n",
    "        df = pd.DataFrame(deckonehot, columns = self.deckbinarizer_.classes_).assign(cabav=cabav)\n",
    "        \n",
    "        df.loc[df.F==1,list(set(self.deckbinarizer_.classes_) - {'F'})] = 0\n",
    "        \n",
    "        deck = self.deckbinarizer_.inverse_transform(df[self.deckbinarizer_.classes_].as_matrix())\n",
    "        df = df.assign(deck=[d[0] if len(d) else 'NaN' for d in deck])\n",
    "        \n",
    "        return df\n",
    "    \n",
    "    def fit(self, X, y=None):\n",
    "        self.deckbinarizer_ = MultiLabelBinarizer(classes = self.decks)\n",
    "        self.deckbinarizer_.fit(self._extract_decks(X))\n",
    "       \n",
    "        df = self._preprocess(X)\n",
    "        \n",
    "        self.cabinimputer_ = CategoricalImputer(self.decks, Imputer())\n",
    "        self.cabinimputer_.fit(df[['deck', 'cabav']].as_matrix())\n",
    "        \n",
    "        return self\n",
    "        \n",
    "    def transform(self, X):\n",
    "        df = self._preprocess(X)\n",
    "\n",
    "        cabav = self.cabinimputer_.transform(df[['deck', 'cabav']].as_matrix())\n",
    "        \n",
    "        df = df.assign(cabav=cabav)\n",
    "                \n",
    "        return df[list(self.deckbinarizer_.classes_) + ['cabav']].as_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_features = sklearn_pandas.gen_features([['SibSp'], ['Parch'], ['Fare']], [Imputer])\n",
    "\n",
    "cat_features = [\n",
    "                (['Pclass'], CategoricalEncoder()),\n",
    "                (['Embarked'], CategoricalEncoder(categories=['C', 'Q', 'S'])),\n",
    "                (['Sex'], CategoricalEncoder(categories=['female']))\n",
    "               ]\n",
    "\n",
    "ticket_features = [(['Ticket', 'Sex', 'PassengerId'], TicketStats(), {'alias': 'TicketStats'})]\n",
    "\n",
    "cabin_features = [(['Cabin'], CabinFeatures(), {'alias': 'Deck'})]\n",
    "\n",
    "title_features = [\n",
    "                  (['Name', 'Age'], [CleanTitles(), CategoricalImputer(set(CleanTitles.titlemap.values()), Imputer())], {'alias': 'Age'}),\n",
    "                  (['Name'], [CleanTitles(), CategoricalEncoder()], {'alias': 'Title'})\n",
    "                 ]\n",
    "\n",
    "data_pipe = sklearn_pandas.DataFrameMapper(num_features + cat_features + cabin_features + title_features + ticket_features, df_out=True, input_df=True)\n",
    "data_pipe_no_ticket = sklearn_pandas.DataFrameMapper(num_features + cat_features + cabin_features + title_features, df_out=True, input_df=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Xdf = data_pipe.fit_transform(train, train.Survived)\n",
    "X = np.array(Xdf.as_matrix(), dtype='float')\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ydf = train.Survived\n",
    "y = ydf.as_matrix()\n",
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Xdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Xdf.to_pickle('Xtrain.pickle')\n",
    "ydf.to_pickle('ytrain.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ¡¡¡SPOILER!!!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ¡¡¡SPOILER!!!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ¡¡¡SPOILER!!!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ¡¡¡SPOILER!!!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " \n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TicketStatsNormalizer(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, imputer_strategy=None):\n",
    "        self.imputer_strategy = imputer_strategy\n",
    "    \n",
    "    def _normalize(self, X):\n",
    "        Xadjusted = X.copy()\n",
    "\n",
    "        Xadjusted.loc[Xadjusted.TicketStats_Count==0, ['TicketStats_Sex', 'TicketStats_Survived']] = np.nan\n",
    "        Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Sex'] /= Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Count']\n",
    "        Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Survived'] /= Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Count']\n",
    "\n",
    "        return Xadjusted\n",
    "    \n",
    "    def fit(self, X, y=None):\n",
    "        if isinstance(self.imputer_strategy, BaseEstimator):\n",
    "            Xadjusted = self._normalize(X)\n",
    "\n",
    "            if isinstance(self.imputer_strategy, Imputer):\n",
    "                self.imputer_strategy.fit(Xadjusted.loc[:, ['TicketStats_Sex', 'TicketStats_Survived']], y)\n",
    "            else:\n",
    "                self.imputer_strategy.fit(Xadjusted, y)\n",
    "\n",
    "            \n",
    "        return self\n",
    "            \n",
    "    def transform(self, X):\n",
    "        Xadjusted = self._normalize(X)\n",
    "\n",
    "        if self.imputer_strategy is not None:\n",
    "            if isinstance(self.imputer_strategy, Imputer):\n",
    "                Xadjusted.loc[:, ['TicketStats_Sex', 'TicketStats_Survived']] = self.imputer_strategy.transform(Xadjusted.loc[:, ['TicketStats_Sex', 'TicketStats_Survived']])\n",
    "            elif isinstance(self.imputer_strategy, BaseEstimator):\n",
    "                return self.imputer_strategy.transform(Xadjusted)\n",
    "            elif np.isreal(self.imputer_strategy):\n",
    "                Xadjusted.loc[Xadjusted.TicketStats_Count==0, ['TicketStats_Sex', 'TicketStats_Survived']] = self.imputer_strategy\n",
    "\n",
    "   #     print(Xadjusted[['TicketStats_Sex', 'TicketStats_Survived', 'TicketStats_Count']])\n",
    "        \n",
    "        return Xadjusted\n",
    "        \n",
    "\n",
    "class TicketStatsClassifier(BaseEstimator, ClassifierMixin):\n",
    "    def __init__(self, classifier, ticketstatsnormalizer=None):\n",
    "        self.classifier = classifier\n",
    "        self.ticketstatsnormalizer = ticketstatsnormalizer\n",
    "    \n",
    "    def fit(self, X, y):\n",
    "        Xadjusted = X.copy()\n",
    "        \n",
    "        Xadjusted = X.assign(TicketStats_Count = X.TicketStats_Count - 1, TicketStats_Sex = X.TicketStats_Sex - X.Sex, TicketStats_Survived = X.TicketStats_Survived - y)     \n",
    "        Xadjusted = self.ticketstatsnormalizer.fit_transform(Xadjusted)\n",
    "        \n",
    "        mask = ~np.isnan(np.array(Xadjusted.TicketStats_Survived, dtype=float))\n",
    "        \n",
    "        try:        \n",
    "            self.classifier.fit(Xadjusted.loc[mask], y.loc[mask])\n",
    "        except AttributeError:\n",
    "            self.classifier.fit(Xadjusted.loc[mask], y[mask])            \n",
    "        \n",
    "        self.classes_ = [0, 1]\n",
    "        \n",
    "        return self\n",
    "    \n",
    "    def predict(self, X):\n",
    "        Xadjusted = self.ticketstatsnormalizer.transform(X)\n",
    "        \n",
    "        mask = ~np.isnan(np.array(Xadjusted.TicketStats_Survived, dtype=float))\n",
    "\n",
    "        res = np.zeros(len(mask))\n",
    "        res[:] = np.nan\n",
    "        \n",
    "        res[mask] = self.classifier.predict(self.ticketstatsnormalizer.transform(Xadjusted.loc[mask]))\n",
    "        \n",
    "    #    print(mask, res)\n",
    "        \n",
    "        return res\n",
    "    \n",
    "    def predict_proba(self, X):\n",
    "        Xadjusted = self.ticketstatsnormalizer.transform(X)\n",
    "        \n",
    "        mask = ~np.isnan(np.array(Xadjusted.TicketStats_Survived, dtype=float))\n",
    "\n",
    "        res = np.ones((len(mask), 2)) * 0.5\n",
    "        \n",
    "        res[mask] = self.classifier.predict_proba(self.ticketstatsnormalizer.transform(Xadjusted.loc[mask]))\n",
    "\n",
    "  #      print(mask, res)\n",
    "\n",
    "\n",
    "        return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}