{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import sklearn\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import scipy\n", "import seaborn as sns\n", "import os\n", "import re\n", "\n", "import sklearn_pandas\n", "import qgrid\n", "\n", "import math\n", "\n", "qgrid.set_grid_option('maxVisibleRows', 30)\n", "qgrid.set_grid_option('enableColumnReorder', True)\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train=pd.read_csv('/Users/pawel/.kaggle/competitions/titanic/train.csv')\n", "test=pd.read_csv('/Users/pawel/.kaggle/competitions/titanic/test.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "alldata = pd.concat([train, test])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train.corr(method='spearman')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train.groupby(['Embarked','Pclass']).mean().Survived.unstack()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train.groupby(['Sex', 'Pclass']).count().PassengerId.unstack()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.pipeline import FeatureUnion\n", "from sklearn.preprocessing import Imputer\n", "from sklearn.preprocessing import FunctionTransformer\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin\n", "from category_encoders import OneHotEncoder" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class CategoricalEncoder(BaseEstimator, TransformerMixin):\n", " def __init__(self, categories=None):\n", " self.categories = categories\n", " \n", " def get_feature_names(self):\n", " if self.categories is None:\n", " return [c for c,i in self.onehotencoder_.ordinal_encoder.mapping[0]['mapping']]\n", " else:\n", " return self.categories\n", " \n", " def fit(self, X, y=None):\n", " try:\n", " self.onehotencoder_ = OneHotEncoder(cols = list(X.columns), return_df=False, handle_unknown='ignore')\n", " except AttributeError:\n", " self.onehotencoder_ = OneHotEncoder(return_df=False, handle_unknown='ignore')\n", " self.onehotencoder_.fit(X)\n", " \n", " def transform(self, X):\n", " res = self.onehotencoder_.transform(X)\n", "\n", " if self.categories is not None:\n", " idxs = [i for c,i in self.onehotencoder_.ordinal_encoder.mapping[0]['mapping'] if c in set(self.categories)]\n", " return res[:, idxs]\n", " \n", " return res" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class TicketStats(BaseEstimator, TransformerMixin):\n", " def get_feature_names(self):\n", " return ['Survived','Sex','Count']\n", " \n", " def fit(self, X, y):\n", " s = np.zeros(len(X))\n", " s[X.Sex=='female'] = 1\n", " \n", " tmp = X.assign(Sex=s, Survived=y)\n", " \n", " self.ticketstats_ = tmp.groupby('Ticket').sum()[['Survived','Sex']]\n", " \n", " self.ticketstats_ = self.ticketstats_.assign(Count = tmp.groupby('Ticket').count().PassengerId)\n", " \n", " self.ticketstats_.columns = self.get_feature_names()\n", " \n", " return self\n", " \n", " def transform(self, X):\n", " res = X.join(self.ticketstats_, on='Ticket', lsuffix='aa')\n", " res = res.fillna(0)\n", "\n", " return res[self.get_feature_names()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class CleanTitles(BaseEstimator, TransformerMixin):\n", " titlemap = {\n", " 'Capt.': 'Military',\n", " 'Col.': 'Military',\n", " 'Don.': 'Mr.',\n", " 'Dona.': 'Mrs.',\n", " 'Dr.': 'Dr.',\n", " 'Jonkheer.': 'Mr.',\n", " 'Lady.': 'Nobility',\n", " 'Major.': 'Military',\n", " 'Master.': 'Master.',\n", " 'Miss.': 'Miss.',\n", " 'Mlle.': 'Miss.',\n", " 'Mme.': 'Mrs.',\n", " 'Mr.': 'Mr.',\n", " 'Mrs.': 'Mrs.',\n", " 'Ms.': 'Miss.',\n", " 'Rev.': 'Clergy',\n", " 'Sir.': 'Nobility',\n", " 'the Countess.': 'Nobility'\n", " }\n", "\n", " def fit(self, X, y=None):\n", " return self\n", " \n", " def transform(self, X):\n", " @np.vectorize\n", " def get_title(t):\n", " return self.titlemap[t.split(', ')[1].split('.')[0]+'.']\n", " \n", " return X.assign(Name=get_title(X.Name)).as_matrix()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class CategoricalImputer(BaseEstimator, TransformerMixin):\n", " def __init__(self, categories, imputer, drop_categories=True):\n", " self.categories = categories\n", " self.imputer = imputer\n", " self.drop_categories = drop_categories\n", " \n", " def fit(self, X, y=None):\n", " self.imputers_ = {}\n", " \n", " for c in self.categories:\n", " if len(X[X[:,0]==c]) > 0:\n", " imp = sklearn.base.clone(self.imputer)\n", " imp.fit(X[X[:,0]==c, 1:])\n", " if sum(~np.isnan(imp.statistics_))>0:\n", " self.imputers_[c] = imp\n", " \n", " self.defaultimputer_ = sklearn.base.clone(self.imputer)\n", " self.defaultimputer_.fit(X[:, 1:])\n", " \n", " return self\n", " \n", " def transform(self, X):\n", " X = X.copy()\n", " \n", " for c, i in self.imputers_.items():\n", " if sum(X[:,0]==c) > 0:\n", " X[X[:,0]==c, 1:] = i.transform(X[X[:,0]==c, 1:])\n", " \n", " X[:, 1:] = self.defaultimputer_.transform(X[:, 1:])\n", " \n", " if self.drop_categories:\n", " return X[:, 1:]\n", " else:\n", " return X" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class CabinFeatures(BaseEstimator, TransformerMixin):\n", " decks = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']\n", " \n", " def get_feature_names(self):\n", " return self.decks + ['cabin']\n", " \n", " def _extract_decks(self, X):\n", " return [set(re.sub('\\d+| ','',c)) for c in X.Cabin.fillna('')]\n", " \n", " def _extract_numbers(self, X):\n", " def mean_cabin(c):\n", " l = re.sub('[A-Z] ?','',c).split(' ')\n", " arr = np.array([int(x) for x in l if x!=''])\n", " if len(arr):\n", " return arr.mean()\n", " else:\n", " return np.nan\n", " return [mean_cabin(c) for c in X.Cabin.fillna('')]\n", " \n", " def _preprocess(self, X):\n", " deckonehot = self.deckbinarizer_.transform(self._extract_decks(X))\n", " cabav = self._extract_numbers(X)\n", "\n", " df = pd.DataFrame(deckonehot, columns = self.deckbinarizer_.classes_).assign(cabav=cabav)\n", " \n", " df.loc[df.F==1,list(set(self.deckbinarizer_.classes_) - {'F'})] = 0\n", " \n", " deck = self.deckbinarizer_.inverse_transform(df[self.deckbinarizer_.classes_].as_matrix())\n", " df = df.assign(deck=[d[0] if len(d) else 'NaN' for d in deck])\n", " \n", " return df\n", " \n", " def fit(self, X, y=None):\n", " self.deckbinarizer_ = MultiLabelBinarizer(classes = self.decks)\n", " self.deckbinarizer_.fit(self._extract_decks(X))\n", " \n", " df = self._preprocess(X)\n", " \n", " self.cabinimputer_ = CategoricalImputer(self.decks, Imputer())\n", " self.cabinimputer_.fit(df[['deck', 'cabav']].as_matrix())\n", " \n", " return self\n", " \n", " def transform(self, X):\n", " df = self._preprocess(X)\n", "\n", " cabav = self.cabinimputer_.transform(df[['deck', 'cabav']].as_matrix())\n", " \n", " df = df.assign(cabav=cabav)\n", " \n", " return df[list(self.deckbinarizer_.classes_) + ['cabav']].as_matrix()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "num_features = sklearn_pandas.gen_features([['SibSp'], ['Parch'], ['Fare']], [Imputer])\n", "\n", "cat_features = [\n", " (['Pclass'], CategoricalEncoder()),\n", " (['Embarked'], CategoricalEncoder(categories=['C', 'Q', 'S'])),\n", " (['Sex'], CategoricalEncoder(categories=['female']))\n", " ]\n", "\n", "ticket_features = [(['Ticket', 'Sex', 'PassengerId'], TicketStats(), {'alias': 'TicketStats'})]\n", "\n", "cabin_features = [(['Cabin'], CabinFeatures(), {'alias': 'Deck'})]\n", "\n", "title_features = [\n", " (['Name', 'Age'], [CleanTitles(), CategoricalImputer(set(CleanTitles.titlemap.values()), Imputer())], {'alias': 'Age'}),\n", " (['Name'], [CleanTitles(), CategoricalEncoder()], {'alias': 'Title'})\n", " ]\n", "\n", "data_pipe = sklearn_pandas.DataFrameMapper(num_features + cat_features + cabin_features + title_features + ticket_features, df_out=True, input_df=True)\n", "data_pipe_no_ticket = sklearn_pandas.DataFrameMapper(num_features + cat_features + cabin_features + title_features, df_out=True, input_df=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Xdf = data_pipe.fit_transform(train, train.Survived)\n", "X = np.array(Xdf.as_matrix(), dtype='float')\n", "X.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ydf = train.Survived\n", "y = ydf.as_matrix()\n", "y.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Xdf.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "os.getcwd()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Xdf.to_pickle('Xtrain.pickle')\n", "ydf.to_pickle('ytrain.pickle')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ¡¡¡SPOILER!!!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ¡¡¡SPOILER!!!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ¡¡¡SPOILER!!!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ¡¡¡SPOILER!!!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class TicketStatsNormalizer(BaseEstimator, TransformerMixin):\n", " def __init__(self, imputer_strategy=None):\n", " self.imputer_strategy = imputer_strategy\n", " \n", " def _normalize(self, X):\n", " Xadjusted = X.copy()\n", "\n", " Xadjusted.loc[Xadjusted.TicketStats_Count==0, ['TicketStats_Sex', 'TicketStats_Survived']] = np.nan\n", " Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Sex'] /= Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Count']\n", " Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Survived'] /= Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Count']\n", "\n", " return Xadjusted\n", " \n", " def fit(self, X, y=None):\n", " if isinstance(self.imputer_strategy, BaseEstimator):\n", " Xadjusted = self._normalize(X)\n", "\n", " if isinstance(self.imputer_strategy, Imputer):\n", " self.imputer_strategy.fit(Xadjusted.loc[:, ['TicketStats_Sex', 'TicketStats_Survived']], y)\n", " else:\n", " self.imputer_strategy.fit(Xadjusted, y)\n", "\n", " \n", " return self\n", " \n", " def transform(self, X):\n", " Xadjusted = self._normalize(X)\n", "\n", " if self.imputer_strategy is not None:\n", " if isinstance(self.imputer_strategy, Imputer):\n", " Xadjusted.loc[:, ['TicketStats_Sex', 'TicketStats_Survived']] = self.imputer_strategy.transform(Xadjusted.loc[:, ['TicketStats_Sex', 'TicketStats_Survived']])\n", " elif isinstance(self.imputer_strategy, BaseEstimator):\n", " return self.imputer_strategy.transform(Xadjusted)\n", " elif np.isreal(self.imputer_strategy):\n", " Xadjusted.loc[Xadjusted.TicketStats_Count==0, ['TicketStats_Sex', 'TicketStats_Survived']] = self.imputer_strategy\n", "\n", " # print(Xadjusted[['TicketStats_Sex', 'TicketStats_Survived', 'TicketStats_Count']])\n", " \n", " return Xadjusted\n", " \n", "\n", "class TicketStatsClassifier(BaseEstimator, ClassifierMixin):\n", " def __init__(self, classifier, ticketstatsnormalizer=None):\n", " self.classifier = classifier\n", " self.ticketstatsnormalizer = ticketstatsnormalizer\n", " \n", " def fit(self, X, y):\n", " Xadjusted = X.copy()\n", " \n", " Xadjusted = X.assign(TicketStats_Count = X.TicketStats_Count - 1, TicketStats_Sex = X.TicketStats_Sex - X.Sex, TicketStats_Survived = X.TicketStats_Survived - y) \n", " Xadjusted = self.ticketstatsnormalizer.fit_transform(Xadjusted)\n", " \n", " mask = ~np.isnan(np.array(Xadjusted.TicketStats_Survived, dtype=float))\n", " \n", " try: \n", " self.classifier.fit(Xadjusted.loc[mask], y.loc[mask])\n", " except AttributeError:\n", " self.classifier.fit(Xadjusted.loc[mask], y[mask]) \n", " \n", " self.classes_ = [0, 1]\n", " \n", " return self\n", " \n", " def predict(self, X):\n", " Xadjusted = self.ticketstatsnormalizer.transform(X)\n", " \n", " mask = ~np.isnan(np.array(Xadjusted.TicketStats_Survived, dtype=float))\n", "\n", " res = np.zeros(len(mask))\n", " res[:] = np.nan\n", " \n", " res[mask] = self.classifier.predict(self.ticketstatsnormalizer.transform(Xadjusted.loc[mask]))\n", " \n", " # print(mask, res)\n", " \n", " return res\n", " \n", " def predict_proba(self, X):\n", " Xadjusted = self.ticketstatsnormalizer.transform(X)\n", " \n", " mask = ~np.isnan(np.array(Xadjusted.TicketStats_Survived, dtype=float))\n", "\n", " res = np.ones((len(mask), 2)) * 0.5\n", " \n", " res[mask] = self.classifier.predict_proba(self.ticketstatsnormalizer.transform(Xadjusted.loc[mask]))\n", "\n", " # print(mask, res)\n", "\n", "\n", " return res" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }