In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import scipy
import seaborn as sns
import os
import re

import sklearn_pandas
import qgrid

import math

qgrid.set_grid_option('maxVisibleRows', 30)
qgrid.set_grid_option('enableColumnReorder', True)

%matplotlib inline

In [None]:
train=pd.read_csv('/Users/pawel/.kaggle/competitions/titanic/train.csv')
test=pd.read_csv('/Users/pawel/.kaggle/competitions/titanic/test.csv')

In [None]:
alldata = pd.concat([train, test])

In [None]:
train.corr(method='spearman')

In [None]:
train.groupby(['Embarked','Pclass']).mean().Survived.unstack()

In [None]:
train.info()

In [None]:
train.groupby(['Sex', 'Pclass']).count().PassengerId.unstack()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from category_encoders import OneHotEncoder

In [None]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
 def __init__(self, categories=None):
 self.categories = categories
 
 def get_feature_names(self):
 if self.categories is None:
 return [c for c,i in self.onehotencoder_.ordinal_encoder.mapping[0]['mapping']]
 else:
 return self.categories
 
 def fit(self, X, y=None):
 try:
 self.onehotencoder_ = OneHotEncoder(cols = list(X.columns), return_df=False, handle_unknown='ignore')
 except AttributeError:
 self.onehotencoder_ = OneHotEncoder(return_df=False, handle_unknown='ignore')
 self.onehotencoder_.fit(X)
 
 def transform(self, X):
 res = self.onehotencoder_.transform(X)

 if self.categories is not None:
 idxs = [i for c,i in self.onehotencoder_.ordinal_encoder.mapping[0]['mapping'] if c in set(self.categories)]
 return res[:, idxs]
 
 return res

In [None]:
class TicketStats(BaseEstimator, TransformerMixin):
 def get_feature_names(self):
 return ['Survived','Sex','Count']
 
 def fit(self, X, y):
 s = np.zeros(len(X))
 s[X.Sex=='female'] = 1
 
 tmp = X.assign(Sex=s, Survived=y)
 
 self.ticketstats_ = tmp.groupby('Ticket').sum()[['Survived','Sex']]
 
 self.ticketstats_ = self.ticketstats_.assign(Count = tmp.groupby('Ticket').count().PassengerId)
 
 self.ticketstats_.columns = self.get_feature_names()
 
 return self
 
 def transform(self, X):
 res = X.join(self.ticketstats_, on='Ticket', lsuffix='aa')
 res = res.fillna(0)

 return res[self.get_feature_names()]

In [None]:
class CleanTitles(BaseEstimator, TransformerMixin):
 titlemap = {
 'Capt.': 'Military',
 'Col.': 'Military',
 'Don.': 'Mr.',
 'Dona.': 'Mrs.',
 'Dr.': 'Dr.',
 'Jonkheer.': 'Mr.',
 'Lady.': 'Nobility',
 'Major.': 'Military',
 'Master.': 'Master.',
 'Miss.': 'Miss.',
 'Mlle.': 'Miss.',
 'Mme.': 'Mrs.',
 'Mr.': 'Mr.',
 'Mrs.': 'Mrs.',
 'Ms.': 'Miss.',
 'Rev.': 'Clergy',
 'Sir.': 'Nobility',
 'the Countess.': 'Nobility'
 }

 def fit(self, X, y=None):
 return self
 
 def transform(self, X):
 @np.vectorize
 def get_title(t):
 return self.titlemap[t.split(', ')[1].split('.')[0]+'.']
 
 return X.assign(Name=get_title(X.Name)).as_matrix()

In [None]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
 def __init__(self, categories, imputer, drop_categories=True):
 self.categories = categories
 self.imputer = imputer
 self.drop_categories = drop_categories
 
 def fit(self, X, y=None):
 self.imputers_ = {}
 
 for c in self.categories:
 if len(X[X[:,0]==c]) > 0:
 imp = sklearn.base.clone(self.imputer)
 imp.fit(X[X[:,0]==c, 1:])
 if sum(~np.isnan(imp.statistics_))>0:
 self.imputers_[c] = imp
 
 self.defaultimputer_ = sklearn.base.clone(self.imputer)
 self.defaultimputer_.fit(X[:, 1:])
 
 return self
 
 def transform(self, X):
 X = X.copy()
 
 for c, i in self.imputers_.items():
 if sum(X[:,0]==c) > 0:
 X[X[:,0]==c, 1:] = i.transform(X[X[:,0]==c, 1:])
 
 X[:, 1:] = self.defaultimputer_.transform(X[:, 1:])
 
 if self.drop_categories:
 return X[:, 1:]
 else:
 return X

In [None]:
class CabinFeatures(BaseEstimator, TransformerMixin):
 decks = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']
 
 def get_feature_names(self):
 return self.decks + ['cabin']
 
 def _extract_decks(self, X):
 return [set(re.sub('\d+| ','',c)) for c in X.Cabin.fillna('')]
 
 def _extract_numbers(self, X):
 def mean_cabin(c):
 l = re.sub('[A-Z] ?','',c).split(' ')
 arr = np.array([int(x) for x in l if x!=''])
 if len(arr):
 return arr.mean()
 else:
 return np.nan
 return [mean_cabin(c) for c in X.Cabin.fillna('')]
 
 def _preprocess(self, X):
 deckonehot = self.deckbinarizer_.transform(self._extract_decks(X))
 cabav = self._extract_numbers(X)

 df = pd.DataFrame(deckonehot, columns = self.deckbinarizer_.classes_).assign(cabav=cabav)
 
 df.loc[df.F==1,list(set(self.deckbinarizer_.classes_) - {'F'})] = 0
 
 deck = self.deckbinarizer_.inverse_transform(df[self.deckbinarizer_.classes_].as_matrix())
 df = df.assign(deck=[d[0] if len(d) else 'NaN' for d in deck])
 
 return df
 
 def fit(self, X, y=None):
 self.deckbinarizer_ = MultiLabelBinarizer(classes = self.decks)
 self.deckbinarizer_.fit(self._extract_decks(X))
 
 df = self._preprocess(X)
 
 self.cabinimputer_ = CategoricalImputer(self.decks, Imputer())
 self.cabinimputer_.fit(df[['deck', 'cabav']].as_matrix())
 
 return self
 
 def transform(self, X):
 df = self._preprocess(X)

 cabav = self.cabinimputer_.transform(df[['deck', 'cabav']].as_matrix())
 
 df = df.assign(cabav=cabav)
 
 return df[list(self.deckbinarizer_.classes_) + ['cabav']].as_matrix()

In [None]:
num_features = sklearn_pandas.gen_features([['SibSp'], ['Parch'], ['Fare']], [Imputer])

cat_features = [
 (['Pclass'], CategoricalEncoder()),
 (['Embarked'], CategoricalEncoder(categories=['C', 'Q', 'S'])),
 (['Sex'], CategoricalEncoder(categories=['female']))
 ]

ticket_features = [(['Ticket', 'Sex', 'PassengerId'], TicketStats(), {'alias': 'TicketStats'})]

cabin_features = [(['Cabin'], CabinFeatures(), {'alias': 'Deck'})]

title_features = [
 (['Name', 'Age'], [CleanTitles(), CategoricalImputer(set(CleanTitles.titlemap.values()), Imputer())], {'alias': 'Age'}),
 (['Name'], [CleanTitles(), CategoricalEncoder()], {'alias': 'Title'})
 ]

data_pipe = sklearn_pandas.DataFrameMapper(num_features + cat_features + cabin_features + title_features + ticket_features, df_out=True, input_df=True)
data_pipe_no_ticket = sklearn_pandas.DataFrameMapper(num_features + cat_features + cabin_features + title_features, df_out=True, input_df=True)

In [None]:
Xdf = data_pipe.fit_transform(train, train.Survived)
X = np.array(Xdf.as_matrix(), dtype='float')
X.shape

In [None]:
ydf = train.Survived
y = ydf.as_matrix()
y.shape

In [None]:
Xdf.head()

In [None]:
os.getcwd()

In [None]:
Xdf.to_pickle('Xtrain.pickle')
ydf.to_pickle('ytrain.pickle')

# ¡¡¡SPOILER!!!

# ¡¡¡SPOILER!!!

# ¡¡¡SPOILER!!!

# ¡¡¡SPOILER!!!

In [None]:
class TicketStatsNormalizer(BaseEstimator, TransformerMixin):
 def __init__(self, imputer_strategy=None):
 self.imputer_strategy = imputer_strategy
 
 def _normalize(self, X):
 Xadjusted = X.copy()

 Xadjusted.loc[Xadjusted.TicketStats_Count==0, ['TicketStats_Sex', 'TicketStats_Survived']] = np.nan
 Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Sex'] /= Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Count']
 Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Survived'] /= Xadjusted.loc[Xadjusted.TicketStats_Count!=0, 'TicketStats_Count']

 return Xadjusted
 
 def fit(self, X, y=None):
 if isinstance(self.imputer_strategy, BaseEstimator):
 Xadjusted = self._normalize(X)

 if isinstance(self.imputer_strategy, Imputer):
 self.imputer_strategy.fit(Xadjusted.loc[:, ['TicketStats_Sex', 'TicketStats_Survived']], y)
 else:
 self.imputer_strategy.fit(Xadjusted, y)

 
 return self
 
 def transform(self, X):
 Xadjusted = self._normalize(X)

 if self.imputer_strategy is not None:
 if isinstance(self.imputer_strategy, Imputer):
 Xadjusted.loc[:, ['TicketStats_Sex', 'TicketStats_Survived']] = self.imputer_strategy.transform(Xadjusted.loc[:, ['TicketStats_Sex', 'TicketStats_Survived']])
 elif isinstance(self.imputer_strategy, BaseEstimator):
 return self.imputer_strategy.transform(Xadjusted)
 elif np.isreal(self.imputer_strategy):
 Xadjusted.loc[Xadjusted.TicketStats_Count==0, ['TicketStats_Sex', 'TicketStats_Survived']] = self.imputer_strategy

 # print(Xadjusted[['TicketStats_Sex', 'TicketStats_Survived', 'TicketStats_Count']])
 
 return Xadjusted
 

class TicketStatsClassifier(BaseEstimator, ClassifierMixin):
 def __init__(self, classifier, ticketstatsnormalizer=None):
 self.classifier = classifier
 self.ticketstatsnormalizer = ticketstatsnormalizer
 
 def fit(self, X, y):
 Xadjusted = X.copy()
 
 Xadjusted = X.assign(TicketStats_Count = X.TicketStats_Count - 1, TicketStats_Sex = X.TicketStats_Sex - X.Sex, TicketStats_Survived = X.TicketStats_Survived - y) 
 Xadjusted = self.ticketstatsnormalizer.fit_transform(Xadjusted)
 
 mask = ~np.isnan(np.array(Xadjusted.TicketStats_Survived, dtype=float))
 
 try: 
 self.classifier.fit(Xadjusted.loc[mask], y.loc[mask])
 except AttributeError:
 self.classifier.fit(Xadjusted.loc[mask], y[mask]) 
 
 self.classes_ = [0, 1]
 
 return self
 
 def predict(self, X):
 Xadjusted = self.ticketstatsnormalizer.transform(X)
 
 mask = ~np.isnan(np.array(Xadjusted.TicketStats_Survived, dtype=float))

 res = np.zeros(len(mask))
 res[:] = np.nan
 
 res[mask] = self.classifier.predict(self.ticketstatsnormalizer.transform(Xadjusted.loc[mask]))
 
 # print(mask, res)
 
 return res
 
 def predict_proba(self, X):
 Xadjusted = self.ticketstatsnormalizer.transform(X)
 
 mask = ~np.isnan(np.array(Xadjusted.TicketStats_Survived, dtype=float))

 res = np.ones((len(mask), 2)) * 0.5
 
 res[mask] = self.classifier.predict_proba(self.ticketstatsnormalizer.transform(Xadjusted.loc[mask]))

 # print(mask, res)


 return res