#Load important packages
import numpy as np
import pandas as pd
'display.max_columns', None)
pd.set_option(
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, FactorAnalysis
import re
import matplotlib.pyplot as plt
Overview
My secret sauce to a top score in the Titanic competition: integrating numerous new tools and items of knowledge that I have obtained recently.
The secret sauce is:
Extensive feature engineering, including target encoding, frequency encoding, regular expression, and more.
Factor analysis and quantile transformation of the numerical variables.
Feature selection with Boruta Shap.
Handling the imbalanced dataset using SMOTE.
Utilizing Keras Classifier and tuning it with Keras tuner. Yep, DNN rocks on tabular data.
#Load the data
= pd.read_csv('train.csv')
train = pd.read_csv('test.csv')
test
#Correct the type of the following columns
for col in ['Sex', 'Cabin', 'Ticket', 'Embarked']:
= train[col].astype('category')
train[col] = test[col].astype('category')
test[col]
#Extract the target
= train['Survived']
target
#Combine the train and test for easier EDA and feature eng'
= pd.concat([train, test])
combined
#Check the shapes of the dataframes
print('train shape:',train.shape, '\ntest shape:', test.shape, '\ncombined shape:', combined.shape,
'\n**************************')
combined.head()
train shape: (891, 12)
test shape: (418, 11)
combined shape: (1309, 12)
**************************
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1.0 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1.0 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1.0 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0.0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
%%capture
#Install category_encoders for target encoding
!pip install category_encoders
#Target encode the following features
from category_encoders import TargetEncoder
= TargetEncoder()
encoder
'Sex_TE'] = encoder.fit_transform(combined.iloc[:891].Sex, target)
combined['Cabin_TE'] = encoder.fit_transform(combined.iloc[:891].Cabin, target)
combined['Ticket_TE'] = encoder.fit_transform(combined.iloc[:891].Ticket, target)
combined['Embarked_TE'] = encoder.fit_transform(combined.iloc[:891].Embarked, target) combined[
#Transform the Sex column to numerical
'Sex'] = combined['Sex'].map({'male': 0, 'female': 1})
combined['Sex'] = combined['Sex'].astype(int)
combined[
#Extract Titles
'Title'] = combined['Name']
combined['Title'] = combined['Name'].str.extract('([A-Za-z]+)\.', expand=True)
combined[
#Replace rare titles
= {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs', 'Major': 'Other',
mapping 'Col': 'Other', 'Dr' : 'Other', 'Rev' : 'Other', 'Capt': 'Other',
'Jonkheer': 'Royal', 'Sir': 'Royal', 'Lady': 'Royal',
'Don': 'Royal', 'Countess': 'Royal', 'Dona': 'Royal'}
'Title': mapping}, inplace=True)
combined.replace({
#Target encoding the Title
'Title_for_te'] = combined['Title'].astype('category')
combined['Title_TE'] = encoder.fit_transform(combined.iloc[:891].Title_for_te, target)
combined[=['Title_for_te'], inplace=True)
combined.drop(columns
#Create new feature - is married
'Is_Married'] = 0
combined['Is_Married'].loc[combined['Title'] == 'Mrs'] = 1
combined[
#Create a new feature of Name Length - because longer names usually are given to higher class people
"Name_Length"] = combined.Name.str.replace("[^a-zA-Z]", "").str.len()
combined[
#Label encode the Title column
= {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Other': 4, 'Royal': 5, 'Master': 6}
title_dict 'Title'] = combined['Title'].map(title_dict).astype('int')
combined[
#Create a feature of Family size
'Family_Size'] = combined['Parch'] + combined['SibSp'] + 1
combined[
#Create a feature of Family size category
'Fsize_Cat'] = combined['Family_Size'].map(lambda val: 'Alone' if val <= 1 else ('Small' if val < 5 else 'Big'))
combined[= {'Alone':3, 'Small':2, 'Big':1}
Fsize_dict 'Fsize_Cat'] = combined['Fsize_Cat'].map(Fsize_dict).astype('int')
combined[
#Extract the Surname
'Surname'] = combined['Name'].str.extract('([A-Za-z]+.[A-Za-z]+)\,', expand=True)
combined[
#Create a family survival rate feature
= round(np.mean(train['Survived']), 4)
MEAN_SURVIVAL_RATE
'Family_Friends_Surv_Rate'] = MEAN_SURVIVAL_RATE
combined['Surv_Rate_Invalid'] = 1
combined[
for _, grp_df in combined[['Survived', 'Surname', 'Fare', 'Ticket', 'PassengerId']].groupby(['Surname', 'Fare']):
if (len(grp_df) > 1):
if(grp_df['Survived'].isnull().sum() != len(grp_df)):
for ind, row in grp_df.iterrows():
'PassengerId'] == row['PassengerId'],
combined.loc[combined['Family_Friends_Surv_Rate'] = round(grp_df['Survived'].mean(), 4)
'PassengerId'] == row['PassengerId'],
combined.loc[combined['Surv_Rate_Invalid'] = 0
for _, grp_df in combined[['Survived', 'Surname', 'Fare', 'Ticket', 'PassengerId', 'Family_Friends_Surv_Rate']].groupby('Ticket'):
if (len(grp_df) > 1):
for ind, row in grp_df.iterrows():
if (row['Family_Friends_Surv_Rate'] == 0.) | (row['Family_Friends_Surv_Rate'] == MEAN_SURVIVAL_RATE):
if(grp_df['Survived'].isnull().sum() != len(grp_df)):
'PassengerId'] == row['PassengerId'],
combined.loc[combined['Family_Friends_Surv_Rate'] = round(grp_df['Survived'].mean(), 4)
'PassengerId'] == row['PassengerId'],
combined.loc[combined['Surv_Rate_Invalid'] = 0
#Clean the Cabin column
'Cabin'] = combined['Cabin'].astype('category')
combined['Cabin'] = combined['Cabin'].cat.add_categories('U')
combined['Cabin_Clean'] = combined['Cabin'].fillna('U')
combined['Cabin_Clean'] = combined['Cabin_Clean'].str.strip(' ').str[0]
combined[
# Label Encoding of the Cabin
= {'A':9, 'B':8, 'C':7, 'D':6, 'E':5, 'F':4, 'G':3, 'T':2, 'U':1}
cabin_dict 'Cabin_Clean'] = combined['Cabin_Clean'].map(cabin_dict).astype('int')
combined[
#Target encoding of the Cleaned cabin column
'Cabin_for_te'] = combined['Cabin_Clean'].astype('category')
combined['Cabin_TE'] = encoder.fit_transform(combined.iloc[:891].Cabin_for_te, target)
combined[=['Cabin_for_te'], inplace=True)
combined.drop(columns
#Clean the ticket column
def clean_ticket(each_ticket):
= re.sub(r'[^a-zA-Z]', '', each_ticket)
prefix if(prefix):
return prefix
else:
return "NUM"
"Tkt_Clean"] = combined.Ticket.apply(clean_ticket)
combined[
#Create ticket frequency column
'Ticket_Frequency'] = combined.groupby('Ticket')['Ticket'].transform('count')
combined[
#Create ticket groups
= dict(combined['Ticket'].value_counts())
Ticket_Count 'TicketGroup'] = combined['Ticket'].map(Ticket_Count)
combined[
def Ticket_Label(s):
if (s >= 2) & (s <= 4):
return 2
elif ((s > 4) & (s <= 8)) | (s == 1):
return 1
elif (s > 8):
return 0
'TicketGroup'] = combined['TicketGroup'].apply(Ticket_Label)
combined[
#Create fare bins
def fare_cat(fare):
if fare <= 7.0:
return 1
elif fare <= 39 and fare > 7.0:
return 2
else:
return 3
'Fare_Cat'] = combined['Fare'].apply(fare_cat).astype('int')
combined.loc[:,
#Create some more columns with mult and div (to help the model)
'Fare_Family_Size'] = combined['Fare']/combined['Family_Size']
combined.loc[:, 'Fare_Cat_Pclass'] = combined['Fare_Cat']*combined['Pclass']
combined.loc[:, 'Fare_Cat_Title'] = combined['Fare_Cat']*combined['Title']
combined.loc[:, 'Fsize_Cat_Title'] = combined['Fsize_Cat']*combined['Title']
combined.loc[:, 'Fsize_Cat_Fare_Cat'] = combined['Fare_Cat']/combined['Fsize_Cat'].astype('int')
combined.loc[:, 'Pclass_Title'] = combined['Pclass']*combined['Title']
combined.loc[:, 'Fsize_Cat_Pclass'] = combined['Fsize_Cat']*combined['Pclass']
combined.loc[:, 'surv_rate_div_title'] = combined['Family_Friends_Surv_Rate'] / combined['Title']
combined[
#Create bins of the Cabin number where available
'Cabin_num'] = combined['Cabin'].map(lambda x: re.sub("\D", "", x))
combined['Cabin_num'] = pd.to_numeric(combined['Cabin_num'])
combined['Cabin_num_bins'] = pd.qcut(combined['Cabin_num'], 10, labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
combined['Cabin_num_bins'] = combined['Cabin_num_bins'].cat.add_categories(0)
combined['Cabin_num_bins'] = combined['Cabin_num_bins'].fillna(0)
combined['Cabin_num_bins'] = combined['Cabin_num_bins'].astype(int)
combined[
#Create bins of the Ticket number where available
'Ticket_num'] = combined['Ticket'].map(lambda x: re.sub("\D", "", x))
combined['Ticket_num'] = pd.to_numeric(combined['Ticket_num'])
combined['Ticket_num_bins'] = pd.qcut(combined['Ticket_num'], 10, labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
combined['Ticket_num_bins'] = combined['Ticket_num_bins'].cat.add_categories(0)
combined['Ticket_num_bins'] = combined['Ticket_num_bins'].fillna(0)
combined['Ticket_num_bins'] = combined['Ticket_num_bins'].astype(int)
combined[
#Create some more columns with mult and div (to help the model)
'Fare_Family_Size'] = combined['Fare']/combined['Family_Size']
combined.loc[:, 'Fare_Cat_Pclass'] = combined['Fare_Cat']*combined['Pclass']
combined.loc[:, 'Fare_Cat_Title'] = combined['Fare_Cat']*combined['Title']
combined.loc[:, 'Fsize_Cat_Title'] = combined['Fsize_Cat']*combined['Title']
combined.loc[:, 'Fsize_Cat_Fare_Cat'] = combined['Fare_Cat']/combined['Fsize_Cat'].astype('int')
combined.loc[:, 'Pclass_Title'] = combined['Pclass']*combined['Title']
combined.loc[:, 'Fsize_Cat_Pclass'] = combined['Fsize_Cat']*combined['Pclass']
combined.loc[:,
'Cabin_num_bins_mult_Fsize_Cat'] = combined['Cabin_num_bins'] * combined['Fsize_Cat']
combined['Cabin_num_bins_mult_Title'] = combined['Cabin_num_bins'] * combined['Title']
combined['Cabin_num_bins_mult_Pclass'] = combined['Cabin_num_bins'] * combined['Pclass']
combined['Cabin_num_bins_mult_Family_Friends_Surv_Rate'] = combined['Cabin_num_bins'] * combined['Family_Friends_Surv_Rate']
combined['Cabin_num_bins_mult_Fare'] = combined['Cabin_num_bins'] * combined['Fare']
combined['Cabin_num_bins_mult_Ticket_num_bins'] = combined['Cabin_num_bins'] * combined['Ticket_num_bins']
combined[
'Ticket_num_bins_mult_Fsize_Cat'] = combined['Ticket_num_bins'] * combined['Fsize_Cat']
combined['Ticket_num_bins_mult_Title'] = combined['Ticket_num_bins'] * combined['Title']
combined['Ticket_num_bins_mult_Pclass'] = combined['Ticket_num_bins'] * combined['Pclass']
combined['Ticket_num_bins_mult_Family_Friends_Surv_Rate'] = combined['Ticket_num_bins'] * combined['Family_Friends_Surv_Rate']
combined['Ticket_num_bins_mult_Fare'] = combined['Ticket_num_bins'] * combined['Fare']
combined[
#Frequencies encodings
= combined.groupby('Pclass').size()
feature_counts 'Pclass_frqeuency_encoding'] = combined['Pclass'].apply(lambda x:feature_counts[x])
combined[
= combined.groupby('Title').size()
title_counts 'Title_frqeuency_encoding'] = combined['Title'].apply(lambda x:title_counts[x])
combined[
= combined.groupby('Fsize_Cat').size()
Fsize_Cat_counts 'Fsize_Cat_frqeuency_encoding'] = combined['Fsize_Cat'].apply(lambda x:Fsize_Cat_counts[x])
combined[
= combined.groupby('Family_Friends_Surv_Rate').size()
Family_Friends_Surv_Rate_counts 'Family_Friends_Surv_Rate_frqeuency_encoding'] = combined['Family_Friends_Surv_Rate'].apply(lambda x:Family_Friends_Surv_Rate_counts[x])
combined[
= combined.groupby('Cabin_Clean').size()
Cabin_Clean_counts 'Cabin_Clean_frqeuency_encoding'] = combined['Cabin_Clean'].apply(lambda x:Cabin_Clean_counts[x])
combined[
= combined.groupby('TicketGroup').size()
TicketGroup_counts 'TicketGroup_frqeuency_encoding'] = combined['TicketGroup'].apply(lambda x:TicketGroup_counts[x])
combined[
= combined.groupby('Fare_Cat').size()
Fare_Cat_counts 'Fare_Cat_frqeuency_encoding'] = combined['Fare_Cat'].apply(lambda x:Fare_Cat_counts[x])
combined[
= combined.groupby('Cabin_num_bins').size()
Cabin_num_bins_counts 'Cabin_num_bins_frqeuency_encoding'] = combined['Cabin_num_bins'].apply(lambda x:Cabin_num_bins_counts[x])
combined[
= combined.groupby('Ticket_num_bins').size()
Ticket_num_bins_counts 'Ticket_num_bins_frqeuency_encoding'] = combined['Ticket_num_bins'].apply(lambda x:Ticket_num_bins_counts[x])
combined[
= combined.groupby(['Pclass', 'Title']).size()
Pclass_and_Title_counts 'Pclass_and_Title_frqeuency_encoding'] = combined[['Pclass', 'Title']].apply(lambda x:Pclass_and_Title_counts[x[0]][x[1]], axis=1)
combined[
= combined.groupby(['Pclass', 'Fsize_Cat']).size()
Pclass_and_Fsize_Cat_counts 'Pclass_and_Fsize_Cat_frqeuency_encoding'] = combined[['Pclass', 'Fsize_Cat']].apply(lambda x:Pclass_and_Fsize_Cat_counts[x[0]][x[1]], axis=1)
combined[
= combined.groupby(['Pclass', 'Family_Friends_Surv_Rate']).size()
Pclass_and_Family_Friends_Surv_Rate_counts 'Pclass_and_Family_Friends_Surv_Rate_frqeuency_encoding'] = combined[['Pclass', 'Family_Friends_Surv_Rate']].apply(lambda x:Pclass_and_Family_Friends_Surv_Rate_counts[x[0]][x[1]], axis=1)
combined[
= combined.groupby(['Pclass', 'Cabin_Clean']).size()
Pclass_and_Cabin_Clean_counts 'Pclass_and_Cabin_Clean_frqeuency_encoding'] = combined[['Pclass', 'Cabin_Clean']].apply(lambda x:Pclass_and_Cabin_Clean_counts[x[0]][x[1]], axis=1)
combined[
= combined.groupby(['Pclass', 'TicketGroup']).size()
Pclass_and_TicketGroup_counts 'Pclass_and_TicketGroup_frqeuency_encoding'] = combined[['Pclass', 'TicketGroup']].apply(lambda x:Pclass_and_TicketGroup_counts[x[0]][x[1]], axis=1)
combined[
= combined.groupby(['Pclass', 'Cabin_num_bins']).size()
Pclass_and_Cabin_num_bins_counts 'Pclass_and_Cabin_num_bins_frqeuency_encoding'] = combined[['Pclass', 'Cabin_num_bins']].apply(lambda x:Pclass_and_Cabin_num_bins_counts[x[0]][x[1]], axis=1)
combined[
= combined.groupby(['Pclass', 'Ticket_num_bins']).size()
Pclass_and_Ticket_num_bins_counts 'Pclass_and_Ticket_num_bins_frqeuency_encoding'] = combined[['Pclass', 'Ticket_num_bins']].apply(lambda x:Pclass_and_Ticket_num_bins_counts[x[0]][x[1]], axis=1)
combined[
= combined.groupby(['Title', 'Family_Friends_Surv_Rate']).size()
Title_and_Family_Friends_Surv_Rate_counts 'Title_and_Family_Friends_Surv_Rate_frqeuency_encoding'] = combined[['Title', 'Family_Friends_Surv_Rate']].apply(lambda x:Title_and_Family_Friends_Surv_Rate_counts[x[0]][x[1]], axis=1)
combined[
= combined.groupby(['Pclass', 'Sex']).size()
Pclass_and_Sex_counts 'Pclass_and_Sex_frqeuency_encoding'] = combined[['Pclass', 'Sex']].apply(lambda x:Pclass_and_Sex_counts[x[0]][x[1]], axis=1)
combined[
#Some helpful definitions
= combined.iloc[:891]
X = combined.iloc[891:]
test_df = combined.copy()
full_df
# Check for families that has survivers and create a dictionary with mean value of their family survivability
= full_df[['Surname', 'Survived']].groupby('Surname').mean().round(2).reset_index()
family_survivers = dict(zip(family_survivers.Surname, family_survivers.Survived))
family_survivers_dict
# Reduce the dictionary to the list of families that are both in train and test data
= {}
common_survivers for lastname, survived in family_survivers_dict.items():
if lastname in list(test_df['Surname'].unique()):
= survived
common_survivers[lastname]
# Create Family_survivers feature
'Family_survivers'] = combined.Surname.map(common_survivers)
combined[
# For the families that are not present in both train and test - impute the overall mean value
= combined.Family_survivers.fillna(combined.Family_survivers.mean())
combined.Family_survivers
# Create Lucky_family feature
'Lucky_family'] = pd.cut(x=combined.Family_survivers, labels=[2, 3, 1, 4],
combined[=[-1, 0.22, 0.35, 0.49, combined.Family_survivers.max()]).astype('float') bins
/tmp/ipykernel_62/1682631074.py:24: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
combined['Is_Married'].loc[combined['Title'] == 'Mrs'] = 1
/tmp/ipykernel_62/1682631074.py:27: FutureWarning: The default value of regex will change from True to False in a future version.
combined["Name_Length"] = combined.Name.str.replace("[^a-zA-Z]", "").str.len()
2) combined.head(
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Sex_TE | Cabin_TE | Ticket_TE | Embarked_TE | Title | Title_TE | Is_Married | Name_Length | Family_Size | Fsize_Cat | Surname | Family_Friends_Surv_Rate | Surv_Rate_Invalid | Cabin_Clean | Tkt_Clean | Ticket_Frequency | TicketGroup | Fare_Cat | Fare_Family_Size | Fare_Cat_Pclass | Fare_Cat_Title | Fsize_Cat_Title | Fsize_Cat_Fare_Cat | Pclass_Title | Fsize_Cat_Pclass | surv_rate_div_title | Cabin_num | Cabin_num_bins | Ticket_num | Ticket_num_bins | Cabin_num_bins_mult_Fsize_Cat | Cabin_num_bins_mult_Title | Cabin_num_bins_mult_Pclass | Cabin_num_bins_mult_Family_Friends_Surv_Rate | Cabin_num_bins_mult_Fare | Cabin_num_bins_mult_Ticket_num_bins | Ticket_num_bins_mult_Fsize_Cat | Ticket_num_bins_mult_Title | Ticket_num_bins_mult_Pclass | Ticket_num_bins_mult_Family_Friends_Surv_Rate | Ticket_num_bins_mult_Fare | Pclass_frqeuency_encoding | Title_frqeuency_encoding | Fsize_Cat_frqeuency_encoding | Family_Friends_Surv_Rate_frqeuency_encoding | Cabin_Clean_frqeuency_encoding | TicketGroup_frqeuency_encoding | Fare_Cat_frqeuency_encoding | Cabin_num_bins_frqeuency_encoding | Ticket_num_bins_frqeuency_encoding | Pclass_and_Title_frqeuency_encoding | Pclass_and_Fsize_Cat_frqeuency_encoding | Pclass_and_Family_Friends_Surv_Rate_frqeuency_encoding | Pclass_and_Cabin_Clean_frqeuency_encoding | Pclass_and_TicketGroup_frqeuency_encoding | Pclass_and_Cabin_num_bins_frqeuency_encoding | Pclass_and_Ticket_num_bins_frqeuency_encoding | Title_and_Family_Friends_Surv_Rate_frqeuency_encoding | Pclass_and_Sex_frqeuency_encoding | Family_survivers | Lucky_family | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.0 | 3 | Braund, Mr. Owen Harris | 0 | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 0.188908 | 0.299854 | 0.333898 | 0.336957 | 1 | 0.156673 | 0 | 18 | 2 | 2 | Braund | 0.3838 | 1 | 1 | A | 1 | 1 | 2 | 3.62500 | 6 | 2 | 2 | 1.0 | 3 | 6 | 0.383800 | NaN | 0 | 521171.0 | 10 | 0 | 0 | 0 | 0.0 | 0.0000 | 0 | 20 | 10 | 30 | 3.838 | 72.5000 | 709 | 757 | 437 | 684 | 1014 | 823 | 995 | 1020 | 131 | 448 | 168 | 425 | 693 | 535 | 693 | 131 | 507 | 493 | 0.449863 | 1.0 |
1 | 2 | 1.0 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 1 | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0.742038 | 0.589066 | 0.464006 | 0.553571 | 3 | 0.793641 | 1 | 41 | 2 | 2 | Cumings | 1.0000 | 0 | 7 | PC | 2 | 2 | 3 | 35.64165 | 3 | 9 | 6 | 1.5 | 3 | 2 | 0.333333 | 85.0 | 8 | 17599.0 | 4 | 16 | 24 | 8 | 8.0 | 570.2664 | 32 | 8 | 12 | 4 | 4.000 | 285.1332 | 323 | 198 | 437 | 225 | 94 | 475 | 277 | 29 | 130 | 78 | 152 | 107 | 94 | 183 | 26 | 81 | 78 | 144 | 1.000000 | 4.0 |
#Impute the Age column
= ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
imp_features 'Title', 'Is_Married',
'Name_Length', 'Family_Size', 'Fsize_Cat', 'Family_Friends_Surv_Rate',
'Surv_Rate_Invalid', 'Cabin_Clean', 'Ticket_Frequency',
'TicketGroup', 'Fare_Cat', 'Fare_Family_Size', 'Fare_Cat_Pclass',
'Fare_Cat_Title', 'Fsize_Cat_Title', 'Fsize_Cat_Fare_Cat', 'Pclass_Title', 'Fsize_Cat_Pclass',
'surv_rate_div_title', 'Cabin_num', 'Cabin_num_bins', 'Ticket_num',
'Ticket_num_bins', 'Cabin_num_bins_mult_Fsize_Cat', 'Cabin_num_bins_mult_Title',
'Cabin_num_bins_mult_Pclass', 'Cabin_num_bins_mult_Family_Friends_Surv_Rate',
'Cabin_num_bins_mult_Fare', 'Cabin_num_bins_mult_Ticket_num_bins',
'Ticket_num_bins_mult_Fsize_Cat', 'Ticket_num_bins_mult_Title',
'Ticket_num_bins_mult_Pclass', 'Ticket_num_bins_mult_Family_Friends_Surv_Rate',
'Ticket_num_bins_mult_Fare', 'Pclass_frqeuency_encoding', 'Title_frqeuency_encoding',
'Fsize_Cat_frqeuency_encoding', 'Family_Friends_Surv_Rate_frqeuency_encoding',
'Cabin_Clean_frqeuency_encoding', 'TicketGroup_frqeuency_encoding', 'Fare_Cat_frqeuency_encoding',
'Cabin_num_bins_frqeuency_encoding', 'Ticket_num_bins_frqeuency_encoding',
'Pclass_and_Title_frqeuency_encoding', 'Pclass_and_Fsize_Cat_frqeuency_encoding',
'Pclass_and_Family_Friends_Surv_Rate_frqeuency_encoding', 'Pclass_and_Cabin_Clean_frqeuency_encoding',
'Pclass_and_TicketGroup_frqeuency_encoding', 'Pclass_and_Cabin_num_bins_frqeuency_encoding',
'Pclass_and_Ticket_num_bins_frqeuency_encoding',
'Title_and_Family_Friends_Surv_Rate_frqeuency_encoding',
'Pclass_and_Sex_frqeuency_encoding', 'Cabin_TE', 'Title_TE',
'Family_survivers', 'Lucky_family', 'Sex_TE', 'Embarked_TE']
= KNNImputer(n_neighbors=10, missing_values=np.nan)
imputer
imputer.fit(combined[imp_features])= pd.DataFrame(imputer.transform(combined[imp_features]), index=combined.index, columns = imp_features) combined.loc[:, imp_features]
/tmp/ipykernel_62/32120658.py:29: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
combined.loc[:, imp_features] = pd.DataFrame(imputer.transform(combined[imp_features]), index=combined.index, columns = imp_features)
#Create two more columns based on Age
'Child'] = combined['Age'].map(lambda val:1 if val<18 else 0)
combined['Senior'] = combined['Age'].map(lambda val:1 if val>70 else 0)
combined[
#Create age bins
'Age_bins'] = pd.qcut(combined['Age'], 10, labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
combined['Age_bins'] = combined['Age_bins'].astype(int)
combined[
#Target encode the bins
'Age_bins_for_te'] = combined['Age_bins'].astype('category')
combined['Age_bins_TE'] = encoder.fit_transform(combined.iloc[:891].Age_bins_for_te, target)
combined[=['Age_bins_for_te'], inplace=True)
combined.drop(columns
#Frequency count
= combined.groupby('Age_bins').size()
Age_bins_counts 'Age_bins_frqeuency_encoding'] = combined['Age_bins'].apply(lambda x:Age_bins_counts[x]) combined[
3) combined.head(
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Sex_TE | Cabin_TE | Ticket_TE | Embarked_TE | Title | Title_TE | Is_Married | Name_Length | Family_Size | Fsize_Cat | Surname | Family_Friends_Surv_Rate | Surv_Rate_Invalid | Cabin_Clean | Tkt_Clean | Ticket_Frequency | TicketGroup | Fare_Cat | Fare_Family_Size | Fare_Cat_Pclass | Fare_Cat_Title | Fsize_Cat_Title | Fsize_Cat_Fare_Cat | Pclass_Title | Fsize_Cat_Pclass | surv_rate_div_title | Cabin_num | Cabin_num_bins | Ticket_num | Ticket_num_bins | Cabin_num_bins_mult_Fsize_Cat | Cabin_num_bins_mult_Title | Cabin_num_bins_mult_Pclass | Cabin_num_bins_mult_Family_Friends_Surv_Rate | Cabin_num_bins_mult_Fare | Cabin_num_bins_mult_Ticket_num_bins | Ticket_num_bins_mult_Fsize_Cat | Ticket_num_bins_mult_Title | Ticket_num_bins_mult_Pclass | Ticket_num_bins_mult_Family_Friends_Surv_Rate | Ticket_num_bins_mult_Fare | Pclass_frqeuency_encoding | Title_frqeuency_encoding | Fsize_Cat_frqeuency_encoding | Family_Friends_Surv_Rate_frqeuency_encoding | Cabin_Clean_frqeuency_encoding | TicketGroup_frqeuency_encoding | Fare_Cat_frqeuency_encoding | Cabin_num_bins_frqeuency_encoding | Ticket_num_bins_frqeuency_encoding | Pclass_and_Title_frqeuency_encoding | Pclass_and_Fsize_Cat_frqeuency_encoding | Pclass_and_Family_Friends_Surv_Rate_frqeuency_encoding | Pclass_and_Cabin_Clean_frqeuency_encoding | Pclass_and_TicketGroup_frqeuency_encoding | Pclass_and_Cabin_num_bins_frqeuency_encoding | Pclass_and_Ticket_num_bins_frqeuency_encoding | Title_and_Family_Friends_Surv_Rate_frqeuency_encoding | Pclass_and_Sex_frqeuency_encoding | Family_survivers | Lucky_family | Child | Senior | Age_bins | Age_bins_TE | Age_bins_frqeuency_encoding | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.0 | 3.0 | Braund, Mr. Owen Harris | 0.0 | 22.0 | 1.0 | 0.0 | A/5 21171 | 7.2500 | NaN | S | 0.188908 | 0.299854 | 0.333898 | 0.336957 | 1.0 | 0.156673 | 0.0 | 18.0 | 2.0 | 2.0 | Braund | 0.3838 | 1.0 | 1.0 | A | 1.0 | 1.0 | 2.0 | 3.62500 | 6.0 | 2.0 | 2.0 | 1.000000 | 3.0 | 6.0 | 0.383800 | 57.4 | 0.0 | 521171.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0000 | 0.0 | 20.0 | 10.0 | 30.0 | 3.838 | 72.5000 | 709.0 | 757.0 | 437.0 | 684.0 | 1014.0 | 823.0 | 995.0 | 1020.0 | 131.0 | 448.0 | 168.0 | 425.0 | 693.0 | 535.0 | 693.0 | 131.0 | 507.0 | 493.0 | 0.449863 | 1.0 | 0 | 0 | 3 | 0.321128 | 134 |
1 | 2 | 1.0 | 1.0 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 1.0 | 38.0 | 1.0 | 0.0 | PC 17599 | 71.2833 | C85 | C | 0.742038 | 0.589066 | 0.464006 | 0.553571 | 3.0 | 0.793641 | 1.0 | 41.0 | 2.0 | 2.0 | Cumings | 1.0000 | 0.0 | 7.0 | PC | 2.0 | 2.0 | 3.0 | 35.64165 | 3.0 | 9.0 | 6.0 | 1.500000 | 3.0 | 2.0 | 0.333333 | 85.0 | 8.0 | 17599.0 | 4.0 | 16.0 | 24.0 | 8.0 | 8.0 | 570.2664 | 32.0 | 8.0 | 12.0 | 4.0 | 4.000 | 285.1332 | 323.0 | 198.0 | 437.0 | 225.0 | 94.0 | 475.0 | 277.0 | 29.0 | 130.0 | 78.0 | 152.0 | 107.0 | 94.0 | 183.0 | 26.0 | 81.0 | 78.0 | 144.0 | 1.000000 | 4.0 | 0 | 0 | 8 | 0.435628 | 144 |
2 | 3 | 1.0 | 3.0 | Heikkinen, Miss. Laina | 1.0 | 26.0 | 0.0 | 0.0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0.742038 | 0.299854 | 0.464006 | 0.336957 | 2.0 | 0.702703 | 0.0 | 18.0 | 1.0 | 3.0 | Heikkinen | 0.3838 | 1.0 | 1.0 | STONO | 1.0 | 1.0 | 2.0 | 7.92500 | 6.0 | 4.0 | 6.0 | 0.666667 | 6.0 | 9.0 | 0.191900 | 56.9 | 0.0 | 23101282.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0000 | 0.0 | 30.0 | 20.0 | 30.0 | 3.838 | 79.2500 | 709.0 | 264.0 | 790.0 | 684.0 | 1014.0 | 823.0 | 995.0 | 1020.0 | 131.0 | 151.0 | 472.0 | 425.0 | 693.0 | 535.0 | 693.0 | 131.0 | 112.0 | 216.0 | 0.449863 | 1.0 | 0 | 0 | 4 | 0.313158 | 147 |
#Drop columns
=['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin', 'Surname',
combined.drop(columns'Cabin_num', 'Ticket_num', 'Family_survivers'], inplace=True)
#Impute this column
= combined.Ticket_num_bins_mult_Fare.mode()
fill_first
fill_first
0 63.1664
Name: Ticket_num_bins_mult_Fare, dtype: float64
'Ticket_num_bins_mult_Fare'].fillna(63.1664, inplace=True) combined[
#And this one...
= combined.Cabin_num_bins_mult_Fare.mode()
fill_second fill_second
0 0.0
Name: Cabin_num_bins_mult_Fare, dtype: float64
'Cabin_num_bins_mult_Fare'].fillna(0, inplace=True) combined[
2) combined.head(
Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | Sex_TE | Cabin_TE | Ticket_TE | Embarked_TE | Title | Title_TE | Is_Married | Name_Length | Family_Size | Fsize_Cat | Family_Friends_Surv_Rate | Surv_Rate_Invalid | Cabin_Clean | Tkt_Clean | Ticket_Frequency | TicketGroup | Fare_Cat | Fare_Family_Size | Fare_Cat_Pclass | Fare_Cat_Title | Fsize_Cat_Title | Fsize_Cat_Fare_Cat | Pclass_Title | Fsize_Cat_Pclass | surv_rate_div_title | Cabin_num_bins | Ticket_num_bins | Cabin_num_bins_mult_Fsize_Cat | Cabin_num_bins_mult_Title | Cabin_num_bins_mult_Pclass | Cabin_num_bins_mult_Family_Friends_Surv_Rate | Cabin_num_bins_mult_Fare | Cabin_num_bins_mult_Ticket_num_bins | Ticket_num_bins_mult_Fsize_Cat | Ticket_num_bins_mult_Title | Ticket_num_bins_mult_Pclass | Ticket_num_bins_mult_Family_Friends_Surv_Rate | Ticket_num_bins_mult_Fare | Pclass_frqeuency_encoding | Title_frqeuency_encoding | Fsize_Cat_frqeuency_encoding | Family_Friends_Surv_Rate_frqeuency_encoding | Cabin_Clean_frqeuency_encoding | TicketGroup_frqeuency_encoding | Fare_Cat_frqeuency_encoding | Cabin_num_bins_frqeuency_encoding | Ticket_num_bins_frqeuency_encoding | Pclass_and_Title_frqeuency_encoding | Pclass_and_Fsize_Cat_frqeuency_encoding | Pclass_and_Family_Friends_Surv_Rate_frqeuency_encoding | Pclass_and_Cabin_Clean_frqeuency_encoding | Pclass_and_TicketGroup_frqeuency_encoding | Pclass_and_Cabin_num_bins_frqeuency_encoding | Pclass_and_Ticket_num_bins_frqeuency_encoding | Title_and_Family_Friends_Surv_Rate_frqeuency_encoding | Pclass_and_Sex_frqeuency_encoding | Lucky_family | Child | Senior | Age_bins | Age_bins_TE | Age_bins_frqeuency_encoding | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.0 | 0.0 | 22.0 | 1.0 | 0.0 | 7.2500 | S | 0.188908 | 0.299854 | 0.333898 | 0.336957 | 1.0 | 0.156673 | 0.0 | 18.0 | 2.0 | 2.0 | 0.3838 | 1.0 | 1.0 | A | 1.0 | 1.0 | 2.0 | 3.62500 | 6.0 | 2.0 | 2.0 | 1.0 | 3.0 | 6.0 | 0.383800 | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0000 | 0.0 | 20.0 | 10.0 | 30.0 | 3.838 | 72.5000 | 709.0 | 757.0 | 437.0 | 684.0 | 1014.0 | 823.0 | 995.0 | 1020.0 | 131.0 | 448.0 | 168.0 | 425.0 | 693.0 | 535.0 | 693.0 | 131.0 | 507.0 | 493.0 | 1.0 | 0 | 0 | 3 | 0.321128 | 134 |
1 | 1.0 | 1.0 | 38.0 | 1.0 | 0.0 | 71.2833 | C | 0.742038 | 0.589066 | 0.464006 | 0.553571 | 3.0 | 0.793641 | 1.0 | 41.0 | 2.0 | 2.0 | 1.0000 | 0.0 | 7.0 | PC | 2.0 | 2.0 | 3.0 | 35.64165 | 3.0 | 9.0 | 6.0 | 1.5 | 3.0 | 2.0 | 0.333333 | 8.0 | 4.0 | 16.0 | 24.0 | 8.0 | 8.0 | 570.2664 | 32.0 | 8.0 | 12.0 | 4.0 | 4.000 | 285.1332 | 323.0 | 198.0 | 437.0 | 225.0 | 94.0 | 475.0 | 277.0 | 29.0 | 130.0 | 78.0 | 152.0 | 107.0 | 94.0 | 183.0 | 26.0 | 81.0 | 78.0 | 144.0 | 4.0 | 0 | 0 | 8 | 0.435628 | 144 |
# Pointing out categorical features
= ['Pclass','Title','Embarked',
categoricals 'Fsize_Cat','Cabin_Clean',
'Tkt_Clean', 'TicketGroup', 'Fare_Cat',
'Cabin_num_bins','Ticket_num_bins', 'Age_bins', 'Lucky_family']
#Impute the Embarked column
'Embarked'].fillna('S', inplace=True) combined[
%%capture
# Dealing with categorical data using get_dummies
= pd.get_dummies(combined, columns=categoricals)
dummies = dummies combined[dummies.columns]
# Dealing with categorical data using OrdinalEncoder
= OrdinalEncoder()
ordinal_encoder = ordinal_encoder.fit_transform(combined[categoricals])
ordinals = ordinals
combined[categoricals] del(ordinals)
# Define numerical columns
= ['Age', 'Fare','Name_Length','Cabin_Clean',
scaler_cols 'Tkt_Clean', 'Fare_Family_Size','Fare_Cat_Pclass',
'Fare_Cat_Title', 'Fsize_Cat_Title', 'Fsize_Cat_Fare_Cat',
'Pclass_Title', 'Fsize_Cat_Pclass', 'Cabin_num_bins',
'Ticket_num_bins', 'Cabin_num_bins_mult_Fsize_Cat',
'Cabin_num_bins_mult_Title', 'Cabin_num_bins_mult_Pclass',
'Cabin_num_bins_mult_Family_Friends_Surv_Rate', 'Cabin_num_bins_mult_Fare',
'Cabin_num_bins_mult_Ticket_num_bins', 'Ticket_num_bins_mult_Fsize_Cat',
'Ticket_num_bins_mult_Title', 'Ticket_num_bins_mult_Pclass',
'Ticket_num_bins_mult_Family_Friends_Surv_Rate', 'Ticket_num_bins_mult_Fare',
'Pclass_frqeuency_encoding', 'Title_frqeuency_encoding', 'Fsize_Cat_frqeuency_encoding',
'Family_Friends_Surv_Rate_frqeuency_encoding', 'Cabin_Clean_frqeuency_encoding',
'TicketGroup_frqeuency_encoding', 'Fare_Cat_frqeuency_encoding', 'Cabin_num_bins_frqeuency_encoding',
'Ticket_num_bins_frqeuency_encoding', 'Pclass_and_Title_frqeuency_encoding',
'Pclass_and_Fsize_Cat_frqeuency_encoding', 'Pclass_and_Family_Friends_Surv_Rate_frqeuency_encoding',
'Pclass_and_Cabin_Clean_frqeuency_encoding', 'Pclass_and_TicketGroup_frqeuency_encoding',
'Pclass_and_Cabin_num_bins_frqeuency_encoding', 'Pclass_and_Ticket_num_bins_frqeuency_encoding',
'Title_and_Family_Friends_Surv_Rate_frqeuency_encoding', 'Pclass_and_Sex_frqeuency_encoding',
'Age_bins', 'Age_bins_frqeuency_encoding', 'Lucky_family']
#FactorAnalysis
= FactorAnalysis(rotation='varimax', random_state=0)
fa
fa.fit(combined[scaler_cols])
= [f'fa_{i}'for i in range(len(scaler_cols))][:2]
fa_feats
= fa.transform(combined[scaler_cols])[:,:2] combined[fa_feats]
# Fixing the numeric variables by quantile transformation
= QuantileTransformer(output_distribution='uniform')
qt = qt.fit_transform(combined[scaler_cols]) combined[scaler_cols]
#Remove useless columns (with std=0)
= []
colsToRemove
= combined.select_dtypes(include=np.number).columns.tolist()
colss
for col in colss:
if combined.iloc[:891][col].std() == 0:
colsToRemove.append(col)
=1, inplace=True)
combined.drop(colsToRemove, axisprint("Removed `{}` Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)
Removed `3` Constant Columns
['Tkt_Clean_AQ', 'Tkt_Clean_LP', 'Tkt_Clean_STONOQ']
combined.head()
Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | Sex_TE | Cabin_TE | Ticket_TE | Embarked_TE | Title | Title_TE | Is_Married | Name_Length | Family_Size | Fsize_Cat | Family_Friends_Surv_Rate | Surv_Rate_Invalid | Cabin_Clean | Tkt_Clean | Ticket_Frequency | TicketGroup | Fare_Cat | Fare_Family_Size | Fare_Cat_Pclass | Fare_Cat_Title | Fsize_Cat_Title | Fsize_Cat_Fare_Cat | Pclass_Title | Fsize_Cat_Pclass | surv_rate_div_title | Cabin_num_bins | Ticket_num_bins | Cabin_num_bins_mult_Fsize_Cat | Cabin_num_bins_mult_Title | Cabin_num_bins_mult_Pclass | Cabin_num_bins_mult_Family_Friends_Surv_Rate | Cabin_num_bins_mult_Fare | Cabin_num_bins_mult_Ticket_num_bins | Ticket_num_bins_mult_Fsize_Cat | Ticket_num_bins_mult_Title | Ticket_num_bins_mult_Pclass | Ticket_num_bins_mult_Family_Friends_Surv_Rate | Ticket_num_bins_mult_Fare | Pclass_frqeuency_encoding | Title_frqeuency_encoding | Fsize_Cat_frqeuency_encoding | Family_Friends_Surv_Rate_frqeuency_encoding | Cabin_Clean_frqeuency_encoding | TicketGroup_frqeuency_encoding | Fare_Cat_frqeuency_encoding | Cabin_num_bins_frqeuency_encoding | Ticket_num_bins_frqeuency_encoding | Pclass_and_Title_frqeuency_encoding | Pclass_and_Fsize_Cat_frqeuency_encoding | Pclass_and_Family_Friends_Surv_Rate_frqeuency_encoding | Pclass_and_Cabin_Clean_frqeuency_encoding | Pclass_and_TicketGroup_frqeuency_encoding | Pclass_and_Cabin_num_bins_frqeuency_encoding | Pclass_and_Ticket_num_bins_frqeuency_encoding | Title_and_Family_Friends_Surv_Rate_frqeuency_encoding | Pclass_and_Sex_frqeuency_encoding | Lucky_family | Child | Senior | Age_bins | Age_bins_TE | Age_bins_frqeuency_encoding | Pclass_1.0 | Pclass_2.0 | Pclass_3.0 | Title_1.0 | Title_2.0 | Title_3.0 | Title_4.0 | Title_5.0 | Title_6.0 | Embarked_C | Embarked_Q | Embarked_S | Fsize_Cat_1.0 | Fsize_Cat_2.0 | Fsize_Cat_3.0 | Cabin_Clean_1.0 | Cabin_Clean_2.0 | Cabin_Clean_3.0 | Cabin_Clean_4.0 | Cabin_Clean_5.0 | Cabin_Clean_6.0 | Cabin_Clean_7.0 | Cabin_Clean_8.0 | Cabin_Clean_9.0 | Tkt_Clean_A | Tkt_Clean_AS | Tkt_Clean_C | Tkt_Clean_CA | Tkt_Clean_CASOTON | Tkt_Clean_FC | Tkt_Clean_FCC | Tkt_Clean_Fa | Tkt_Clean_LINE | Tkt_Clean_NUM | Tkt_Clean_PC | Tkt_Clean_PP | Tkt_Clean_PPP | Tkt_Clean_SC | Tkt_Clean_SCA | Tkt_Clean_SCAH | Tkt_Clean_SCAHBasle | Tkt_Clean_SCOW | Tkt_Clean_SCPARIS | Tkt_Clean_SCParis | Tkt_Clean_SOC | Tkt_Clean_SOP | Tkt_Clean_SOPP | Tkt_Clean_SOTONO | Tkt_Clean_SOTONOQ | Tkt_Clean_SP | Tkt_Clean_STONO | Tkt_Clean_SWPP | Tkt_Clean_WC | Tkt_Clean_WEP | TicketGroup_0.0 | TicketGroup_1.0 | TicketGroup_2.0 | Fare_Cat_1.0 | Fare_Cat_2.0 | Fare_Cat_3.0 | Cabin_num_bins_0.0 | Cabin_num_bins_1.0 | Cabin_num_bins_2.0 | Cabin_num_bins_3.0 | Cabin_num_bins_4.0 | Cabin_num_bins_5.0 | Cabin_num_bins_6.0 | Cabin_num_bins_7.0 | Cabin_num_bins_8.0 | Cabin_num_bins_9.0 | Cabin_num_bins_10.0 | Ticket_num_bins_0.0 | Ticket_num_bins_1.0 | Ticket_num_bins_2.0 | Ticket_num_bins_3.0 | Ticket_num_bins_4.0 | Ticket_num_bins_5.0 | Ticket_num_bins_6.0 | Ticket_num_bins_7.0 | Ticket_num_bins_8.0 | Ticket_num_bins_9.0 | Ticket_num_bins_10.0 | Age_bins_1 | Age_bins_2 | Age_bins_3 | Age_bins_4 | Age_bins_5 | Age_bins_6 | Age_bins_7 | Age_bins_8 | Age_bins_9 | Age_bins_10 | Lucky_family_1.0 | Lucky_family_2.0 | Lucky_family_3.0 | Lucky_family_4.0 | fa_0 | fa_1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.0 | 0.0 | 0.256757 | 1.0 | 0.0 | 0.082082 | 2.0 | 0.188908 | 0.299854 | 0.333898 | 0.336957 | 0.0 | 0.156673 | 0.0 | 0.370871 | 2.0 | 1.0 | 0.3838 | 1.0 | 0.000000 | 0.000000 | 1.0 | 1.0 | 1.0 | 0.030613 | 0.718719 | 0.258258 | 0.084084 | 0.680180 | 0.484484 | 0.515015 | 0.383800 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.705205 | 0.703203 | 1.000000 | 0.843844 | 0.468969 | 1.000000 | 1.000000 | 0.229229 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.575075 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 0 | 0 | 0.254755 | 0.321128 | 0.506507 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -0.483094 | 0.786972 |
1 | 0.0 | 1.0 | 0.767768 | 1.0 | 0.0 | 0.883051 | 0.0 | 0.742038 | 0.589066 | 0.464006 | 0.553571 | 2.0 | 0.793641 | 1.0 | 0.973974 | 2.0 | 1.0 | 1.0000 | 0.0 | 0.897898 | 0.871371 | 2.0 | 2.0 | 2.0 | 0.873445 | 0.173674 | 0.917417 | 0.797297 | 0.877377 | 0.484484 | 0.067067 | 0.333333 | 0.944945 | 0.351852 | 0.928929 | 0.976977 | 0.933934 | 0.980981 | 0.941942 | 0.944444 | 0.286787 | 0.771772 | 0.227728 | 0.893393 | 0.880881 | 0.334835 | 0.144144 | 0.229229 | 0.199199 | 0.189189 | 0.18969 | 0.134134 | 0.128128 | 0.251251 | 0.276276 | 0.209710 | 0.299299 | 0.24024 | 0.521522 | 0.123123 | 0.445445 | 0.390390 | 0.135636 | 1.0 | 0 | 0 | 0.754755 | 0.435628 | 0.718218 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1.738878 | -0.908420 |
2 | 2.0 | 1.0 | 0.406907 | 0.0 | 0.0 | 0.266767 | 2.0 | 0.742038 | 0.299854 | 0.464006 | 0.336957 | 1.0 | 0.702703 | 0.0 | 0.370871 | 1.0 | 2.0 | 0.3838 | 1.0 | 0.000000 | 0.975475 | 1.0 | 1.0 | 1.0 | 0.429429 | 0.718719 | 0.656156 | 0.797297 | 0.286286 | 0.819319 | 1.000000 | 0.191900 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.917417 | 1.000000 | 0.843844 | 0.542042 | 1.000000 | 0.320320 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 0.478478 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 0.462963 | 0.541041 | 0.0 | 0 | 0 | 0.362362 | 0.313158 | 0.829329 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | -0.506328 | -1.251513 |
3 | 0.0 | 1.0 | 0.710210 | 1.0 | 0.0 | 0.834334 | 2.0 | 0.742038 | 0.589066 | 0.400316 | 0.336957 | 2.0 | 0.793641 | 1.0 | 0.910911 | 2.0 | 1.0 | 0.5000 | 0.0 | 0.897898 | 0.469970 | 2.0 | 2.0 | 2.0 | 0.791291 | 0.173674 | 0.917417 | 0.797297 | 0.877377 | 0.484484 | 0.067067 | 0.166667 | 0.967467 | 0.551552 | 0.948448 | 0.984985 | 0.955956 | 0.942442 | 0.925926 | 0.988989 | 0.468969 | 0.886887 | 0.377377 | 0.649149 | 0.896396 | 0.334835 | 0.144144 | 0.229229 | 0.080080 | 0.189189 | 0.18969 | 0.134134 | 0.173173 | 0.251251 | 0.276276 | 0.209710 | 0.152152 | 0.24024 | 0.521522 | 0.174675 | 0.286286 | 0.132132 | 0.135636 | 0.0 | 0 | 0 | 0.754755 | 0.435628 | 0.718218 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1.781670 | -0.850478 |
4 | 2.0 | 0.0 | 0.710210 | 0.0 | 0.0 | 0.299299 | 2.0 | 0.188908 | 0.299854 | 0.333898 | 0.336957 | 0.0 | 0.156673 | 0.0 | 0.424925 | 1.0 | 2.0 | 0.3838 | 1.0 | 0.000000 | 0.469970 | 1.0 | 1.0 | 1.0 | 0.467968 | 0.718719 | 0.258258 | 0.382883 | 0.286286 | 0.484484 | 1.000000 | 0.383800 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.703203 | 1.000000 | 0.843844 | 0.565065 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 0 | 0 | 0.754755 | 0.435628 | 0.718218 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | -0.477648 | 0.690401 |
= combined.iloc[:891]
X = target
y = combined.iloc[891:] test_df
%%capture
!pip install BorutaShap
#Feature selection with Boruta Shap
from BorutaShap import BorutaShap
# Creates a BorutaShap selector for classification
= BorutaShap(importance_measure = 'shap', classification = True)
selector
# Fits the selector
= X, y = y, n_trials = 100, sample = False, verbose = True) selector.fit(X
26 attributes confirmed important: ['fa_1', 'Title_and_Family_Friends_Surv_Rate_frqeuency_encoding', 'Family_Friends_Surv_Rate', 'Title_1.0', 'surv_rate_div_title', 'Ticket_num_bins_mult_Family_Friends_Surv_Rate', 'Pclass_and_Sex_frqeuency_encoding', 'Pclass_and_Cabin_Clean_frqeuency_encoding', 'Fare_Cat_Title', 'Lucky_family_2.0', 'Title_TE', 'Pclass_and_Title_frqeuency_encoding', 'Cabin_num_bins_mult_Family_Friends_Surv_Rate', 'Fare_Family_Size', 'fa_0', 'Name_Length', 'Title_frqeuency_encoding', 'Fsize_Cat_Title', 'Pclass_and_Family_Friends_Surv_Rate_frqeuency_encoding', 'Title', 'Ticket_TE', 'Lucky_family', 'Sex', 'Family_Friends_Surv_Rate_frqeuency_encoding', 'Sex_TE', 'Lucky_family_4.0']
138 attributes confirmed unimportant: ['Age_bins', 'Tkt_Clean_PC', 'Ticket_num_bins_mult_Title', 'Tkt_Clean_FC', 'Ticket_num_bins_9.0', 'Pclass_Title', 'Embarked_TE', 'Cabin_num_bins_mult_Title', 'Fsize_Cat_frqeuency_encoding', 'Cabin_Clean_7.0', 'Cabin_num_bins_3.0', 'Pclass_3.0', 'Senior', 'Age_bins_5', 'Cabin_num_bins_1.0', 'Tkt_Clean_SOP', 'Ticket_num_bins_2.0', 'TicketGroup_2.0', 'Title_6.0', 'Age_bins_9', 'Title_3.0', 'Cabin_num_bins_2.0', 'Tkt_Clean_SOTONOQ', 'Tkt_Clean_A', 'Fsize_Cat', 'Cabin_num_bins_6.0', 'Parch', 'Surv_Rate_Invalid', 'Cabin_num_bins_frqeuency_encoding', 'TicketGroup_1.0', 'Title_4.0', 'Tkt_Clean_SC', 'Fsize_Cat_Fare_Cat', 'Tkt_Clean_SCAHBasle', 'Fsize_Cat_1.0', 'Cabin_num_bins_9.0', 'Fare_Cat', 'Age_bins_7', 'Tkt_Clean_SOPP', 'Ticket_num_bins_7.0', 'Cabin_Clean_6.0', 'Ticket_num_bins_10.0', 'Pclass_and_Fsize_Cat_frqeuency_encoding', 'Pclass', 'Ticket_num_bins_8.0', 'Tkt_Clean_SCPARIS', 'Cabin_num_bins_4.0', 'Cabin_num_bins_mult_Ticket_num_bins', 'Ticket_Frequency', 'Tkt_Clean_WEP', 'Embarked_Q', 'SibSp', 'Age_bins_TE', 'Age_bins_3', 'Cabin_num_bins_mult_Fare', 'Age', 'TicketGroup', 'Pclass_and_TicketGroup_frqeuency_encoding', 'Tkt_Clean_AS', 'Tkt_Clean_PPP', 'Tkt_Clean_SCA', 'Tkt_Clean_NUM', 'Embarked', 'Tkt_Clean_CA', 'Pclass_2.0', 'Tkt_Clean_Fa', 'Fare_Cat_1.0', 'Is_Married', 'Cabin_Clean_frqeuency_encoding', 'Cabin_num_bins_8.0', 'Pclass_1.0', 'Title_5.0', 'Fsize_Cat_3.0', 'Tkt_Clean_FCC', 'Cabin_Clean_4.0', 'Cabin_Clean_9.0', 'Cabin_Clean_5.0', 'Tkt_Clean_SCAH', 'Age_bins_2', 'Ticket_num_bins_4.0', 'Tkt_Clean_LINE', 'Age_bins_10', 'Cabin_num_bins_mult_Fsize_Cat', 'Ticket_num_bins_3.0', 'Age_bins_4', 'Lucky_family_1.0', 'Cabin_TE', 'Ticket_num_bins_6.0', 'Tkt_Clean_WC', 'Fare_Cat_frqeuency_encoding', 'Tkt_Clean_SP', 'Tkt_Clean_CASOTON', 'Tkt_Clean_C', 'Cabin_num_bins', 'Title_2.0', 'Cabin_Clean_2.0', 'Tkt_Clean_SOC', 'Age_bins_frqeuency_encoding', 'Cabin_Clean', 'Tkt_Clean_SWPP', 'Fsize_Cat_2.0', 'Pclass_frqeuency_encoding', 'Ticket_num_bins_mult_Fsize_Cat', 'Fsize_Cat_Pclass', 'Cabin_Clean_1.0', 'Cabin_Clean_8.0', 'Ticket_num_bins_1.0', 'Cabin_num_bins_5.0', 'Embarked_C', 'Ticket_num_bins_0.0', 'Cabin_num_bins_7.0', 'Age_bins_1', 'Family_Size', 'Embarked_S', 'Fare_Cat_3.0', 'Age_bins_8', 'Ticket_num_bins_mult_Fare', 'Tkt_Clean_SCOW', 'Fare', 'Tkt_Clean_PP', 'Cabin_num_bins_0.0', 'Ticket_num_bins', 'Fare_Cat_Pclass', 'TicketGroup_frqeuency_encoding', 'Tkt_Clean_SCParis', 'Cabin_num_bins_mult_Pclass', 'Age_bins_6', 'Cabin_num_bins_10.0', 'Ticket_num_bins_frqeuency_encoding', 'Ticket_num_bins_5.0', 'Tkt_Clean_SOTONO', 'Tkt_Clean_STONO', 'TicketGroup_0.0', 'Child', 'Cabin_Clean_3.0', 'Tkt_Clean', 'Lucky_family_3.0', 'Fare_Cat_2.0']
3 tentative attributes remains: ['Pclass_and_Ticket_num_bins_frqeuency_encoding', 'Ticket_num_bins_mult_Pclass', 'Pclass_and_Cabin_num_bins_frqeuency_encoding']
# Display features to be removed
= selector.features_to_remove
features_to_remove print(features_to_remove)
['Pclass' 'Age' 'SibSp' 'Parch' 'Fare' 'Embarked' 'Cabin_TE' 'Embarked_TE'
'Is_Married' 'Family_Size' 'Fsize_Cat' 'Surv_Rate_Invalid' 'Cabin_Clean'
'Tkt_Clean' 'Ticket_Frequency' 'TicketGroup' 'Fare_Cat' 'Fare_Cat_Pclass'
'Fsize_Cat_Fare_Cat' 'Pclass_Title' 'Fsize_Cat_Pclass' 'Cabin_num_bins'
'Ticket_num_bins' 'Cabin_num_bins_mult_Fsize_Cat'
'Cabin_num_bins_mult_Title' 'Cabin_num_bins_mult_Pclass'
'Cabin_num_bins_mult_Fare' 'Cabin_num_bins_mult_Ticket_num_bins'
'Ticket_num_bins_mult_Fsize_Cat' 'Ticket_num_bins_mult_Title'
'Ticket_num_bins_mult_Fare' 'Pclass_frqeuency_encoding'
'Fsize_Cat_frqeuency_encoding' 'Cabin_Clean_frqeuency_encoding'
'TicketGroup_frqeuency_encoding' 'Fare_Cat_frqeuency_encoding'
'Cabin_num_bins_frqeuency_encoding' 'Ticket_num_bins_frqeuency_encoding'
'Pclass_and_Fsize_Cat_frqeuency_encoding'
'Pclass_and_TicketGroup_frqeuency_encoding' 'Child' 'Senior' 'Age_bins'
'Age_bins_TE' 'Age_bins_frqeuency_encoding' 'Pclass_1.0' 'Pclass_2.0'
'Pclass_3.0' 'Title_2.0' 'Title_3.0' 'Title_4.0' 'Title_5.0' 'Title_6.0'
'Embarked_C' 'Embarked_Q' 'Embarked_S' 'Fsize_Cat_1.0' 'Fsize_Cat_2.0'
'Fsize_Cat_3.0' 'Cabin_Clean_1.0' 'Cabin_Clean_2.0' 'Cabin_Clean_3.0'
'Cabin_Clean_4.0' 'Cabin_Clean_5.0' 'Cabin_Clean_6.0' 'Cabin_Clean_7.0'
'Cabin_Clean_8.0' 'Cabin_Clean_9.0' 'Tkt_Clean_A' 'Tkt_Clean_AS'
'Tkt_Clean_C' 'Tkt_Clean_CA' 'Tkt_Clean_CASOTON' 'Tkt_Clean_FC'
'Tkt_Clean_FCC' 'Tkt_Clean_Fa' 'Tkt_Clean_LINE' 'Tkt_Clean_NUM'
'Tkt_Clean_PC' 'Tkt_Clean_PP' 'Tkt_Clean_PPP' 'Tkt_Clean_SC'
'Tkt_Clean_SCA' 'Tkt_Clean_SCAH' 'Tkt_Clean_SCAHBasle' 'Tkt_Clean_SCOW'
'Tkt_Clean_SCPARIS' 'Tkt_Clean_SCParis' 'Tkt_Clean_SOC' 'Tkt_Clean_SOP'
'Tkt_Clean_SOPP' 'Tkt_Clean_SOTONO' 'Tkt_Clean_SOTONOQ' 'Tkt_Clean_SP'
'Tkt_Clean_STONO' 'Tkt_Clean_SWPP' 'Tkt_Clean_WC' 'Tkt_Clean_WEP'
'TicketGroup_0.0' 'TicketGroup_1.0' 'TicketGroup_2.0' 'Fare_Cat_1.0'
'Fare_Cat_2.0' 'Fare_Cat_3.0' 'Cabin_num_bins_0.0' 'Cabin_num_bins_1.0'
'Cabin_num_bins_2.0' 'Cabin_num_bins_3.0' 'Cabin_num_bins_4.0'
'Cabin_num_bins_5.0' 'Cabin_num_bins_6.0' 'Cabin_num_bins_7.0'
'Cabin_num_bins_8.0' 'Cabin_num_bins_9.0' 'Cabin_num_bins_10.0'
'Ticket_num_bins_0.0' 'Ticket_num_bins_1.0' 'Ticket_num_bins_2.0'
'Ticket_num_bins_3.0' 'Ticket_num_bins_4.0' 'Ticket_num_bins_5.0'
'Ticket_num_bins_6.0' 'Ticket_num_bins_7.0' 'Ticket_num_bins_8.0'
'Ticket_num_bins_9.0' 'Ticket_num_bins_10.0' 'Age_bins_1' 'Age_bins_2'
'Age_bins_3' 'Age_bins_4' 'Age_bins_5' 'Age_bins_6' 'Age_bins_7'
'Age_bins_8' 'Age_bins_9' 'Age_bins_10' 'Lucky_family_1.0'
'Lucky_family_3.0' 'Lucky_family_4.0']
# Removes them
= combined.drop(columns = features_to_remove) combined
# Preparing data again as a tabular matrix
= combined.iloc[:891]
X = target
y = combined.iloc[891:] test_df
#Check if the target is imbalanced
y.value_counts().to_frame().T
0 | 1 | |
---|---|---|
Survived | 549 | 342 |
%%capture
#It is, so we need to use imblearn
!pip install imblearn
#Fit imblearn's SMOTE
from imblearn.over_sampling import SMOTE
= SMOTE().fit_resample(X, y) X_resampled, y_resampled
#That's much better
y_resampled.value_counts().to_frame().T
0 | 1 | |
---|---|---|
Survived | 549 | 549 |
#We'll use the balanced dataset
= X_resampled
X = y_resampled y
#Let's see the correlations with the target
= X.corrwith(y).sort_values(ascending=False)
correlation
# Correlation graph
1:].plot(kind='bar', figsize=(10,5), title='Survivability dependency')
correlation[ plt.show()
X.shape, y.shape
((1098, 28), (1098,))
Deploy Keras Classifier
Note: the NN was built and tuned using Keras Tuner - see the code below.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.metrics import *
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
= ['accuracy',
metrics
Precision(), Recall()]
= EarlyStopping(
early_stopping ='val_accuracy',
monitor= 0.0002, # minimium amount of change to count as an improvement
min_delta = 5, # how many epochs to wait before stopping
patience =True,
restore_best_weights )
def create_model():
= Sequential()
model
=X.shape[1], name='Input_'))
model.add(Input(shape4096, activation='relu', kernel_initializer='glorot_normal', kernel_regularizer=l2(0.1)))
model.add(Dense(16, activation='relu', kernel_initializer='glorot_normal', kernel_regularizer=l2(0.1)))
model.add(Dense(8, activation='relu', kernel_initializer='glorot_normal', kernel_regularizer=l2(0.1)))
model.add(Dense(1, activation='sigmoid', kernel_initializer='glorot_normal'))
model.add(Dense(
model.summary()= Adam(lr = 0.0001)
optimize compile(optimizer = optimize,
model.= 'binary_crossentropy',
loss = metrics)
metrics return model
%%capture
= KerasClassifier(build_fn = create_model,
estimator = 600,
epochs = 32,
batch_size = 1,
verbose =0.2,
validation_split=[early_stopping])
callbacks
= StratifiedKFold(n_splits = 3)
kfold = cross_val_score(estimator, X, y, cv = kfold) results
%%capture
#Fit it on the whole dataset
= estimator.fit(X, y, epochs = 600, batch_size = 32, validation_split=0.2) train_history
print('Max Accuracy=', max(train_history.history['accuracy']))
Max Accuracy= 0.8542141318321228
%matplotlib inline
= pd.DataFrame(train_history.history)
logs
=(14, 4))
plt.figure(figsize1, 2, 1)
plt.subplot('accuracy'], lw=2, label='accuracy')
plt.plot(logs["Epoch")
plt.xlabel("Accuracy")
plt.ylabel(
plt.legend() plt.show()
#submission
= estimator.predict(test_df)
y_preds
= pd.DataFrame({'PassengerId': test.PassengerId,
output 'Survived': y_preds[:, 0]})
'submission.csv', index=False) output.to_csv(
14/14 [==============================] - 0s 1ms/step
Keras Tuner
import tensorflow as tf
from tensorflow import keras
%%capture
!pip install keras_tuner
import keras_tuner as kt
from sklearn.model_selection import train_test_split
= train_test_split(X, y, test_size=0.20, random_state=42) X_train, X_test, y_train, y_test
def model_builder(hp):
= keras.Sequential()
model =X.shape[1], name='Input_'))
model.add(Input(shape
= hp.Choice('units1', values=[16, 32, 64, 128, 256, 512, 1024, 2048, 4096])
hp_units1 = hp.Choice('units2', values=[16, 32, 64, 128, 256, 512, 1024, 2048, 4096])
hp_units2 = hp.Choice('regulizer1', values=[0.0, 0.001, 0.01, 0.1])
hp_regulizer1 = hp.Choice('regulizer2', values=[0.0, 0.001, 0.01, 0.1])
hp_regulizer2 = hp.Choice('dropout1', values=[0.0, 0.1, 0.3, 0.5])
hp_dropout1 = hp.Choice('dropout2', values=[0.0, 0.1, 0.3, 0.5])
hp_dropout2
=hp_units1, activation='relu', kernel_initializer='glorot_normal', kernel_regularizer=l2(hp_regulizer1)))
model.add(keras.layers.Dense(units
model.add(Dropout(hp_dropout1))=hp_units2, activation='relu', kernel_initializer='glorot_normal', kernel_regularizer=l2(hp_regulizer2)))
model.add(keras.layers.Dense(units
model.add(Dropout(hp_dropout2))
8, activation='relu', kernel_initializer='normal', kernel_regularizer=l2(0.1)))
model.add(Dense(1, activation='sigmoid'))
model.add(Dense(
= hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7])
hp_learning_rate
compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
model.=keras.losses.BinaryCrossentropy(),
loss=['accuracy'])
metrics
return model
= kt.Hyperband(model_builder,
tuner ='val_accuracy',
objective=50,
max_epochs=3,
factor='my_dir',
directory='titanic_tune',
project_name=True)
overwrite
= tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10)
stop_early
tuner.search(X_train, y_train, = 32,
batch_size =50,
epochs=0.2,
validation_split=[stop_early])
callbacks
# Get the optimal hyperparameters
=tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps
print(f"""
The hyperparameter search is complete.
Unit1
------
Neurons: {best_hps.get('units1')}
l2: {best_hps.get('regulizer1')}
Dropout: {best_hps.get('dropout1')}
Unit2
------
Neurons: {best_hps.get('units2')}
l2: {best_hps.get('regulizer2')}
Dropout: {best_hps.get('dropout2')}
Learning rate: {best_hps.get('learning_rate')}.
""")
# Unit3
# ------
# Neurons: {best_hps.get('units3')}
# l2: {best_hps.get('regulizer3')}
# Dropout: {best_hps.get('dropout3')}
Trial 90 Complete [00h 00m 02s]
val_accuracy: 0.5227272510528564
Best val_accuracy So Far: 0.9943181872367859
Total elapsed time: 00h 06m 06s
INFO:tensorflow:Oracle triggered exit
The hyperparameter search is complete.
Unit1
------
Neurons: 256
l2: 0.0
Dropout: 0.0
Unit2
------
Neurons: 64
l2: 0.001
Dropout: 0.0
Learning rate: 0.01.
%%capture
# Build the model with the optimal hyperparameters and train it on the full train data for 100 epochs
= tuner.hypermodel.build(best_hps)
model = model.fit(X, y, epochs=100, validation_split=0.2)
history
= history.history['val_accuracy']
val_acc_per_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1 best_epoch
print('Best epoch: %d' % (best_epoch,))
Best epoch: 17
%%capture
= tuner.hypermodel.build(best_hps)
hypermodel
# Retrain the model
=best_epoch, validation_split=0.2) hypermodel.fit(X, y, epochs
= hypermodel.evaluate(X_test, y_test)
eval_result print("[test loss, test accuracy]:", eval_result)
7/7 [==============================] - 0s 2ms/step - loss: 0.1277 - accuracy: 0.9636
[test loss, test accuracy]: [0.12773045897483826, 0.9636363387107849]
= hypermodel.predict(test_df) hypermodel_preds
14/14 [==============================] - 0s 1ms/step
= [1 if i>0.5 else 0 for i in hypermodel_preds] hypermodel_preds_clean