The classic data analysis tutorial! This is the Kaggle Titanic dataset, which contains information on the passengers of the Titanic, including survival, age, cabin fare, and things of that nature. It's our job to determine how best we can use the data in order to predict whether or not a passenger survives.¶

So let's begin!¶

import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import numpy as np
import seaborn as sns
sns.set(style="ticks", color_codes=True)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
from sklearn import preprocessing




%matplotlib inline

As always, we need to import the necessary modules - pandas for the dataframe, pyplot and seaborn to plot various data, and the rest we will use either to create models from or to glean information from those models. Next we'll load the data and take a look at it.¶

train_data = pd.read_csv('./train.csv')

train_data.head()

The variables are defined as such from the Kaggle website:¶

Data Dictionary¶

Variable Definition Key

survival Survival 0 = No, 1 = Yes

pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd

sex Sex
Age Age in years
sibsp # of siblings / spouses aboard the Titanic
parch # of parents / children aboard the Titanic
ticket Ticket number
fare Passenger fare
cabin Cabin number
embarked Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton Variable Notes

pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

train_data.describe()

train_data.Survived.mean()

0.3838383838383838

Needless to say, the dataframe description shows some interesting information - 1.) That we'll need to fill in the age in some way; 2.) That there is a large disparity in the fares (which we'll plot out later), and 3.) Our particularly low survival rate. I placed it in a second cell by itself just for reference later on.¶

Also, the below cell also shows that we have a lot of empty data in the cabin column, which we'll consequently remove at a later point.¶

for col in train_data:
    print(col)
    print(train_data[col].isnull().sum())

PassengerId
0
Survived
0
Pclass
0
Name
0
Sex
0
Age
177
SibSp
0
Parch
0
Ticket
0
Fare
0
Cabin
687
Embarked
2

import numpy as np
import seaborn as sns
plt.rcParams["patch.force_edgecolor"] = True
sns.set(style='ticks')
sns.set(font_scale = 1.25)



x= train_data[train_data['Sex']=='female'].Age.dropna()

f, (ax_box, ax_hist) = plt.subplots(2,figsize=(12,10),sharex = True,gridspec_kw={"height_ratios": (.15, .85)})

sns.boxplot(x,ax=ax_box)
sns.distplot(x, ax=ax_hist,bins=20)

ax_box.set(yticks=[])
sns.despine(ax=ax_hist)
sns.despine(ax=ax_box, left=True)
sns.plt.xlim(0,)
ax_box.set(title='Age Distribution of Females that Survived on the Titanic')
plt.show()

sns.set(style='ticks')
sns.set(font_scale = 1.25)
x= train_data[train_data['Sex']=='male'].Age.dropna()

f, (ax_box, ax_hist) = plt.subplots(2,figsize=(12,10),sharex = True,gridspec_kw={"height_ratios": (.15, .85)})

sns.boxplot(x,ax=ax_box)
sns.distplot(x, ax=ax_hist,bins=20)

ax_box.set(yticks=[])
sns.despine(ax=ax_hist)
sns.despine(ax=ax_box, left=True)
sns.plt.xlim(0,)
ax_box.set(title='Age Distribution of Males on the Titanic')
plt.show()

sns.set(style='white')
plt.rcParams["patch.force_edgecolor"] = True
sns.set(font_scale = 1.25)
sns.jointplot(x = train_data['Age'],y=train_data['Fare'],size=8)
plt.show()

na_age = pd.isnull(train_data['Age'])

plt.style.use('ggplot')


data1 = train_data[train_data['Survived']==0].Fare
data2 = train_data[train_data['Survived']==1].Fare
data = [data1,data2]
ax = sns.boxplot(data = data, width = .5)

labels = ['Survive = No','Survive = Yes']
ax.set_xticklabels(labels)
plt.ylim(0,300)
plt.title('Differences in Fare based on Survival')

plt.show()

<a list of 2 Text xticklabel objects>

plt.style.use('ggplot')
age_data1 = train_data[train_data['Pclass'] ==1].Age.dropna()
age_data2 = train_data[train_data['Pclass'] ==2].Age.dropna()
age_data3 = train_data[train_data['Pclass'] ==3].Age.dropna()
box_age = [age_data1,age_data2,age_data3]

ax = sns.boxplot(data = box_age)
ax.set_xticklabels([1,2,3])

plt.title('Age Distribution by Passenger Class')
plt.xlabel('Passenger Class')
plt.ylabel('Age')
plt.show()

def normal_age(pclass):
    while True:
        age = np.random.normal(loc = train_data[train_data['Pclass'] ==pclass].Age.dropna().mean(axis=0),
                            scale = train_data[train_data['Pclass'] ==pclass].Age.dropna().std(axis=0) )
        if age > 0:
            break
    #print(age)
    return age
    
for index,row in train_data[na_age].iterrows():
    #print(row)
    if row.Pclass =='1':
        train_data.loc[index,'Age'] = normal_age(1)
        #print(row.Age)
    elif row.Pclass == '2':
        train_data.loc[index,'Age'] = normal_age(2)
        #print(row.Age)
    else:
        train_data.loc[index,'Age'] = normal_age(3)
        #print(row.Age)

age_data1_updated = train_data[train_data['Pclass'] ==1].Age.dropna()
age_data2_updated = train_data[train_data['Pclass'] ==2].Age.dropna()
age_data3_updated = train_data[train_data['Pclass'] ==3].Age.dropna()

box_age_updated = [age_data1_updated,age_data2_updated,age_data3_updated]

sns.boxplot(data=box_age_updated)
ax.set_xticklabels([1,2,3])

plt.title('Age Distribution by Passenger Class (Updated NaN values)')
plt.xlabel('Passenger Class')
plt.ylabel('Age')
plt.show()

sextab = pd.pivot_table(data = train_data,index= 'Sex',columns = 'Survived',aggfunc='size')
sextab

sextab = pd.pivot_table(data = train_data,index= 'Sex',columns = 'Survived',aggfunc='size')
#sextab.plot.barh(stacked=True)
#sextab = sextab.div(sextab.sum(axis=1),axis=0)
sex_sum=sextab.sum(axis=1)
sextab = sextab.div(sex_sum,axis=0)
sextab.plot.barh(stacked=True)

<matplotlib.axes._subplots.AxesSubplot at 0x13a0c588>

class_tab = pd.pivot_table(data = train_data,index = 'Pclass',columns = ['Survived','Sex'], aggfunc = 'size')
class_tab

embarktab = pd.pivot_table(data = train_data,index= 'Pclass',columns = 'Embarked',aggfunc='size')
embarktab

train_data[train_data.Embarked.isnull()]

train_data.drop([61,829],inplace=True)

train_data['Family'] = train_data['Parch']+train_data['SibSp']

sex      = pd.get_dummies(train_data['Sex'], drop_first=True)
embarked = pd.get_dummies(train_data['Embarked'], drop_first=True)

train_data = train_data.drop(['Cabin'],axis=1,)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-106-65cb62c7d16b> in <module>()
----> 1 train_data = train_data.drop(['Cabin'],axis=1,)

C:\Users\Zaphikel\Anaconda3\lib\site-packages\pandas\core\generic.py in drop(self, labels, axis, level, inplace, errors)
   2159                 new_axis = axis.drop(labels, level=level, errors=errors)
   2160             else:
-> 2161                 new_axis = axis.drop(labels, errors=errors)
   2162             dropped = self.reindex(**{axis_name: new_axis})
   2163             try:

C:\Users\Zaphikel\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in drop(self, labels, errors)
   3622             if errors != 'ignore':
   3623                 raise ValueError('labels %s not contained in axis' %
-> 3624                                  labels[mask])
   3625             indexer = indexer[~mask]
   3626         return self.delete(indexer)

ValueError: labels ['Cabin'] not contained in axis

train_data_model = pd.concat([train_data_model,sex,embarked],axis=1)
train_data_model.head()

target = train_data_model.Survived

train_col = ['Pclass','Age','Fare','Family','male','Q','S']

scaler = StandardScaler()
features = scaler.fit_transform(train_data_model[train_col])

model = LogisticRegression()
model.fit(features,target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

score = cross_validation.cross_val_score(model,features,target,cv=5)
print(score)

[ 0.76966292  0.78651685  0.78651685  0.78651685  0.81355932]

model_rf = RandomForestClassifier(n_estimators=300,max_depth=5)
model_rf.fit(features,target)
score_rf = cross_validation.cross_val_score(model_rf,features,target,cv=5)
print(score_rf)

[ 0.79213483  0.82022472  0.86516854  0.78651685  0.83615819]

predict_array = model.predict(features)
prob_array = model.predict_proba(features)

prob_df = pd.DataFrame(prob_array)

prob_df.reset_index()
prob_df.reindex(index=range(len(prob_array)))
prob_df.columns=['No_Survive','Survive']
prob_values = prob_df.sort_values('Survive').Survive
prob_index = np.argsort(prob_df['Survive'])
prob_index

0      200
1      323
2      158
3      844
4       13
5      359
6      849
7      175
8      682
9      325
10     115
11      59
12      93
13     479
14     151
15     858
16     279
17     152
18     782
19     103
20     181
21     630
22     405
23     221
24     159
25     265
26     481
27     596
28     385
29     196
      ... 
859    840
860    456
861    296
862    741
863    374
864    217
865    729
866    308
867    503
868    333
869    707
870    324
871    310
872    668
873    379
874    536
875    392
876    709
877    688
878    328
879    290
880     31
881    368
882    640
883    309
884    715
885    306
886    305
887    699
888    257
Name: Survive, Length: 889, dtype: int64

train_index = range(1,len(prob_values)+1)

len(predict_array) - predict_array.sum()

574

train_data_model.iloc[prob_index].Survived

201    0
324    0
159    0
846    0
13     0
360    0
851    0
176    0
683    0
326    0
116    0
59     0
94     0
480    0
152    0
860    0
280    0
153    0
783    0
104    0
182    0
631    0
406    0
222    0
160    0
266    0
482    0
597    0
386    0
197    0
      ..
842    1
457    1
297    0
742    1
375    1
218    1
730    1
309    1
504    1
334    1
708    1
325    1
311    1
669    1
380    1
537    1
393    1
710    1
689    1
329    1
291    1
31     1
369    1
641    1
310    1
716    1
307    1
306    1
700    1
258    1
Name: Survived, Length: 889, dtype: int64

predict_array.sum()

315

with plt.style.context('ggplot'):
    plt.style.reload_library()
    plt.style.use('seaborn-muted')
    plt.scatter(x= train_index,y=prob_values,label = 'Probability of Survival')
    #plt.scatter(x=train_data_fix.PassengerId,y=predict_array[prob_index],color = 'lightcoral',marker = 'x')
    plt.scatter(x=train_index,y=train_data_model.iloc[prob_index].Survived,color = 'red',marker = 'x',label = 'Predicted Survival')
    plt.axhline(y=train_data.Survived.mean(),xmin=0,xmax = 900,color='black',alpha=.6,linestyle = 'dashdot',label = 'Mean Survival')
    plt.axvline(x=576,ymin=0.0,ymax=1,alpha=.6,linestyle = 'dashed',label = 'Cutoff for Survival')
    plt.xlim(400,700)
    plt.ylim(-.1,1.1)
    plt.xlabel('Passengers')
    plt.ylabel('Probability of Survival')
    plt.legend(loc = 'center left', bbox_to_anchor=(1.0,0.5))
    plt.title('Logistic Regression Plot of Titanic Survival: \n Ordered by Probability of Survival')

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	PassengerId	Survived	Pclass	Age	Fare	Family	male	S
0	1	0	3	22.0	7.2500	1	1	1
1	2	1	1	38.0	71.2833	1	0	0
2	3	1	3	26.0	7.9250	0	0	1
3	4	1	1	35.0	53.1000	1	0	1
4	5	0	3	35.0	8.0500	0	1	1

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
61	62	1	1	Icard, Miss. Amelie	female	38.0	0	0	113572	80.0	B28	NaN
829	830	1	1	Stone, Mrs. George Nelson (Martha Evelyn)	female	62.0	0	0	113572	80.0	B28	NaN

Embarked	C	Q	S
Pclass
1	85	2	127
2	17	3	164
3	66	72	353