import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import numpy as np
import seaborn as sns
sns.set(style="ticks", color_codes=True)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
from sklearn import preprocessing
%matplotlib inline
train_data = pd.read_csv('./train.csv')
train_data.head()
Variable Definition Key
survival Survival 0 = No, 1 = Yes
pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
sex Sex
Age Age in years
sibsp # of siblings / spouses aboard the Titanic
parch # of parents / children aboard the Titanic
ticket Ticket number
fare Passenger fare
cabin Cabin number
embarked Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
Variable Notes
pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
train_data.describe()
train_data.Survived.mean()
for col in train_data:
print(col)
print(train_data[col].isnull().sum())
import numpy as np
import seaborn as sns
plt.rcParams["patch.force_edgecolor"] = True
sns.set(style='ticks')
sns.set(font_scale = 1.25)
x= train_data[train_data['Sex']=='female'].Age.dropna()
f, (ax_box, ax_hist) = plt.subplots(2,figsize=(12,10),sharex = True,gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(x,ax=ax_box)
sns.distplot(x, ax=ax_hist,bins=20)
ax_box.set(yticks=[])
sns.despine(ax=ax_hist)
sns.despine(ax=ax_box, left=True)
sns.plt.xlim(0,)
ax_box.set(title='Age Distribution of Females that Survived on the Titanic')
plt.show()
sns.set(style='ticks')
sns.set(font_scale = 1.25)
x= train_data[train_data['Sex']=='male'].Age.dropna()
f, (ax_box, ax_hist) = plt.subplots(2,figsize=(12,10),sharex = True,gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(x,ax=ax_box)
sns.distplot(x, ax=ax_hist,bins=20)
ax_box.set(yticks=[])
sns.despine(ax=ax_hist)
sns.despine(ax=ax_box, left=True)
sns.plt.xlim(0,)
ax_box.set(title='Age Distribution of Males on the Titanic')
plt.show()
sns.set(style='white')
plt.rcParams["patch.force_edgecolor"] = True
sns.set(font_scale = 1.25)
sns.jointplot(x = train_data['Age'],y=train_data['Fare'],size=8)
plt.show()
na_age = pd.isnull(train_data['Age'])
plt.style.use('ggplot')
data1 = train_data[train_data['Survived']==0].Fare
data2 = train_data[train_data['Survived']==1].Fare
data = [data1,data2]
ax = sns.boxplot(data = data, width = .5)
labels = ['Survive = No','Survive = Yes']
ax.set_xticklabels(labels)
plt.ylim(0,300)
plt.title('Differences in Fare based on Survival')
plt.show()
plt.style.use('ggplot')
age_data1 = train_data[train_data['Pclass'] ==1].Age.dropna()
age_data2 = train_data[train_data['Pclass'] ==2].Age.dropna()
age_data3 = train_data[train_data['Pclass'] ==3].Age.dropna()
box_age = [age_data1,age_data2,age_data3]
ax = sns.boxplot(data = box_age)
ax.set_xticklabels([1,2,3])
plt.title('Age Distribution by Passenger Class')
plt.xlabel('Passenger Class')
plt.ylabel('Age')
plt.show()
def normal_age(pclass):
while True:
age = np.random.normal(loc = train_data[train_data['Pclass'] ==pclass].Age.dropna().mean(axis=0),
scale = train_data[train_data['Pclass'] ==pclass].Age.dropna().std(axis=0) )
if age > 0:
break
#print(age)
return age
for index,row in train_data[na_age].iterrows():
#print(row)
if row.Pclass =='1':
train_data.loc[index,'Age'] = normal_age(1)
#print(row.Age)
elif row.Pclass == '2':
train_data.loc[index,'Age'] = normal_age(2)
#print(row.Age)
else:
train_data.loc[index,'Age'] = normal_age(3)
#print(row.Age)
age_data1_updated = train_data[train_data['Pclass'] ==1].Age.dropna()
age_data2_updated = train_data[train_data['Pclass'] ==2].Age.dropna()
age_data3_updated = train_data[train_data['Pclass'] ==3].Age.dropna()
box_age_updated = [age_data1_updated,age_data2_updated,age_data3_updated]
sns.boxplot(data=box_age_updated)
ax.set_xticklabels([1,2,3])
plt.title('Age Distribution by Passenger Class (Updated NaN values)')
plt.xlabel('Passenger Class')
plt.ylabel('Age')
plt.show()
sextab = pd.pivot_table(data = train_data,index= 'Sex',columns = 'Survived',aggfunc='size')
sextab
sextab = pd.pivot_table(data = train_data,index= 'Sex',columns = 'Survived',aggfunc='size')
#sextab.plot.barh(stacked=True)
#sextab = sextab.div(sextab.sum(axis=1),axis=0)
sex_sum=sextab.sum(axis=1)
sextab = sextab.div(sex_sum,axis=0)
sextab.plot.barh(stacked=True)
class_tab = pd.pivot_table(data = train_data,index = 'Pclass',columns = ['Survived','Sex'], aggfunc = 'size')
class_tab
embarktab = pd.pivot_table(data = train_data,index= 'Pclass',columns = 'Embarked',aggfunc='size')
embarktab
train_data[train_data.Embarked.isnull()]
train_data.drop([61,829],inplace=True)
train_data['Family'] = train_data['Parch']+train_data['SibSp']
sex = pd.get_dummies(train_data['Sex'], drop_first=True)
embarked = pd.get_dummies(train_data['Embarked'], drop_first=True)
train_data = train_data.drop(['Cabin'],axis=1,)
train_data_model = pd.concat([train_data_model,sex,embarked],axis=1)
train_data_model.head()
target = train_data_model.Survived
train_col = ['Pclass','Age','Fare','Family','male','Q','S']
scaler = StandardScaler()
features = scaler.fit_transform(train_data_model[train_col])
model = LogisticRegression()
model.fit(features,target)
score = cross_validation.cross_val_score(model,features,target,cv=5)
print(score)
model_rf = RandomForestClassifier(n_estimators=300,max_depth=5)
model_rf.fit(features,target)
score_rf = cross_validation.cross_val_score(model_rf,features,target,cv=5)
print(score_rf)
predict_array = model.predict(features)
prob_array = model.predict_proba(features)
prob_df = pd.DataFrame(prob_array)
prob_df.reset_index()
prob_df.reindex(index=range(len(prob_array)))
prob_df.columns=['No_Survive','Survive']
prob_values = prob_df.sort_values('Survive').Survive
prob_index = np.argsort(prob_df['Survive'])
prob_index
train_index = range(1,len(prob_values)+1)
len(predict_array) - predict_array.sum()
train_data_model.iloc[prob_index].Survived
predict_array.sum()
with plt.style.context('ggplot'):
plt.style.reload_library()
plt.style.use('seaborn-muted')
plt.scatter(x= train_index,y=prob_values,label = 'Probability of Survival')
#plt.scatter(x=train_data_fix.PassengerId,y=predict_array[prob_index],color = 'lightcoral',marker = 'x')
plt.scatter(x=train_index,y=train_data_model.iloc[prob_index].Survived,color = 'red',marker = 'x',label = 'Predicted Survival')
plt.axhline(y=train_data.Survived.mean(),xmin=0,xmax = 900,color='black',alpha=.6,linestyle = 'dashdot',label = 'Mean Survival')
plt.axvline(x=576,ymin=0.0,ymax=1,alpha=.6,linestyle = 'dashed',label = 'Cutoff for Survival')
plt.xlim(400,700)
plt.ylim(-.1,1.1)
plt.xlabel('Passengers')
plt.ylabel('Probability of Survival')
plt.legend(loc = 'center left', bbox_to_anchor=(1.0,0.5))
plt.title('Logistic Regression Plot of Titanic Survival: \n Ordered by Probability of Survival')