Hi there,
I’m trying to get my first neural network to solve the Titanic challenge on Kaggle and despite the multiple efforts on trying different features, one-hot encoding, regularizer values, learning rates, epochs, and network sizes, I can’t get it to be much better than ~46% of classification error on the cross-validation dataset.
I’m running out of ideas and would love to get some insights and what I could try.
This is my script to prepare the features:
def cleanup_dataset(dataset):
# Cabin
dataset = dataset.drop(['Cabin'], axis=1)
# Embarked
dataset = dataset.fillna({'Embarked': 'S'})
# Age group
dataset['Age'] = dataset['Age'].fillna(-0.5)
bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]
labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
dataset['AgeGroup'] = pd.cut(dataset['Age'], bins, labels=labels)
# Titles
dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
dataset['Title'] = dataset['Title'].replace(['Lady', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')
dataset['Title'] = dataset['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
dataset['Title'] = dataset['Title'].fillna('Unknown')
# Age
# Most common age per title
age_title_mapping = {'Mr': 'Young Adult', 'Miss': 'Young Adult', 'Mrs': 'Adult', 'Master': 'Adult', 'Royal': 'Adult', 'Rare': 'Adult'}
for x in range(1, len(dataset['AgeGroup'])):
if dataset['AgeGroup'][x] == 'Unknown':
dataset['AgeGroup'][x] = age_title_mapping[dataset['Title'][x]]
dataset = dataset.drop(['Age'], axis=1)
# Fare
for x in range(1, len(dataset['Fare'])):
if pd.isnull(dataset['Fare'][x]):
pclass = dataset['Pclass'][x]
dataset['Fare'][x] = round(
dataset[dataset['Pclass'] == pclass]['Fare'].median(), 4)
dataset['FareBand'] = pd.qcut(dataset['Fare'], 3, labels=['Low', 'Median', 'High'])
dataset = dataset.drop(['Fare'], axis=1)
# Family Size
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Cleanup
dataset = dataset.drop(['Ticket', 'Name'], axis=1)
dataset = pd.get_dummies(
dataset,
columns = ['Sex','Title','AgeGroup','Embarked','FareBand'],
prefix = ['Sex','Title','AgeGroup','Embarked','FareBand']
)
return dataset
Splitting the datasets:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
x_train, x_cv, y_train, y_cv = train_test_split(
cleanup_dataset(train).drop(['Survived', 'PassengerId'], axis=1),
cleanup_dataset(train)['Survived'],
test_size=0.20,
random_state=1
)
Trying different architectures:
def build_models(regularizer):
tf.random.set_seed(10)
model_1 = Sequential(
[
Dense(32, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(16, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(1, activation = 'linear', kernel_regularizer=L2(regularizer))
],
name='model_1'
)
model_2 = Sequential(
[
Dense(32, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(16, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(16, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(8, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(1, activation = 'linear', kernel_regularizer=L2(regularizer))
],
name='model_2'
)
model_3 = Sequential(
[
Dense(64, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(32, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(16, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(16, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(8, activation = 'relu', kernel_regularizer=L2(regularizer)),
Dense(1, activation = 'linear', kernel_regularizer=L2(regularizer))
],
name='model_3'
)
model_list = [
model_1,
model_2,
model_3
]
return model_list
And finally the actual fitting:
nn_train_error = []
nn_cv_error = []
models_history = []
models_bc = build_models(regularizer=1e-2)
for model in models_bc:
model.compile(
loss=BinaryCrossentropy(from_logits=True),
optimizer=Adam(learning_rate=1e-3),
)
print(f"Training {model.name}...")
models_history.append(
model.fit(
x_train.values,
y_train.values,
epochs=400,
verbose=0
)
)
model.summary()
print("Done!\n")
threshold = 0.5
# Record the fraction of misclassified examples for the training set
yhat = model.predict(x_train)
yhat = tf.math.sigmoid(yhat)
yhat = np.where(yhat >= threshold, 1, 0)
train_error = np.mean(yhat != y_train.values)
nn_train_error.append(train_error)
print('Train error: {}'.format(train_error))
# Record the fraction of misclassified examples for the cross validation set
yhat = model.predict(x_cv)
yhat = tf.math.sigmoid(yhat)
yhat = np.where(yhat >= threshold, 1, 0)
cv_error = np.mean(yhat != y_cv.values)
nn_cv_error.append(cv_error)
print('CV error: {}'.format(train_error))
for model_num in range(len(nn_train_error)):
print(
f"Model {model_num+1}: Training Set Classification Error: {nn_train_error[model_num]:.5f}, " +
f"CV Set Classification Error: {nn_cv_error[model_num]:.5f}"
)
Any insights would be really appreciated, thanks!