This notebook was originally published here at Kaggle.


In this notebook, I will show how to build supervised emphasized Denoising AutoEncoder (DAE) with Keras. With pseudo label, we can train a classifier and the DAE together instead of training them separately as done in previous TPS competitions.

If you're interested in how different components of DAE (denoising, stacked layers, emphasis, etc.) contribute to its performance, please check out Vincent et al. (2010) "Stacked Denoising Autoencoders: Learning Useful Representations in a Deep Network with a Local Denoising Criterion", JMLR.

This notebook is built on top of my previous notebook, AutoEncoder + Pseudo Label + AutoLGB. The first part (section 1, 2, 3 and 5) of the notebook is the same as the previous one.

The contents of the notebook are as follows:

  1. Package Installation: Installing latest version of Kaggler using Pip.
  2. Feature Engineering: code by @udbhavpangotra
  3. Feature Transformation: Using kaggler.preprocessing.LabelEncoder to impute missing values and group rare categories automatically.
  4. Stacked Emphasized Denoising AutoEncoder (DAE): Adding random noise mask and emphasized version of AutoEncoder, called "Embphasized Denoising AutoEncoder".
  5. LightGBM Model Training: 5-fold CV + Pseudo label from @hiro5299834's data + kaggler.model.AutoLGB's feature selection and hyperparameter optimization
  6. Supervised DAE: Training the classifier and DAE simultaneously.

Part 1: DAE + AutoLGB

Load Libraries and Install Kaggler

# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import lightgbm as lgb
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.losses import mean_squared_error
from tensorflow.keras.metrics import AUC
from tensorflow.python.keras.utils import control_flow_util
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix
import warnings
!pip install kaggler
import kaggler
from kaggler.model import AutoLGB
from kaggler.preprocessing import LabelEncoder

print(f'Kaggler: {kaggler.__version__}')
print(f'TensorFlow: {tf.__version__}')
warnings.simplefilter('ignore')
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)

Feature Engineering (ref: code by @udbhavpangotra)

data_dir = Path('/kaggle/input/tabular-playground-series-apr-2021/')
trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
pseudo_label_file = '/kaggle/input/tps-apr-2021-label/voting_submission_from_5_best.csv'

target_col = 'Survived'
id_col = 'PassengerId'

feature_name = 'dae'
algo_name = 'lgb'
model_name = f'{algo_name}_{feature_name}'

feature_file = f'{feature_name}.csv'
predict_val_file = f'{model_name}.val.txt'
predict_tst_file = f'{model_name}.tst.txt'
submission_file = f'{model_name}.sub.csv'
trn = pd.read_csv(trn_file, index_col=id_col)
tst = pd.read_csv(tst_file, index_col=id_col)
sub = pd.read_csv(sample_file, index_col=id_col)
pseudo_label = pd.read_csv(pseudo_label_file, index_col=id_col)
print(trn.shape, tst.shape, sub.shape, pseudo_label.shape)
tst[target_col] = pseudo_label[target_col]
n_trn = trn.shape[0]
df = pd.concat([trn, tst], axis=0)
df.head()
df['Embarked'] = df['Embarked'].fillna('No')
df['Cabin'] = df['Cabin'].fillna('_')
df['CabinType'] = df['Cabin'].apply(lambda x:x[0])
df.Ticket = df.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

df['Age'].fillna(round(df['Age'].median()), inplace=True,)
df['Age'] = df['Age'].apply(round).astype(int)

df['Fare'].fillna(round(df['Fare'].median()), inplace=True,)

df['FirstName'] = df['Name'].str.split(', ').str[0]
df['SecondName'] = df['Name'].str.split(', ').str[1]

df['n'] = 1

gb = df.groupby('FirstName')
df_names = gb['n'].sum()
df['SameFirstName'] = df['FirstName'].apply(lambda x:df_names[x])

gb = df.groupby('SecondName')
df_names = gb['n'].sum()
df['SameSecondName'] = df['SecondName'].apply(lambda x:df_names[x])

df['Sex'] = (df['Sex'] == 'male').astype(int)

df['FamilySize'] = df.SibSp + df.Parch + 1

feature_cols = ['Pclass', 'Age','Embarked','Parch','SibSp','Fare','CabinType','Ticket','SameFirstName', 'SameSecondName', 'Sex',
                'FamilySize', 'FirstName', 'SecondName']
cat_cols = ['Pclass','Embarked','CabinType','Ticket', 'FirstName', 'SecondName']
num_cols = [x for x in feature_cols if x not in cat_cols]
print(len(feature_cols), len(cat_cols), len(num_cols))

Feature Transformation Using Kaggler

for col in ['SameFirstName', 'SameSecondName', 'Fare', 'FamilySize', 'Parch', 'SibSp']:
    df[col] = np.log2(1 + df[col])
    
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

lbe = LabelEncoder(min_obs=50)
df[cat_cols] = lbe.fit_transform(df[cat_cols]).astype(int)

Emphasized Denoising AutoEncoder (DAE) Using Keras

encoding_dim = 128
masking_prob = .2
emphasis_ratio = 2.
seed = 42

def get_dae(encoding_dim, dropout=.2):
    num_dim = len(num_cols)
    num_input = keras.layers.Input((num_dim,), name='num_input')
    cat_inputs = []
    cat_embs = []
    emb_dims = 0
    for col in cat_cols:
        cat_input = keras.layers.Input((1,), name=f'{col}_input')
        emb_dim = max(8, int(np.log2(1 + df[col].nunique()) * 4))
        cat_emb = keras.layers.Embedding(input_dim=df[col].max() + 1, output_dim=emb_dim)(cat_input)
        cat_emb = keras.layers.Dropout(dropout)(cat_emb)
        cat_emb = keras.layers.Reshape((emb_dim,))(cat_emb)

        cat_inputs.append(cat_input)
        cat_embs.append(cat_emb)
        emb_dims += emb_dim

    merged_inputs = keras.layers.Concatenate()([num_input] + cat_embs)
    batch_size, merged_inputs_dim = merged_inputs.get_shape()
    training = K.learning_phase()
    def mask_inputs():
        mask = tf.random.stateless_binomial(shape=(batch_size, merged_inputs_dim),
                                            seed=seed,
                                            counts=tf.ones((merged_inputs_dim,)),
                                            probs=[masking_prob] * merged_inputs_dim)

        return tf.where(mask == 1, tf.zeros_like(merged_inputs), merged_inputs)

    masked_inputs = control_flow_util.smart_cond(training,
                                                 mask_inputs,
                                                 lambda: merged_inputs)    

    encoded = keras.layers.Dense(encoding_dim, activation='relu')(masked_inputs)
    encoded = keras.layers.Dropout(dropout)(encoded)
    encoded = keras.layers.Dense(encoding_dim, activation='relu')(encoded)
    encoded = keras.layers.Dropout(dropout)(encoded)    
    encoded = keras.layers.Dense(encoding_dim, activation='relu')(encoded)
    
    decoded = keras.layers.Dense(encoding_dim, activation='relu')(encoded)
    decoded = keras.layers.Dropout(dropout)(decoded)
    decoded = keras.layers.Dense(encoding_dim, activation='relu')(decoded)
    decoded = keras.layers.Dropout(dropout)(decoded)    
    decoded = keras.layers.Dense(num_dim + emb_dims, activation='linear')(decoded)

    encoder = keras.Model([num_input] + cat_inputs, encoded)
    ae = keras.Model([num_input] + cat_inputs, decoded, name='ae')
    
    reconstruction_loss = K.mean(
        # masked inputs
        mean_squared_error(merged_inputs,
                           tf.where(merged_inputs != masked_inputs,
                                    decoded, merged_inputs)) / masking_prob * emphasis_ratio \
        # original inputs
       + mean_squared_error(merged_inputs,
                            tf.where(merged_inputs == masked_inputs,
                                     decoded, merged_inputs)) / (1. - masking_prob)
    )
    ae.add_loss(reconstruction_loss)
    ae.compile(optimizer='adam')
    return ae, encoder
ae, encoder = get_dae(encoding_dim)
ae.summary()
inputs = [df[num_cols].values] + [df[x].values for x in cat_cols]
ae.fit(inputs, inputs,
      epochs=30,
      batch_size=16384,
      shuffle=True,
      validation_split=.2)
encoding = encoder.predict(inputs)
print(encoding.shape)
np.savetxt(feature_file, encoding, fmt='%.6f', delimiter=',')

Model Training + Feature Selection + HPO Using Kaggler's AutoLGB

n_fold = 5
X = pd.concat((df[feature_cols], 
               pd.DataFrame(encoding, columns=[f'enc_{x}' for x in range(encoding_dim)])), axis=1)
y = df[target_col]
X_tst = X.iloc[n_trn:]

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
p = np.zeros_like(y, dtype=float)
p_tst = np.zeros((tst.shape[0],))
for i, (i_trn, i_val) in enumerate(cv.split(X, y)):
    if i == 0:
        clf = AutoLGB(objective='binary', metric='auc', random_state=seed)
        clf.tune(X.iloc[i_trn], y[i_trn])
        features = clf.features
        params = clf.params
        n_best = clf.n_best
        print(f'{n_best}')
        print(f'{params}')
        print(f'{features}')
    
    trn_data = lgb.Dataset(X.iloc[i_trn], y[i_trn])
    val_data = lgb.Dataset(X.iloc[i_val], y[i_val])
    clf = lgb.train(params, trn_data, n_best, val_data, verbose_eval=100)
    p[i_val] = clf.predict(X.iloc[i_val])
    p_tst += clf.predict(X_tst) / n_fold
    print(f'CV #{i + 1} AUC: {roc_auc_score(y[i_val], p[i_val]):.6f}')

np.savetxt(predict_val_file, p, fmt='%.6f')
np.savetxt(predict_tst_file, p_tst, fmt='%.6f')
print(f'  CV AUC: {roc_auc_score(y, p):.6f}')
print(f'Test AUC: {roc_auc_score(pseudo_label[target_col], p_tst)}')

Submission File for DAE + AutoLGB

n_pos = int(0.34911 * tst.shape[0])
th = sorted(p_tst, reverse=True)[n_pos]
print(th)
confusion_matrix(pseudo_label[target_col], (p_tst > th).astype(int))
sub[target_col] = (p_tst > th).astype(int)
sub.to_csv(submission_file)

Part 2: Supervised DAE

feature_name = 'dae'
algo_name = 'sdae'
model_name = f'{algo_name}_{feature_name}'

feature_file = f'{feature_name}.csv'
predict_val_file = f'{model_name}.val.txt'
predict_tst_file = f'{model_name}.tst.txt'
submission_file = f'{model_name}.sub.csv'

Supervised DAE with Keras

We are adding a classifier head to the DAE network. It requires the additional loss and metric for the classifier in addition to the reconstruction_loss for DAE.

def get_sdae(encoding_dim, dropout=.2):
    num_dim = len(num_cols)
    num_input = keras.layers.Input((num_dim,), name='num_input')
    cat_inputs = []
    cat_embs = []
    emb_dims = 0
    for col in cat_cols:
        cat_input = keras.layers.Input((1,), name=f'{col}_input')
        emb_dim = max(8, int(np.log2(1 + df[col].nunique()) * 4))
        cat_emb = keras.layers.Embedding(input_dim=df[col].max() + 1, output_dim=emb_dim)(cat_input)
        cat_emb = keras.layers.Dropout(dropout)(cat_emb)
        cat_emb = keras.layers.Reshape((emb_dim,))(cat_emb)

        cat_inputs.append(cat_input)
        cat_embs.append(cat_emb)
        emb_dims += emb_dim
    
    inputs = [num_input] + cat_inputs
    merged_inputs = keras.layers.Concatenate()([num_input] + cat_embs)
    
    # masking
    batch_size, merged_inputs_dim = merged_inputs.get_shape()
    training = K.learning_phase()
    def mask_inputs():
        mask = tf.random.stateless_binomial(shape=(batch_size, merged_inputs_dim),
                                            seed=seed,
                                            counts=tf.ones((merged_inputs_dim,)),
                                            probs=[masking_prob] * merged_inputs_dim)

        return tf.where(mask == 1, tf.zeros_like(merged_inputs), merged_inputs)

    masked_inputs = control_flow_util.smart_cond(training,
                                                 mask_inputs,
                                                 lambda: merged_inputs)    

    # encoder
    encoded_1 = keras.layers.Dense(encoding_dim, activation='relu')(masked_inputs)
    encoded_1 = keras.layers.Dropout(dropout)(encoded_1)
    encoded_2 = keras.layers.Dense(encoding_dim, activation='relu')(encoded_1)
    encoded_2 = keras.layers.Dropout(dropout)(encoded_2)    
    encoded_3 = keras.layers.Dense(encoding_dim, activation='relu')(encoded_2)
    
    encoded_concat = keras.layers.Concatenate()([encoded_1, encoded_2, encoded_3])
    encoder = keras.Model(inputs, encoded_concat)
    
    decoded = keras.layers.Dense(encoding_dim, activation='relu')(encoded_3)
    decoded = keras.layers.Dropout(dropout)(decoded)
    decoded = keras.layers.Dense(encoding_dim, activation='relu')(decoded)
    decoded = keras.layers.Dropout(dropout)(decoded)    
    decoded = keras.layers.Dense(num_dim + emb_dims, activation='linear')(decoded)

    ae = keras.Model([num_input] + cat_inputs, decoded)
    
    # classifier
    clf_encoded_input = keras.Input((encoding_dim * 3,))
    x = keras.layers.Dense(encoding_dim, 'relu')(clf_encoded_input)
    x = keras.layers.Dropout(dropout)(x)
    clf_output = keras.layers.Dense(1, activation='sigmoid')(x)
    clf = keras.Model(inputs=clf_encoded_input, outputs=clf_output, name='clf')
    
    outputs = [ae(inputs), clf(encoder(inputs))]
    model = keras.Model(inputs, outputs, name='sdae')
    
    reconstruction_loss = K.mean(
        # masked inputs
        mean_squared_error(merged_inputs,
                           tf.where(merged_inputs != masked_inputs,
                                    decoded, merged_inputs)) / masking_prob * emphasis_ratio \
        # original inputs
       + mean_squared_error(merged_inputs,
                            tf.where(merged_inputs == masked_inputs,
                                     decoded, merged_inputs)) / (1. - masking_prob)
    )
    model.add_loss(reconstruction_loss)
    model.compile(optimizer='adam', loss={'clf': 'binary_crossentropy'}, metrics={'clf': [AUC()]})
    return model, encoder
sdae, encoder = get_sdae(encoding_dim)
sdae.summary()

Model Training: Supervised DAE with 5-CV

n_fold = 5
X = df[feature_cols]
y = df[target_col]
X_tst = X.iloc[n_trn:]
inputs_tst = [X_tst[num_cols].values] + [X_tst[x].values for x in cat_cols]

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
p = np.zeros_like(y, dtype=float)
p_tst = np.zeros((tst.shape[0],))
for i, (i_trn, i_val) in enumerate(cv.split(X, y)):
    X_trn = X.iloc[i_trn]
    X_val = X.iloc[i_val]

    inputs_trn = [X[num_cols].values[i_trn]] + [X[x].values[i_trn] for x in cat_cols]
    inputs_val = [X[num_cols].values[i_val]] + [X[x].values[i_val] for x in cat_cols]
    sdae, _ = get_sdae(encoding_dim)
    sdae.fit(inputs_trn, y[i_trn],
              epochs=20,
              batch_size=16384,
              shuffle=True,
              validation_data=(inputs_val, y[i_val]))
    p[i_val] = sdae.predict(inputs_val)[1].flatten()
    p_tst += sdae.predict(inputs_tst)[1].flatten() / n_fold
    print(f'CV #{i + 1} AUC: {roc_auc_score(y[i_val], p[i_val]):.6f}')

np.savetxt(predict_val_file, p, fmt='%.6f')
np.savetxt(predict_tst_file, p_tst, fmt='%.6f')
print(f'  CV AUC: {roc_auc_score(y, p):.6f}')
print(f'Test AUC: {roc_auc_score(pseudo_label[target_col], p_tst)}')
n_pos = int(0.34911 * tst.shape[0])
th = sorted(p_tst, reverse=True)[n_pos]
print(th)
confusion_matrix(pseudo_label[target_col], (p_tst > th).astype(int))
sub[target_col] = (p_tst > th).astype(int)
sub.to_csv(submission_file)

Part 3: Simple Ensemble

submission_file = 'simple_ensemble_dae.csv'
model_names = ['lgb_dae', 'sdae_dae']
predict_val_files = [f'{x}.val.txt' for x in model_names]
predict_tst_files = [f'{x}.tst.txt' for x in model_names]
dict_val_predict = {}
dict_tst_predict = {}
for name, val_file, tst_file in zip(model_name, predict_val_files, predict_tst_files):
    dict_val_predict[name] = np.loadtxt(val_file)
    dict_tst_predict[name] = np.loadtxt(tst_file)
    
p = pd.DataFrame(dict_val_predict).mean(axis=1).values
p_tst = pd.DataFrame(dict_tst_predict).mean(axis=1).values
print(f'  CV AUC: {roc_auc_score(y, p):.6f}')
print(f'Test AUC: {roc_auc_score(pseudo_label[target_col], p_tst)}')
n_pos = int(0.34911 * tst.shape[0])
th = sorted(p_tst, reverse=True)[n_pos]
print(th)
confusion_matrix(pseudo_label[target_col], (p_tst > th).astype(int))
sub[target_col] = (p_tst > th).astype(int)
sub.to_csv(submission_file)

If you find it helpful, please upvote the notebook and give a star to Kaggler. If you have questions and/or feature requests for Kaggler, please post them as Issue in the Kaggler GitHub repository.

Happy Kaggling!