This notebook shows how to train a neural network model with pre-trained BERT in Tensorflow/Keras. It is based on @xhlulu's Disaster NLP: Keras BERT using TFHub notebook and Text Extraction with BERT example at Keras.

This competition is a code competition without access to internet. So we add the transformers tokenizer and pre-trained BERT model through Kaggle Datasets instead.

Hope it helps.

Changelogs

Version CV Score Public Score Changes Comment
v9 to be updated to be updated use transformers' tokenizer
v8 0.653635 0.606 add 5-fold CV + early-stopping back.
v7 N/A 0.617 fix the bug in learning rate scheduler overfitting to train? (n=20)
v6 N/A 0.566 add the warm-up learning rate scheduler With a bug. Don't use it
v5 N/A 0.531 roll back to v3
v4 N/A 0.573 add early-stopping seemed to stop too early with patience=1 (n=5)
v3 N/A 0.530 initial baseline

Load Libraries and Data

%reload_ext autoreload
%autoreload 2
%matplotlib inline
from copy import copy
import joblib
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import sys
from warnings import simplefilter

import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel, BertConfig, BertTokenizerFast

simplefilter('ignore')
plt.style.use('fivethirtyeight')
gpu = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu))
if len(gpu) > 0:
    tf.config.experimental.set_memory_growth(gpu[0], True)
Num GPUs Available:  1
model_name = 'bert_v9'

data_dir = Path('../input/commonlitreadabilityprize')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

build_dir = Path('../build/')
output_dir = build_dir / model_name
trn_encoded_file = output_dir / 'trn.enc.joblib'
tokenizer_file = output_dir / 'tokenizer.joblib'
val_predict_file = output_dir / f'{model_name}.val.txt'
submission_file = 'submission.csv'

module_url = "../input/bert-en-uncased-l24-h1024-a16"

id_col = 'id'
target_col = 'target'
text_col = 'excerpt'

max_len = 205
n_fold = 5
n_est = 2
n_stop = 2
batch_size = 8
seed = 42
output_dir.mkdir(parents=True, exist_ok=True)
trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
y = trn[target_col].values
print(trn.shape, y.shape, tst.shape)
trn.head()
(2834, 5) (2834,) (7, 3)
url_legal license excerpt target standard_error
id
c12129c31 NaN NaN When the young people returned to the ballroom... -0.340259 0.464009
85aa80a4c NaN NaN All through dinner time, Mrs. Fayre was somewh... -0.315372 0.480805
b69ac6792 NaN NaN As Roger had predicted, the snow departed as q... -0.580118 0.476676
dd1000b26 NaN NaN And outside before the palace a great garden w... -1.054013 0.450007
37c1b32fb NaN NaN Once upon a time there were Three Bears who li... 0.247197 0.510845

Tokenization Using transformers

pretrained_dir = output_dir / "bert_base_uncased/"
pretrained_dir.mkdir(exist_ok=True)

def load_tokenizer():
    if not os.path.exists(pretrained_dir / 'vocab.txt'):
        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
        tokenizer.save_pretrained(pretrained_dir)
    else:
        print('loading the saved pretrained tokenizer')
        tokenizer = BertTokenizerFast.from_pretrained(str(pretrained_dir))
        
    model_config = BertConfig.from_pretrained(str(pretrained_dir))
    model_config.output_hidden_states = True
    return tokenizer, model_config

def load_bert(config):
    if not os.path.exists(pretrained_dir / 'tf_model.h5'):
        bert_model = TFBertModel.from_pretrained("bert-base-uncased", config=config)
        bert_model.save_pretrained(pretrained_dir)
    else:
        print('loading the saved pretrained model')
        bert_model = TFBertModel.from_pretrained(pretrained_dir, config=config)
    return bert_model
def bert_encode(texts, tokenizer, max_len=max_len):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    
    for text in texts:
        token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                         add_special_tokens=True)
        input_ids.append(token['input_ids'])
        token_type_ids.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    
    return np.array(input_ids), np.array(token_type_ids), np.array(attention_mask)
tokenizer, bert_config = load_tokenizer()

X = bert_encode(trn[text_col].values, tokenizer, max_len=max_len)
X_tst = bert_encode(tst[text_col].values, tokenizer, max_len=max_len)
y = trn[target_col].values
print(X[0].shape, X_tst[0].shape, y.shape)
loading the saved pretrained tokenizer
(2834, 205) (7, 205) (2834,)

Save Tokenizer and Encoded Training Data

joblib.dump(X, trn_encoded_file)
joblib.dump(tokenizer, tokenizer_file)
['../build/bert_v9/tokenizer.joblib']

Model Training with Cross-Validation

Simple model with only an output dense layer added to the pre-trained BERT model.

def build_model(bert_model, max_len=max_len):    
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    token_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    sequence_output = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    clf_output = sequence_output[:, 0, :]
    clf_output = Dropout(.1)(clf_output)
    out = Dense(1, activation='linear')(clf_output)
    
    model = Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)
    model.compile(Adam(lr=1e-5), loss='mean_squared_error', metrics=[RootMeanSquaredError()])
    
    return model

Training the model with early stopping and a learning-rate scheduler

def scheduler(epoch, lr, warmup=5, decay_start=10):
    if epoch <= warmup:
        return lr / (warmup - epoch + 1)
    elif warmup < epoch <= decay_start:
        return lr
    else:
        return lr * tf.math.exp(-.1)

ls = LearningRateScheduler(scheduler)
es = EarlyStopping(patience=n_stop, restore_best_weights=True)

cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

p = np.zeros_like(y, dtype=float)
p_tst = np.zeros((X_tst[0].shape[0], ), dtype=float)
for i, (i_trn, i_val) in enumerate(cv.split(X[0]), 1):
    print(f'training CV #{i}:')
    tf.random.set_seed(seed + i)
    bert_model = load_bert(bert_config)
    clf = build_model(bert_model, max_len=max_len)
    if i == 1:
        print(clf.summary())

    history = clf.fit([x[i_trn] for x in X], y[i_trn],
                      validation_data=([x[i_val] for x in X], y[i_val]),
                      epochs=n_est,
                      batch_size=batch_size,
                      callbacks=[ls])
    clf.save_weights(f'{model_name}_cv{i}.h5')
    p[i_val] = clf.predict([x[i_val] for x in X]).flatten()
    p_tst += clf.predict(X_tst).flatten() / n_fold
training CV #1:
loading the saved pretrained model
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_ids (InputLayer)          [(None, 205)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 205)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 205)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
                                                                 token_type_ids[0][0]             
__________________________________________________________________________________________________
tf.__operators__.getitem_1 (Sli (None, 768)          0           tf_bert_model_1[0][13]           
__________________________________________________________________________________________________
dropout_75 (Dropout)            (None, 768)          0           tf.__operators__.getitem_1[0][0] 
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1)            769         dropout_75[0][0]                 
==================================================================================================
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
__________________________________________________________________________________________________
None
Epoch 1/2
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_1/bert/pooler/dense/kernel:0', 'tf_bert_model_1/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_1/bert/pooler/dense/kernel:0', 'tf_bert_model_1/bert/pooler/dense/bias:0'] when minimizing the loss.
284/284 [==============================] - ETA: 0s - loss: 1.0671WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
284/284 [==============================] - 118s 366ms/step - loss: 1.0662 - val_loss: 0.4776
Epoch 2/2
284/284 [==============================] - 104s 367ms/step - loss: 0.5244 - val_loss: 0.4544
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
training CV #2:
loading the saved pretrained model
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
Epoch 1/2
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_2/bert/pooler/dense/kernel:0', 'tf_bert_model_2/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_2/bert/pooler/dense/kernel:0', 'tf_bert_model_2/bert/pooler/dense/bias:0'] when minimizing the loss.
284/284 [==============================] - ETA: 0s - loss: 1.0675WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
284/284 [==============================] - 117s 365ms/step - loss: 1.0667 - val_loss: 0.5301
Epoch 2/2
284/284 [==============================] - 101s 356ms/step - loss: 0.5712 - val_loss: 0.4714
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
training CV #3:
loading the saved pretrained model
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
Epoch 1/2
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_3/bert/pooler/dense/kernel:0', 'tf_bert_model_3/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_3/bert/pooler/dense/kernel:0', 'tf_bert_model_3/bert/pooler/dense/bias:0'] when minimizing the loss.
284/284 [==============================] - ETA: 0s - loss: 0.9928WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
284/284 [==============================] - 118s 365ms/step - loss: 0.9922 - val_loss: 0.5096
Epoch 2/2
284/284 [==============================] - 102s 358ms/step - loss: 0.5822 - val_loss: 0.5252
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
training CV #4:
loading the saved pretrained model
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
Epoch 1/2
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_4/bert/pooler/dense/kernel:0', 'tf_bert_model_4/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_4/bert/pooler/dense/kernel:0', 'tf_bert_model_4/bert/pooler/dense/bias:0'] when minimizing the loss.
284/284 [==============================] - ETA: 0s - loss: 1.0345WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
284/284 [==============================] - 120s 375ms/step - loss: 1.0337 - val_loss: 0.5380
Epoch 2/2
284/284 [==============================] - 102s 358ms/step - loss: 0.5264 - val_loss: 0.4960
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
training CV #5:
loading the saved pretrained model
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
Epoch 1/2
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_5/bert/pooler/dense/kernel:0', 'tf_bert_model_5/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_5/bert/pooler/dense/kernel:0', 'tf_bert_model_5/bert/pooler/dense/bias:0'] when minimizing the loss.
284/284 [==============================] - ETA: 0s - loss: 1.0071WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
284/284 [==============================] - 120s 371ms/step - loss: 1.0063 - val_loss: 0.5089
Epoch 2/2
284/284 [==============================] - 103s 363ms/step - loss: 0.4906 - val_loss: 0.5032
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
print(f'CV RMSE: {mean_squared_error(y, p, squared=False):.6f}')
np.savetxt(val_predict_file, p, fmt='%.6f')

Submission

sub = pd.read_csv(sample_file, index_col=id_col)
sub[target_col] = p_tst
sub.to_csv(submission_file)
sub.head()