This notebook shows how to train a neural network model with pre-trained BERT in Tensorflow/Keras. It is based on @xhlulu's Disaster NLP: Keras BERT using TFHub notebook and Text Extraction with BERT example at Keras.

This competition is a code competition without access to internet. So we add the transformers tokenizer and pre-trained BERT model through Kaggle Datasets instead.

Hope it helps.

Changelogs

Version	CV Score	Public Score	Changes	Comment
v9	to be updated	to be updated	use transformers' tokenizer
v8	0.653635	0.606	add 5-fold CV + early-stopping back.
v7	N/A	0.617	fix the bug in learning rate scheduler	overfitting to train? (n=20)
v6	N/A	0.566	add the warm-up learning rate scheduler	With a bug. Don't use it
v5	N/A	0.531	roll back to v3
v4	N/A	0.573	add early-stopping	seemed to stop too early with `patience=1` (n=5)
v3	N/A	0.530	initial baseline

Load Libraries and Data

%reload_ext autoreload
%autoreload 2
%matplotlib inline

from copy import copy
import joblib
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import sys
from warnings import simplefilter

import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel, BertConfig, BertTokenizerFast

simplefilter('ignore')
plt.style.use('fivethirtyeight')

gpu = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu))
if len(gpu) > 0:
    tf.config.experimental.set_memory_growth(gpu[0], True)

Num GPUs Available:  1

model_name = 'bert_v9'

data_dir = Path('../input/commonlitreadabilityprize')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

build_dir = Path('../build/')
output_dir = build_dir / model_name
trn_encoded_file = output_dir / 'trn.enc.joblib'
tokenizer_file = output_dir / 'tokenizer.joblib'
val_predict_file = output_dir / f'{model_name}.val.txt'
submission_file = 'submission.csv'

module_url = "../input/bert-en-uncased-l24-h1024-a16"

id_col = 'id'
target_col = 'target'
text_col = 'excerpt'

max_len = 205
n_fold = 5
n_est = 2
n_stop = 2
batch_size = 8
seed = 42

output_dir.mkdir(parents=True, exist_ok=True)

trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
y = trn[target_col].values
print(trn.shape, y.shape, tst.shape)
trn.head()

(2834, 5) (2834,) (7, 3)

Tokenization Using `transformers`

pretrained_dir = output_dir / "bert_base_uncased/"
pretrained_dir.mkdir(exist_ok=True)

def load_tokenizer():
    if not os.path.exists(pretrained_dir / 'vocab.txt'):
        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
        tokenizer.save_pretrained(pretrained_dir)
    else:
        print('loading the saved pretrained tokenizer')
        tokenizer = BertTokenizerFast.from_pretrained(str(pretrained_dir))
        
    model_config = BertConfig.from_pretrained(str(pretrained_dir))
    model_config.output_hidden_states = True
    return tokenizer, model_config

def load_bert(config):
    if not os.path.exists(pretrained_dir / 'tf_model.h5'):
        bert_model = TFBertModel.from_pretrained("bert-base-uncased", config=config)
        bert_model.save_pretrained(pretrained_dir)
    else:
        print('loading the saved pretrained model')
        bert_model = TFBertModel.from_pretrained(pretrained_dir, config=config)
    return bert_model

def bert_encode(texts, tokenizer, max_len=max_len):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    
    for text in texts:
        token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                         add_special_tokens=True)
        input_ids.append(token['input_ids'])
        token_type_ids.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    
    return np.array(input_ids), np.array(token_type_ids), np.array(attention_mask)

tokenizer, bert_config = load_tokenizer()

X = bert_encode(trn[text_col].values, tokenizer, max_len=max_len)
X_tst = bert_encode(tst[text_col].values, tokenizer, max_len=max_len)
y = trn[target_col].values
print(X[0].shape, X_tst[0].shape, y.shape)

loading the saved pretrained tokenizer
(2834, 205) (7, 205) (2834,)

Save Tokenizer and Encoded Training Data

joblib.dump(X, trn_encoded_file)
joblib.dump(tokenizer, tokenizer_file)

['../build/bert_v9/tokenizer.joblib']

Model Training with Cross-Validation

Simple model with only an output dense layer added to the pre-trained BERT model.

def build_model(bert_model, max_len=max_len):    
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    token_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    sequence_output = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    clf_output = sequence_output[:, 0, :]
    clf_output = Dropout(.1)(clf_output)
    out = Dense(1, activation='linear')(clf_output)
    
    model = Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)
    model.compile(Adam(lr=1e-5), loss='mean_squared_error', metrics=[RootMeanSquaredError()])
    
    return model

Training the model with early stopping and a learning-rate scheduler

def scheduler(epoch, lr, warmup=5, decay_start=10):
    if epoch <= warmup:
        return lr / (warmup - epoch + 1)
    elif warmup < epoch <= decay_start:
        return lr
    else:
        return lr * tf.math.exp(-.1)

ls = LearningRateScheduler(scheduler)
es = EarlyStopping(patience=n_stop, restore_best_weights=True)

cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

p = np.zeros_like(y, dtype=float)
p_tst = np.zeros((X_tst[0].shape[0], ), dtype=float)
for i, (i_trn, i_val) in enumerate(cv.split(X[0]), 1):
    print(f'training CV #{i}:')
    tf.random.set_seed(seed + i)
    bert_model = load_bert(bert_config)
    clf = build_model(bert_model, max_len=max_len)
    if i == 1:
        print(clf.summary())

    history = clf.fit([x[i_trn] for x in X], y[i_trn],
                      validation_data=([x[i_val] for x in X], y[i_val]),
                      epochs=n_est,
                      batch_size=batch_size,
                      callbacks=[ls])
    clf.save_weights(f'{model_name}_cv{i}.h5')
    p[i_val] = clf.predict([x[i_val] for x in X]).flatten()
    p_tst += clf.predict(X_tst).flatten() / n_fold

training CV #1:
loading the saved pretrained model

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.

WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_ids (InputLayer)          [(None, 205)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 205)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 205)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
                                                                 token_type_ids[0][0]             
__________________________________________________________________________________________________
tf.__operators__.getitem_1 (Sli (None, 768)          0           tf_bert_model_1[0][13]           
__________________________________________________________________________________________________
dropout_75 (Dropout)            (None, 768)          0           tf.__operators__.getitem_1[0][0] 
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1)            769         dropout_75[0][0]                 
==================================================================================================
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
__________________________________________________________________________________________________
None
Epoch 1/2
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_1/bert/pooler/dense/kernel:0', 'tf_bert_model_1/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_1/bert/pooler/dense/kernel:0', 'tf_bert_model_1/bert/pooler/dense/bias:0'] when minimizing the loss.
284/284 [==============================] - ETA: 0s - loss: 1.0671WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
284/284 [==============================] - 118s 366ms/step - loss: 1.0662 - val_loss: 0.4776
Epoch 2/2
284/284 [==============================] - 104s 367ms/step - loss: 0.5244 - val_loss: 0.4544
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
training CV #2:
loading the saved pretrained model

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.

WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
Epoch 1/2
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_2/bert/pooler/dense/kernel:0', 'tf_bert_model_2/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_2/bert/pooler/dense/kernel:0', 'tf_bert_model_2/bert/pooler/dense/bias:0'] when minimizing the loss.
284/284 [==============================] - ETA: 0s - loss: 1.0675WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
284/284 [==============================] - 117s 365ms/step - loss: 1.0667 - val_loss: 0.5301
Epoch 2/2
284/284 [==============================] - 101s 356ms/step - loss: 0.5712 - val_loss: 0.4714
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
training CV #3:
loading the saved pretrained model

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.

WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
Epoch 1/2
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_3/bert/pooler/dense/kernel:0', 'tf_bert_model_3/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_3/bert/pooler/dense/kernel:0', 'tf_bert_model_3/bert/pooler/dense/bias:0'] when minimizing the loss.
284/284 [==============================] - ETA: 0s - loss: 0.9928WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
284/284 [==============================] - 118s 365ms/step - loss: 0.9922 - val_loss: 0.5096
Epoch 2/2
284/284 [==============================] - 102s 358ms/step - loss: 0.5822 - val_loss: 0.5252
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
training CV #4:
loading the saved pretrained model

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.

WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
Epoch 1/2
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_4/bert/pooler/dense/kernel:0', 'tf_bert_model_4/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_4/bert/pooler/dense/kernel:0', 'tf_bert_model_4/bert/pooler/dense/bias:0'] when minimizing the loss.
284/284 [==============================] - ETA: 0s - loss: 1.0345WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
284/284 [==============================] - 120s 375ms/step - loss: 1.0337 - val_loss: 0.5380
Epoch 2/2
284/284 [==============================] - 102s 358ms/step - loss: 0.5264 - val_loss: 0.4960
WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
training CV #5:
loading the saved pretrained model

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../build/bert_v9/bert_base_uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.

Print CV RMSE and Save CV Predictions

print(f'CV RMSE: {mean_squared_error(y, p, squared=False):.6f}')
np.savetxt(val_predict_file, p, fmt='%.6f')

Submission

sub = pd.read_csv(sample_file, index_col=id_col)
sub[target_col] = p_tst
sub.to_csv(submission_file)
sub.head()

	url_legal	license	excerpt	target	standard_error
id
c12129c31	NaN	NaN	When the young people returned to the ballroom...	-0.340259	0.464009
85aa80a4c	NaN	NaN	All through dinner time, Mrs. Fayre was somewh...	-0.315372	0.480805
b69ac6792	NaN	NaN	As Roger had predicted, the snow departed as q...	-0.580118	0.476676
dd1000b26	NaN	NaN	And outside before the palace a great garden w...	-1.054013	0.450007
37c1b32fb	NaN	NaN	Once upon a time there were Three Bears who li...	0.247197	0.510845