TF/Keras BERT Baseline (Training/Inference)
A tutorial about how to train an NLP model with the huggingface's pretrained BERT in TF/Keras
- Changelogs
- Load Libraries and Data
- Tokenization Using transformers
- Model Training with Cross-Validation
- Submission
This notebook shows how to train a neural network model with pre-trained BERT in Tensorflow/Keras. It is based on @xhlulu's Disaster NLP: Keras BERT using TFHub notebook and Text Extraction with BERT example at Keras.
This competition is a code competition without access to internet. So we add the transformers
tokenizer and pre-trained BERT model through Kaggle Datasets instead.
Hope it helps.
Version | CV Score | Public Score | Changes | Comment |
---|---|---|---|---|
v9 | to be updated | to be updated | use transformers' tokenizer | |
v8 | 0.653635 | 0.606 | add 5-fold CV + early-stopping back. | |
v7 | N/A | 0.617 | fix the bug in learning rate scheduler | overfitting to train? (n=20) |
v6 | N/A | 0.566 | add the warm-up learning rate scheduler | With a bug. Don't use it |
v5 | N/A | 0.531 | roll back to v3 | |
v4 | N/A | 0.573 | add early-stopping | seemed to stop too early with patience=1 (n=5) |
v3 | N/A | 0.530 | initial baseline |
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from copy import copy
import joblib
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import sys
from warnings import simplefilter
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel, BertConfig, BertTokenizerFast
simplefilter('ignore')
plt.style.use('fivethirtyeight')
gpu = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu))
if len(gpu) > 0:
tf.config.experimental.set_memory_growth(gpu[0], True)
model_name = 'bert_v9'
data_dir = Path('../input/commonlitreadabilityprize')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
build_dir = Path('../build/')
output_dir = build_dir / model_name
trn_encoded_file = output_dir / 'trn.enc.joblib'
tokenizer_file = output_dir / 'tokenizer.joblib'
val_predict_file = output_dir / f'{model_name}.val.txt'
submission_file = 'submission.csv'
module_url = "../input/bert-en-uncased-l24-h1024-a16"
id_col = 'id'
target_col = 'target'
text_col = 'excerpt'
max_len = 205
n_fold = 5
n_est = 2
n_stop = 2
batch_size = 8
seed = 42
output_dir.mkdir(parents=True, exist_ok=True)
trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
y = trn[target_col].values
print(trn.shape, y.shape, tst.shape)
trn.head()
pretrained_dir = output_dir / "bert_base_uncased/"
pretrained_dir.mkdir(exist_ok=True)
def load_tokenizer():
if not os.path.exists(pretrained_dir / 'vocab.txt'):
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer.save_pretrained(pretrained_dir)
else:
print('loading the saved pretrained tokenizer')
tokenizer = BertTokenizerFast.from_pretrained(str(pretrained_dir))
model_config = BertConfig.from_pretrained(str(pretrained_dir))
model_config.output_hidden_states = True
return tokenizer, model_config
def load_bert(config):
if not os.path.exists(pretrained_dir / 'tf_model.h5'):
bert_model = TFBertModel.from_pretrained("bert-base-uncased", config=config)
bert_model.save_pretrained(pretrained_dir)
else:
print('loading the saved pretrained model')
bert_model = TFBertModel.from_pretrained(pretrained_dir, config=config)
return bert_model
def bert_encode(texts, tokenizer, max_len=max_len):
input_ids = []
token_type_ids = []
attention_mask = []
for text in texts:
token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
add_special_tokens=True)
input_ids.append(token['input_ids'])
token_type_ids.append(token['token_type_ids'])
attention_mask.append(token['attention_mask'])
return np.array(input_ids), np.array(token_type_ids), np.array(attention_mask)
tokenizer, bert_config = load_tokenizer()
X = bert_encode(trn[text_col].values, tokenizer, max_len=max_len)
X_tst = bert_encode(tst[text_col].values, tokenizer, max_len=max_len)
y = trn[target_col].values
print(X[0].shape, X_tst[0].shape, y.shape)
joblib.dump(X, trn_encoded_file)
joblib.dump(tokenizer, tokenizer_file)
Simple model with only an output dense layer added to the pre-trained BERT model.
def build_model(bert_model, max_len=max_len):
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
token_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
sequence_output = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
clf_output = sequence_output[:, 0, :]
clf_output = Dropout(.1)(clf_output)
out = Dense(1, activation='linear')(clf_output)
model = Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)
model.compile(Adam(lr=1e-5), loss='mean_squared_error', metrics=[RootMeanSquaredError()])
return model
Training the model with early stopping and a learning-rate scheduler
def scheduler(epoch, lr, warmup=5, decay_start=10):
if epoch <= warmup:
return lr / (warmup - epoch + 1)
elif warmup < epoch <= decay_start:
return lr
else:
return lr * tf.math.exp(-.1)
ls = LearningRateScheduler(scheduler)
es = EarlyStopping(patience=n_stop, restore_best_weights=True)
cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)
p = np.zeros_like(y, dtype=float)
p_tst = np.zeros((X_tst[0].shape[0], ), dtype=float)
for i, (i_trn, i_val) in enumerate(cv.split(X[0]), 1):
print(f'training CV #{i}:')
tf.random.set_seed(seed + i)
bert_model = load_bert(bert_config)
clf = build_model(bert_model, max_len=max_len)
if i == 1:
print(clf.summary())
history = clf.fit([x[i_trn] for x in X], y[i_trn],
validation_data=([x[i_val] for x in X], y[i_val]),
epochs=n_est,
batch_size=batch_size,
callbacks=[ls])
clf.save_weights(f'{model_name}_cv{i}.h5')
p[i_val] = clf.predict([x[i_val] for x in X]).flatten()
p_tst += clf.predict(X_tst).flatten() / n_fold
print(f'CV RMSE: {mean_squared_error(y, p, squared=False):.6f}')
np.savetxt(val_predict_file, p, fmt='%.6f')
sub = pd.read_csv(sample_file, index_col=id_col)
sub[target_col] = p_tst
sub.to_csv(submission_file)
sub.head()