PyTorch Lightning RoBERTa Baseline (Training/Inference)
A tutorial about how to train an NLP model with the huggingface's pretrained RoBERTa in PyTorch Lightning
- Changelogs
- Load Libraries and Data
- Tokenization Using RoBERTa
- Model Training with Cross-Validation
- Submission
This notebook shows how to train a neural network model with pre-trained RoBERTa in Pytorch Lightning.
This competition is a code competition without access to internet. So we add the pretrained model through @abhishek's roberta-base
Kaggle Datasets instead.
This notebook shares the same structure as in TF/Keras BERT Baseline (Training/Inference), and is built on top of two other notebooks:
- BERT & PyTorch [CommonLit Readability] Simple by @shivanandmn
- RoBERTa meets TPUs by @yassinealouini
Hope it helps.
%reload_ext autoreload
%autoreload 2
import joblib
import numpy as np
import os
import pandas as pd
from pathlib import Path
import random
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from warnings import simplefilter
simplefilter('ignore')
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.core.lightning import LightningModule
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
get_constant_schedule_with_warmup, AdamW)
model_name = 'roberta_v1'
data_dir = Path('../input/commonlitreadabilityprize')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
pretrained_path = '../input/roberta-base/'
build_dir = Path('../build')
output_dir = build_dir / 'model' / model_name
trn_encoded_file = output_dir / 'trn.enc.joblib'
tokenizer_file = output_dir / 'tokenizer.joblib'
val_predict_file = output_dir / f'{model_name}.val.txt'
submission_file = output_dir / 'submission.csv'
id_col = 'id'
target_col = 'target'
text_col = 'excerpt'
max_len = 200
n_fold = 5
n_est = 20
n_stop = 2
batch_size = 8
seed = 42
output_dir.mkdir(parents=True, exist_ok=True)
seed_everything(seed)
if torch.cuda.is_available():
device = torch.device("cuda")
print("GPU is available")
else:
device = torch.device("cpu")
print("GPU not available, CPU used")
trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
y = trn[target_col].values
print(trn.shape, y.shape)
trn.head()
tokenizer = RobertaTokenizerFast.from_pretrained(pretrained_path, do_lower_case=True)
model_config = RobertaConfig.from_pretrained(pretrained_path)
model_config.output_hidden_states = True
class Data(Dataset):
def __init__(self, df):
super().__init__()
self.df = df
self.labeled = target_col in df
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
texts = self.df[text_col][idx]
token = tokenizer(texts, max_length=max_len, truncation=True, padding='max_length',
return_tensors='pt', add_special_tokens=True)
ids = torch.tensor(token['input_ids'], dtype=torch.long).squeeze()
mask = torch.tensor(token['attention_mask'], dtype=torch.long).squeeze()
if self.labeled:
target = torch.tensor(self.df[target_col][idx], dtype=torch.float)
return (ids, mask, target) if self.labeled else (ids, mask)
Simple model with only an output dense layer added to the pre-trained RoBERTa model.
class ReadabilityModel(LightningModule):
def __init__(self, conf):
super().__init__()
self.config = conf
self.model = RobertaModel.from_pretrained(pretrained_path, config=self.config)
self.dropout = nn.Dropout(0.1)
self.num_targets = 1
self.clf = nn.Linear(768, self.num_targets)
torch.nn.init.normal_(self.clf.weight, std=0.02)
def forward(self, inputs):
ids, mask = inputs
out = self.model(ids, attention_mask=mask)
out = out['hidden_states']
x = out[-1]
x = self.dropout(x)
x = torch.mean(x, 1, True)
preds = self.clf(x)
preds = preds.squeeze(-1).squeeze(-1)
return preds
def training_step(self, batch, batch_idx):
ids, mask, y = batch
p = self([ids, mask])
loss = self.loss_fn(p, y)
self.log('train_loss', loss)
return loss
def validation_step(self, batch, batch_idx):
ids, mask, y = batch
p = self([ids, mask])
loss = self.loss_fn(p, y)
self.log('val_loss', loss)
def configure_optimizers(self):
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
lr_scheduler = get_constant_schedule_with_warmup(optimizer, 100)
return [optimizer], [lr_scheduler]
def loss_fn(self, p, y):
return torch.sqrt(nn.MSELoss()(p, y))
Training the model with early stopping and a learning-rate schedulerTraining the model
cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)
p = np.zeros_like(y, dtype=float)
p_tst = np.zeros((tst.shape[0],), dtype=float)
for i_cv, (i_trn, i_val) in enumerate(cv.split(trn), 1):
model = ReadabilityModel(model_config)
trn_loader = DataLoader(Data(trn.iloc[i_trn]), shuffle=True, batch_size=batch_size)
val_loader = DataLoader(Data(trn.iloc[i_val]), shuffle=False, batch_size=batch_size * 8)
trainer = Trainer(gpus=[0], max_epochs=n_est,
callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=n_stop)],
checkpoint_callback=False)
trainer.fit(model, trn_loader, val_loader)
val_loader = DataLoader(Data(trn.iloc[i_val].drop(target_col, axis=1)), shuffle=False,
batch_size=batch_size * 8)
tst_loader = DataLoader(Data(tst), shuffle=False, batch_size=batch_size * 8)
p[i_val] = np.concatenate(trainer.predict(model, val_loader))
p_tst += np.concatenate(trainer.predict(model, tst_loader)) / n_fold
trainer.save_checkpoint(f'{model_name}_cv{i_cv}.ckpt')
del trainer, model
print(f'CV RMSE: {mean_squared_error(y, p, squared=False):.6f}')
np.savetxt(val_predict_file, p, fmt='%.6f')
sub = pd.read_csv(sample_file, index_col=id_col)
sub[target_col] = p_tst
sub.to_csv(submission_file)
sub.head()
If you find it helpful, please upvote the notebook. Also check out my other notebooks below:
- TF/Keras BERT Baseline (Training/Inference): shares the TF/Keras BERT baseline with 5-fold CV
- All Zero Submission: shows the public LB score for all zero submission
-
DAE with 2 Lines of Code with Kaggler: shows how to generate Denoising AutoEncoder features using
Kaggler
Happy Kagglging~!