This notebook shows how to train a neural network model with pre-trained RoBERTa in Pytorch Lightning.

This competition is a code competition without access to internet. So we add the pretrained model through @abhishek's roberta-base Kaggle Datasets instead.

This notebook shares the same structure as in TF/Keras BERT Baseline (Training/Inference), and is built on top of two other notebooks:

Hope it helps.

Changelogs

Version CV Score Public Score Changes Comment
v1 to be updated initial baseline

Load Libraries and Data

%reload_ext autoreload
%autoreload 2
import joblib
import numpy as np
import os
import pandas as pd 
from pathlib import Path
import random
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from warnings import simplefilter
simplefilter('ignore')
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.core.lightning import LightningModule
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import (PreTrainedModel, RobertaModel, RobertaTokenizerFast, RobertaConfig,
                          get_constant_schedule_with_warmup, AdamW)
model_name = 'roberta_v1'

data_dir = Path('../input/commonlitreadabilityprize')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

pretrained_path = '../input/roberta-base/'

build_dir = Path('../build')
output_dir = build_dir / 'model' / model_name

trn_encoded_file = output_dir / 'trn.enc.joblib'
tokenizer_file = output_dir / 'tokenizer.joblib'
val_predict_file = output_dir / f'{model_name}.val.txt'
submission_file = output_dir / 'submission.csv'

id_col = 'id'
target_col = 'target'
text_col = 'excerpt'

max_len = 200
n_fold = 5
n_est = 20
n_stop = 2
batch_size = 8
seed = 42
output_dir.mkdir(parents=True, exist_ok=True)
seed_everything(seed)
Global seed set to 42
42
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")
GPU is available
trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
y = trn[target_col].values
print(trn.shape, y.shape)
trn.head()
(2834, 5) (2834,)
url_legal license excerpt target standard_error
id
c12129c31 NaN NaN When the young people returned to the ballroom... -0.340259 0.464009
85aa80a4c NaN NaN All through dinner time, Mrs. Fayre was somewh... -0.315372 0.480805
b69ac6792 NaN NaN As Roger had predicted, the snow departed as q... -0.580118 0.476676
dd1000b26 NaN NaN And outside before the palace a great garden w... -1.054013 0.450007
37c1b32fb NaN NaN Once upon a time there were Three Bears who li... 0.247197 0.510845

Tokenization Using RoBERTa

tokenizer = RobertaTokenizerFast.from_pretrained(pretrained_path, do_lower_case=True)
model_config = RobertaConfig.from_pretrained(pretrained_path)
model_config.output_hidden_states = True
class Data(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
        self.labeled = target_col in df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        texts = self.df[text_col][idx]
        token = tokenizer(texts, max_length=max_len, truncation=True, padding='max_length', 
                          return_tensors='pt', add_special_tokens=True)
        ids = torch.tensor(token['input_ids'], dtype=torch.long).squeeze()
        mask = torch.tensor(token['attention_mask'], dtype=torch.long).squeeze()
        if self.labeled:
            target = torch.tensor(self.df[target_col][idx], dtype=torch.float)
        
        return (ids, mask, target) if self.labeled else (ids, mask)

Model Training with Cross-Validation

Simple model with only an output dense layer added to the pre-trained RoBERTa model.

class ReadabilityModel(LightningModule):
    
    def __init__(self, conf):
        super().__init__()
        self.config = conf
        self.model = RobertaModel.from_pretrained(pretrained_path, config=self.config)
        self.dropout = nn.Dropout(0.1)
        self.num_targets = 1
        self.clf = nn.Linear(768, self.num_targets)
        torch.nn.init.normal_(self.clf.weight, std=0.02)
    
    def forward(self, inputs):
        ids, mask = inputs
        out = self.model(ids, attention_mask=mask)
        out = out['hidden_states']
        x = out[-1]
        x = self.dropout(x)
        x = torch.mean(x, 1, True)
        preds = self.clf(x)
        preds = preds.squeeze(-1).squeeze(-1)

        return preds
    
    def training_step(self, batch, batch_idx):
        ids, mask, y = batch
        p = self([ids, mask])
        loss = self.loss_fn(p, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        ids, mask, y = batch
        p = self([ids, mask])
        loss = self.loss_fn(p, y)
        self.log('val_loss', loss)
        
    def configure_optimizers(self):
        optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
        lr_scheduler = get_constant_schedule_with_warmup(optimizer, 100)
        return [optimizer], [lr_scheduler]
    
    def loss_fn(self, p, y):
        return torch.sqrt(nn.MSELoss()(p, y))

Training the model with early stopping and a learning-rate schedulerTraining the model

cv = KFold(n_splits=n_fold, shuffle=True, random_state=seed)

p = np.zeros_like(y, dtype=float)
p_tst = np.zeros((tst.shape[0],), dtype=float)
for i_cv, (i_trn, i_val) in enumerate(cv.split(trn), 1):
    model = ReadabilityModel(model_config)
    trn_loader = DataLoader(Data(trn.iloc[i_trn]), shuffle=True, batch_size=batch_size)
    val_loader = DataLoader(Data(trn.iloc[i_val]), shuffle=False, batch_size=batch_size * 8)

    trainer = Trainer(gpus=[0], max_epochs=n_est, 
                      callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=n_stop)], 
                      checkpoint_callback=False)
    trainer.fit(model, trn_loader, val_loader)

    val_loader = DataLoader(Data(trn.iloc[i_val].drop(target_col, axis=1)), shuffle=False, 
                            batch_size=batch_size * 8)
    tst_loader = DataLoader(Data(tst), shuffle=False, batch_size=batch_size * 8)
    p[i_val] = np.concatenate(trainer.predict(model, val_loader))
    p_tst += np.concatenate(trainer.predict(model, tst_loader)) / n_fold
    
    trainer.save_checkpoint(f'{model_name}_cv{i_cv}.ckpt')
    del trainer, model
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type         | Params
-----------------------------------------
0 | model   | RobertaModel | 124 M 
1 | dropout | Dropout      | 0     
2 | clf     | Linear       | 769   
-----------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.586   Total estimated model params size (MB)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type         | Params
-----------------------------------------
0 | model   | RobertaModel | 124 M 
1 | dropout | Dropout      | 0     
2 | clf     | Linear       | 769   
-----------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.586   Total estimated model params size (MB)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type         | Params
-----------------------------------------
0 | model   | RobertaModel | 124 M 
1 | dropout | Dropout      | 0     
2 | clf     | Linear       | 769   
-----------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.586   Total estimated model params size (MB)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type         | Params
-----------------------------------------
0 | model   | RobertaModel | 124 M 
1 | dropout | Dropout      | 0     
2 | clf     | Linear       | 769   
-----------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.586   Total estimated model params size (MB)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type         | Params
-----------------------------------------
0 | model   | RobertaModel | 124 M 
1 | dropout | Dropout      | 0     
2 | clf     | Linear       | 769   
-----------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.586   Total estimated model params size (MB)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
print(f'CV RMSE: {mean_squared_error(y, p, squared=False):.6f}')
np.savetxt(val_predict_file, p, fmt='%.6f')
CV RMSE: 0.678173

Submission

sub = pd.read_csv(sample_file, index_col=id_col)
sub[target_col] = p_tst
sub.to_csv(submission_file)
sub.head()
target
id
c0f722661 -0.087452
f0953f0a5 -0.122406
0df072751 -0.165737
04caf4e0c -2.293868
0e63f8bea -1.454202

If you find it helpful, please upvote the notebook. Also check out my other notebooks below:

Happy Kagglging~!