Hi there,
I am attempting to recreate R2Bert (see paper here: https://www.aclweb.org/anthology/2020.findings-emnlp.141.pdf) which combines regression and ranking as part of the loss function when training a model to correctly predict an essay score. I have successfully built the model with native Pytorch to train. However, when I use the trainer module all is well when I am training the model, but if I call evaluate or predict methods on the trainer, then I am met by an arrow error which is the result of the EvalPrediction.prediction tensor being of difference length than EvalPrediction.label_id. After some snooping around I noticed that the difference in length turns out to always be the difference in epochs, so for example evaluating over 5 batches I get that the difference is 5 outputs. Any ideas on what might be causing this?
Hereās my code, also I use the ASAP dataset but with only 32 essays for each the test, validation and training set, just as a scrap dataset to try and get the model working ( Iāve tried with the full dataset: behaviour is the same _
imports
from transformers import (TrainingArguments,
Trainer,
AutoConfig,
AutoTokenizer,
AutoModel,
AdamW,
EvalPrediction)
from datasets import load_metric,load_dataset
import torch
from torch import nn
import torch.nn.functional as F
from sklearn.metrics import cohen_kappa_score
import re
acquiring dataset
model_name = 'bert-base-uncased'
path = 'datasets/AES/asap'
dataset_title = 'asap'
# loading dataset
dataset = load_dataset('csv', data_files={'train':[f'{path}/PreProcessed/CsvFiles/{dataset_title}_dev_train.csv'],
'val':[f'{path}/PreProcessed/CsvFiles/{dataset_title}_dev_val.csv'],
'test':[f'{path}/PreProcessed/CsvFiles/{dataset_title}_dev_test.csv']})
Tokenizing dataset
# tokenizing dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
def encode_batch(batch):
"""Encodes a batch of input data using the model tokenizer."""
return tokenizer(batch["essay"], max_length=512, truncation=True, padding="max_length")
# Encode the input data
dataset = dataset.map(encode_batch, batched=True)
# labels = normalised scores, domain1_score = original score, essay_set = prompt number the essay belongs to
# will be used to adjust predicted scores to original scoring scale for each essay.
dataset.set_format(type="torch", columns=["input_ids", "attention_mask","essay_set","labels","domain1_score"])
Building the model:
class R2BERT(nn.Module):
def __init__(
self,
pretrained_model_name,
norm_params = None,
):
super().__init__()
# get bert model
config = AutoConfig.from_pretrained(pretrained_model_name)
self.model = AutoModel.from_pretrained(pretrained_model_name,
config=config)
# add final layer to make score prediction
self.predictor = nn.Linear(config.hidden_size,1)
# To be used for calculating kappa metric, by accessing the minimum score and
# score range for each essay set to get predictions to original scoring range.
# but not got round to it yet
self.norm_params = norm_params
# method for freezing bert layers (using regex to find all layers less than
# a specified n_training_layer and then setting requires_grad = False).
# Done mainly to avoid CUDA: runtime error when training
def set_trainable_params(self,n_training_layer=None):
for param_name,param_value in model.named_parameters():
if n_training_layer:
layer_num = re.findall(r'\d+',param_name)
if len(layer_num)>0:
layer_num = int(layer_num[0])
else:
layer_num = 0
if param_name.startswith('model') and layer_num<n_training_layer:
param_value.requires_grad = False
else:
if param_name.startswith('model'):
param_value.requires_grad = False
# Forward pass of the model takes inputs as ā¢ā¢kwargs making it a dictionary.
# Then the values of keys: 'input_ids' and 'attention_mask' are used to get output of bert
# Linear layer applied to get score and returned as model output. Dimension of output changed
# from [batch size,1] to just batch size to prevent broadcasting error.
def forward(self,**inputs):
bert_output = self.model(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'])
text_representation = bert_output[0][:,0,:]
batch_size = inputs['input_ids'].size()[0]
return self.predictor(text_representation).view(batch_size)
# Specific trainer class created to create a custom loss function
class R2Trainer(Trainer):
def compute_loss(self,model,inputs, return_outputs=False):
# labels are the scores of the essays
labels = inputs["labels"]
# The output of the model is passed through a sigmoid activation function
# to ensure it is between 0 and 1. This is because the essay scores have been normalised
# with min max scaling to adjust for different scoring ranges for different prompts.
outputs = torch.sigmoid(model(**inputs))
# mean square error used as regression loss
loss_m = F.mse_loss(outputs,labels)
# soft max is applied to both predicted and normalised scores (essentially determing the probability
# that for each essay in the set that it woruld be ranked the highest scoring)
# This enables the use of the listnet algorithm which is used for ranking loss
sm_pred_scores = F.softmax(outputs,dim=0)
sm_gold_scores = F.softmax(labels,dim=0)
# The loss for the listnet function is the cross entropy as applied here, this essentially determines
# how different the two soft max distrobutions are
loss_r = torch.sum((-sm_gold_scores*torch.log(sm_pred_scores)))
# The losses are then added together
loss = loss_m + loss_r
return (loss, outputs) if return_outputs else loss
def compute_accuracy(p):
#####################################################
# Here is where the error lies p.predictions returns only 30
# predictions for the training arguments and parameters set below
logits, labels = p.predictions,p.label_ids
print(p)
return metric.compute(predictions=logits, references=labels)
model = R2BERT(model_name)
model.set_trainable_params(6)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
metric = load_metric("pearsonr","spearmanr")
training_args = TrainingArguments(
learning_rate=4e-5,
num_train_epochs=2,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
logging_steps=200,
output_dir="./training_output",
overwrite_output_dir=True,
evaluation_strategy='steps',
remove_unused_columns=False,
)
trainer = R2Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["val"],
compute_metrics=compute_accuracy,
)
Training
trainer.train()
Output:
TrainOutput(global_step=4, training_loss=2.8204991817474365, metrics={'train_runtime': 2.4896, 'train_samples_per_second': 25.707, 'train_steps_per_second': 1.607, 'total_flos': 0.0, 'train_loss': 2.8204991817474365, 'epoch': 2.0})
Predicting:
trainer.predict(dataset['test'])
Output and Error:
EvalPrediction(predictions=array([0.6802865 , 0.69348145, 0.7554306 , 0.70484996, 0.7307703 ,
0.74552727, 0.6842238 , 0.76353663, 0.69672614, 0.7247801 ,
0.77793705, 0.7025176 , 0.6014939 , 0.6216687 , 0.702473 ,
0.6444423 , 0.73216194, 0.75792855, 0.7077718 , 0.62824374,
0.72637045, 0.7813148 , 0.71593434, 0.7130688 , 0.7126326 ,
0.7286271 , 0.6804262 , 0.7279507 , 0.69572073, 0.72733516],
dtype=float32), label_ids=array([0.75 , 0. , 0.75 , 0.6 , 0.6 ,
0.2 , 0.4 , 1. , 0.6 , 0.6666667 ,
0.6363636 , 0.75 , 0.9 , 0.75 , 0.25 ,
0.56 , 0.75 , 0.6666667 , 0.27272728, 0.5 ,
1. , 0. , 0.44 , 1. , 0.6 ,
0.4 , 0.36 , 0.5 , 0.36363637, 0.8181818 ,
1. , 0.59090906], dtype=float32))
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/datasets/metric.py in add_batch(self, predictions, references)
434 try:
--> 435 self.writer.write_batch(batch)
436 except pa.ArrowInvalid:
10 frames
ArrowInvalid: Column 1 named references expected length 30 but got length 32
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/datasets/metric.py in add_batch(self, predictions, references)
436 except pa.ArrowInvalid:
437 raise ValueError(
--> 438 f"Predictions and/or references don't match the expected format.\n"
439 f"Expected format: {self.features},\n"
440 f"Input predictions: {predictions},\n"
ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)},
Input predictions: [0.6802865 0.69348145 0.7554306 0.70484996 0.7307703 0.74552727
0.6842238 0.76353663 0.69672614 0.7247801 0.77793705 0.7025176
0.6014939 0.6216687 0.702473 0.6444423 0.73216194 0.75792855
0.7077718 0.62824374 0.72637045 0.7813148 0.71593434 0.7130688
0.7126326 0.7286271 0.6804262 0.7279507 0.69572073 0.72733516],
Input references: [0.75 0. 0.75 0.6 0.6 0.2
0.4 1. 0.6 0.6666667 0.6363636 0.75
0.9 0.75 0.25 0.56 0.75 0.6666667
0.27272728 0.5 1. 0. 0.44 1.
0.6 0.4 0.36 0.5 0.36363637 0.8181818
1. 0.59090906]
Kind Regards,
Cameron
- `transformers` version: 4.7.0
- Platform: Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic
- Python version: 3.7.10
- PyTorch version (GPU?): 1.8.1+cu101 (True)
- Tensorflow version (GPU?): 2.5.0 (True)
- Using GPU in script?: <fill in>
- Using distributed or parallel set-up in script?: <fill in>