I’m trying to find-tune BERT on a regression task. My target values are approximately uniformly distributed over 0 to 100. I train using MSE
The training loss and validation loss appear to go down:
But when I look at the output predictions, they’re all nearly the same value:
13.34873,13.34946,13.34548,13.34980,13.34415,13.35009,13.35031,13.35068,13.35060,13.34515,13.34916,13.34391,13.32421,13.33146,13.29470,13.34953,13.35133,13.34735,13.34369,13.34804,13.35447,13.34434,13.35356,13.34438,13.35195,13.35314,13.34806,13.33857,13.34869,13.34059,13.35074,13.34365,13.35027,13.34974,13.35198,13.34209,13.34324,13.35140,13.35044,13.34025,13.34005,13.35257,13.30577,13.34795,13.33279,13.34773,13.33482,13.35300,13.34842,13.33357,13.34200,13.35000
This suggests to me that I’m doing something incorrectly. My code is below. Could someone please help me?
training_args = TrainingArguments(
output_dir=results_dir, # output directory
num_train_epochs=10, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir=results_dir, # directory for storing logs
logging_steps=10,
report_to='wandb',
do_eval=True,
evaluation_strategy="steps",
eval_steps=10,
)
class RegressionTrainer(Trainer):
def compute_loss(self,
model,
inputs,
return_outputs=False):
labels = inputs.get("labels")
outputs = model(**inputs)
logits = outputs.get('logits')
loss = torch.mean(torch.square(logits.squeeze() - labels.squeeze()))
return (loss, outputs) if return_outputs else loss
pytorch_model_save_path = os.path.join(results_dir, 'pytorch_model.bin')
if os.path.isfile(pytorch_model_save_path): # If model was already fine-tuned
# yes, pass the whole results dir; see https://github.com/huggingface/transformers/issues/1620
model = DistilBertForSequenceClassification.from_pretrained(
results_dir,
num_labels=1)
else: # If model needs to be fine-tuned
# Set output dimension to 1 to perform regression
model = DistilBertForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=1)
trainer = RegressionTrainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset,
eval_dataset=eval_dataset,
# compute_metrics=compute_eval_metrics,
)
if force_train or not os.path.isfile(pytorch_model_save_path):
trainer.train()
trainer.save_model(output_dir=results_dir)
all_prediction_output = trainer.predict(all_dataset)