Unable to update the weights / learn anything

I’m a bit at a loss here. None of my huggingface transformers “learn” anything. It’s probably a simple problem, but I’m failing to identify the issue here.

import argparse
import os
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, TrainingArguments, EarlyStoppingCallback, AutoModelForSequenceClassification, Trainer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from sklearn.utils.class_weight import compute_class_weight

from torch import nn
import torch.nn.functional as F
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

import evaluate
import joblib

from typing import Dict

model_name = 'roberta-base'

class MyDataset(Dataset):
    def __init__(self, encodings, attention_mask, labels, dtype=torch.float):  # dtype changed to float
        self.encodings = encodings
        self.attention_mask = attention_mask
        self.labels = labels
        self.dtype = dtype
    def __getitem__(self, idx):
        item = {}
        item['input_ids'] = self.encodings[idx].clone().detach()
        item['attention_mask'] = self.attention_mask[idx].clone().detach()
        item['labels'] = torch.tensor(self.labels[idx], dtype=self.dtype)
        return item
    def __len__(self):
        return len(self.encodings)

# for trainer
def compute_metrics(eval_preds):
    accuracy = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1", average='macro')
    logits, labels = eval_preds
    # turn 2d into 1d of 0s and 1s
    predictions = np.argmax(logits, axis=-1) # 0 or 1
    labels_new = np.argmax(labels, -1)
    accuracy = accuracy.compute(predictions=predictions, references=labels_new)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels_new)["f1"]


    return {'accuracy' : accuracy, 'f1' : f1}

# read train data
# it's now local because Sagemaker took care of that
train_data = phrase_id_train
val_data = phrase_id_val

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_inputs = tokenizer(train_data.X.tolist(), truncation=True, padding=True, max_length=200, return_tensors='pt')
train_input_ids = train_inputs['input_ids']
train_attention_mask = train_inputs['attention_mask']
train_labels = torch.from_numpy(np.array([np.array([1-v, v]) for v in train_data.y.values])) # note [v, 1-v] -> each label is 2d: either 0,1 or 1,0

val_inputs = tokenizer(val_data.X.tolist(), truncation=True, padding=True, max_length=200, return_tensors='pt')
val_input_ids = val_inputs['input_ids']
val_attention_mask = val_inputs['attention_mask']
val_labels = torch.from_numpy(np.array([np.array([1-v, v]) for v in val_data.y.values]))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained(model_name, 
    num_labels = 2, # 2 labels -> 0,1 or 1,0
    output_attentions = False,
    output_hidden_states = False,

model = model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.0}

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, 

n_epochs = 1
batch_size = 16
total_steps = int(len(train_inputs) / batch_size * n_epochs)

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = int(total_steps*0.06),
                                            num_training_steps = total_steps)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./trained_model',  # output directory
    num_train_epochs=n_epochs,  # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    #warmup_steps=0.0,  # number of warmup steps for learning rate scheduler
    #weight_decay=0.0,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    load_best_model_at_end = True,
    #learning_rate=2e-5,  # learning rate
    #lr_scheduler_type='linear', ####### NEW

train_dataset = MyDataset(train_input_ids, train_attention_mask, train_labels, dtype=torch.float)
eval_dataset = MyDataset(val_input_ids, val_attention_mask, val_labels, dtype=torch.float)

trainer = Trainer(
    model=model,  # the pre-trained model
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset


Here’s the output:

Step Training Loss Validation Loss Accuracy F1
300 No log 0.657723 0.932656 0.000000
600 0.654400 0.657723 0.932656 0.000000
900 0.654400 0.657723 0.932656 0.000000
1200 0.653500 0.657723 0.932656 0.000000
1500 0.654900 0.657723 0.932656 0.000000


I really recommend this post: A Recipe for Training Neural Networks to debug your training run. The best tip for me is: take 1 training example, and see whether the model is able to overfit it (i.e. achieve 100% accuracy). If not, then there’s a bug in your model.

See also this guide which we wrote for debugging your training pipeline with the Trainer class: Debugging the training pipeline - Hugging Face NLP Course.


Thanks, this actually helped. Turned out I was able to train the model on a tiny example, and then scaled it up to the whole dataset. I’m still not sure what happened exactly, I didn’t change a lot of code, just copy-pasted things around… may be it was something to do with variables in my interpreter, but everything started to work at the end.

