Unable to update the weights / learn anything

lbassmaster · December 21, 2023, 7:10pm

I’m a bit at a loss here. None of my huggingface transformers “learn” anything. It’s probably a simple problem, but I’m failing to identify the issue here.

import argparse
import os
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, TrainingArguments, EarlyStoppingCallback, AutoModelForSequenceClassification, Trainer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from sklearn.utils.class_weight import compute_class_weight

    
from torch import nn
import torch.nn.functional as F
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

import evaluate
import joblib

from typing import Dict

model_name = 'roberta-base'

class MyDataset(Dataset):
    def __init__(self, encodings, attention_mask, labels, dtype=torch.float):  # dtype changed to float
        self.encodings = encodings
        self.attention_mask = attention_mask
        self.labels = labels
        self.dtype = dtype
    def __getitem__(self, idx):
        item = {}
        item['input_ids'] = self.encodings[idx].clone().detach()
        item['attention_mask'] = self.attention_mask[idx].clone().detach()
        item['labels'] = torch.tensor(self.labels[idx], dtype=self.dtype)
        return item
    def __len__(self):
        return len(self.encodings)

# for trainer
def compute_metrics(eval_preds):
    accuracy = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1", average='macro')
    logits, labels = eval_preds
    # turn 2d into 1d of 0s and 1s
    predictions = np.argmax(logits, axis=-1) # 0 or 1
    labels_new = np.argmax(labels, -1)
    
    accuracy = accuracy.compute(predictions=predictions, references=labels_new)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels_new)["f1"]

    #print(f"Val_acc={accuracy}") 
    #print(f"Val_f1={f1}")

    return {'accuracy' : accuracy, 'f1' : f1}

# read train data
# it's now local because Sagemaker took care of that
train_data = phrase_id_train
val_data = phrase_id_val

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_inputs = tokenizer(train_data.X.tolist(), truncation=True, padding=True, max_length=200, return_tensors='pt')
train_input_ids = train_inputs['input_ids']
train_attention_mask = train_inputs['attention_mask']
train_labels = torch.from_numpy(np.array([np.array([1-v, v]) for v in train_data.y.values])) # note [v, 1-v] -> each label is 2d: either 0,1 or 1,0

val_inputs = tokenizer(val_data.X.tolist(), truncation=True, padding=True, max_length=200, return_tensors='pt')
val_input_ids = val_inputs['input_ids']
val_attention_mask = val_inputs['attention_mask']
val_labels = torch.from_numpy(np.array([np.array([1-v, v]) for v in val_data.y.values]))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained(model_name, 
    num_labels = 2, # 2 labels -> 0,1 or 1,0
    output_attentions = False,
    output_hidden_states = False,
)

model = model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.0}
]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, 
                            lr=5e-5,
                            eps=1e-6)


n_epochs = 1
batch_size = 16
total_steps = int(len(train_inputs) / batch_size * n_epochs)

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = int(total_steps*0.06),
                                            num_training_steps = total_steps)


# Define the training arguments
training_args = TrainingArguments(
    output_dir='./trained_model',  # output directory
    num_train_epochs=n_epochs,  # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    #warmup_steps=0.0,  # number of warmup steps for learning rate scheduler
    #weight_decay=0.0,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    load_best_model_at_end = True,
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_strategy="steps",
    eval_steps=300,
    save_steps=300,
    save_total_limit=5,
    #metric_for_best_model='f1',
    #learning_rate=2e-5,  # learning rate
    #lr_scheduler_type='linear', ####### NEW
)

train_dataset = MyDataset(train_input_ids, train_attention_mask, train_labels, dtype=torch.float)
eval_dataset = MyDataset(val_input_ids, val_attention_mask, val_labels, dtype=torch.float)

trainer = Trainer(
    model=model,  # the pre-trained model
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=eval_dataset,
    #callbacks=[EarlyStoppingCallback(5)],
    compute_metrics=compute_metrics,
    optimizers=(optimizer,scheduler)
)

trainer.train()

Here’s the output:

Step	Training Loss	Validation Loss	Accuracy
300	No log	0.657723	0.932656
600	0.654400	0.657723	0.932656
900	0.654400	0.657723	0.932656
1200	0.653500	0.657723	0.932656
1500	0.654900	0.657723	0.932656

nielsr · December 22, 2023, 9:20am

Hi,

I really recommend this post: A Recipe for Training Neural Networks to debug your training run. The best tip for me is: take 1 training example, and see whether the model is able to overfit it (i.e. achieve 100% accuracy). If not, then there’s a bug in your model.

See also this guide which we wrote for debugging your training pipeline with the Trainer class: Debugging the training pipeline - Hugging Face NLP Course.

lbassmaster · December 22, 2023, 6:21pm

Thanks, this actually helped. Turned out I was able to train the model on a tiny example, and then scaled it up to the whole dataset. I’m still not sure what happened exactly, I didn’t change a lot of code, just copy-pasted things around… may be it was something to do with variables in my interpreter, but everything started to work at the end.

Topic		Replies	Views
Why my model doesn't learn anything? 🤗Transformers	0	838	July 29, 2021
Fine-Tuning results suggest some underlying implementation error? 🤗Transformers	1	690	October 5, 2021
Error when finetuning pretrained huggingface conv-ai chatbot model 🤗Transformers	2	818	April 19, 2021
Unable to apply transfer learning to certain models Intermediate	0	368	March 23, 2021
Can not make transfer learning Beginners	0	533	January 19, 2021

Unable to update the weights / learn anything

Related topics