I’m a bit at a loss here. None of my huggingface transformers “learn” anything. It’s probably a simple problem, but I’m failing to identify the issue here.
import argparse
import os
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, TrainingArguments, EarlyStoppingCallback, AutoModelForSequenceClassification, Trainer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from sklearn.utils.class_weight import compute_class_weight
from torch import nn
import torch.nn.functional as F
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
import evaluate
import joblib
from typing import Dict
model_name = 'roberta-base'
class MyDataset(Dataset):
def __init__(self, encodings, attention_mask, labels, dtype=torch.float): # dtype changed to float
self.encodings = encodings
self.attention_mask = attention_mask
self.labels = labels
self.dtype = dtype
def __getitem__(self, idx):
item = {}
item['input_ids'] = self.encodings[idx].clone().detach()
item['attention_mask'] = self.attention_mask[idx].clone().detach()
item['labels'] = torch.tensor(self.labels[idx], dtype=self.dtype)
return item
def __len__(self):
return len(self.encodings)
# for trainer
def compute_metrics(eval_preds):
accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1", average='macro')
logits, labels = eval_preds
# turn 2d into 1d of 0s and 1s
predictions = np.argmax(logits, axis=-1) # 0 or 1
labels_new = np.argmax(labels, -1)
accuracy = accuracy.compute(predictions=predictions, references=labels_new)["accuracy"]
f1 = f1_metric.compute(predictions=predictions, references=labels_new)["f1"]
#print(f"Val_acc={accuracy}")
#print(f"Val_f1={f1}")
return {'accuracy' : accuracy, 'f1' : f1}
# read train data
# it's now local because Sagemaker took care of that
train_data = phrase_id_train
val_data = phrase_id_val
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_inputs = tokenizer(train_data.X.tolist(), truncation=True, padding=True, max_length=200, return_tensors='pt')
train_input_ids = train_inputs['input_ids']
train_attention_mask = train_inputs['attention_mask']
train_labels = torch.from_numpy(np.array([np.array([1-v, v]) for v in train_data.y.values])) # note [v, 1-v] -> each label is 2d: either 0,1 or 1,0
val_inputs = tokenizer(val_data.X.tolist(), truncation=True, padding=True, max_length=200, return_tensors='pt')
val_input_ids = val_inputs['input_ids']
val_attention_mask = val_inputs['attention_mask']
val_labels = torch.from_numpy(np.array([np.array([1-v, v]) for v in val_data.y.values]))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained(model_name,
num_labels = 2, # 2 labels -> 0,1 or 1,0
output_attentions = False,
output_hidden_states = False,
)
model = model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.0}
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
lr=5e-5,
eps=1e-6)
n_epochs = 1
batch_size = 16
total_steps = int(len(train_inputs) / batch_size * n_epochs)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = int(total_steps*0.06),
num_training_steps = total_steps)
# Define the training arguments
training_args = TrainingArguments(
output_dir='./trained_model', # output directory
num_train_epochs=n_epochs, # total number of training epochs
per_device_train_batch_size=batch_size, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
#warmup_steps=0.0, # number of warmup steps for learning rate scheduler
#weight_decay=0.0, # strength of weight decay
logging_dir='./logs', # directory for storing logs
load_best_model_at_end = True,
evaluation_strategy='steps',
save_strategy='steps',
logging_strategy="steps",
eval_steps=300,
save_steps=300,
save_total_limit=5,
#metric_for_best_model='f1',
#learning_rate=2e-5, # learning rate
#lr_scheduler_type='linear', ####### NEW
)
train_dataset = MyDataset(train_input_ids, train_attention_mask, train_labels, dtype=torch.float)
eval_dataset = MyDataset(val_input_ids, val_attention_mask, val_labels, dtype=torch.float)
trainer = Trainer(
model=model, # the pre-trained model
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=eval_dataset,
#callbacks=[EarlyStoppingCallback(5)],
compute_metrics=compute_metrics,
optimizers=(optimizer,scheduler)
)
trainer.train()
Here’s the output:
Step | Training Loss | Validation Loss | Accuracy | F1 |
---|---|---|---|---|
300 | No log | 0.657723 | 0.932656 | 0.000000 |
600 | 0.654400 | 0.657723 | 0.932656 | 0.000000 |
900 | 0.654400 | 0.657723 | 0.932656 | 0.000000 |
1200 | 0.653500 | 0.657723 | 0.932656 | 0.000000 |
1500 | 0.654900 | 0.657723 | 0.932656 | 0.000000 |