Slow GPU with mps in Intel

Hi All:
I am finetuning a BERT model with HuggingFace Trainer API in Mac OS Ventura (Intel), Python 3.10 and Torch 2.0.0.
It takes 14 min in a simple scenery with CPU, with no problem.
I changed to GPU with mps. Initially, GPU was not used, but after redefining TrainingArguments in this way, it worked

`class TrainingArgumentsWithMPSSupport(TrainingArguments):
@Property
def device(self) → torch.device:
return torch.device(device)

training_args = TrainingArgumentsWithMPSSupport(…)`

But the problem is that improvement over CPU is scarce (barely from 14 min to 10 min). Monitor says %GPU is only 15% peak.

Any idea about why such poor improvement?

Thanks for any help
Alberto

The is the full code

from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
import nlp
import torch
from torch.utils.data import Dataset, DataLoader

device = torch.device("mps:0")

_DATASET = '../IMDB.csv'

dataset = nlp.load_dataset('csv', data_files=[_DATASET], split='train[:1%]')

dataset = dataset.train_test_split(test_size=0.3)
train_set = dataset['train']
test_set = dataset['test']

class CustomDataset(Dataset):


def __init__(self, dataset, mytokenizer):
    self.tokenizer = mytokenizer
    self.dataset = dataset
    self.texts = dataset["text"] 

def __len__(self):
    return len(self.dataset)

def __getitem__(self, index):
    theText = self.dataset[index]['text']
    theLabel = self.dataset[index]['label']
    inputs = self.tokenizer(theText, max_length=512, padding='max_length', truncation=True)
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    ids = torch.tensor(ids, dtype=torch.long).to(device)
    mask = torch.tensor(mask, dtype=torch.long).to(device)
    theLabel = torch.tensor(theLabel, dtype=torch.long).to(device)

    result = {
        'input_ids': ids,
        'attention_mask': mask,
        'label': theLabel
    }

    return result


model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

training_set = CustomDataset(train_set, tokenizer)
testing_set = CustomDataset(test_set, tokenizer)

batch_size = 8
epochs = 2
warmup_steps = 500
weight_decay = 0.01

class TrainingArgumentsWithMPSSupport(TrainingArguments):
[@Property](https://github.com/Property)
def device(self) -> torch.device:
return torch.device(device)

training_args = TrainingArgumentsWithMPSSupport(
output_dir='./results',
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
warmup_steps=warmup_steps,
weight_decay=weight_decay,
# evaluate_during_training=True,
evaluation_strategy='steps',
logging_dir='./logs',
)

trainer = Trainer(
model=model.to(device),
args=training_args,
train_dataset=training_set,
eval_dataset=testing_set
)

trainer.train() # full finetune
trainer.evaluate()