Using multi GPU with Trainer through Deepspeed, parameters found on cpu

I’m trying to train a longformer as a classifier, and I’m currently using a test dataset to try to get this working. I’m using dual 3060s, so I need to use deepspeed to shard the model. My code is

from transformers import (
TrainingArguments,
Trainer,
LongformerTokenizerFast,
LongformerForSequenceClassification,
LongformerConfig,
TextClassificationPipeline,
logging
)
from transformers.modeling_outputs import SequenceClassifierOutput
import torch
import tensorflow as tf
import datasets
from datasets import load_dataset, load_from_disk, Dataset
import numpy as np
import evaluate
import json
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
with open(‘ds_config.json’) as dsconf:
dsconfig = json.load(dsconf)
training_args = TrainingArguments(
output_dir=“test_trainer”,
evaluation_strategy=“epoch”,
per_device_eval_batch_size=2,
per_device_train_batch_size=2,
deepspeed=dsconfig,
)
models = [
“lexlms/legal-longformer-large”,
“severinsimmler/xlm-roberta-longformer-base-16384”,
“allenai/longformer-base-4096”,
]
chosen_model = models[1]
config = LongformerConfig.from_pretrained(chosen_model,device=“cuda”)
config.num_labels = 1
model = LongformerForSequenceClassification._from_config(config=config)
tokenizer = LongformerTokenizerFast.from_pretrained(chosen_model)
def tokenize_function(examples):
return tokenizer(examples[“text”],padding = “max_length”, truncation=True)
ds = load_dataset(‘yelp_review_full’)
tokenized_datasets = ds.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets[“train”].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_datasets[“test”].shuffle(seed=42).select(range(10000))
trainer = Trainer(
model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_eval_dataset,
)
trainer.train()

And my ds_config.json:
{
“train_batch_size”: 2,
“gradient_accumulation_steps”: 1,
“scheduler”: {
“type”: “WarmupLR”,
“params”: {
“warmup_min_lr”: 0,
“warmup_max_lr”: “auto”,
“warmup_num_steps”: “auto”
}
},
“optimizer”: {
“type”: “Adam”,
“params”: {
“lr”: “auto”,
“betas”: “auto”,
“eps”: 1e-8,
“weight_decay”: “auto”
}
},
“zero_optimization”: {
“stage”: [
3
],
“allgather_partitions”: [
true
],
“allgather_bucket_size”: 5e8,
“overlap_comm”: false,
“reduce_scatter”: [
true
],
“reduce_bucket_size”: 5e8,
“contiguous_gradients”: [
true
],
“stage3_max_live_parameters”: 1e9,
“stage3_max_reuse_distance”: 1e9,
“stage3_prefetch_bucket_size”: 5e8,
“stage3_param_persistence_threshold”: 1e6,
“sub_group_size”: 1e12,
“stage3_gather_16bit_weights_on_model_save”: [
false
]
}
}

This gives me
[/mnt/ml/transformers/transformers-venv/lib/python3.11/site-packages/tqdm/auto.py:21]: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See [link removed because of limit] from .autonotebook import tqdm as notebook_tqdm 2023-08-08 20:55:35.768933: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-08-08 20:55:36.334875: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
[2023-08-08 20:55:37,246] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. The tokenizer class you load from this checkpoint is ‘XLMRobertaTokenizer’. The class this function is called from is ‘LongformerTokenizerFast’.
0%| | 0/7500 [00:00<?, ?it/s]
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[9], line 1 ----> 1 trainer.train() File [/mnt/ml/transformers/transformers-venv/lib/python3.11/site-packages/transformers/trainer.py:1539], in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1534 self.model_wrapped = self.model 1536 inner_training_loop = find_executable_batch_size( 1537 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size 1538 ) → 1539 return inner_training_loop( 1540 args=args, 1541 resume_from_checkpoint=resume_from_checkpoint, 1542 trial=trial, 1543 ignore_keys_for_eval=ignore_keys_for_eval, 1544 ) File [/mnt/ml/transformers/transformers-venv/lib/python3.11/site-packages/transformers/trainer.py:1809], in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval) 1806 self.control = self.callback_handler.on_step_begin(args, self.state, self.control) 1808 with self.accelerator.accumulate(model): → 1809 tr_loss_step = self.training_step(model, inputs) 1811 if ( 1812 args.logging_nan_inf_filter 1813 and not is_torch_tpu_available() 1814 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) 1815 ): 1816 # if loss is nan or inf simply add the average of previous logged losses 1817 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) File [/mnt/ml/transformers/transformers-venv/lib/python3.11/site-packages/transformers/trainer.py:2654], in Trainer.training_step(self, model, inputs) 2651 return loss_mb.reduce_mean().detach().to(self.args.device) 2653 with self.compute_loss_context_manager(): → 2654 loss = self.compute_loss(model, inputs) 2656 if self.args.n_gpu > 1: 2657 loss = loss.mean() # mean() to average on multi-gpu parallel training File [/mnt/ml/transformers/transformers-venv/lib/python3.11/site-packages/transformers/trainer.py:2679], in Trainer.compute_loss(self, model, inputs, return_outputs) 2677 else: 2678 labels = None → 2679 outputs = model(**inputs) 2680 # Save past state if it exists 2681 # TODO: this needs to be fixed and made cleaner later. 2682 if self.args.past_index >= 0: File [/mnt/ml/transformers/transformers-venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501], in Module._call_impl(self, *args, **kwargs) 1496 # If we don’t have any hooks, we want to skip the rest of the logic in 1497 # this function, and just call forward. 1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1499 or _global_backward_pre_hooks or _global_backward_hooks 1500 or _global_forward_hooks or _global_forward_pre_hooks): → 1501 return forward_call(*args, **kwargs) 1502 # Do not call functions when jit is used 1503 full_backward_hooks, non_full_backward_hooks = , File [/mnt/ml/transformers/transformers-venv/lib/python3.11/site-packages/torch/nn/parallel/data_parallel.py:157], in DataParallel.forward(self, *inputs, **kwargs) 155 for t in chain(self.module.parameters(), self.module.buffers()): 156 if t.device != self.src_device_obj: → 157 raise RuntimeError("module must have its parameters and buffers " 158 "on device {} (device_ids[0]) but found one of " 159 “them on device: {}”.format(self.src_device_obj, t.device)) 161 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) 162 # for forward function without any inputs, empty list and dict will be created 163 # so the module can be executed on one device which is the first one in device_ids RuntimeError: module must have its parameters and buffers on device cuda:0 (device_ids[0]) but found one of them on device: cpu

if it matters, I modified trainer.py to use torch.amp instead of apex.amp because amp was not part of my apex installation

2 Likes