Problem with multiple GPUs

Please consider the following code, which I will briefly describe below:

from datasets import load_dataset_builder, load_dataset
import os
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments


os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
# print name of available devices
for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")

class QGenMetrics:
    def __init__(self, tokenizer, ignore_index=-100):
        self.tokenizer = tokenizer
        self.ignore_index = ignore_index
        
    def clean_labels(self, labels):
        labels[labels == self.ignore_index] = self.tokenizer.pad_token_id
        return labels

    def compute_metrics_validation(self, eval_preds):
        predictions, labels = eval_preds
        
        try:
            labels = self.clean_labels(labels)
            predictions = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
        except Exception as e:
            print(e)
            print("PREDS")
            print(predictions)
            print("LABELS")
            print(labels)
            assert False

        res = {"metric":1.0}
        return res

    def compute_metrics_test(self, test_preds):
        res = {"metric":1.0}
        return res        
#<

def actual_encoding(examples, tokenizer, max_source_len=None, max_target_len=None, ignore_label=-100):
    # no padding and no truncation: a collator will do the job
    # prompts_enc = tokenizer(examples["prompt"], padding="max_length", truncation=True, max_length=200)
    # targets_enc = tokenizer(text_target=examples["question"], padding="max_length", truncation=True, max_length=200)  # max_length=target_length, padding="max_length", truncation=True, return_tensors="pt")
    if max_source_len is None:
        prompts_enc = tokenizer(examples["question"], padding=False, truncation=False) # , return_tensors="pt")
    else:
        # tokenize up to max_source_len
        prompts_enc = tokenizer(examples["question"], padding=False, truncation=True, max_length=max_source_len) # , return_tensors="pt")
    
    if max_target_len is None:    
        targets_enc = tokenizer(examples["text"], padding=False, truncation=False)  # max_length=target_length, padding="max_length", truncation=True, return_tensors="pt")
    else:
        targets_enc = tokenizer(examples["text"], padding=False, truncation=True, max_length=max_target_len)
    
    examples["input_ids"] = prompts_enc["input_ids"]
    examples["attention_mask"] = prompts_enc["attention_mask"]

    # unused    
    # labels = []
    # for ex_labels in targets_enc["input_ids"]:
    #     proc_labels = [label if label != 0 else ignore_label for label in ex_labels]
    #     labels.append(proc_labels) 
    examples["labels"] = targets_enc["input_ids"]  # labels
    return examples        
#< actual_encoding

# download Bilkies/QuestionGeneration from HF hub
# https://huggingface.co/datasets/Bilkies/QuestionGeneration

ds_name = 'Bilkies/QuestionGeneration'
ds_builder = load_dataset_builder(ds_name)
print(ds_builder.info)
dataset = load_dataset(ds_name)

train_ds = dataset['train']
print(len(train_ds))
# subsample train_ds
train_ds = train_ds.select(range(1000))

print(len(train_ds))
test_ds = dataset['validation'].select(range(500))
# split training_ds in 80/20 for training and validation
train_ds = train_ds.train_test_split(test_size=0.2)
valid_ds = train_ds['test']
train_ds = train_ds['train']

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

train_ds = train_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len":10, "max_target_len":5}, batched=True, num_proc=2)
train_df = train_ds.to_pandas()
max_source_len = train_df["input_ids"].apply(len).max()
max_target_len = train_df["labels"].apply(len).max()
print(f"max_source_len: {max_source_len}, max_target_len: {max_target_len}")

valid_ds = valid_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len": max_source_len, "max_target_len": max_target_len}, batched=True, num_proc=2)
test_ds = test_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len": max_source_len, "max_target_len": max_target_len}, batched=True, num_proc=2)

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True, label_pad_token_id=-100)
evaluator = QGenMetrics(tokenizer)
trainer = Seq2SeqTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    compute_metrics=evaluator.compute_metrics_validation,
    args=Seq2SeqTrainingArguments(
        output_dir="./_remove",
        gradient_accumulation_steps=1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        seed=1,
        data_seed=2,
        predict_with_generate=True,
        eval_strategy="epoch",
        report_to="none",
    ) #< training args
) #< trainer

trainer.train()

1-89 The script loads a (random) dataset with questions and related answer.
90-100 It uses the function actual_encoding to encode the datasets. The encoding is as follows:

  • tokenize the training set with truncation set to 10 and 5 for the text and labels, respectively.
  • compute and store the max_source_len and max_target_len in the training set
  • tokenize the validation and test set truncating and passing to the maximum lengths.

This simulates my user case, where I encode the training data and fix the maximum length of the input and target texts for validation and test to be as in the training set.

The batches are prepared by a DataCollatorForSeq2Seq at line 104:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True, label_pad_token_id=-100)

that, from my understanding, pads everything to max_
It then trains the model “google/flan-t5-small” for 1 epoch. At the end of the training it performs a validation.

If I use a single GPU the training succeeds. I get only a strange warning message:

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.

that I cannot understand as everything is pre-tokenized and the trainer should not invoke a tokenizer.

However, the main problem is when I use multiple GPUs. As soon as I use more than 1 GPU, the training fails due to a tokenization problem. The exception is raised at line 26:

out of range integral type conversion attempted
PREDS
[[   0    3    2 ...    0    0 -100]
 [   0    3    2 ...    0    0 -100]
 [   0    3    2 ...    0    0 -100]
 ...
 [   0    3    2 ...    0    0 -100]
 [   0    3    2 ...    0    0 -100]
 [   0    3    2 ...    0    0 -100]]
LABELS
[[   3    2 3247 ...    0    0    0]
 [   3    2 3247 ...    0    0    0]
 [   3    2 3247 ...    0    0    0]
 ...
 [   3    2 3247 ...    0    0    0]
 [   3    2 3247 ...    0    0    0]
 [   3    2 3247 ...    0    0    0]]

As you can see the tokenizer.batch_decode fails because it finds the value -100 in the predictions. -100 is the value used for padding the labels and it is impossible for any model to generate.

What am I doing wrong? I get also another warning when I use multiple GPUs:

/home/user/workspace/project/.venv/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:71: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
  warnings.warn(

You can find a Google Colab with the code above at:

https://colab.research.google.com/drive/1ps418n97fPOMbOemWe3p-ewRZ7RGZq3l?usp=sharing

but, clearly, it is run only on a single GPU and it works.

The problem may be stays in different column dimensions of the batches sent to different GPUs, but it should take place on training data and not on validation data as it has been truncated and padded by the tokenizer.

Is the problem related to the incomprehensible warning about the Trainer.tokenizer?
Is the low-level code reusing memory areas without zero-ing them first?

1 Like