Trainer.evalute() with multi GPUs results Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0!

I’m trying to evaluate the model with 4 GPUs using trainer.evalute().
But when I run the script, It gives me this error.
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0!
I thought It might be a issue with the data loader, so I tried this code dataset.with_format("torch"), but didn’t work.
Also, I found some posts on the internet saying trying using model.to("cuda"), but that also didn’t work.

model = AutoModelForQuestionAnswering.from_pretrained(model_path, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = (
    tokenizer.eos_token if tokenizer.eos_token else tokenizer.unk_token
)

def preprocess(examples):
    questions = [q.strip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        questions,
        examples["context"],
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
        max_length=2048,  # 바꾸면 안됨!
    )
    tokenized_examples = tokenized_examples.to("cuda")

    offset_mapping = tokenized_examples.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

dataset = load_dataset("klue", "mrc", split="validation[:10]")
eval_dataset = dataset.map(
    preprocess, batched=True, remove_columns=dataset.column_names
)

# Define the evaluation metrics (Exact Match and F1 Score)
em_metric = evaluate.load("exact_match")
f1_metric = evaluate.load("f1")


# Custom evaluation function
def compute_metrics(eval_pred):
    print(eval_pred)
    return {}

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_eval_batch_size=1,
    save_strategy="no",
    report_to="none",
    dataloader_pin_memory=False,
)

# Trainer setup for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

# Run evaluation
eval_results = trainer.evaluate()
print(eval_results)
1 Like

As I can’t tell which line is causing the error, fix the line that is actually causing the error. This is just an example.

If you use .to(“cuda”), you can’t tell which GPU it will be sent to, so you should do something like this. If you transfer it to the same VRAM or RAM area as the variable you are calculating with, you should be able to manage. If the calculation partner is not a model, refer to a device other than the model.

#tokenized_examples = tokenized_examples.to("cuda")
tokenized_examples = tokenized_examples.to(model.device)

Actually, I found the solution. This code solved the issue.

from accelerate import PartialState

device_string = PartialState().process_index
model = AutoModelForQuestionAnswering.from_pretrained(model_path, device_map={'': device_string})
1 Like