I get a “RuntimeError: Expected all tensors to be on the same device , but found at least two devices, cuda:3 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)” when running the following code.
import datasets
from transformers import AutoTokenizer
import torch
import transformers
from transformers import AutoModelForQuestionAnswering
MAX_LENGTH=200
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# from: https://huggingface.co/docs/transformers/tasks/question_answering
def preprocess_function(examples):
questions = [q.strip() for q in examples["question"]]
inputs = tokenizer(
questions,
examples["context"],
max_length=MAX_LENGTH,
truncation="only_second", # only truncate the context
return_offsets_mapping=True,
padding="max_length",
)
offset_mapping = inputs.pop("offset_mapping")
answers = examples["answers"]
start_positions = []
end_positions = []
for i, offset in enumerate(offset_mapping):
answer = answers[i]
start_char = answer["answer_start"][0]
end_char = answer["answer_start"][0] + len(answer["text"][0])
sequence_ids = inputs.sequence_ids(i)
# Find the start and end of the context
idx = 0
while sequence_ids[idx] != 1:
idx += 1
context_start = idx
while sequence_ids[idx] == 1: # TODO - can throw an error here if the max_length isn't long enough. I think, I can add a check to just end the sequence once max_length is reached
idx += 1
if idx >= len(sequence_ids): # TODO - I added this, but does that make the next if statement unnecessary?
break
context_end = idx - 1
# If the answer is not fully inside the context, label it (0, 0)
if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
start_positions.append(0)
end_positions.append(0)
else:
# Otherwise it's the start and end token positions
idx = context_start
while idx <= context_end and offset[idx][0] <= start_char:
idx += 1
start_positions.append(idx - 1)
idx = context_end
while idx >= context_start and offset[idx][1] >= end_char:
idx -= 1
end_positions.append(idx + 1)
inputs["start_positions"] = start_positions
inputs["end_positions"] = end_positions
return inputs
def load_squad_data():
from datasets import load_dataset
squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
return tokenized_squad
##############################################################
##############################################################
##############################################################
##############################################################
##############################################################
# create the model and tokenizer
model_name = "facebook/opt-6.7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name, device_map='auto')
# load and process the data
data = load_squad_data()
# set up the trainer and train
training_args = transformers.TrainingArguments(
per_device_train_batch_size=4,
num_train_epochs=25,
learning_rate=2e-4,
output_dir='training_output',
)
data_collator = transformers.DefaultDataCollator()
trainer = transformers.Trainer(
model=model,
train_dataset=data['train'],
args=training_args,
data_collator=data_collator
)
trainer.train()
The full error is below:
python3 code/min_reproducable.py
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.12s/it]
Some weights of OPTForQuestionAnswering were not initialized from the model checkpoint at facebook/opt-6.7b and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:01<00:00, 2348.15 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2429.45 examples/s]
2024-05-02 16:49:00.310155: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-02 16:49:01.124980: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
0%| | 0/25000 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/shenry/llm_for_bioasq/code/min_reproducable.py", line 110, in <module>
trainer.train()
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/transformers/trainer.py", line 1859, in train
return inner_training_loop(
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/transformers/trainer.py", line 2203, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/transformers/trainer.py", line 3138, in training_step
loss = self.compute_loss(model, inputs)
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/transformers/trainer.py", line 3161, in compute_loss
outputs = model(**inputs)
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/transformers/models/opt/modeling_opt.py", line 1436, in forward
start_loss = loss_fct(start_logits, start_positions)
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/torch/nn/modules/loss.py", line 1179, in forward
return F.cross_entropy(input, target, weight=self.weight,
File "/home/shenry/llm_for_bioasq/virtual_env_ne/lib64/python3.9/site-packages/torch/nn/functional.py", line 3059, in cross_entropy
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)
0%| | 0/25000 [00:02<?, ?it/s]
I have some guesses as to what is going on, but I am stuck. Any help would be appreciated. Thank you!