Wish I could provide a solution but just dropping in to say I’m having a nearly identical issue. I’m working on system with 16x V100s. Loading the model works fine, but I encounter the “Expected all tensors” error when attempting to run inference. Code below, for reference. Note I’ve encountered this with other models and encountered the same issue…not sure if it’s a problem with accelerate or bitsandbytes
.
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained('google/flan-ul2',
cache_dir = './models')
model = T5ForConditionalGeneration.from_pretrained('google/flan-ul2',
cache_dir = './models',
device_map = 'auto',
load_in_8bit = True)
input_string = 'Answer the following question by reasoning step by step. I start with 10 bananas. A monkey eats three of them, and then gives me an avocado. How many bananas do I have left?'
inputs = tokenizer(input_string, return_tensors = 'pt').to('cuda:0')
outputs = model.generate(inputs['input_ids'], max_length = 200)
print(tokenizer.decode(outputs[0]))
Error traceback:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[4], line 3
1 input_string = 'Answer the following question by reasoning step by step. I start with 10 bananas. A monkey eats three of them, and then gives me an avocado. How many bananas do I have left?'
2 inputs = tokenizer(input_string, return_tensors = 'pt').to('cuda:0')
----> 3 outputs = model.generate(inputs['input_ids'], max_length = 200)
4 print(tokenizer.decode(outputs[0]))
File ~/.conda/envs/llm_lab/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs)
24 @functools.wraps(func)
25 def decorate_context(*args, **kwargs):
26 with self.clone():
---> 27 return func(*args, **kwargs)
File ~/.conda/envs/llm_lab/lib/python3.10/site-packages/transformers/generation/utils.py:1391, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, **kwargs)
1385 raise ValueError(
1386 f"num_return_sequences has to be 1, but is {generation_config.num_return_sequences} when doing"
1387 " greedy search."
1388 )
1390 # 11. run greedy search
-> 1391 return self.greedy_search(
1392 input_ids,
1393 logits_processor=logits_processor,
1394 stopping_criteria=stopping_criteria,
1395 pad_token_id=generation_config.pad_token_id,
1396 eos_token_id=generation_config.eos_token_id,
1397 output_scores=generation_config.output_scores,
1398 return_dict_in_generate=generation_config.return_dict_in_generate,
1399 synced_gpus=synced_gpus,
1400 **model_kwargs,
1401 )
1403 elif is_contrastive_search_gen_mode:
1404 if generation_config.num_return_sequences > 1:
File ~/.conda/envs/llm_lab/lib/python3.10/site-packages/transformers/generation/utils.py:2179, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, **model_kwargs)
2176 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2178 # forward pass to get next token
-> 2179 outputs = self(
2180 **model_inputs,
2181 return_dict=True,
2182 output_attentions=output_attentions,
2183 output_hidden_states=output_hidden_states,
2184 )
2186 if synced_gpus and this_peer_finished:
2187 continue # don't waste resources running the code we don't need
File ~/.conda/envs/llm_lab/lib/python3.10/site-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []
File ~/.conda/envs/llm_lab/lib/python3.10/site-packages/accelerate/hooks.py:158, in add_hook_to_module.<locals>.new_forward(*args, **kwargs)
156 output = old_forward(*args, **kwargs)
157 else:
--> 158 output = old_forward(*args, **kwargs)
159 return module._hf_hook.post_forward(module, output)
File ~/.conda/envs/llm_lab/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py:1691, in T5ForConditionalGeneration.forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, inputs_embeds, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1686 if self.config.tie_word_embeddings:
1687 # Rescale output before projecting on vocab
1688 # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
1689 sequence_output = sequence_output * (self.model_dim**-0.5)
-> 1691 lm_logits = self.lm_head(sequence_output)
1693 loss = None
1694 if labels is not None:
File ~/.conda/envs/llm_lab/lib/python3.10/site-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []
File ~/.conda/envs/llm_lab/lib/python3.10/site-packages/accelerate/hooks.py:158, in add_hook_to_module.<locals>.new_forward(*args, **kwargs)
156 output = old_forward(*args, **kwargs)
157 else:
--> 158 output = old_forward(*args, **kwargs)
159 return module._hf_hook.post_forward(module, output)
File ~/.conda/envs/llm_lab/lib/python3.10/site-packages/torch/nn/modules/linear.py:114, in Linear.forward(self, input)
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:3! (when checking argument for argument mat2 in method wrapper_mm)
@sgugger seems like you and some other team members were working on this issue in this transformers PR. Any advice on how we should proceed here?