Unable to inference in 8bit mode: 'NoneType' object has no attribute 'device'

HI , I am about to load a model in 8 bit on Colab, The model is loaded in GPU . But the problem is that when I try to make an inference it always throws error
AttributeError: ‘NoneType’ object has no attribute ‘device’

Here is the code :

tokenizer = AutoTokenizer.from_pretrained('sarvamai/OpenHathi-7B-Hi-v0.1-Base')
model = AutoModelForCausalLM.from_pretrained(
    'sarvamai/OpenHathi-7B-Hi-v0.1-Base',
    torch_dtype=torch.float16,
    device_map = "cuda",
    load_in_8bit = True,
)

query = "What is India, reply in Hindi?"

inputs = tokenizer("WHat is India?" , return_tensors = "pt").to("cuda")
with torch.inference_mode():
    outputs = model.generate(**inputs, max_new_tokens = 100)
completion = tokenizer.batch_decode(outputs, skip_special_tokens = True)[0][len(query):]

print(completion) 

My dependencies:
transformers-4.36.0.dev0
accelerate 0.25.0
bitsandbytes 0.41.3.post2

Can you please report the full trace?

Also doing device_map="cuda" is not recommended. Just do model.to("cuda")

I have copied the entire output:

AttributeError Traceback (most recent call last)
in <cell line: 4>()
3 inputs = tokenizer(“WHat is India?” , return_tensors = “pt”).to(“cuda”)
4 with torch.inference_mode():
----> 5 outputs = model.generate(**inputs, max_new_tokens = 100)
6 completion = tokenizer.batch_decode(outputs, skip_special_tokens = True)[0][len(query):]
7

26 frames
/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
→ 115 return func(*args, **kwargs)
116
117 return decorate_context

/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py in generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
1775
1776 # 13. run sample
→ 1777 return self.sample(
1778 input_ids,
1779 logits_processor=logits_processor,

/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py in sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2872
2873 # forward pass to get next token
→ 2874 outputs = self(
2875 **model_inputs,
2876 return_dict=True,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
→ 1518 return self._call_impl(*args, **kwargs)
1519
1520 def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1527 return forward_call(*args, **kwargs)
1528
1529 try:

/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
163 output = module._old_forward(*args, **kwargs)
164 else:
→ 165 output = module._old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
167

/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1179
1180 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
→ 1181 outputs = self.model(
1182 input_ids=input_ids,
1183 attention_mask=attention_mask,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
→ 1518 return self._call_impl(*args, **kwargs)
1519
1520 def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1527 return forward_call(*args, **kwargs)
1528
1529 try:

/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
163 output = module._old_forward(*args, **kwargs)
164 else:
→ 165 output = module._old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
167

/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py in forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
1066 )
1067 else:
→ 1068 layer_outputs = decoder_layer(
1069 hidden_states,
1070 attention_mask=attention_mask,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
→ 1518 return self._call_impl(*args, **kwargs)
1519
1520 def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1527 return forward_call(*args, **kwargs)
1528
1529 try:

/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
163 output = module._old_forward(*args, **kwargs)
164 else:
→ 165 output = module._old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
167

/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py in forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
794
795 # Self Attention
→ 796 hidden_states, self_attn_weights, present_key_value = self.self_attn(
797 hidden_states=hidden_states,
798 attention_mask=attention_mask,

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
→ 1518 return self._call_impl(*args, **kwargs)
1519
1520 def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1527 return forward_call(*args, **kwargs)
1528
1529 try:

/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
163 output = module._old_forward(*args, **kwargs)
164 else:
→ 165 output = module._old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
167

/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py in forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
384
385 else:
→ 386 query_states = self.q_proj(hidden_states)
387 key_states = self.k_proj(hidden_states)
388 value_states = self.v_proj(hidden_states)

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
→ 1518 return self._call_impl(*args, **kwargs)
1519
1520 def _call_impl(self, *args, **kwargs):

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1527 return forward_call(*args, **kwargs)
1528
1529 try:

/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py in new_forward(module, *args, **kwargs)
163 output = module._old_forward(*args, **kwargs)
164 else:
→ 165 output = module._old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
167

/usr/local/lib/python3.10/dist-packages/bitsandbytes/nn/modules.py in forward(self, x)
448 self.bias.data = self.bias.data.to(x.dtype)
449
→ 450 out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
451
452 if not self.state.has_fp16_weights:

/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py in matmul(A, B, out, state, threshold, bias)
560 if threshold > 0.0:
561 state.threshold = threshold
→ 562 return MatMul8bitLt.apply(A, B, out, bias, state)
563
564

/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py in apply(cls, *args, **kwargs)
537 # See NOTE: [functorch vjp and autograd interaction]
538 args = _functorch.utils.unwrap_dead_wrappers(args)
→ 539 return super().apply(*args, **kwargs) # type: ignore[misc]
540
541 if cls.setup_context == _SingleLevelFunction.setup_context:

/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py in forward(ctx, A, B, out, bias, state)
342 else:
343 if not state.has_fp16_weights and state.CxB is None and using_igemmlt:
→ 344 state.CxB, state.SB = F.transform(state.CB, to_order=formatB)
345 subA = None
346

/usr/local/lib/python3.10/dist-packages/bitsandbytes/functional.py in transform(A, to_order, from_order, out, transpose, state, ld)
2194
2195 def transform(A, to_order, from_order=‘row’, out=None, transpose=False, state=None, ld=None):
→ 2196 prev_device = pre_call(A.device)
2197 if state is None: state = (A.shape, from_order)
2198 else: from_order = state[1]

AttributeError: ‘NoneType’ object has no attribute ‘device’

@muellerzr I have removed device_map and implemented

tokenizer = AutoTokenizer.from_pretrained('sarvamai/OpenHathi-7B-Hi-v0.1-Base')
model = AutoModelForCausalLM.from_pretrained(
    'sarvamai/OpenHathi-7B-Hi-v0.1-Base',
    torch_dtype=torch.float16,
    load_in_8bit = True,
).to("cuda")

produced error::

WARNING:accelerate.big_modeling:You shouldn’t move a model when it is dispatched on multiple devices.


ValueError Traceback (most recent call last)
in <cell line: 2>()
4 torch_dtype=torch.float16,
5 load_in_8bit = True,
----> 6 ).to(“cuda”)

1 frames
/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py in to(self, *args, **kwargs)
2436 # Checks if the model has been loaded in 8-bit
2437 if getattr(self, “quantization_method”, None) == QuantizationMethod.BITS_AND_BYTES:
→ 2438 raise ValueError(
2439 “.to is not supported for 4-bit or 8-bit bitsandbytes models. Please use the model as it is, since the”
2440 " model has already been set to the correct devices and casted to the correct dtype."

ValueError: .to is not supported for 4-bit or 8-bit bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct dtype.

I solved the issue with the removal of to(“cuda”) after the model…

tokenizer = AutoTokenizer.from_pretrained('sarvamai/OpenHathi-7B-Hi-v0.1-Base')
model = AutoModelForCausalLM.from_pretrained(
    'sarvamai/OpenHathi-7B-Hi-v0.1-Base',
    load_in_8bit = True,
)

Works fine,found to that there was some data format related issue [float16] while offloading into GPU… …infernecein bit slower while on 8bit