I am using bitsandbytes
quantization to load mistral-7b
on NVIDIA T4
gpu. I loaded the model with the quantized configuration, however, I keep getting an runtime error related to device. I am ensured that the model and inputs are on cuda
.
Transformers 4.41.2
import bitsandbytes
from peft import LoraConfig, get_peft_model
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, =
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_storage=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
"mistralai/Mistral-7B-v0.1"
quantization_config=bnb_config,
)
# Lora
config = LoraConfig(
r=32,
lora_alpha=64,
lora_dropout=0.01,
target_modules=["q_proj", "k_proj", "v_proj"],
bias="none",
task_type="CAUSAL_LM"
)
peft_model = get_peft_model(model, config)
inputs = tokenizer("Do you have time", return_tensors="pt")
print("Inputs:", inputs)
{'input_ids': tensor([[ 1, 2378, 368, 506, 727]], device='cuda:0'),
'attention_mask': tensor([[1, 1, 1, 1, 1]], device='cuda:0')}
with torch.no_grad():
outputs = peft_model(**inputs)
print("Outputs:\n", outputs.logits)
Error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[44], line 2
1 with torch.no_grad():
----> 2 outputs = peft_model(**inputs)
4 print("Outputs:\n", outputs.logits)
5 print("Outputs dimensions:", outputs.logits.shape) # shape: (batch_size, num_tokens, num_classes)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/peft/peft_model.py:1430, in PeftModelForCausalLM.forward(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)
1428 with self._enable_peft_forward_hooks(**kwargs):
1429 kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
-> 1430 return self.base_model(
1431 input_ids=input_ids,
1432 attention_mask=attention_mask,
1433 inputs_embeds=inputs_embeds,
1434 labels=labels,
1435 output_attentions=output_attentions,
1436 output_hidden_states=output_hidden_states,
1437 return_dict=return_dict,
1438 **kwargs,
1439 )
1441 batch_size = _get_batch_size(input_ids, inputs_embeds)
1442 if attention_mask is not None:
1443 # concat prompt attention mask
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:179, in BaseTuner.forward(self, *args, **kwargs)
178 def forward(self, *args: Any, **kwargs: Any):
--> 179 return self.model.forward(*args, **kwargs)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/accelerate/hooks.py:166, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
164 output = module._old_forward(*args, **kwargs)
165 else:
--> 166 output = module._old_forward(*args, **kwargs)
167 return module._hf_hook.post_forward(module, output)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:1152, in MistralForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1139 outputs = self.model(
1140 input_ids=input_ids,
1141 attention_mask=attention_mask,
(...)
1148 return_dict=return_dict,
1149 )
1151 hidden_states = outputs[0]
-> 1152 logits = self.lm_head(hidden_states)
1153 logits = logits.float()
1155 loss = None
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/torch/nn/modules/linear.py:116, in Linear.forward(self, input)
115 def forward(self, input: Tensor) -> Tensor:
--> 116 return F.linear(input, self.weight, self.bias)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat2 in method wrapper_CUDA_mm)