Bitsandbytes and CUBLAS_STATUS_NOT_INITIALIZED error

I am trying to run microsoft/deberta-v3-xsmall model with bitsandbytes and ending up with CUDA error: CUBLAS_STATUS_ALLOC_FAILED when callingcublasCreate(handle)``

Everything runs fine without bitsandbytes or with just LORA, but bitsandbytes are causing error. I searched the internet and one possible solution is to adjust dimensions of matrices (incorrect number of labels). Is possible to manually step through the model and see dimensions of output of each layer?

How else can I troubleshoot?

Here is link to the notebook
https://www.kaggle.com/code/bridgeport/bitsandbytes-and-cublas-status-not-initialized

Here is full error message:

/tmp/ipykernel_36/2123443610.py:2: FutureWarning: tokenizer is deprecated and will be removed in version 5.0.0 for Trainer.__init__. Use processing_class instead.trainer = Trainer(No label_names provided for model class PeftModelForSequenceClassification. Since PeftModel hides base models input arguments, if label_names is not given, label_names can’t be set automatically within Trainer. Note that empty label_names list will be used instead./usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py:745: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.return fn(*args, **kwargs)
RuntimeError                              Traceback (most recent call last)/tmp/ipykernel_36/2123443610.py in <cell line: 0>()9 )10—> 11 trainer.train()
/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)2238                 hf_hub_utils.enable_progress_bars()2239         else: → 2240             return inner_training_loop(2241                 args=args,2242                 resume_from_checkpoint=resume_from_checkpoint,
/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)2553                     )2554                     with context(): → 2555                         tr_loss_step = self.training_step(model, inputs, num_items_in_batch)25562557                     if (
/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in training_step(self, model, inputs, num_items_in_batch)37433744         with self.compute_loss_context_manager(): → 3745             loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)37463747         del inputs
/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs, num_items_in_batch)3808                 loss_kwargs[“num_items_in_batch”] = num_items_in_batch3809             inputs = {**inputs, **loss_kwargs} → 3810         outputs = model(**inputs)3811         # Save past state if it exists3812         # TODO: this needs to be fixed and made cleaner later.
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)1737             return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]1738         else: → 1739             return self._call_impl(*args, **kwargs)17401741     # torchrec tests the code consistency with the following code
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)1748                 or _global_backward_pre_hooks or _global_backward_hooks1749                 or _global_forward_hooks or _global_forward_pre_hooks): → 1750             return forward_call(*args, **kwargs)17511752         result = None
/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)191                 return self.module(*inputs[0], **module_kwargs[0])192             replicas = self.replicate(self.module, self.device_ids[: len(inputs)]) → 193             outputs = self.parallel_apply(replicas, inputs, module_kwargs)194             return self.gather(outputs, self.output_device)195
/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)210         self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any211     ) → List[Any]: → 212         return parallel_apply(213             replicas, inputs, kwargs, self.device_ids[: len(replicas)]214         )
/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)124         output = results[i]125         if isinstance(output, ExceptionWrapper): → 126             output.reraise()127         outputs.append(output)128     return outputs
/usr/local/lib/python3.11/dist-packages/torch/_utils.py in reraise(self)731             # instantiate since we don’t know how to732             raise RuntimeError(msg) from None → 733         raise exception734735
RuntimeError: Caught RuntimeError in replica 0 on device 0.Original Traceback (most recent call last):File “/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/parallel_apply.py”, line 96, in _workeroutput = module(*input, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1739, in _wrapped_call_implreturn self._call_impl(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1750, in _call_implreturn forward_call(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/peft/peft_model.py”, line 1559, in forwardreturn self.base_model(^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1739, in _wrapped_call_implreturn self._call_impl(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1750, in _call_implreturn forward_call(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/peft/tuners/tuners_utils.py”, line 193, in forwardreturn self.model.forward(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py”, line 1089, in forwardoutputs = self.deberta(^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1739, in _wrapped_call_implreturn self._call_impl(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1750, in _call_implreturn forward_call(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py”, line 796, in forwardencoder_outputs = self.encoder(^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1739, in _wrapped_call_implreturn self._call_impl(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1750, in _call_implreturn forward_call(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py”, line 659, in forwardoutput_states, attn_weights = self._gradient_checkpointing_func(^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/_compile.py”, line 32, in innerreturn disable_fn(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py”, line 745, in _fnreturn fn(*args, **kwargs)^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py”, line 489, in checkpointreturn CheckpointFunction.apply(function, preserve, *args)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/autograd/function.py”, line 575, in applyreturn super().apply(*args, **kwargs)  # type: ignore[misc]^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py”, line 264, in forwardoutputs = run_function(*args)^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1739, in _wrapped_call_implreturn self._call_impl(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1750, in _call_implreturn forward_call(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py”, line 437, in forwardattention_output, att_matrix = self.attention(^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1739, in _wrapped_call_implreturn self._call_impl(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1750, in _call_implreturn forward_call(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py”, line 370, in forwardself_output, att_matrix = self.self(^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1739, in _wrapped_call_implreturn self._call_impl(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1750, in _call_implreturn forward_call(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/transformers/models/deberta_v2/modeling_deberta_v2.py”, line 235, in forwardquery_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1739, in _wrapped_call_implreturn self._call_impl(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py”, line 1750, in _call_implreturn forward_call(*args, **kwargs)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/bitsandbytes/nn/modules.py”, line 565, in forwardreturn bnb.matmul_4bit(x, weight, bias=bias, quant_state=self.weight.quant_state).to(inp_dtype)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/bitsandbytes/autograd/_functions.py”, line 466, in matmul_4bitreturn MatMul4Bit.apply(A, B, out, bias, quant_state)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/torch/autograd/function.py”, line 575, in applyreturn super().apply(*args, **kwargs)  # type: ignore[misc]^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^File “/usr/local/lib/python3.11/dist-packages/bitsandbytes/autograd/_functions.py”, line 380, in forwardoutput = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling cublasCreate(handle)