Bitsandbytes and CUBLAS_STATUS_NOT_INITIALIZED error

I tried:

model = AutoModelForSequenceClassification.from_pretrained(
 model_id,
 revision=“refs/pr/4”,
 num_labels=2,
 torch_dtype=torch.float16,
 quantization_config=bnb_cfg,
 device_map=“cuda:0”,)



and got back:

RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`

Full trace:

/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py:632:
 UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended,
 but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_119/4032920361.py in <cell line: 0>()
----> 1 trainer.train()

/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   2169                 hf_hub_utils.enable_progress_bars()
   2170         else:
-> 2171             return inner_training_loop(
   2172                 args=args,
   2173                 resume_from_checkpoint=resume_from_checkpoint,

/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2529                     )
   2530                     with context():
-> 2531                         tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
   2532 
   2533                     if (

/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in training_step(***failed resolving arguments***)
   3710                 loss = loss / self.args.gradient_accumulation_steps
   3711 
-> 3712             self.accelerator.backward(loss, **kwargs)
   3713 
   3714             return loss.detach()

/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py in backward(self, loss, **kwargs)
   2732             self.lomo_backward(loss, learning_rate)
   2733         else:
-> 2734             loss.backward(**kwargs)
   2735 
   2736     def set_trigger(self):

/usr/local/lib/python3.11/dist-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
    579                 inputs=inputs,
    580             )
--> 581         torch.autograd.backward(
    582             self, gradient, retain_graph, create_graph, inputs=inputs
    583         )

/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    345     # some Python versions print out the first line of a multi-line function
    346     # calls in the traceback and some print out the last line
--> 347     _engine_run_backward(
    348         tensors,
    349         grad_tensors_,

/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py in _engine_run_backward(t_outputs, *args, **kwargs)
    823         unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)
    824     try:
--> 825         return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    826             t_outputs, *args, **kwargs
    827         )  # Calls into the C++ engine to run the backward pass

RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`
1 Like

It looks like an error when the label is wrong