I tried:
model = AutoModelForSequenceClassification.from_pretrained(
model_id,
revision=“refs/pr/4”,
num_labels=2,
torch_dtype=torch.float16,
quantization_config=bnb_cfg,
device_map=“cuda:0”,)
and got back:
RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`
Full trace:
/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py:632:
UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended,
but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
return fn(*args, **kwargs)
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_119/4032920361.py in <cell line: 0>()
----> 1 trainer.train()
/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
2169 hf_hub_utils.enable_progress_bars()
2170 else:
-> 2171 return inner_training_loop(
2172 args=args,
2173 resume_from_checkpoint=resume_from_checkpoint,
/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2529 )
2530 with context():
-> 2531 tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
2532
2533 if (
/usr/local/lib/python3.11/dist-packages/transformers/trainer.py in training_step(***failed resolving arguments***)
3710 loss = loss / self.args.gradient_accumulation_steps
3711
-> 3712 self.accelerator.backward(loss, **kwargs)
3713
3714 return loss.detach()
/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py in backward(self, loss, **kwargs)
2732 self.lomo_backward(loss, learning_rate)
2733 else:
-> 2734 loss.backward(**kwargs)
2735
2736 def set_trigger(self):
/usr/local/lib/python3.11/dist-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
579 inputs=inputs,
580 )
--> 581 torch.autograd.backward(
582 self, gradient, retain_graph, create_graph, inputs=inputs
583 )
/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
345 # some Python versions print out the first line of a multi-line function
346 # calls in the traceback and some print out the last line
--> 347 _engine_run_backward(
348 tensors,
349 grad_tensors_,
/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py in _engine_run_backward(t_outputs, *args, **kwargs)
823 unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)
824 try:
--> 825 return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
826 t_outputs, *args, **kwargs
827 ) # Calls into the C++ engine to run the backward pass
RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`