Qlora - 8 bit quantization using bitsandbytes gives error for owl-vit model

Hi Team,

when i am running the above qlora code for owl-vit model (google/owlvit-base-patch32) with below 4 bits bnbconfig , the fine tuning is taking place without any error.

bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type=“nf4”,
bnb_4bit_compute_dtype=torch.bfloat16
)

once i change the config with below information

bnb_config = BitsAndBytesConfig(
load_in_8bit=True
)

i receive the following error trace.

#########################################################################################
RuntimeError Traceback (most recent call last)
Cell In[25], line 1
----> 1 trainer.train()

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/transformers/trainer.py:1537, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1535 hf_hub_utils.enable_progress_bars()
1536 else:
→ 1537 return inner_training_loop(
1538 args=args,
1539 resume_from_checkpoint=resume_from_checkpoint,
1540 trial=trial,
1541 ignore_keys_for_eval=ignore_keys_for_eval,
1542 )

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/transformers/trainer.py:1854, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1851 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
1853 with self.accelerator.accumulate(model):
→ 1854 tr_loss_step = self.training_step(model, inputs)
1856 if (
1857 args.logging_nan_inf_filter
1858 and not is_torch_tpu_available()
1859 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1860 ):
1861 # if loss is nan or inf simply add the average of previous logged losses
1862 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/transformers/trainer.py:2744, in Trainer.training_step(self, model, inputs)
2742 scaled_loss.backward()
2743 else:
→ 2744 self.accelerator.backward(loss)
2746 return loss.detach() / self.args.gradient_accumulation_steps

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/accelerate/accelerator.py:1907, in Accelerator.backward(self, loss, **kwargs)
1905 return
1906 elif self.scaler is not None:
→ 1907 self.scaler.scale(loss).backward(**kwargs)
1908 else:
1909 loss.backward(**kwargs)

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/torch/_tensor.py:492, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
482 if has_torch_function_unary(self):
483 return handle_torch_function(
484 Tensor.backward,
485 (self,),
(…)
490 inputs=inputs,
491 )
→ 492 torch.autograd.backward(
493 self, gradient, retain_graph, create_graph, inputs=inputs
494 )

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/torch/autograd/init.py:251, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
246 retain_graph = create_graph
248 # The reason we repeat the same comment below is that
249 # some Python versions print out the first line of a multi-line function
250 # calls in the traceback and some print out the last line
→ 251 Variable.execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
252 tensors,
253 grad_tensors,
254 retain_graph,
255 create_graph,
256 inputs,
257 allow_unreachable=True,
258 accumulate_grad=True,
259 )

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/torch/autograd/function.py:288, in BackwardCFunction.apply(self, *args)
282 raise RuntimeError(
283 "Implementing both ‘backward’ and ‘vjp’ for a custom "
284 "Function is not allowed. You should only implement one "
285 “of them.”
286 )
287 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
→ 288 return user_fn(self, *args)

File ~/miniconda3/envs/testenv/lib/python3.10/site-packages/bitsandbytes/autograd/functions.py:491, in MatMul8bitLt.backward(ctx, grad_output)
485 print(“state.CxB”,state.CxB)
486 print("State ",state)
488 CB = (
489 undo_layout(state.CxB, state.tile_indices)
490 .to(ctx.dtype_A)
→ 491 .mul(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
492 )
493 grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)
494 else:

RuntimeError: The size of tensor a (32) must match the size of tensor b (4) at non-singleton dimension 0

#######################################################################################
need your help.

thanks.

I am experiencing the same issue, simply changing from 8bit to 4bit resolves the issue. But I still want to use 8bit. Any solution?