Attempting to unscale FP16 gradients

I鈥檝e modified a llama 7b using peft and its Lora adapters. After I uploaded the model to the huggingface hub, and when I want to download and upgrade it, I get an error. I load the model using AutoPeftModelForCausalLM with argument is_trainable=True.
How i load model:

quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
)

model = AutoPeftModelForCausalLM.from_pretrained("sample", quantization_config=quantization_config, device_map=device, low_cpu_mem_usage=True, offload_state_dict=True, is_trainable=True, token = token)

And this my trainer :

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    data_collator = collator,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 1,
        warmup_steps = 10,
        max_steps = 1_000,
        gradient_checkpointing=True,
        fp16 = True,
        report_to = "none",
        logging_steps = 5,
        output_dir = "outputs",
        optim = "adamw_8bit",
  ),
)

Error:

ValueError                                Traceback (most recent call last)
Cell In[13], line 1
----> 1 trainer.train()

File /opt/conda/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361, in SFTTrainer.train(self, *args, **kwargs)
    358 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
    359     self.model = self._trl_activate_neftune(self.model)
--> 361 output = super().train(*args, **kwargs)
    363 # After training we make sure to retrieve back the original forward pass method
    364 # for the embedding layer by removing the forward post hook.
    365 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1883         hf_hub_utils.enable_progress_bars()
   1884 else:
-> 1885     return inner_training_loop(
   1886         args=args,
   1887         resume_from_checkpoint=resume_from_checkpoint,
   1888         trial=trial,
   1889         ignore_keys_for_eval=ignore_keys_for_eval,
   1890     )

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2262, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2257     _grad_norm = nn.utils.clip_grad_norm_(
   2258         amp.master_params(self.optimizer),
   2259         args.max_grad_norm,
   2260     )
   2261 else:
-> 2262     _grad_norm = self.accelerator.clip_grad_norm_(
   2263         model.parameters(),
   2264         args.max_grad_norm,
   2265     )
   2267 if (
   2268     is_accelerate_available()
   2269     and self.accelerator.distributed_type == DistributedType.DEEPSPEED
   2270 ):
   2271     grad_norm = model.get_global_grad_norm()

File /opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:2269, in Accelerator.clip_grad_norm_(self, parameters, max_norm, norm_type)
   2267             # Set is_xla_gradients_synced to True to avoid all-reduce twice in the AcceleratedOptimizer step.
   2268             acc_opt.gradient_state.is_xla_gradients_synced = True
-> 2269 self.unscale_gradients()
   2270 return torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)

File /opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:2219, in Accelerator.unscale_gradients(self, optimizer)
   2217 while isinstance(opt, AcceleratedOptimizer):
   2218     opt = opt.optimizer
-> 2219 self.scaler.unscale_(opt)

File /opt/conda/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py:307, in GradScaler.unscale_(self, optimizer)
    304 inv_scale = self._scale.double().reciprocal().float()
    305 found_inf = torch.full((), 0.0, dtype=torch.float32, device=self._scale.device)
--> 307 optimizer_state["found_inf_per_device"] = self._unscale_grads_(
    308     optimizer, inv_scale, found_inf, False
    309 )
    310 optimizer_state["stage"] = OptState.UNSCALED

File /opt/conda/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py:229, in GradScaler._unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16)
    227     continue
    228 if (not allow_fp16) and param.grad.dtype == torch.float16:
--> 229     raise ValueError("Attempting to unscale FP16 gradients.")
    230 if param.grad.is_sparse:
    231     # is_coalesced() == False means the sparse grad has values with duplicate indices.
    232     # coalesce() deduplicates indices and adds all values that have the same index.
    233     # For scaled fp16 values, there's a good chance coalescing will cause overflow,
    234     # so we should check the coalesced _values().
    235     if param.grad.dtype is torch.float16:

Hi,

See this thread: i got a Trainer error: Attempting to unscale FP16 gradients 路 Issue #23165 路 huggingface/transformers 路 GitHub. You need to use this function: Models

It helped me solve the torch_dtype=torch.float32 problem in AutoPeftModelForCausalLM

This topic was automatically closed 12 hours after the last reply. New replies are no longer allowed.