I鈥檝e modified a llama 7b using peft and its Lora adapters. After I uploaded the model to the huggingface hub, and when I want to download and upgrade it, I get an error. I load the model using AutoPeftModelForCausalLM with argument is_trainable=True.
How i load model:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
model = AutoPeftModelForCausalLM.from_pretrained("sample", quantization_config=quantization_config, device_map=device, low_cpu_mem_usage=True, offload_state_dict=True, is_trainable=True, token = token)
And this my trainer :
trainer = SFTTrainer(
model = model,
train_dataset = dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
tokenizer = tokenizer,
data_collator = collator,
args = TrainingArguments(
per_device_train_batch_size = 4,
gradient_accumulation_steps = 1,
warmup_steps = 10,
max_steps = 1_000,
gradient_checkpointing=True,
fp16 = True,
report_to = "none",
logging_steps = 5,
output_dir = "outputs",
optim = "adamw_8bit",
),
)
Error:
ValueError Traceback (most recent call last)
Cell In[13], line 1
----> 1 trainer.train()
File /opt/conda/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361, in SFTTrainer.train(self, *args, **kwargs)
358 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
359 self.model = self._trl_activate_neftune(self.model)
--> 361 output = super().train(*args, **kwargs)
363 # After training we make sure to retrieve back the original forward pass method
364 # for the embedding layer by removing the forward post hook.
365 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1883 hf_hub_utils.enable_progress_bars()
1884 else:
-> 1885 return inner_training_loop(
1886 args=args,
1887 resume_from_checkpoint=resume_from_checkpoint,
1888 trial=trial,
1889 ignore_keys_for_eval=ignore_keys_for_eval,
1890 )
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2262, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2257 _grad_norm = nn.utils.clip_grad_norm_(
2258 amp.master_params(self.optimizer),
2259 args.max_grad_norm,
2260 )
2261 else:
-> 2262 _grad_norm = self.accelerator.clip_grad_norm_(
2263 model.parameters(),
2264 args.max_grad_norm,
2265 )
2267 if (
2268 is_accelerate_available()
2269 and self.accelerator.distributed_type == DistributedType.DEEPSPEED
2270 ):
2271 grad_norm = model.get_global_grad_norm()
File /opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:2269, in Accelerator.clip_grad_norm_(self, parameters, max_norm, norm_type)
2267 # Set is_xla_gradients_synced to True to avoid all-reduce twice in the AcceleratedOptimizer step.
2268 acc_opt.gradient_state.is_xla_gradients_synced = True
-> 2269 self.unscale_gradients()
2270 return torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)
File /opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:2219, in Accelerator.unscale_gradients(self, optimizer)
2217 while isinstance(opt, AcceleratedOptimizer):
2218 opt = opt.optimizer
-> 2219 self.scaler.unscale_(opt)
File /opt/conda/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py:307, in GradScaler.unscale_(self, optimizer)
304 inv_scale = self._scale.double().reciprocal().float()
305 found_inf = torch.full((), 0.0, dtype=torch.float32, device=self._scale.device)
--> 307 optimizer_state["found_inf_per_device"] = self._unscale_grads_(
308 optimizer, inv_scale, found_inf, False
309 )
310 optimizer_state["stage"] = OptState.UNSCALED
File /opt/conda/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py:229, in GradScaler._unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16)
227 continue
228 if (not allow_fp16) and param.grad.dtype == torch.float16:
--> 229 raise ValueError("Attempting to unscale FP16 gradients.")
230 if param.grad.is_sparse:
231 # is_coalesced() == False means the sparse grad has values with duplicate indices.
232 # coalesce() deduplicates indices and adds all values that have the same index.
233 # For scaled fp16 values, there's a good chance coalescing will cause overflow,
234 # so we should check the coalesced _values().
235 if param.grad.dtype is torch.float16: