### System Info
- `transformers` version: 4.45.1
- Platform: Linux-5.10.225-…213.878.amzn2.x86_64-x86_64-with-glibc2.35
- Python version: 3.10.14
- Huggingface_hub version: 0.24.5
- Safetensors version: 0.4.4
- Accelerate version: 0.34.2
- Accelerate config: not found
- PyTorch version (GPU?): 2.1.0 (True)
- Tensorflow version (GPU?): 2.15.0 (True)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using distributed or parallel set-up in script?: SFTTrainer, FSDP
### Who can help?
@ArthurZucker @SunMarc @muellerzr
### Information
- [ ] The official example scripts
- [X] My own modified scripts
### Tasks
- [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)
- [X] My own task or dataset (give details below)
### Reproduction
Script executed with Amazon SageMaker, instance type g5.12xlarge (4 GPUs):
```python
from accelerate import Accelerator
from huggingface_hub import login
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sagemaker.remote_function import remote
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed
from trl import SFTConfig, SFTTrainer
import transformers
def train_fn(
model_name,
train_ds,
test_ds=None,
lora_r=8,
lora_alpha=16,
lora_dropout=0.1,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
gradient_accumulation_steps=1,
learning_rate=2e-4,
num_train_epochs=1,
fsdp="",
fsdp_config=None,
max_seq_length=2048,
gradient_checkpointing=False,
merge_weights=False,
seed=42,
token=None
):
set_seed(seed)
accelerator = Accelerator()
if token is not None:
login(token=token)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set Tokenizer pad Token
tokenizer.pad_token = tokenizer.eos_token
def tokenize(text):
result = tokenizer(
text['text'],
max_length=max_seq_length,
padding="max_length",
truncation=True
)
result["labels"] = result["input_ids"].copy()
return result
with accelerator.main_process_first():
lm_train_dataset = train_ds.map(tokenize, remove_columns=["text"])
print(f"Total number of train samples: {len(lm_train_dataset)}")
if test_ds is not None:
lm_test_dataset = test_ds.map(tokenize, remove_columns=["text"])
print(f"Total number of test samples: {len(lm_test_dataset)}")
else:
lm_test_dataset = None
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_storage=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
quantization_config=bnb_config,
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16,
use_cache=False if gradient_checkpointing else True,
cache_dir="/tmp/.cache"
)
#model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
if gradient_checkpointing:
model.gradient_checkpointing_enable()
# get lora target modules
modules = find_all_linear_names(model)
print(f"Found {len(modules)} modules to quantize: {modules}")
config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
target_modules=modules,
lora_dropout=lora_dropout,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
trainer = SFTTrainer(
model=model,
train_dataset=lm_train_dataset,
eval_dataset=lm_test_dataset if test_ds is not None else None,
args=SFTConfig(
per_device_train_batch_size=per_device_train_batch_size,
per_device_eval_batch_size=per_device_eval_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
gradient_checkpointing=gradient_checkpointing,
gradient_checkpointing_kwargs={
"use_reentrant": True if gradient_checkpointing else False
},
logging_strategy="steps",
logging_steps=1,
log_on_each_node=False,
num_train_epochs=num_train_epochs,
learning_rate=learning_rate,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
fp16=False,
fp16_full_eval=False,
ddp_find_unused_parameters=False,
fsdp=fsdp,
fsdp_config=fsdp_config,
save_strategy="no",
output_dir="outputs"
),
peft_config=config,
packing=True,
tokenizer=tokenizer,
dataset_text_field="text",
dataset_kwargs={
"add_special_tokens": False,
"append_concat_token": False,
}
)
trainer.train()
if trainer.is_fsdp_enabled:
trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
if merge_weights:
if accelerator.is_main_process:
output_dir = "/tmp/model"
# merge adapter weights with base model and save
# save int 4 model
trainer.model.save_pretrained(output_dir, safe_serialization=False)
# clear memory
del model
del trainer
torch.cuda.empty_cache()
# load PEFT model in fp16
model = AutoPeftModelForCausalLM.from_pretrained(
output_dir,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
cache_dir="/tmp/.cache"
)
# Merge LoRA and base model and save
model = model.merge_and_unload()
model.save_pretrained(
"/opt/ml/model", safe_serialization=True, max_shard_size="2GB"
)
else:
trainer.model.save_pretrained("/opt/ml/model", safe_serialization=True)
if accelerator.is_main_process:
tokenizer.save_pretrained("/opt/ml/model")
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
train_fn(
model_id,
train_ds=train_dataset,
test_ds=test_dataset,
per_device_train_batch_size=4,
per_device_eval_batch_size=1,
gradient_accumulation_steps=4,
gradient_checkpointing=True,
num_train_epochs=3,
fsdp="full_shard auto_wrap offload",
fsdp_config={
"backward_prefetch": "backward_pre",
"forward_prefetch": False,
"use_orig_params": False,
"cpu_ram_efficient_loading": True
},
merge_weights=True,
token="<HF_TOKEN>"
)
```
train_dataset:
```
Dataset({
features: ['text'],
num_rows: 140
})
```
train_dataset[0]["text"] (mock):
```
<|begin_of_text|><|start_header_id|>user<|end_header_id|>What dimensions of the Llama 3.2 model are available?<|eot_id|><|start_header_id|>assistant<|end_header_id|>Llama 3.2 offers several model dimensions to cater to different use cases and computational requirements: Text-Only Models 1\ 1B parameter model: This is the smallest Llama 3.2 model, designed for lightweight text processing tasks and on-device applications 2\ 3B parameter model: A slightly larger text-only model that still maintains efficiency for edge devices and mobile applications. Multimodal Vision Models: 1\ 11B parameter model: This is the smaller of the two vision-capable models, suitable for efficient deployment and development on consumer-grade GPUs 2\ 90B parameter model: The largest Llama 3.2 model, designed for large-scale applications and advanced image reasoning tasks<|end_of_text|><|eot_id|>
```
Error returned:
```
Cell In[26], line 151, in train_fn()
110 print_trainable_parameters(model)
112 trainer = SFTTrainer(
113 model=model,
114 train_dataset=lm_train_dataset,
(...)
148 }
149 )
--> 151 trainer.train()
153 if trainer.is_fsdp_enabled:
154 trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
File [/opt/conda/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:434](https://ahwj8sa5y2ar84p.studio.us-east-1.sagemaker.aws/opt/conda/lib/python3.10/site-packages/trl/trainer/sft_trainer.py#line=433), in train()
431 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
432 self.model = self._trl_activate_neftune(self.model)
--> 434 output = super().train(*args, **kwargs)
436 # After training we make sure to retrieve back the original forward pass method
437 # for the embedding layer by removing the forward post hook.
438 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
File [/opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2052](https://ahwj8sa5y2ar84p.studio.us-east-1.sagemaker.aws/opt/conda/lib/python3.10/site-packages/transformers/trainer.py#line=2051), in train()
2050 hf_hub_utils.enable_progress_bars()
2051 else:
-> 2052 return inner_training_loop(
2053 args=args,
2054 resume_from_checkpoint=resume_from_checkpoint,
2055 trial=trial,
2056 ignore_keys_for_eval=ignore_keys_for_eval,
2057 )
File [/opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2388](https://ahwj8sa5y2ar84p.studio.us-east-1.sagemaker.aws/opt/conda/lib/python3.10/site-packages/transformers/trainer.py#line=2387), in _inner_training_loop()
2385 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
2387 with self.accelerator.accumulate(model):
-> 2388 tr_loss_step = self.training_step(model, inputs)
2390 if (
2391 args.logging_nan_inf_filter
2392 and not is_torch_xla_available()
2393 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2394 ):
2395 # if loss is nan or inf simply add the average of previous logged losses
2396 tr_loss += tr_loss [/](https://ahwj8sa5y2ar84p.studio.us-east-1.sagemaker.aws/) (1 + self.state.global_step - self._globalstep_last_logged)
File [/opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3518](https://ahwj8sa5y2ar84p.studio.us-east-1.sagemaker.aws/opt/conda/lib/python3.10/site-packages/transformers/trainer.py#line=3517), in training_step()
3516 scaled_loss.backward()
3517 else:
-> 3518 self.accelerator.backward(loss, **kwargs)
3520 return loss.detach() [/](https://ahwj8sa5y2ar84p.studio.us-east-1.sagemaker.aws/) self.args.gradient_accumulation_steps
File [/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:2196](https://ahwj8sa5y2ar84p.studio.us-east-1.sagemaker.aws/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py#line=2195), in backward()
2194 self.lomo_backward(loss, learning_rate)
2195 else:
-> 2196 loss.backward(**kwargs)
File [/opt/conda/lib/python3.10/site-packages/torch/_tensor.py:492](https://ahwj8sa5y2ar84p.studio.us-east-1.sagemaker.aws/opt/conda/lib/python3.10/site-packages/torch/_tensor.py#line=491), in backward()
491 def register_hook(self, hook):
--> 492 r"""Registers a backward hook.
493
494 The hook will be called every time a gradient with respect to the
495 Tensor is computed. The hook should have the following signature::
496
497 hook(grad) -> Tensor or None
498
499
500 The hook should not modify its argument, but it can optionally return
501 a new gradient which will be used in place of :attr:`grad`.
502
503 This function returns a handle with a method ``handle.remove()``
504 that removes the hook from the module.
505
506 .. note::
507 See :ref:`backward-hooks-execution` for more information on how when this hook
508 is executed, and how its execution is ordered relative to other hooks.
509
510 Example::
511
512 >>> v = torch.tensor([0., 0., 0.], requires_grad=True)
513 >>> h = v.register_hook(lambda grad: grad * 2) # double the gradient
514 >>> v.backward(torch.tensor([1., 2., 3.]))
515 >>> v.grad
516
517 2
518 4
519 6
520 [torch.FloatTensor of size (3,)]
521
522 >>> h.remove() # removes the hook
523 """
524 if has_torch_function_unary(self):
525 return handle_torch_function(Tensor.register_hook, (self,), self, hook)
File [/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:251](https://ahwj8sa5y2ar84p.studio.us-east-1.sagemaker.aws/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py#line=250), in backward()
204 def grad(
205 outputs: _TensorOrTensors,
206 inputs: _TensorOrTensors,
(...)
212 is_grads_batched: bool = False
213 ) -> Tuple[torch.Tensor, ...]:
214 r"""Computes and returns the sum of gradients of outputs with respect to
215 the inputs.
216
217 ``grad_outputs`` should be a sequence of length matching ``output``
218 containing the "vector" in vector-Jacobian product, usually the pre-computed
219 gradients w.r.t. each of the outputs. If an output doesn't require_grad,
220 then the gradient can be ``None``).
221
222 .. note::
223
224 If you run any forward ops, create ``grad_outputs``, and[/or](https://ahwj8sa5y2ar84p.studio.us-east-1.sagemaker.aws/or) call ``grad``
225 in a user-specified CUDA stream context, see
226 :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
227
228 .. note::
229
230 ``only_inputs`` argument is deprecated and is ignored now (defaults to ``True``).
231 To accumulate gradient for other parts of the graph, please use
232 ``torch.autograd.backward``.
233
234 Args:
235 outputs (sequence of Tensor): outputs of the differentiated function.
236 inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be
237 returned (and not accumulated into ``.grad``).
238 grad_outputs (sequence of Tensor): The "vector" in the vector-Jacobian product.
239 Usually gradients w.r.t. each output. None values can be specified for scalar
240 Tensors or ones that don't require grad. If a None value would be acceptable
241 for all grad_tensors, then this argument is optional. Default: None.
242 retain_graph (bool, optional): If ``False``, the graph used to compute the grad
243 will be freed. Note that in nearly all cases setting this option to ``True``
244 is not needed and often can be worked around in a much more efficient
245 way. Defaults to the value of ``create_graph``.
246 create_graph (bool, optional): If ``True``, graph of the derivative will
247 be constructed, allowing to compute higher order derivative products.
248 Default: ``False``.
249 allow_unused (bool, optional): If ``False``, specifying inputs that were not
250 used when computing outputs (and therefore their grad is always zero)
--> 251 is an error. Defaults to ``False``.
252 is_grads_batched (bool, optional): If ``True``, the first dimension of each
253 tensor in ``grad_outputs`` will be interpreted as the batch dimension.
254 Instead of computing a single vector-Jacobian product, we compute a
255 batch of vector-Jacobian products for each "vector" in the batch.
256 We use the vmap prototype feature as the backend to vectorize calls
257 to the autograd engine so that this computation can be performed in a
258 single call. This should lead to performance improvements when compared
259 to manually looping and performing backward multiple times. Note that
260 due to this feature being experimental, there may be performance
261 cliffs. Please use ``torch._C._debug_only_display_vmap_fallback_warnings(True)``
262 to show any performance warnings and file an issue on github if warnings exist
263 for your use case. Defaults to ``False``.
264 """
265 t_outputs = cast(Tuple[torch.Tensor, ...], (outputs,) if is_tensor_like(outputs) else tuple(outputs))
266 t_inputs = cast(Tuple[torch.Tensor, ...], (inputs,) if is_tensor_like(inputs) else tuple(inputs))
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
```
requirements.txt:
```
transformers==4.45.1
peft==0.13.0
accelerate==0.34.2
bitsandbytes==0.44.0
evaluate==0.4.1
safetensors>=0.4.3
sagemaker==2.232.1
trl==0.11.1
tokenizers>=0.19.1
py7zr
```
### Expected behavior
The script was adapted from [run_fsdp_qlora.py](https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/scripts/run_fsdp_qlora.py), which seems to work. Switched to `SFTConfig` as recommended in the documentation.
Expected behavior is to execute successfully the training script.