I am getting an error when launching the TRL trainer with fsdp. Attaching complete configuration below:
from transformers import TrainingArguments
output_dir = model_output_dir
per_device_train_batch_size = 6
gradient_accumulation_steps = 8
optim = "paged_adamw_32bit"
save_steps = 50
save_total_limit=3
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "cosine_with_restarts"
max_steps = 8000
group_by_length = True
do_fsdp_training = "full_shard"
training_arguments = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
save_total_limit=save_total_limit,
logging_steps=logging_steps,
learning_rate=learning_rate,
fp16=True,
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
# group_by_length=group_by_length,
lr_scheduler_type=lr_scheduler_type,
report_to = "tensorboard",
fsdp= do_fsdp_training
)
math_qa_data_collator = DataCollatorWithPadding(
tokenizer,
return_tensors = "pt"
)
from trl import SFTTrainer
max_seq_length = 1024
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_arguments,
peft_config=peft_config,
train_dataset=math_qa_dataset["train"],
formatting_func = dataset_formatting_func,
max_seq_length=max_seq_length,
data_collator=math_qa_data_collator,
packing=True,
)
Stacktrace Snippet
File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:455, in Trainer.__init__(self, model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics)
451 raise ValueError(
452 "Using --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags."
453 )
454 if not args.fsdp_config["xla"] and args.parallel_mode != ParallelMode.DISTRIBUTED:
--> 455 raise ValueError("Using fsdp only works in distributed training.")
457 # dep_version_check("torch>=1.12.0")
458 # Would have to update setup.py with torch>=1.12.0
459 # which isn't ideally given that it will force people not using FSDP to also use torch>=1.12.0
460 # below is the current alternative.
461 if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.12.0"):
ValueError: Using fsdp only works in distributed training.