ValueError: Using fsdp only works in distributed training

I am getting an error when launching the TRL trainer with fsdp. Attaching complete configuration below:

from transformers import TrainingArguments

output_dir = model_output_dir
per_device_train_batch_size = 6
gradient_accumulation_steps = 8
optim = "paged_adamw_32bit"
save_steps = 50
save_total_limit=3
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "cosine_with_restarts"
max_steps = 8000
group_by_length = True
do_fsdp_training = "full_shard"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    # group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to = "tensorboard",
    fsdp= do_fsdp_training
)

math_qa_data_collator = DataCollatorWithPadding(
    tokenizer,
    return_tensors = "pt"
)

from trl import SFTTrainer

max_seq_length = 1024

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_arguments,
    peft_config=peft_config,
    train_dataset=math_qa_dataset["train"],
    formatting_func = dataset_formatting_func,
    max_seq_length=max_seq_length,
    data_collator=math_qa_data_collator,
    packing=True,
)

Stacktrace Snippet

File /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:455, in Trainer.__init__(self, model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics)
    451     raise ValueError(
    452         "Using --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags."
    453     )
    454 if not args.fsdp_config["xla"] and args.parallel_mode != ParallelMode.DISTRIBUTED:
--> 455     raise ValueError("Using fsdp only works in distributed training.")
    457 # dep_version_check("torch>=1.12.0")
    458 # Would have to update setup.py with torch>=1.12.0
    459 # which isn't ideally given that it will force people not using FSDP to also use torch>=1.12.0
    460 # below is the current alternative.
    461 if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.12.0"):

ValueError: Using fsdp only works in distributed training.