CUDA out of memory when using the trainer model_init

I am running out of memory when using the trainer model_init argument as follows:

def model_init():
    model =  AutoModelForSequenceClassification.from_pretrained(BASE_MODEL_ID,num_labels=3,
                                                              output_attentions = False, # Whether the model returns attentions weights.
                                                              output_hidden_states = False,
                                                              return_dict=True,
                                                              torch_dtype=torch.float16,
                                                              )
    model.config.pad_token_id = 2
    model.config.use_cache = False  
    model.gradient_checkpointing_enable()
    for name, param in model.named_parameters():
        param.requires_grad = False
    for name, param in model.score.named_parameters():
        param.requires_grad = True
    for name, param in model.named_parameters():
        print(f"{name} - {param.requires_grad}")
    return model

run_name = BASE_MODEL_ID.split("/")[-1]  + PROJECT_NAME
output_dir = "./" + run_name
trainer_args = TrainingArguments(
               output_dir=output_dir,
               #warmup_steps=1,
               per_device_train_batch_size=4,
               gradient_accumulation_steps=8,
               gradient_checkpointing=True,
               max_steps=200,
               warmup_ratio=0.1,
               learning_rate=2e-5, 
               bf16=True,
               optim="paged_adamw_32bit",
               load_best_model_at_end=True,
               metric_for_best_model="eval_loss",
               logging_steps=4,             
               logging_dir="./logs",        
               save_strategy="steps",       
               save_steps=8,               
               evaluation_strategy="steps", 
               eval_steps=8,
               seed=SEED,            
               do_eval=True
              )


trainer = Trainer(
    model_init = model_init,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    args=trainer_args,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer), 
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

It seems like the trainer calll places the model on the GPU, when calling trainer.train() the model is placed on the GPU again and the GPU runs out of memory. Is there a way to prevent this while being able to init the model with the same random weights?