How to save hugging face fine tuned model using pytorch and distributed training

I am fine tuning masked language model from XLM Roberta large on google machine specs.
When I copy the model using gsutil and subprocess from container to GCP bucket it gives me error.

Versions

  Versions torch==1.11.0+cu113 
  torchvision==0.12.0+cu113  
  torchaudio==0.11.0+cu113 
  transformers==4.17.0

I am using pre-trained Hugging face model.

I launch it as train.py file which I copy inside docker image and use vertex-ai ( GCP) to launch it using Containerspec

machineSpec = MachineSpec(machine_type="a2-highgpu-4g",accelerator_count=4,accelerator_type="NVIDIA_TESLA_A100")

python -m torch.distributed.launch --nproc_per_node 4 train.py --bf16 

I am using

tokenizer = tr.XLMRobertaTokenizer.from_pretrained("xlm-roberta-large",local_files_only=True)
model = tr.XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-large", return_dict=True,local_files_only=True)

Training Code

training_args = tr.TrainingArguments(

     output_dir='****'
    ,logging_dir='****'        # directory for storing logs
    ,save_strategy="epoch"
    ,run_name="****"
    ,learning_rate=2e-5
    ,logging_steps=1000
    ,overwrite_output_dir=True
    ,num_train_epochs=10
    ,per_device_train_batch_size=4
    ,prediction_loss_only=True
    ,gradient_accumulation_steps=2
#     ,gradient_checkpointing=True
    ,bf16=True #57100 
,optim="adafactor"

)


trainer = tr.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data
)

Train.py

import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import transformers as tr
from sentence_transformers import SentenceTransformer
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,BertForMaskedLM
from transformers import DataCollatorForLanguageModeling
from scipy.special import softmax
import scipy
import random
import pickle
import os
import time

import subprocess as sp


# torch.cuda.empty_cache()




start=time.time()


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Using", device)
torch.backends.cudnn.deterministic = True  

tr.trainer_utils.set_seed(0)

print("here")

tokenizer = tr.XLMRobertaTokenizer.from_pretrained("xlm-roberta-large",local_files_only=True)
model = tr.XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-large", return_dict=True,local_files_only=True)
model.gradient_checkpointing_enable() #included as new line
print("included gradient checkpoint")

model.to(device)
print("Model loaded successfully")

df=pd.read_csv("data.csv") 
train_df=df.text.tolist()
print(len(train_df))

train_df=list(set(train_df))
train_df = [x for x in train_df if str(x) != 'nan']

print("Length of training data is \n ",len(train_df))
print("DATA LOADED successfully")


train_encodings = tokenizer(train_df, truncation=True, padding=True, max_length=512, return_tensors="pt")
print("encoding done")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
print("data collector done")

class SEDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["attention_mask"])

train_data = SEDataset(train_encodings)

print("train data created")

training_args = tr.TrainingArguments(

     output_dir='results_mlm_exp1'
    ,logging_dir='logs_mlm_exp1'        # directory for storing logs
    ,save_strategy="epoch"
    ,learning_rate=2e-5
    ,logging_steps=500
    ,overwrite_output_dir=True
    ,num_train_epochs=20
    ,per_device_train_batch_size=4
    ,prediction_loss_only=True
    ,gradient_accumulation_steps=2

    ,bf16=True #Ampere GPU
)


trainer = tr.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data
)

trainer.train()
print("model training finished")
trainer.save_model("model_mlm_exp1")

print("training finished")

end=time.time()

print("total time taken in hours is", (end-start)/3600)

Error

trainer.save_model("model_mlm_exp1")
subprocess.call('gsutil cp -r /pythonPackage/trainer/model_mlm_exp1 gs://******/model_mlm_exp1', shell=True, stdout=subprocess.PIPE)

ERROR	ResumableUploadAbortException: 409 The object has already been created in an earlier attempt and was overwritten, possibly due to a race condition.