Finetuning GPT2 using Multiple GPU and Trainer

@aclifton314 , I hope you are well. sorry, I run the code pretty easy with this command and then checked the nvidia-smi all defined gpus are working (means 1,2,3). is it ok now? I shouldn’t do anything more inside the code? this is my code, can I trust the final model? how did you call your function?

CUDA_VISIBLE_DEVICES=“1,2,3” python casesummary_resolution_GPT_Neo_GPU_V5-125M-Trainer_v22.py

from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, IntervalStrategy
from sklearn.model_selection import train_test_split

torch.manual_seed(42)

pretrained_model = '/home//GPT-NEO-125M/'

tokenizer = AutoTokenizer.from_pretrained(pretrained_model, bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained(pretrained_model).cuda()

print(torch.cuda.current_device())


model.resize_token_embeddings(len(tokenizer))

descriptions = DataWhole_1

# max_length = max([len(tokenizer.encode(description)) for description in descriptions])

max_length=350
print("Max length: {}".format(max_length))


class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


dataset = NetflixDataset(descriptions, tokenizer, max_length=max_length)


train_dataset, val_dataset = train_test_split(dataset,test_size=.1,random_state=42,shuffle=False)

training_args = TrainingArguments(output_dir=Results_Path, learning_rate=5e-5,num_train_epochs=16,evaluation_strategy="steps", logging_strategy="steps",save_strategy="steps",save_steps=10000,seed=42,load_best_model_at_end=True,logging_steps=10000,
report_to="tensorboard",per_device_train_batch_size=4,eval_steps=10000,save_total_limit=2,per_device_eval_batch_size=4,
warmup_steps=100, weight_decay=0.01, logging_dir=Results_Path)


Trainer(model=model, args=training_args, tokenizer=tokenizer,train_dataset=train_dataset,
        eval_dataset=val_dataset,data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()