Finetuning GPT2 using Multiple GPU and Trainer

@valhalla, happy to. Here is the snippet:

from transformers import GPT2Tokenizer, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, csv_file: str):
            self.df = pd.read_csv(csv_file, encoding='ISO-8859-1')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        text = self.df.iloc[idx, 1]
        return text



def my_data_collator(dataset_samples_list):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='right')
    tokenizer.pad_token = tokenizer.eos_token

    encoded_results = tokenizer(dataset_samples_list, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True)

    batch = {}
    batch['input_ids'] = torch.stack([result for result in encoded_results['input_ids']])
    batch['past'] = None
    batch['attention_mask'] = torch.stack([result for result in encoded_results['attention_mask']])
    batch['position_ids'] = None
    batch['head_mask'] = None
    batch['inputs_embeds'] = None
    batch['labels'] = None
    batch['use_cache'] = True
    return batch


dataset_train = MyDataset('/path/to/train_dataset.csv')

training_args = TrainingArguments(
    output_dir='/path/to/out',
    do_train=True,
    per_device_train_batch_size=64,
    logging_dir='/path/to/dir', 
    max_steps=300000
)

model = GPT2FinetunedWithNgrams.from_pretrained('gpt2')

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=my_data_collator,
    train_dataset=dataset_train
)
trainer.train()
trainer.save_model('/path/to/model_save_dir')

I’m working on getting the snippet together for model = GPT2FinetunedWithNgrams.from_pretrained('gpt2') so others can see how the loss etc. are being calculated. The above is the controller script for the training.