@valhalla, happy to. Here is the snippet:
from transformers import GPT2Tokenizer, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
class MyDataset(Dataset):
def __init__(self, csv_file: str):
self.df = pd.read_csv(csv_file, encoding='ISO-8859-1')
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
text = self.df.iloc[idx, 1]
return text
def my_data_collator(dataset_samples_list):
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='right')
tokenizer.pad_token = tokenizer.eos_token
encoded_results = tokenizer(dataset_samples_list, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True)
batch = {}
batch['input_ids'] = torch.stack([result for result in encoded_results['input_ids']])
batch['past'] = None
batch['attention_mask'] = torch.stack([result for result in encoded_results['attention_mask']])
batch['position_ids'] = None
batch['head_mask'] = None
batch['inputs_embeds'] = None
batch['labels'] = None
batch['use_cache'] = True
return batch
dataset_train = MyDataset('/path/to/train_dataset.csv')
training_args = TrainingArguments(
output_dir='/path/to/out',
do_train=True,
per_device_train_batch_size=64,
logging_dir='/path/to/dir',
max_steps=300000
)
model = GPT2FinetunedWithNgrams.from_pretrained('gpt2')
trainer = Trainer(
model=model,
args=training_args,
data_collator=my_data_collator,
train_dataset=dataset_train
)
trainer.train()
trainer.save_model('/path/to/model_save_dir')
I’m working on getting the snippet together for model = GPT2FinetunedWithNgrams.from_pretrained('gpt2')
so others can see how the loss etc. are being calculated. The above is the controller script for the training.