Finetuning GPT2 with user defined loss

@valhalla Thanks for your reply. Now I am getting the same error as the OP in this post. Here is my DataSet object:

from torch.utils.data import Dataset
import pandas as pd
from transformers import GPT2Tokenizer

class SDAbstractsDataset(Dataset):
    def __init__(self, csv_file):
        self.sd_abstracts_df = pd.read_csv(csv_file, encoding='ISO-8859-1')
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    def __len__(self):
        return len(self.sd_abstracts_df)

    def __getitem__(self, idx):
        abstract_text = self.sd_abstracts_df.iloc[idx, 1]
        input_ids = self.tokenizer.encode(abstract_text, return_tensors='pt')
        return {'input_ids': input_ids, 'past': None,
                'attention_mask': None, 'token_type_ids': None,
                'position_ids': None, 'head_mask': None,
                'inputs_embeds': None, 'labels': None,
                'use_cache': True}

The inputs into my forward method are the same:

def forward(
            self,
            input_ids=None,
            past=None,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            labels=None,
            use_cache=True,
    ):

And here is my training setup:

from text_gen_w_transformers.finetune_gpt2 import GPT2FinetunedWithNgrams
from text_gen_w_transformers.custom_dataset import SDAbstractsDataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

sd_dataset = SDAbstractsDataset('/path/to/sd_samples_64.csv')

training_args = TrainingArguments(
    output_dir='/path/to/finetuned_gpt2',
    do_train=True,
    per_device_train_batch_size=4,
    learning_rate=1e-3,
    num_train_epochs=1
)

model = GPT2FinetunedWithNgrams.from_pretrained('gpt2')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=sd_dataset
)

trainer.train()

When that training command runs, I get the following error:

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/16 [00:00<?, ?it/s]Traceback (most recent call last):
  File "/path/to/finetune_test.py", line 35, in <module>
    trainer.train()
  File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/trainer.py", line 464, in train
    for step, inputs in enumerate(epoch_iterator):
  File "/path/to/anaconda3/lib/python3.7/site-packages/tqdm/std.py", line 1107, in __iter__
    for obj in iterable:
  File "/path/to/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 345, in __next__
    data = self._next_data()
  File "/path/to/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 385, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/path/to/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/data/data_collator.py", line 72, in collate_batch
    for k, v in vars(first).items():
TypeError: vars() argument must have __dict__ attribute

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

In contrast to this post, my DataSet object returns a dictionary. But, if I understand the post correctly since the Trainer I have instantiated is passing in (by default) default_data_collator, I should be returning a List[InputExamples] from my SDAbstractsDataset? That seems to contradict the next part of the post that discusses having the DataLoader return a dict with the same key-value pairs that forward expects.