Finetuning GPT2 with user defined loss

Thanks to everyone helping out and helping me get things setup right. I really appreciate the feedback and assistance!

@sgugger: Aahh, ok I see what you’re saying. I’ve modified the DataSet object to:

from torch.utils.data import Dataset
import pandas as pd
import torch

class SDAbstractsDataset(Dataset):
    def __init__(self, csv_file):
        self.sd_abstracts_df = pd.read_csv(csv_file, encoding='ISO-8859-1')

    def __len__(self):
        return len(self.sd_abstracts_df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        abstract_text = self.sd_abstracts_df.iloc[idx, 1]
        return abstract_text

and the DataCollator function as:

def sd_data_collator(dataset_samples_list):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='right')
    tokenizer.pad_token = tokenizer.eos_token

    encoded_results = tokenizer(dataset_samples_list, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True)

    batch = {}
    batch['input_ids'] = torch.stack([result for result in encoded_results['input_ids']])
    batch['past'] = [None for result in encoded_results]
    batch['attention_mask'] = torch.stack([result for result in encoded_results['attention_mask']])
    batch['position_ids'] = [None for result in encoded_results]
    batch['head_mask'] = [None for result in encoded_results]
    batch['inputs_embeds'] = [None for result in encoded_results]
    batch['labels'] = [None for result in encoded_results]
    batch['use_cache'] = [True for result in encoded_results]
    return batch

and that gets passed to the Trainer object. However, I get the following error:

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/16 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "/path/to/finetune_test.py", line 55, in <module>
    trainer.train()
  File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/trainer.py", line 499, in train
    tr_loss += self._training_step(model, inputs, optimizer)
  File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/trainer.py", line 622, in _training_step
    outputs = model(**inputs)
  File "/path/to/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/path/to/ric-2020/text_gen_w_transformers/finetune_gpt2.py", line 46, in forward
    orig_input_str = self.tokenizer.decode(input_ids, skip_special_tokens=True)
  File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 688, in decode
    filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
  File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 666, in convert_ids_to_tokens
    index = int(index)
ValueError: only one element tensors can be converted to Python scalars

@swayso: I get the same error when adding in the max_length arguments to the previous tokenizer setup but with no DataCollator:

class SDAbstractsDataset(Dataset):
    def __init__(self, csv_file):
        self.sd_abstracts_df = pd.read_csv(csv_file, encoding='ISO-8859-1')
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='right')
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def __len__(self):
        return len(self.sd_abstracts_df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        abstract_text = self.sd_abstracts_df.iloc[idx, 1]
        encoded_result =self.tokenizer(abstract_text, padding='max_length', max_length=512, truncation=True, return_tensors='pt', return_attention_mask=True)
        return {'input_ids': encoded_result['input_ids'],
                'past': None,
                'attention_mask': encoded_result['attention_mask'],
                'token_type_ids': None,
                'position_ids': None,
                'head_mask': None,
                'inputs_embeds': None,
                'labels': None,
                'use_cache': True}
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/16 [00:00<?, ?it/s]Traceback (most recent call last):
  File "/path/to/finetune_test.py", line 55, in <module>
    trainer.train()
  File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/trainer.py", line 499, in train
    tr_loss += self._training_step(model, inputs, optimizer)
  File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/trainer.py", line 622, in _training_step
    outputs = model(**inputs)
  File "/path/to/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/path/to/ric-2020/text_gen_w_transformers/finetune_gpt2.py", line 46, in forward
    orig_input_str = self.tokenizer.decode(input_ids, skip_special_tokens=True)
  File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 688, in decode
    filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
  File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 666, in convert_ids_to_tokens
    index = int(index)
ValueError: only one element tensors can be converted to Python scalars