Thanks to everyone helping out and helping me get things setup right. I really appreciate the feedback and assistance!
@sgugger: Aahh, ok I see what you’re saying. I’ve modified the DataSet
object to:
from torch.utils.data import Dataset
import pandas as pd
import torch
class SDAbstractsDataset(Dataset):
def __init__(self, csv_file):
self.sd_abstracts_df = pd.read_csv(csv_file, encoding='ISO-8859-1')
def __len__(self):
return len(self.sd_abstracts_df)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
abstract_text = self.sd_abstracts_df.iloc[idx, 1]
return abstract_text
and the DataCollator
function as:
def sd_data_collator(dataset_samples_list):
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='right')
tokenizer.pad_token = tokenizer.eos_token
encoded_results = tokenizer(dataset_samples_list, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True)
batch = {}
batch['input_ids'] = torch.stack([result for result in encoded_results['input_ids']])
batch['past'] = [None for result in encoded_results]
batch['attention_mask'] = torch.stack([result for result in encoded_results['attention_mask']])
batch['position_ids'] = [None for result in encoded_results]
batch['head_mask'] = [None for result in encoded_results]
batch['inputs_embeds'] = [None for result in encoded_results]
batch['labels'] = [None for result in encoded_results]
batch['use_cache'] = [True for result in encoded_results]
return batch
and that gets passed to the Trainer
object. However, I get the following error:
Epoch: 0%| | 0/1 [00:00<?, ?it/s]
Iteration: 0%| | 0/16 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/path/to/finetune_test.py", line 55, in <module>
trainer.train()
File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/trainer.py", line 499, in train
tr_loss += self._training_step(model, inputs, optimizer)
File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/trainer.py", line 622, in _training_step
outputs = model(**inputs)
File "/path/to/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/path/to/ric-2020/text_gen_w_transformers/finetune_gpt2.py", line 46, in forward
orig_input_str = self.tokenizer.decode(input_ids, skip_special_tokens=True)
File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 688, in decode
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 666, in convert_ids_to_tokens
index = int(index)
ValueError: only one element tensors can be converted to Python scalars
@swayso: I get the same error when adding in the max_length
arguments to the previous tokenizer setup but with no DataCollator
:
class SDAbstractsDataset(Dataset):
def __init__(self, csv_file):
self.sd_abstracts_df = pd.read_csv(csv_file, encoding='ISO-8859-1')
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='right')
self.tokenizer.pad_token = self.tokenizer.eos_token
def __len__(self):
return len(self.sd_abstracts_df)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
abstract_text = self.sd_abstracts_df.iloc[idx, 1]
encoded_result =self.tokenizer(abstract_text, padding='max_length', max_length=512, truncation=True, return_tensors='pt', return_attention_mask=True)
return {'input_ids': encoded_result['input_ids'],
'past': None,
'attention_mask': encoded_result['attention_mask'],
'token_type_ids': None,
'position_ids': None,
'head_mask': None,
'inputs_embeds': None,
'labels': None,
'use_cache': True}
Epoch: 0%| | 0/1 [00:00<?, ?it/s]
Iteration: 0%| | 0/16 [00:00<?, ?it/s]Traceback (most recent call last):
File "/path/to/finetune_test.py", line 55, in <module>
trainer.train()
File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/trainer.py", line 499, in train
tr_loss += self._training_step(model, inputs, optimizer)
File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/trainer.py", line 622, in _training_step
outputs = model(**inputs)
File "/path/to/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/path/to/ric-2020/text_gen_w_transformers/finetune_gpt2.py", line 46, in forward
orig_input_str = self.tokenizer.decode(input_ids, skip_special_tokens=True)
File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 688, in decode
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
File "/path/to/anaconda3/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 666, in convert_ids_to_tokens
index = int(index)
ValueError: only one element tensors can be converted to Python scalars