I’m trying to do some language training on a GPT model (distilgpt2). I’ve gone through the Transformers course here at huggingface, and I’ve dabbled with some article I found on the internet training a model from german recipes, but I’m having trouble adapting the code and integrating what I’ve learned for my own purposes.
Here’s my code. The dataset is a csv file with the columns id and text, and I’m attempting to inherit from Dataset so that I can only use the text column. But the trainer throws an error.
File "my-distgpt2/t.py", line 57, in <module>
trainer.train()
AttributeError: 'MyDataset' object has no attribute '_data'. Did you mean: 'data'?
What am I doing wrong?
from datasets import load_dataset, Dataset
from transformers import set_seed
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
set_seed(42)
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
model = AutoModelWithLMHead.from_pretrained('distilgpt2')
# Dataset features: {'id', 'text'}
raw_dataset = load_dataset('csv', data_files='dataset.csv')
dataset = raw_dataset['train'].train_test_split(test_size=0.2)
class MyDataset(Dataset):
def __init__(self, dataset):
self.dataset = dataset
self.tokenizer = tokenizer
def __len__(self):
return len(self.dataset)
def __getitem__(self, id):
text = self.dataset[id]['text']
inputs = self.tokenizer(text, truncation=True)
return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask']}
train_dataset = MyDataset(dataset['train'])
eval_dataset = MyDataset(dataset['test'])
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
training_args = TrainingArguments(
output_dir="./model",
overwrite_output_dir=True,
num_train_epochs=100,
per_device_train_batch_size=32, # batch size for training
per_device_eval_batch_size=64, # batch size for evaluation
eval_steps = 400, # Number of update steps between two evaluations.
save_steps=800, # after # steps model is saved
warmup_steps=500, # number of warmup steps for learning rate scheduler
prediction_loss_only=True,
)
trainer = Trainer(
args=training_args,
tokenizer=tokenizer,
data_collator=data_collator,
model=model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
trainer.train()
trainer.save_model()