Hi, everyone.
I would like to fine-tuning my custom data on T5 model.
But when I run below codes, I got ValueError.
ValueError: The Batch received was empty, your model won’t be able to train on it. Double-check that your training dataset contains keys expected by the model: inputs, kwargs, label_ids, label.
I have checked that return value of SummarizationDataset, and there is nothing problem with it. It returned correct tensor, and dictation keys. However, when I tried to input train_dataset into Trainer, it asserted ValueError.
What is wrong of my code?
Thanks for your reading!
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
class SummarizationDataset(Dataset):
def __init__(self, data, tokenizer, max_length=512):
self.tokenizer = tokenizer
self.data = data
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
src = self.data.iloc[idx]['src']
tgt = self.data.iloc[idx]['tgt']
srcs = self.tokenizer(src,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt')
targets = self.tokenizer(tgt,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt')
input_ids = srcs.input_ids.squeeze()
attention_mask = srcs.attention_mask.squeeze()
labels = targets.input_ids.squeeze()
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': labels}
data = pd.read_csv(csv_filepath)
train_df, valid_df = train_test_split(data, test_size=0.2)
train_dataset = SummarizationDataset(data=train_df, tokenizer=tokenizer)
valid_dataset = SummarizationDataset(data=valid_df, tokenizer=tokenizer)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model = nn.DataParallel(model)
training_args = TrainingArguments(
output_dir="./output",
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=10,
logging_dir='./logs',
logging_steps=10,
save_steps=500,
eval_strategy='steps',
eval_steps=500,
save_total_limit=2,
load_best_model_at_end=True,
report_to=["none"],
logging_strategy='steps'
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
)
print(type(train_dataset[0]))
print(train_dataset[0].keys())
trainer.train()