Hi @sgugger. Thank you for your reply.
The model I am using is T5. This is how I initialize the model and the dataset:
from transformers import T5ForConditionalGeneration, T5Tokenizer
t5 = T5ForConditionalGeneration.from_pretrained('t5-base')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
class CustomDataset(Dataset):
def __init__(self, data, tokenizer, input_len, summ_len, eval=False):
self.tokenizer = tokenizer
self.data = data
self.input_len = input_len
self.summ_len = summ_len
self.eval = eval
self.article = self.data.article
self.summary = self.data.summary
def __len__(self):
return len(self.summary)
def __getitem__(self, idx):
item = {}
article = str(self.article[idx])
article = ' '.join(article.split())
summary = str(self.summary[idx])
summary = ' '.join(summary.split())
source = self.tokenizer.batch_encode_plus(
[article],
max_length = self.input_len,
truncation = True,
padding = 'max_length',
return_tensors = 'pt')
target = self.tokenizer.batch_encode_plus(
[summary],
max_length = self.summ_len,
truncation = True,
padding = 'max_length',
return_tensors='pt')
item['input_ids'] = source['input_ids'].squeeze()
item['attention_mask'] = source['attention_mask'].squeeze()
y = target['input_ids'].squeeze()
if not self.eval:
y[y == self.tokenizer.pad_token_id] = -100
item['labels'] = y
return item
train_dataset = CustomDataset(
train_dataset,
t5_tokenizer,
MAX_LEN,
SUMMARY_LEN)
val_dataset = CustomDataset(
val_dataset,
t5_tokenizer,
MAX_LEN,
SUMMARY_LEN,
eval = True)
This is how i set the TrainingArgs and the Trainer:
training_args = TrainingArguments(
output_dir = '/content/drive/My Drive/t5_newssummary_train',
overwrite_output_dir = True,
do_train = True,
evaluation_strategy = 'steps',
eval_steps = 10,
#prediction_loss_only = True,
per_device_train_batch_size = TRAIN_BATCH_SIZE,
per_device_eval_batch_size = VALID_BATCH_SIZE,
num_train_epochs = TRAIN_EPOCHS,
learning_rate = LEARNING_RATE,
logging_steps = 10,
seed = SEED,
dataloader_num_workers = 0,
run_name = 'doing_eval',
logging_dir = '/content/drive/My Drive/t5_newssummary_train/logs',
disable_tqdm = True
)
trainer = Trainer(
model = t5,
args = training_args,
train_dataset = train_dataset,
compute_metrics = my_compute_metrics,
eval_dataset = val_dataset,
optimizers = optimizers
)
And this is my compute_metrics
function:
from transformers import EvalPrediction
def my_compute_metrics(p: EvalPrediction):
predictions = p.predictions
print("predictions")
print(len(predictions))
print_predictions(predictions)
references = p.label_ids
print("references")
for r in references:
print(r.shape)
return {'marco': 1}
The print_predictions
function only prints the tuple object.
The output I get when evaluating is
predictions
3
(23, 150, 32128)
new tuple 12
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
new tuple 4
(23, 12, 150, 64)
(23, 12, 150, 64)
(23, 12, 512, 64)
(23, 12, 512, 64)
(23, 512, 768)
references
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
(150,)
{'eval_loss': 7.113981246948242, 'eval_marco': 1, 'epoch': 0.18518518518518517}
I made my compute_metrics
function to print because I was getting weird predictions and labels. As I said in my initial post, what I am expecting from the model is 2 predictions (because of the batch size of 2) and 2 labels. What I get is 23 labels and some strange tuples for the predictions.
Thank you very much for your answer again, hope you can help me sort this out.