Sorry for the URGENT tag but I have a deadline. The title is self-explanatory. The predictions from trainer.predict() are extremely bad whereas model.generate gives qualitative results.
I want to use trainer.predict() because it is paralilized on the gpu. My testing data set is huge, having 250k samples. I wonder if I am doing something wrong or the library contains an issue. Below you can find a minimum example of my code.
# Load the model and tokenizer that were trained and saved.
tokenizer = T5Tokenizer.from_pretrained(args.load_model)
print("Loaded tokenizer from directory {}".format(args.load_model))
model = T5ForConditionalGeneration.from_pretrained(args.load_model)
# create train and validation data set, not relevant for testing
train_dataset = create_dataset(train_inputs, train_labels, tokenizer, pad_truncate=True, max_length=128)
val_dataset = create_dataset(val_inputs, val_labels, tokenizer, pad_truncate=True)
# training arguments
training_args = Seq2SeqTrainingArguments(
output_dir=model_directory,
num_train_epochs=args.epochs,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=args.batch_size,
warmup_steps=500,
weight_decay=args.weight_decay,
logging_dir=model_directory,
logging_steps=100,
do_eval=True,
evaluation_strategy='epoch',
learning_rate=args.learning_rate,
load_best_model_at_end=True,
metric_for_best_model='eval_loss',
greater_is_better=False,
save_total_limit=args.epochs,
eval_accumulation_steps=args.eval_acc_steps, # set this lower, if testing or validation crashes
disable_tqdm=True if args.load_model != '' else False,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
optimizers=[torch.optim.Adam(params=model.parameters(), lr=args.learning_rate), None],
tokenizer=tokenizer,
)
class BugFixDataset(torch.utils.data.Dataset):
def __init__(self, encodings, targets):
self.encodings = encodings
self.target_encodings = targets
def __getitem__(self, index):
item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.target_encodings['input_ids'][index], dtype=torch.long)
return item
def __len__(self):
return len(self.encodings['input_ids'])
# create the dataset
test_warning_dataset = create_dataset(test_warning, test_warning_labels, tokenizer, pad_truncate=True, max_length=target_max_length)
# the take models output --> logits --> argmax to obtain prediction ids.
output_ids = np.argmax(trainer.predict(test_dataset=test_warning_dataset, num_beams=5, max_length=target_max_length).predictions[0], axis=2)
trainer_outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
model_generate_outputs = []
for i, code in enumerate(test_warning):
input_ids = tokenizer.encode(code, truncation=True, padding=True).to(model.device)
beam_outputs = model.generate(input_ids, max_length=target_max_length, num_beams=5, early_stopping=False, num_return_sequences=1)
for pred in beam_outputs:
x = tokenizer.decode(pred, skip_special_tokens)
model_generate_outputs.append(x)
model_generate_outputs and trainer_outputs are different. What is the issue?
What am I doing wrong?