Error of 'input_ids' when using Transformers Trainer class with Encoder/Decoder model

Hi everyone,

My first post here, as I’m very new to the HuggingFace library. Looking forward to joining the community!

I’m trying to follow the tutorial here but modifying it to use a sequence-to-sequence paradigm rather than sequence classification. My code is below:

# load data from files
data_files = {"train": "data/training_data.json", "test": "data/test_data.json"}
data = load_dataset("json", data_files=data_files)

# define tokenizer and map data to tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["output"], examples["input"], padding="max_length")
tokenized_data = data.map(tokenize_function, batched=True)

# define evaluation function
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# define training args and model
training_args = TrainingArguments(output_dir="training_args", evaluation_strategy="epoch", use_mps_device=True)
encoder = BertGenerationEncoder.from_pretrained("bert-base-uncased")
decoder = BertGenerationDecoder.from_pretrained("bert-base-uncased")
model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

# define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    compute_metrics=compute_metrics,
)

# call train loop
trainer.train()

At the trainer.train() step, I get the error ValueError: You have to specify either input_ids or inputs_embeds. I see here that this issue has already been reported and likely has to do with the encoder/decoder framework requiring the input_id variable for both the encoder and decoder components.

What I’m not sure about is if this issue means that I can not use the convenience wrapper class trainer.py to finetune the model for a sequence-to-sequence task due to this issue, or whether there is a way to still use this class? If it is not possible, what is the current recommended way to finetune such a model?

Below is the full error stack trace:

ValueError                                Traceback (most recent call last)
Cell In[7], line 1
----> 1 trainer.train()

File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:1645, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1640     self.model_wrapped = self.model
   1642 inner_training_loop = find_executable_batch_size(
   1643     self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
   1644 )
-> 1645 return inner_training_loop(
   1646     args=args,
   1647     resume_from_checkpoint=resume_from_checkpoint,
   1648     trial=trial,
   1649     ignore_keys_for_eval=ignore_keys_for_eval,
   1650 )

File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:1938, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1935     self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
   1937 with self.accelerator.accumulate(model):
-> 1938     tr_loss_step = self.training_step(model, inputs)
   1940 if (
   1941     args.logging_nan_inf_filter
   1942     and not is_torch_tpu_available()
   1943     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
   1944 ):
   1945     # if loss is nan or inf simply add the average of previous logged losses
   1946     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:2759, in Trainer.training_step(self, model, inputs)
   2756     return loss_mb.reduce_mean().detach().to(self.args.device)
   2758 with self.compute_loss_context_manager():
-> 2759     loss = self.compute_loss(model, inputs)
   2761 if self.args.n_gpu > 1:
   2762     loss = loss.mean()  # mean() to average on multi-gpu parallel training

File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:2784, in Trainer.compute_loss(self, model, inputs, return_outputs)
   2782 else:
   2783     labels = None
-> 2784 outputs = model(**inputs)
   2785 # Save past state if it exists
   2786 # TODO: this needs to be fixed and made cleaner later.
   2787 if self.args.past_index >= 0:

File ~/opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/models/encoder_decoder/modeling_encoder_decoder.py:625, in EncoderDecoderModel.forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, encoder_outputs, past_key_values, inputs_embeds, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **kwargs)
    620     decoder_input_ids = shift_tokens_right(
    621         labels, self.config.pad_token_id, self.config.decoder_start_token_id
    622     )
    624 # Decode
--> 625 decoder_outputs = self.decoder(
    626     input_ids=decoder_input_ids,
    627     attention_mask=decoder_attention_mask,
    628     encoder_hidden_states=encoder_hidden_states,
    629     encoder_attention_mask=attention_mask,
    630     inputs_embeds=decoder_inputs_embeds,
    631     output_attentions=output_attentions,
    632     output_hidden_states=output_hidden_states,
    633     use_cache=use_cache,
    634     past_key_values=past_key_values,
    635     return_dict=return_dict,
    636     **kwargs_decoder,
    637 )
    639 # Compute loss independent from decoder (as some shift the logits inside them)
    640 loss = None

File ~/opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/models/bert_generation/modeling_bert_generation.py:949, in BertGenerationDecoder.forward(self, input_ids, attention_mask, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
    946 if labels is not None:
    947     use_cache = False
--> 949 outputs = self.bert(
    950     input_ids,
    951     attention_mask=attention_mask,
    952     position_ids=position_ids,
    953     head_mask=head_mask,
    954     inputs_embeds=inputs_embeds,
    955     encoder_hidden_states=encoder_hidden_states,
    956     encoder_attention_mask=encoder_attention_mask,
    957     past_key_values=past_key_values,
    958     use_cache=use_cache,
    959     output_attentions=output_attentions,
    960     output_hidden_states=output_hidden_states,
    961     return_dict=return_dict,
    962 )
    964 sequence_output = outputs[0]
    965 prediction_scores = self.lm_head(sequence_output)

File ~/opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/models/bert_generation/modeling_bert_generation.py:774, in BertGenerationEncoder.forward(self, input_ids, attention_mask, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
    772     input_shape = inputs_embeds.size()[:-1]
    773 else:
--> 774     raise ValueError("You have to specify either input_ids or inputs_embeds")
    776 batch_size, seq_length = input_shape
    777 device = input_ids.device if input_ids is not None else inputs_embeds.device

ValueError: You have to specify either input_ids or inputs_embeds
1 Like