Hi everyone,
My first post here, as I’m very new to the HuggingFace library. Looking forward to joining the community!
I’m trying to follow the tutorial here but modifying it to use a sequence-to-sequence paradigm rather than sequence classification. My code is below:
# load data from files
data_files = {"train": "data/training_data.json", "test": "data/test_data.json"}
data = load_dataset("json", data_files=data_files)
# define tokenizer and map data to tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
return tokenizer(examples["output"], examples["input"], padding="max_length")
tokenized_data = data.map(tokenize_function, batched=True)
# define evaluation function
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
# define training args and model
training_args = TrainingArguments(output_dir="training_args", evaluation_strategy="epoch", use_mps_device=True)
encoder = BertGenerationEncoder.from_pretrained("bert-base-uncased")
decoder = BertGenerationDecoder.from_pretrained("bert-base-uncased")
model = EncoderDecoderModel(encoder=encoder, decoder=decoder)
# define trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_data["train"],
eval_dataset=tokenized_data["test"],
compute_metrics=compute_metrics,
)
# call train loop
trainer.train()
At the trainer.train() step, I get the error ValueError: You have to specify either input_ids or inputs_embeds
. I see here that this issue has already been reported and likely has to do with the encoder/decoder framework requiring the input_id
variable for both the encoder and decoder components.
What I’m not sure about is if this issue means that I can not use the convenience wrapper class trainer.py to finetune the model for a sequence-to-sequence task due to this issue, or whether there is a way to still use this class? If it is not possible, what is the current recommended way to finetune such a model?
Below is the full error stack trace:
ValueError Traceback (most recent call last)
Cell In[7], line 1
----> 1 trainer.train()
File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:1645, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1640 self.model_wrapped = self.model
1642 inner_training_loop = find_executable_batch_size(
1643 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1644 )
-> 1645 return inner_training_loop(
1646 args=args,
1647 resume_from_checkpoint=resume_from_checkpoint,
1648 trial=trial,
1649 ignore_keys_for_eval=ignore_keys_for_eval,
1650 )
File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:1938, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1935 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
1937 with self.accelerator.accumulate(model):
-> 1938 tr_loss_step = self.training_step(model, inputs)
1940 if (
1941 args.logging_nan_inf_filter
1942 and not is_torch_tpu_available()
1943 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1944 ):
1945 # if loss is nan or inf simply add the average of previous logged losses
1946 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:2759, in Trainer.training_step(self, model, inputs)
2756 return loss_mb.reduce_mean().detach().to(self.args.device)
2758 with self.compute_loss_context_manager():
-> 2759 loss = self.compute_loss(model, inputs)
2761 if self.args.n_gpu > 1:
2762 loss = loss.mean() # mean() to average on multi-gpu parallel training
File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:2784, in Trainer.compute_loss(self, model, inputs, return_outputs)
2782 else:
2783 labels = None
-> 2784 outputs = model(**inputs)
2785 # Save past state if it exists
2786 # TODO: this needs to be fixed and made cleaner later.
2787 if self.args.past_index >= 0:
File ~/opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/models/encoder_decoder/modeling_encoder_decoder.py:625, in EncoderDecoderModel.forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, encoder_outputs, past_key_values, inputs_embeds, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **kwargs)
620 decoder_input_ids = shift_tokens_right(
621 labels, self.config.pad_token_id, self.config.decoder_start_token_id
622 )
624 # Decode
--> 625 decoder_outputs = self.decoder(
626 input_ids=decoder_input_ids,
627 attention_mask=decoder_attention_mask,
628 encoder_hidden_states=encoder_hidden_states,
629 encoder_attention_mask=attention_mask,
630 inputs_embeds=decoder_inputs_embeds,
631 output_attentions=output_attentions,
632 output_hidden_states=output_hidden_states,
633 use_cache=use_cache,
634 past_key_values=past_key_values,
635 return_dict=return_dict,
636 **kwargs_decoder,
637 )
639 # Compute loss independent from decoder (as some shift the logits inside them)
640 loss = None
File ~/opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/models/bert_generation/modeling_bert_generation.py:949, in BertGenerationDecoder.forward(self, input_ids, attention_mask, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
946 if labels is not None:
947 use_cache = False
--> 949 outputs = self.bert(
950 input_ids,
951 attention_mask=attention_mask,
952 position_ids=position_ids,
953 head_mask=head_mask,
954 inputs_embeds=inputs_embeds,
955 encoder_hidden_states=encoder_hidden_states,
956 encoder_attention_mask=encoder_attention_mask,
957 past_key_values=past_key_values,
958 use_cache=use_cache,
959 output_attentions=output_attentions,
960 output_hidden_states=output_hidden_states,
961 return_dict=return_dict,
962 )
964 sequence_output = outputs[0]
965 prediction_scores = self.lm_head(sequence_output)
File ~/opt/miniconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File ~/opt/miniconda3/lib/python3.10/site-packages/transformers/models/bert_generation/modeling_bert_generation.py:774, in BertGenerationEncoder.forward(self, input_ids, attention_mask, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
772 input_shape = inputs_embeds.size()[:-1]
773 else:
--> 774 raise ValueError("You have to specify either input_ids or inputs_embeds")
776 batch_size, seq_length = input_shape
777 device = input_ids.device if input_ids is not None else inputs_embeds.device
ValueError: You have to specify either input_ids or inputs_embeds