ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['pixel_values']

I want to fine tune a VisionEncoderDecoderModel.from_pretrained(model_name)
I use a CustomOCRDataset from Learn Open CV.
But the default_data_collator fails to stack the inputs because the samples have a different shape , so I decided to use DataCollatorForSeq2Seq and Resize in augmentation.

I get an error
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided [‘pixel_values’]

So I changed getitem to get input_ids, but the error is still the same.

def __getitem__(self, idx):
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]

        assert text.strip() != "", f"ERROR Empty text in {idx}"

        # Read the image, apply augmentations, and get the transformed pixels.
        image = Image.open(self.root_dir + file_name).convert('RGB')
        image = train_transforms(image)
        pixel_values = self.processor(image, return_tensors='pt').pixel_values
        # Pass the text through the tokenizer and get the labels,
        # i.e. tokenized labels.
        labels = self.processor.tokenizer(
            text,
            padding='max_length',
            max_length=self.max_target_length,
            return_tensors='pt'
        ).input_ids.squeeze(0)

        # We are using -100 as the padding token.
        labels = torch.where(labels == self.processor.tokenizer.pad_token_id, torch.tensor(-100), labels)
        encoding = {"pixel_values": pixel_values.squeeze(0),
                    "input_ids": labels}
        return encoding
@dataclass(frozen=True)
class TrainingConfig:
    BATCH_SIZE:    int = 16
    EPOCHS:        int = 5
    LEARNING_RATE: float = 0.00005

@dataclass(frozen=True)
class DatasetConfig:
    DATA_ROOT:     str = image_dir

@dataclass(frozen=True)
class ModelConfig:
    MODEL_NAME: str = 'microsoft/trocr-base-handwritten'
# Augmentations.
train_transforms = transforms.Compose([
    transforms.Resize((1024, 880))
])
processor = TrOCRProcessor.from_pretrained(ModelConfig.MODEL_NAME)
train_dataset = CustomOCRDataset(
    root_dir=os.path.join(DatasetConfig.DATA_ROOT, train_destination),
    df=train_df,
    processor=processor
)
valid_dataset = CustomOCRDataset(
    root_dir=os.path.join(DatasetConfig.DATA_ROOT, test_destination),
    df=test_df,
    processor=processor
)
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=TrainingConfig.BATCH_SIZE,
    per_device_eval_batch_size=TrainingConfig.BATCH_SIZE,
    fp16=True,
    output_dir='seq2seq_model_printed/',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=5,
    report_to='tensorboard',
    num_train_epochs=TrainingConfig.EPOCHS
)
data_collator = DataCollatorForSeq2Seq(tokenizer=processor.tokenizer, model=model, padding=True)
# Initialize trainer.
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_cer,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator
)
trainer.train()

The full ERROR :

File \transformers\data\data_collator.py:599, in DataCollatorForSeq2Seq.__call__(self, features, return_tensors)
    596 non_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
    598 # run through tokenizer without labels to ensure no side effects
--> 599 batch = pad_without_fast_tokenizer_warning(
    600     self.tokenizer,
    601     non_labels_features,
    602     padding=self.padding,
    603     max_length=self.max_length,
    604     pad_to_multiple_of=self.pad_to_multiple_of,
    605     return_tensors=return_tensors,
    606 )
    608 # we have to pad the labels manually as we cannot rely on `tokenizer.pad` and we need them to be of the same length to return tensors
    609 no_padding = self.padding is False or self.padding == PaddingStrategy.DO_NOT_PAD

File \transformers\data\data_collator.py:66, in pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs)
     63 tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
     65 try:
---> 66     padded = tokenizer.pad(*pad_args, **pad_kwargs)
     67 finally:
     68     # Restore the state of the warning.
     69     tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state

File \transformers\tokenization_utils_base.py:3305, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, padding_side, return_attention_mask, return_tensors, verbose)
   3303 # The model's main input name, usually `input_ids`, has been passed for padding
   3304 if self.model_input_names[0] not in encoded_inputs:
-> 3305     raise ValueError(
   3306         "You should supply an encoding or a list of encodings to this method "
   3307         f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
   3308     )
   3310 required_input = encoded_inputs[self.model_input_names[0]]
   3312 if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['pixel_values']

The solutions that I’ve already tried were:
→ to do transforms.Resize((1024, 880)),
→ to use a custom data collator
I got a TypeError: ViTModel.forward() got an unexpected keyword argument ‘num_items_in_batch’. That’s why I decided to use DataCollatorForSeq2Seq.

But I’m getting another error over and over: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided [‘pixel_values’].
If I use
data_collator=default_data_collator in the trainer, I get
RuntimeError: stack expects each tensor to be equal size, but got [1067] at entry 0 and [693] at entry 1
Thanks in advance!

1 Like