I want to fine tune a VisionEncoderDecoderModel.from_pretrained(model_name)
I use a CustomOCRDataset from Learn Open CV.
But the default_data_collator fails to stack the inputs because the samples have a different shape , so I decided to use DataCollatorForSeq2Seq and Resize in augmentation.
I get an error
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided [‘pixel_values’]
So I changed getitem to get input_ids, but the error is still the same.
def __getitem__(self, idx):
file_name = self.df['file_name'][idx]
text = self.df['text'][idx]
assert text.strip() != "", f"ERROR Empty text in {idx}"
# Read the image, apply augmentations, and get the transformed pixels.
image = Image.open(self.root_dir + file_name).convert('RGB')
image = train_transforms(image)
pixel_values = self.processor(image, return_tensors='pt').pixel_values
# Pass the text through the tokenizer and get the labels,
# i.e. tokenized labels.
labels = self.processor.tokenizer(
text,
padding='max_length',
max_length=self.max_target_length,
return_tensors='pt'
).input_ids.squeeze(0)
# We are using -100 as the padding token.
labels = torch.where(labels == self.processor.tokenizer.pad_token_id, torch.tensor(-100), labels)
encoding = {"pixel_values": pixel_values.squeeze(0),
"input_ids": labels}
return encoding
@dataclass(frozen=True)
class TrainingConfig:
BATCH_SIZE: int = 16
EPOCHS: int = 5
LEARNING_RATE: float = 0.00005
@dataclass(frozen=True)
class DatasetConfig:
DATA_ROOT: str = image_dir
@dataclass(frozen=True)
class ModelConfig:
MODEL_NAME: str = 'microsoft/trocr-base-handwritten'
# Augmentations.
train_transforms = transforms.Compose([
transforms.Resize((1024, 880))
])
processor = TrOCRProcessor.from_pretrained(ModelConfig.MODEL_NAME)
train_dataset = CustomOCRDataset(
root_dir=os.path.join(DatasetConfig.DATA_ROOT, train_destination),
df=train_df,
processor=processor
)
valid_dataset = CustomOCRDataset(
root_dir=os.path.join(DatasetConfig.DATA_ROOT, test_destination),
df=test_df,
processor=processor
)
training_args = Seq2SeqTrainingArguments(
predict_with_generate=True,
evaluation_strategy='epoch',
per_device_train_batch_size=TrainingConfig.BATCH_SIZE,
per_device_eval_batch_size=TrainingConfig.BATCH_SIZE,
fp16=True,
output_dir='seq2seq_model_printed/',
logging_strategy='epoch',
save_strategy='epoch',
save_total_limit=5,
report_to='tensorboard',
num_train_epochs=TrainingConfig.EPOCHS
)
data_collator = DataCollatorForSeq2Seq(tokenizer=processor.tokenizer, model=model, padding=True)
# Initialize trainer.
trainer = Seq2SeqTrainer(
model=model,
tokenizer=processor.feature_extractor,
args=training_args,
compute_metrics=compute_cer,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
data_collator=data_collator
)
trainer.train()
The full ERROR :
File \transformers\data\data_collator.py:599, in DataCollatorForSeq2Seq.__call__(self, features, return_tensors)
596 non_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
598 # run through tokenizer without labels to ensure no side effects
--> 599 batch = pad_without_fast_tokenizer_warning(
600 self.tokenizer,
601 non_labels_features,
602 padding=self.padding,
603 max_length=self.max_length,
604 pad_to_multiple_of=self.pad_to_multiple_of,
605 return_tensors=return_tensors,
606 )
608 # we have to pad the labels manually as we cannot rely on `tokenizer.pad` and we need them to be of the same length to return tensors
609 no_padding = self.padding is False or self.padding == PaddingStrategy.DO_NOT_PAD
File \transformers\data\data_collator.py:66, in pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs)
63 tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
65 try:
---> 66 padded = tokenizer.pad(*pad_args, **pad_kwargs)
67 finally:
68 # Restore the state of the warning.
69 tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state
File \transformers\tokenization_utils_base.py:3305, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, padding_side, return_attention_mask, return_tensors, verbose)
3303 # The model's main input name, usually `input_ids`, has been passed for padding
3304 if self.model_input_names[0] not in encoded_inputs:
-> 3305 raise ValueError(
3306 "You should supply an encoding or a list of encodings to this method "
3307 f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
3308 )
3310 required_input = encoded_inputs[self.model_input_names[0]]
3312 if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['pixel_values']
The solutions that I’ve already tried were:
→ to do transforms.Resize((1024, 880)),
→ to use a custom data collator
I got a TypeError: ViTModel.forward() got an unexpected keyword argument ‘num_items_in_batch’. That’s why I decided to use DataCollatorForSeq2Seq.
But I’m getting another error over and over: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided [‘pixel_values’].
If I use
data_collator=default_data_collator
in the trainer, I get
RuntimeError: stack expects each tensor to be equal size, but got [1067] at entry 0 and [693] at entry 1
Thanks in advance!