'ViTImageProcessor' object has no attribute 'pad'

Hi

I’m a student, new to this topic in general and that’s my first time using Hugging Face.
I want to fine-tune ViT with my own dataset, In order to do so, I’m following this tutorial, but since I don’t import my data set from datasets, some things are different.

This is my custom data set class:

import os
import torch
import numpy as np
from transformers import ViTImageProcessor
from torchvision import transforms
from PIL import Image

model_name_or_path = ‘google/vit-base-patch16-224-in21k’
feature_extractor = ViTImageProcessor.from_pretrained(model_name_or_path)

class ViTCustomDataSet():
    def __init__(self, image_dir, device):
        # super(ImageDataset, self).__init__()
        self.image_dir = image_dir

        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            self.extract_features
        ])

        self.image_dir = image_dir
        self.device = device

    def extract_features(self, image):
        return feature_extractor(image, return_tensors='pt')
        return feature_extractor(image, return_tensors='pt')['pixel_values']

    def __len__(self):
        return len(os.listdir(self.image_dir))

    def __getitem__(self, idx):
        image_name = os.listdir(self.image_dir)[idx]
        image_path = os.path.join(self.image_dir, image_name)
        image = np.array(Image.open(image_path).convert('RGB'))
        image = self.transform(image)
        label = self.get_label_from_filename(image_name)
        return image.to(self.device), label

    def get_label_from_filename(self, image_name):
        return 1 if image_name.split('_')[-2] == 'M' else 0

    def get_train_and_test_data(self):
        train_size = int(0.8 * self.__len__())
        test_size = self.__len__() - train_size

        train_dataset, test_dataset = torch.utils.data.random_split(
            self, [train_size, test_size])

        return torch.utils.data.DataLoader(
            train_dataset, batch_size=32, shuffle=True).dataset, torch.utils.data.DataLoader(
            test_dataset, batch_size=32, shuffle=True).dataset

And that’s my main:

training_args = TrainingArguments(
    output_dir="./vit",
    # per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=4,
    # fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to='tensorboard',
    load_best_model_at_end=True,
)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    vitData = ViTCustomDataSet(
        'D:/data/NIST302/images/auxiliary/flat/M/500/plain/png/equal', device=device)

    train_dataset, test_dataset = vitData.get_train_and_test_data()

    model = ViTForImageClassification.from_pretrained(
        model_name_or_path,
        num_labels=num_classes)

    feature_extractor = ViTImageProcessor.from_pretrained(
        model_name_or_path, padding=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=None,
        tokenizer=feature_extractor,
    )

    train_results = trainer.train()
    trainer.save_model()
    trainer.log_metrics("train", train_results.metrics)
    trainer.save_metrics("train", train_results.metrics)
    trainer.save_state()

    metrics = trainer.evaluate(test_dataset)
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

(I’m sorry for the mess, I’m transforming an old code of mine and I want to make sure things work before I refactor)

Anyway, I’m getting the following error:

ViTImageProcessor does not have a pad attribute

Which I don’t see how to solve.

Also - since I’m new, any other tip will be welcomed!
Thank you

1 Like

I have the same problem.

Just remove the padding parameter from the below:

feature_extractor = ViTImageProcessor.from_pretrained(
        model_name_or_path, padding=False)

It should be just:

feature_extractor = ViTImageProcessor.from_pretrained(
        model_name_or_path)

Did you resolve this issue? I’m having the same problem.

Hi,

This can be bypassed by passing the default data collator to the Trainer:

from transformers import default_data_collator

trainer = Trainer(...
            tokenizer=image_processor,
            data_collator=default_data_collator
)

However, we’re going to fix this issue by adding an image_processor argument to the Trainer so that users no longer need to do this. You can follow the progress here: [Trainer] Allow passing image processor by NielsRogge · Pull Request #29896 · huggingface/transformers · GitHub