InvalidArgumentError with vit-base-patch16-224 model?

Hi. So I am using a pretrained model based on google’s vit-base-patch16-224-in21k for a binary classification of images (human vs non human) .
I am using Keras/tensorflow 2.6.0 API.

here are some parts of my code.

# Downloading the base model
base_model = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

# Flipping and rotating images
data_augmentation = keras.Sequential(
    [layers.RandomFlip("horizontal"), layers.RandomRotation(0.1),]
)
# Freeze base model
base_model.trainable = False
# Create new model
inputs = keras.Input(shape = (3, 224, 224))
x = data_augmentation(inputs)   # apply data augmentation

x = base_model(x, training=False)[0]
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)


# model
model_vit = tf.keras.Model(inputs, outputs)
model_vit.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

model_vit.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_6 (InputLayer)         [(None, 3, 224, 224)]     0         
_________________________________________________________________
sequential_1 (Sequential)    (None, 3, 224, 224)       0         
_________________________________________________________________
tf_vi_t_model (TFViTModel)   TFBaseModelOutputWithPool 86389248  
_________________________________________________________________
dense_2 (Dense)              (None, 197, 1)            769       
=================================================================
Total params: 86,390,017
Trainable params: 769
Non-trainable params: 86,389,248

There are lots of non-trainable parameters by the way.

When I run the training I get this error:

# Train the Vit model
vit_trained_model = model_vit.fit( X_train_images, y_train_labels, validation_data=(X_val_images, y_val_labels), batch_size = 8, verbose=2, epochs=50)

scores = model_vit.evaluate(test_images, test_labels_binary, verbose=0)
print("ViT Model Accuracy on Test Set: %.2f%%" % (scores[1]*100))


---------------------------------------------------------------------------
InternalError                             Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_28616\3601201585.py in <cell line: 2>()
      1 # Train the Vit model
----> 2 vit_trained_model = model_vit.fit( X_train_images, y_train_labels, validation_data=(X_val_images, y_val_labels), batch_size = 8, verbose=2, epochs=50)
      3 
      4 scores = model_vit.evaluate(test_images, test_labels_binary, verbose=0)
      5 print("Xception Accuracy on Test Set: %.2f%%" % (scores[1]*100))
....

C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
    104       dtype = dtypes.as_dtype(dtype).as_datatype_enum
    105   ctx.ensure_initialized()
--> 106   return ops.EagerTensor(value, ctx.device_name, dtype)
    107 
    108 

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

This error most likely means that my computer is out of memory. Fine, another approach is to use generators and define the batch size manually:

from tensorflow.keras.utils import Sequence
import numpy as np   

class DataGenerator(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return batch_x, batch_y

train_gen = DataGenerator(X_train_images, y_train_labels, 16)
test_gen = DataGenerator(X_val_images, y_val_labels, 16)

history = model_vit.fit(train_gen,
                    epochs=6,
                    validation_data=test_gen)

this gives the error relating to input shapes?

Epoch 1/6
---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_13140\3904230856.py in <cell line: 21>()
     19 
     20 
---> 21 history = model_vit.fit(train_gen,
     22                     epochs=6,
     23                     validation_data=test_gen)

C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1182                 _r=1):
   1183               callbacks.on_train_batch_begin(step)
-> 1184               tmp_logs = self.train_function(iterator)
   1185               if data_handler.should_sync:
   1186                 context.async_wait()

C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds)
    883 
    884       with OptionalXlaContext(self._jit_compile):
--> 885         result = self._call(*args, **kwds)
    886 
    887       new_tracing_count = self.experimental_get_tracing_count()

C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds)
    948         # Lifting succeeded, so variables are initialized and we can run the
    949         # stateless function.
--> 950         return self._stateless_fn(*args, **kwds)
    951     else:
    952       _, _, _, filtered_flat_args = \

C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs)
   3037       (graph_function,
   3038        filtered_flat_args) = self._maybe_define_function(args, kwargs)
-> 3039     return graph_function._call_flat(
   3040         filtered_flat_args, captured_inputs=graph_function.captured_inputs)  # pylint: disable=protected-access
   3041 

C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
   1961         and executing_eagerly):
   1962       # No tape is watching; skip to running the function.
-> 1963       return self._build_call_outputs(self._inference_function.call(
   1964           ctx, args, cancellation_manager=cancellation_manager))
   1965     forward_backward = self._select_forward_and_backward_functions(

C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager)
    589       with _InterpolateFunctionError(self):
    590         if cancellation_manager is None:
--> 591           outputs = execute.execute(
    592               str(self.signature.name),
    593               num_outputs=self._num_outputs,

C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     57   try:
     58     ctx.ensure_initialized()
---> 59     tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     60                                         inputs, attrs, num_outputs)
     61   except core._NotOkStatusException as e:

InvalidArgumentError:  input depth must be evenly divisible by filter depth: 224 vs 3
	 [[node model/tf_vi_t_model/vit/embeddings/patch_embeddings/projection/Conv2D (defined at ProgramData\Anaconda3\lib\site-packages\transformers\models\vit\modeling_tf_vit.py:199) ]] [Op:__inference_train_function_30507]

Errors may have originated from an input operation.
Input Source operations connected to node model/tf_vi_t_model/vit/embeddings/patch_embeddings/projection/Conv2D:
 model/tf_vi_t_model/vit/embeddings/patch_embeddings/transpose (defined at ProgramData\Anaconda3\lib\site-packages\transformers\models\vit\modeling_tf_vit.py:197)

Function call stack:
train_function

Can anyone explain to me what " input depth must be evenly divisible by filter depth: 224 vs 3" means for me and how to fix?
the shapes of my train and validation data are as follows:

Train: X_train_images=(3932, 224, 224, 3), y_train_labels=(3932, 1)
Validation: X_val_images=(800, 224, 224, 3), y_val_labels=(800, 1)

It is my first time experimenting with a ViT transfer model! thank you very much. Any other advice on my model architecture is welcome too.

P.S. I have used this article as a guide to install hugging face and transformers through anaconda and run a model with my dataset of images, https://www.philschmid.de/image-classification-huggingface-transformers-keras

Hi! I encountered a similar problem. Did you manage to find the solution?

Hey @mbluetail @ellivalla ,
did you find the sulution?

cc @joaogante @amyeroberts @sayakpaul

Could you amend your code following this one?

@sayakpaul Hey, yes I can do it.
I noticed 2 differences:

  1. You used TFData2VecVisionModel Instead of: TFAutoModelForImageClassification
  2. You added these lines to code:
# Transpose because the `transformers` model has a leading channel dimension.
  dataset = dataset.map(lambda x, y: (tf.transpose(x, [0, 3, 1, 2]), y), AUTO)
  return dataset.prefetch(AUTO)

I accept the first change.
The second change - I faced some difficulty to aplly for my own dataset.
I have dataset that I loaded with that code:

data_dir = "/home/data/train"
datagen_kwargs = dict(dtype='float32', validation_split=.20)
dataflow_kwargs = dict(target_size=(IMAGE_SIZE,IMAGE_SIZE), batch_size=BATCH_SIZE,
                   interpolation="bicubic")
valid_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    **datagen_kwargs)
valid_generator = valid_datagen.flow_from_directory(
    data_dir, subset="validation", shuffle=True, **dataflow_kwargs)
do_data_augmentation = True
if do_data_augmentation:
    percent = 0.1
    train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
      rotation_range=30,
      horizontal_flip=True,
      width_shift_range=percent, height_shift_range=percent,
      shear_range=percent, zoom_range=percent,
      **datagen_kwargs)
else:
  train_datagen = valid_datagen
train_generator = train_datagen.flow_from_directory(
    data_dir, subset="training", shuffle=True, **dataflow_kwargs)

when I pass train_generator and valid_generator to model.fit() I get that error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_2432028/1086025199.py in <module>
----> 1 history = model.fit(train_generator, validation_data=valid_generator, epochs=num_train_epochs)

~/.local/lib/python3.8/site-packages/keras/utils/traceback_utils.py in error_handler(*args, **kwargs)
     65     except Exception as e:  # pylint: disable=broad-except
     66       filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67       raise e.with_traceback(filtered_tb) from None
     68     finally:
     69       del filtered_tb

/usr/local/lib/python3.8/dist-packages/keras_preprocessing/image/iterator.py in __getitem__(self, idx)
     63         index_array = self.index_array[self.batch_size * idx:
     64                                        self.batch_size * (idx + 1)]
---> 65         return self._get_batches_of_transformed_samples(index_array)
     66 
     67     def __len__(self):

/usr/local/lib/python3.8/dist-packages/keras_preprocessing/image/iterator.py in _get_batches_of_transformed_samples(self, index_array)
    220             A batch of transformed samples.
    221         """
--> 222         batch_x = np.zeros((len(index_array),) + self.image_shape, dtype=self.dtype)
    223         # build batch of image data
    224         # self.filepaths is dynamic, is better to call it once outside the loop

TypeError: 'list' object cannot be interpreted as an integer

What can I do?

I would suggest turning your ImageDataGenerator into a tf.data pipeline.

If you have numpy arrays then you can do something like the following:

def pp(x, y):
    x = tf.transpose(x, [0, 3, 1, 2])
    return {"pixel_values": x, "labels": y}


tf_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
tf_dataset = tf_dataset.batch(BATCH_SIZE)
tf_dataset = tf_dataset.map(pp)

The above assumes that x_train is a NumPy array consisting of the images and y_train is a NumPy array consisting of the labels.

@sayakpaul thank you,
Can you take a look at my code? In my opinion, the whole issue of private data creates a problem. There is an error that is not clear at all in model.fit.