Finetuning CLIP model raises IndexError: index out of range in self

I’m trying to finetune the CLIP model for my private dataset. I have written CustomDataset for that, the data loading is working well but when I try to train the model with Trainer, It raises IndexError: index out of range in self

Custom Dataset

​```
class ClassifierDataset(Dataset):
def init(self, processor, folds: int, mode=β€œtrain”):
super(ClassifierDataset, self).init()

    self.mode = mode
    self.fold = folds
    self.processor = processor
    self.max_target_length = 32
    
    self.train_transform = A.Compose(
        [
            A.CLAHE(),
            A.RandomRotate90(),
            A.Transpose(),
            A.ShiftScaleRotate(
                 shift_limit=0.0625, scale_limit=0.50, rotate_limit=45, p=0.75
             ),
            A.Blur(blur_limit=3),
            A.OpticalDistortion(),
            A.GridDistortion(),
            A.HueSaturationValue(),
            A.VerticalFlip(),
            A.HorizontalFlip(),
            A.Normalize(),
            ToTensorV2(),
        ]
    )

    self.val_transform = A.Compose(
        [

            A.Normalize(),
            ToTensorV2(),
        ]
    )
    
    
    self.df = pd.read_csv(str(Path.cwd() / "data_new.csv"))

    self.train_df = self.df[self.df.folds != self.fold].reset_index(drop=True)
    self.val_df = self.df[self.df.folds == self.fold].reset_index(drop=True)
    self.dff = self.train_df if self.mode == "train" else self.val_df

def __len__(self):
    return len(self.dff)

def __getitem__(self, index):

    img_name = self.dff.loc[index, "paths"]
    text = self.dff.loc[index, "corrected_prompts"]

    img = Image.open(img_name).convert("RGB")          
    pixel_values = self.processor.feature_extractor(img, return_tensors="pt").pixel_values
    labels = self.processor.tokenizer(text, 
                            padding="max_length", 
                            max_length=77,
                            truncation=True).input_ids
    labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
    return {"input_ids":torch.tensor(labels), "pixel_values":pixel_values.squeeze()}

## Trainer 

default_data_collator = DefaultDataCollator()
training_args = TrainingArguments(
output_dir=β€˜./results’,
num_train_epochs=3,
warmup_steps=500,
weight_decay=0.01,
logging_dir=β€˜./logs’,
logging_steps=10,
evaluation_strategy=β€˜epoch’
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data,
eval_dataset=val_data,
data_collator=default_data_collator
)

trainer.train()


## The Error StackTrace

in <cell line: 22>:22 β”‚
β”‚ β”‚
β”‚ 19 β”‚ data_collator=default_data_collator β”‚
β”‚ 20 ) β”‚
β”‚ 21 β”‚
β”‚ ❱ 22 trainer.train() β”‚
β”‚ 23 β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/transformers/trainer.py:1543 in train β”‚
β”‚ β”‚
β”‚ 1540 β”‚ β”‚ inner_training_loop = find_executable_batch_size( β”‚
β”‚ 1541 β”‚ β”‚ β”‚ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size β”‚
β”‚ 1542 β”‚ β”‚ ) β”‚
β”‚ ❱ 1543 β”‚ β”‚ return inner_training_loop( β”‚
β”‚ 1544 β”‚ β”‚ β”‚ args=args, β”‚
β”‚ 1545 β”‚ β”‚ β”‚ resume_from_checkpoint=resume_from_checkpoint, β”‚
β”‚ 1546 β”‚ β”‚ β”‚ trial=trial, β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/transformers/trainer.py:1791 in β”‚
β”‚ _inner_training_loop β”‚
β”‚ β”‚
β”‚ 1788 β”‚ β”‚ β”‚ β”‚ β”‚ with model.no_sync(): β”‚
β”‚ 1789 β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ tr_loss_step = self.training_step(model, inputs) β”‚
β”‚ 1790 β”‚ β”‚ β”‚ β”‚ else: β”‚
β”‚ ❱ 1791 β”‚ β”‚ β”‚ β”‚ β”‚ tr_loss_step = self.training_step(model, inputs) β”‚
β”‚ 1792 β”‚ β”‚ β”‚ β”‚ β”‚
β”‚ 1793 β”‚ β”‚ β”‚ β”‚ if ( β”‚
β”‚ 1794 β”‚ β”‚ β”‚ β”‚ β”‚ args.logging_nan_inf_filter β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/transformers/trainer.py:2539 in training_step β”‚
β”‚ β”‚
β”‚ 2536 β”‚ β”‚ β”‚ return loss_mb.reduce_mean().detach().to(self.args.device) β”‚
β”‚ 2537 β”‚ β”‚ β”‚
β”‚ 2538 β”‚ β”‚ with self.compute_loss_context_manager(): β”‚
β”‚ ❱ 2539 β”‚ β”‚ β”‚ loss = self.compute_loss(model, inputs) β”‚
β”‚ 2540 β”‚ β”‚ β”‚
β”‚ 2541 β”‚ β”‚ if self.args.n_gpu > 1: β”‚
β”‚ 2542 β”‚ β”‚ β”‚ loss = loss.mean() # mean() to average on multi-gpu parallel training β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/transformers/trainer.py:2571 in compute_loss β”‚
β”‚ β”‚
β”‚ 2568 β”‚ β”‚ β”‚ labels = inputs.pop(β€œlabels”) β”‚
β”‚ 2569 β”‚ β”‚ else: β”‚
β”‚ 2570 β”‚ β”‚ β”‚ labels = None β”‚
β”‚ ❱ 2571 β”‚ β”‚ outputs = model(**inputs) β”‚
β”‚ 2572 β”‚ β”‚ # Save past state if it exists β”‚
β”‚ 2573 β”‚ β”‚ # TODO: this needs to be fixed and made cleaner later. β”‚
β”‚ 2574 β”‚ β”‚ if self.args.past_index >= 0: β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/torch/nn/modules/module.py:1110 in _call_impl β”‚
β”‚ β”‚
β”‚ 1107 β”‚ β”‚ # this function, and just call forward. β”‚
β”‚ 1108 β”‚ β”‚ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o β”‚
β”‚ 1109 β”‚ β”‚ β”‚ β”‚ or _global_forward_hooks or _global_forward_pre_hooks): β”‚
β”‚ ❱ 1110 β”‚ β”‚ β”‚ return forward_call(*input, **kwargs) β”‚
β”‚ 1111 β”‚ β”‚ # Do not call functions when jit is used β”‚
β”‚ 1112 β”‚ β”‚ full_backward_hooks, non_full_backward_hooks = , β”‚
β”‚ 1113 β”‚ β”‚ if self._backward_hooks or _global_backward_hooks: β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:1125 β”‚
β”‚ in forward β”‚
β”‚ β”‚
β”‚ 1122 β”‚ β”‚ β”‚ return_dict=return_dict, β”‚
β”‚ 1123 β”‚ β”‚ ) β”‚
β”‚ 1124 β”‚ β”‚ β”‚
β”‚ ❱ 1125 β”‚ β”‚ text_outputs = self.text_model( β”‚
β”‚ 1126 β”‚ β”‚ β”‚ input_ids=input_ids, β”‚
β”‚ 1127 β”‚ β”‚ β”‚ attention_mask=attention_mask, β”‚
β”‚ 1128 β”‚ β”‚ β”‚ position_ids=position_ids, β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/torch/nn/modules/module.py:1110 in _call_impl β”‚
β”‚ β”‚
β”‚ 1107 β”‚ β”‚ # this function, and just call forward. β”‚
β”‚ 1108 β”‚ β”‚ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o β”‚
β”‚ 1109 β”‚ β”‚ β”‚ β”‚ or _global_forward_hooks or _global_forward_pre_hooks): β”‚
β”‚ ❱ 1110 β”‚ β”‚ β”‚ return forward_call(*input, **kwargs) β”‚
β”‚ 1111 β”‚ β”‚ # Do not call functions when jit is used β”‚
β”‚ 1112 β”‚ β”‚ full_backward_hooks, non_full_backward_hooks = , β”‚
β”‚ 1113 β”‚ β”‚ if self._backward_hooks or _global_backward_hooks: β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:712 β”‚
β”‚ in forward β”‚
β”‚ β”‚
β”‚ 709 β”‚ β”‚ input_shape = input_ids.size() β”‚
β”‚ 710 β”‚ β”‚ input_ids = input_ids.view(-1, input_shape[-1]) β”‚
β”‚ 711 β”‚ β”‚ β”‚
β”‚ ❱ 712 β”‚ β”‚ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) β”‚
β”‚ 713 β”‚ β”‚ β”‚
β”‚ 714 β”‚ β”‚ bsz, seq_len = input_shape β”‚
β”‚ 715 β”‚ β”‚ # CLIP’s text model uses causal mask, prepare it here. β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/torch/nn/modules/module.py:1110 in _call_impl β”‚
β”‚ β”‚
β”‚ 1107 β”‚ β”‚ # this function, and just call forward. β”‚
β”‚ 1108 β”‚ β”‚ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o β”‚
β”‚ 1109 β”‚ β”‚ β”‚ β”‚ or _global_forward_hooks or _global_forward_pre_hooks): β”‚
β”‚ ❱ 1110 β”‚ β”‚ β”‚ return forward_call(*input, **kwargs) β”‚
β”‚ 1111 β”‚ β”‚ # Do not call functions when jit is used β”‚
β”‚ 1112 β”‚ β”‚ full_backward_hooks, non_full_backward_hooks = , β”‚
β”‚ 1113 β”‚ β”‚ if self._backward_hooks or _global_backward_hooks: β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:227 β”‚
β”‚ in forward β”‚
β”‚ β”‚
β”‚ 224 β”‚ β”‚ β”‚ position_ids = self.position_ids[:, :seq_length] β”‚
β”‚ 225 β”‚ β”‚ β”‚
β”‚ 226 β”‚ β”‚ if inputs_embeds is None: β”‚
β”‚ ❱ 227 β”‚ β”‚ β”‚ inputs_embeds = self.token_embedding(input_ids) β”‚
β”‚ 228 β”‚ β”‚ β”‚
β”‚ 229 β”‚ β”‚ position_embeddings = self.position_embedding(position_ids) β”‚
β”‚ 230 β”‚ β”‚ embeddings = inputs_embeds + position_embeddings β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/torch/nn/modules/module.py:1110 in _call_impl β”‚
β”‚ β”‚
β”‚ 1107 β”‚ β”‚ # this function, and just call forward. β”‚
β”‚ 1108 β”‚ β”‚ if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks o β”‚
β”‚ 1109 β”‚ β”‚ β”‚ β”‚ or _global_forward_hooks or _global_forward_pre_hooks): β”‚
β”‚ ❱ 1110 β”‚ β”‚ β”‚ return forward_call(*input, **kwargs) β”‚
β”‚ 1111 β”‚ β”‚ # Do not call functions when jit is used β”‚
β”‚ 1112 β”‚ β”‚ full_backward_hooks, non_full_backward_hooks = , β”‚
β”‚ 1113 β”‚ β”‚ if self.backward_hooks or global_backward_hooks: β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/torch/nn/modules/sparse.py:158 in forward β”‚
β”‚ β”‚
β”‚ 155 β”‚ β”‚ β”‚ β”‚ self.weight[self.padding_idx].fill
(0) β”‚
β”‚ 156 β”‚ β”‚
β”‚ 157 β”‚ def forward(self, input: Tensor) β†’ Tensor: β”‚
β”‚ ❱ 158 β”‚ β”‚ return F.embedding( β”‚
β”‚ 159 β”‚ β”‚ β”‚ input, self.weight, self.padding_idx, self.max_norm, β”‚
β”‚ 160 β”‚ β”‚ β”‚ self.norm_type, self.scale_grad_by_freq, self.sparse) β”‚
β”‚ 161 β”‚
β”‚ β”‚
β”‚ /opt/conda/envs/blip/lib/python3.8/site-packages/torch/nn/functional.py:2183 in embedding β”‚
β”‚ β”‚
β”‚ 2180 β”‚ β”‚ # torch.embedding_renorm
β”‚
β”‚ 2181 β”‚ β”‚ # remove once script supports set_grad_enabled β”‚
β”‚ 2182 β”‚ β”‚ no_grad_embedding_renorm(weight, input, max_norm, norm_type) β”‚
β”‚ ❱ 2183 β”‚ return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) β”‚
β”‚ 2184 β”‚
β”‚ 2185 β”‚
β”‚ 2186 def embedding_bag( β”‚
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
IndexError: index out of range in self