Sentence pair classification with BertForSequenceClassification cause IndexError: index out of range in self

Hi,
Iโ€™m trying to create a sentence pair classification with a Bert-based model (pre-trained for Hebrew):

cls_model = BertForSequenceClassification.from_pretrained(
            'onlplab/alephbert-base', 
            num_labels = 2, 
            output_attentions = False, 
            output_hidden_states = False,
            return_dict = False
        )

I use this tokenizer:

alephbert_tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base')

For a small running example, I run it on two lists of sentences and corresponding labels for every pair of sentences. But I get this error:

IndexError: index out of range in self

I checked that the vocab_size of the tokenizer is of the same size as the modelโ€™s , as well as the max_length = 512 of the input pair.
Unfortunately, I am stuck. Any help would be highly appreciated.

The code Iโ€™m running:

sentence_a = ['ืฉื ื™ ื“ื’ื™ื ืฉื—ื• ื‘ื™ื','ืฉื ื™ื”ื ืฆืœืœื• ืœื™ื','ื›ืžื” ื ืฉืืจื•?','ืฉืœื•ืฉื” ื“ื’ื™ื ืฉื—ื• ื‘ื ื”ืจ', 'ืื—ื“ ืฆืœืœ ืœื™ื'] 
sentence_b = ['ืื—ืช ืฉืชื™ื™ื ืฉืœื•ืฉ','ื‘ืงื•ืœ ื’ื“ื•ืœ ื’ื“ื•ืœ','ืจืง ื™ืฉื‘ื• ื•ืคื˜ืคื˜ื• ืื—ืช ืฉืชื™ื™ื ืฉืœื•ืฉ','ืžืื—ื•ืจื™ ื”ื”ืจ ื™ืฉื‘ื• ืฉืœื•ืฉื” ื’ืžื“ื™ื', 'ืœื ืื›ืœื• ื•ืœื ืฉืชื• ืื—ืช ืฉืชื™ื™ื ืฉืœื•ืฉ'] 
labels = torch.tensor([1,0,1,1,0])

# tokenize_pairs
tokens = alephbert_tokenizer(sentence_a, sentence_b, 
                                 padding = 'max_length', 
                                 max_length = 512,
                                 truncation = True,
                                 return_tensors = 'pt')

input_ids = tokens['input_ids']
token_type_ids = tokens['token_type_ids']
attention_mask = tokens['attention_mask']

cls_model.train()
cls_model.zero_grad()

# Forward pass.
outputs, logits = cls_model(input_ids=input_ids, #.squeeze(), 
                            attention_mask=attention_mask, #.squeeze(), 
                            token_type_ids=token_type_ids, #.squeeze(), 
                            labels=labels, #.squeeze(),
                            return_dict=False
                           )

here is the full trackback:

IndexError                                Traceback (most recent call last)
Input In [137], in <cell line: 3>()
      1 cls_model.zero_grad()
      2 # Forward pass.
----> 3 outputs, logits = cls_model(input_ids=input_ids.squeeze(), 
      4                             attention_mask=attention_mask.squeeze(), 
      5                             token_type_ids=token_type_ids.squeeze(), 
      6                             labels=labels.squeeze(),
      7                             return_dict=False
      8                            )

File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\modules\module.py:889, in Module._call_impl(self, *input, **kwargs)
    887     result = self._slow_forward(*input, **kwargs)
    888 else:
--> 889     result = self.forward(*input, **kwargs)
    890 for hook in itertools.chain(
    891         _global_forward_hooks.values(),
    892         self._forward_hooks.values()):
    893     hook_result = hook(self, input, result)

File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\transformers\models\bert\modeling_bert.py:1554, in BertForSequenceClassification.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)
   1546 r"""
   1547 labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
   1548     Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
   1549     config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
   1550     `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
   1551 """
   1552 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1554 outputs = self.bert(
   1555     input_ids,
   1556     attention_mask=attention_mask,
   1557     token_type_ids=token_type_ids,
   1558     position_ids=position_ids,
   1559     head_mask=head_mask,
   1560     inputs_embeds=inputs_embeds,
   1561     output_attentions=output_attentions,
   1562     output_hidden_states=output_hidden_states,
   1563     return_dict=return_dict,
   1564 )
   1566 pooled_output = outputs[1]
   1568 pooled_output = self.dropout(pooled_output)

File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\modules\module.py:889, in Module._call_impl(self, *input, **kwargs)
    887     result = self._slow_forward(*input, **kwargs)
    888 else:
--> 889     result = self.forward(*input, **kwargs)
    890 for hook in itertools.chain(
    891         _global_forward_hooks.values(),
    892         self._forward_hooks.values()):
    893     hook_result = hook(self, input, result)

File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\transformers\models\bert\modeling_bert.py:1010, in BertModel.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
   1003 # Prepare head mask if needed
   1004 # 1.0 in head_mask indicate we keep the head
   1005 # attention_probs has shape bsz x n_heads x N x N
   1006 # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
   1007 # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
   1008 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-> 1010 embedding_output = self.embeddings(
   1011     input_ids=input_ids,
   1012     position_ids=position_ids,
   1013     token_type_ids=token_type_ids,
   1014     inputs_embeds=inputs_embeds,
   1015     past_key_values_length=past_key_values_length,
   1016 )
   1017 encoder_outputs = self.encoder(
   1018     embedding_output,
   1019     attention_mask=extended_attention_mask,
   (...)
   1027     return_dict=return_dict,
   1028 )
   1029 sequence_output = encoder_outputs[0]

File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\modules\module.py:889, in Module._call_impl(self, *input, **kwargs)
    887     result = self._slow_forward(*input, **kwargs)
    888 else:
--> 889     result = self.forward(*input, **kwargs)
    890 for hook in itertools.chain(
    891         _global_forward_hooks.values(),
    892         self._forward_hooks.values()):
    893     hook_result = hook(self, input, result)

File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\transformers\models\bert\modeling_bert.py:236, in BertEmbeddings.forward(self, input_ids, token_type_ids, position_ids, inputs_embeds, past_key_values_length)
    234 if inputs_embeds is None:
    235     inputs_embeds = self.word_embeddings(input_ids)
--> 236 token_type_embeddings = self.token_type_embeddings(token_type_ids)
    238 embeddings = inputs_embeds + token_type_embeddings
    239 if self.position_embedding_type == "absolute":

File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\modules\module.py:889, in Module._call_impl(self, *input, **kwargs)
    887     result = self._slow_forward(*input, **kwargs)
    888 else:
--> 889     result = self.forward(*input, **kwargs)
    890 for hook in itertools.chain(
    891         _global_forward_hooks.values(),
    892         self._forward_hooks.values()):
    893     hook_result = hook(self, input, result)

File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\modules\sparse.py:156, in Embedding.forward(self, input)
    155 def forward(self, input: Tensor) -> Tensor:
--> 156     return F.embedding(
    157         input, self.weight, self.padding_idx, self.max_norm,
    158         self.norm_type, self.scale_grad_by_freq, self.sparse)

File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\functional.py:1916, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   1910     # Note [embedding_renorm set_grad_enabled]
   1911     # XXX: equivalent to
   1912     # with torch.no_grad():
   1913     #   torch.embedding_renorm_
   1914     # remove once script supports set_grad_enabled
   1915     _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1916 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

IndexError: index out of range in self