Iโm trying to get BERT encoding for sentence pairs.
I manage to get the tokens for the batch of sentences. But when I feed it to BERT I get :
IndexError: index out of range in self
For a single pair of sentences, the BERT model does accept the tokens. But for lists of sentences, I get this error. I donโt understand where is the problem.
( The model I am using inherits from BERT ).
Here is my code:
alephbert_tokenizer = AutoTokenizer.from_pretrained('onlplab/alephbert-base')
alephbert = AutoModel.from_pretrained('onlplab/alephbert-base')
sentence_a = ['ืฉื ื ืืืื ืฉืื ืืื','ืฉื ืืื ืฆืืื ืืื','ืืื ื ืฉืืจื?','ืฉืืืฉื ืืืื ืฉืื ืื ืืจ', 'ืืื ืฆืื ืืื']
sentence_b = ['ืืืช ืฉืชืืื ืฉืืืฉ','ืืงืื ืืืื ืืืื','ืจืง ืืฉืื ืืคืืคืื ืืืช ืฉืชืืื ืฉืืืฉ','ืืืืืจื ืืืจ ืืฉืื ืฉืืืฉื ืืืืื', 'ืื ืืืื ืืื ืฉืชื ืืืช ืฉืชืืื ืฉืืืฉ']
tokens = alephbert_tokenizer(sentence_a, sentence_b,
padding = 'max_length',
truncation = True,
max_length = 512,
return_tensors = 'pt')
batch_input_ids = tokens['input_ids']
batch_token_type_ids = tokens['token_type_ids']
batch_attention_mask = tokens['attention_mask']
alephbert(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])
Trackback:
IndexError Traceback (most recent call last)
Input In [76], in <cell line: 1>()
----> 1 alephbert(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])
File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\modules\module.py:889, in Module._call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
892 self._forward_hooks.values()):
893 hook_result = hook(self, input, result)
File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\transformers\models\bert\modeling_bert.py:1010, in BertModel.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
1003 # Prepare head mask if needed
1004 # 1.0 in head_mask indicate we keep the head
1005 # attention_probs has shape bsz x n_heads x N x N
1006 # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
1007 # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
1008 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-> 1010 embedding_output = self.embeddings(
1011 input_ids=input_ids,
1012 position_ids=position_ids,
1013 token_type_ids=token_type_ids,
1014 inputs_embeds=inputs_embeds,
1015 past_key_values_length=past_key_values_length,
1016 )
1017 encoder_outputs = self.encoder(
1018 embedding_output,
1019 attention_mask=extended_attention_mask,
(...)
1027 return_dict=return_dict,
1028 )
1029 sequence_output = encoder_outputs[0]
File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\modules\module.py:889, in Module._call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
892 self._forward_hooks.values()):
893 hook_result = hook(self, input, result)
File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\transformers\models\bert\modeling_bert.py:236, in BertEmbeddings.forward(self, input_ids, token_type_ids, position_ids, inputs_embeds, past_key_values_length)
234 if inputs_embeds is None:
235 inputs_embeds = self.word_embeddings(input_ids)
--> 236 token_type_embeddings = self.token_type_embeddings(token_type_ids)
238 embeddings = inputs_embeds + token_type_embeddings
239 if self.position_embedding_type == "absolute":
File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\modules\module.py:889, in Module._call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
892 self._forward_hooks.values()):
893 hook_result = hook(self, input, result)
File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\modules\sparse.py:156, in Embedding.forward(self, input)
155 def forward(self, input: Tensor) -> Tensor:
--> 156 return F.embedding(
157 input, self.weight, self.padding_idx, self.max_norm,
158 self.norm_type, self.scale_grad_by_freq, self.sparse)
File C:\Users\BUDBUDIO\Anaconda3\lib\site-packages\torch\nn\functional.py:1916, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
1910 # Note [embedding_renorm set_grad_enabled]
1911 # XXX: equivalent to
1912 # with torch.no_grad():
1913 # torch.embedding_renorm_
1914 # remove once script supports set_grad_enabled
1915 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1916 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self
I also paste here the working example for one pair for reference:
sentence_a1 = 'ืื ื ืืืื ืฉืืงืืื, ืืขืืืืช ืืืื ื'
sentence_b1 = 'ืืืจืืืง ืืกืืืจืืืช ืืชืืช ืืื ื'
single_s_tok = alephbert_tokenizer(sentence_a1,sentence_b1, is_split_into_words=True)
input_ids = torch.tensor(single_s_tok['input_ids']).unsqueeze(0)
attention_mask = torch.tensor(single_s_tok['attention_mask']).unsqueeze(0)
token_type_ids = torch.tensor(single_s_tok['token_type_ids']).unsqueeze(0)
alephbert(input_ids, attention_mask, token_type_ids)
Any help is greatly appreciated.