BioGPT causal language model with unexpected error

I am trying to use a Causal Language Model from BioGPT. However, I got a strange error.

Here are my steps:

First, I installed transformers and sacremoses:

!pip install transformers sacremoses -q

Then I executed the following code:

input_sequence = "Hello, I'm a language model,"

inputs = torch.as_tensor(tokenizer.encode(input_sequence)).unsqueeze(0).to(device)
past_key_values = None

count = 0
complete_token = []
with torch.no_grad():
    while count<10:
        count += 1
        print("Iteration no.: " + str(count))
        if count > 1:
            inputs = input_token

        model_out = model(input_ids=inputs.to(device), past_key_values=past_key_values)
        logits = model_out.logits[:, -1, :]
        past_key_values = model_out.past_key_values

        topk_values, topk_indices = torch.topk(logits, 5)

        log_probs = F.softmax(topk_values, dim=-1)
        inputs_in_topk = torch.multinomial(log_probs, num_samples=1, replacement=True)
        input_token = torch.gather(topk_indices, 1, inputs_in_topk)
        complete_token.append(input_token)

And here is the error I got:

Iteration no.: 1
Iteration no.: 2
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_18990/2689790310.py in <cell line: 8>()
     13             inputs = input_token
     14 
---> 15         model_out = model(input_ids=inputs.to(device), past_key_values=past_key_values)
     16         logits = model_out.logits[:, -1, :]
     17         past_key_values = model_out.past_key_values

~/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1192         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194             return forward_call(*input, **kwargs)
   1195         # Do not call functions when jit is used
   1196         full_backward_hooks, non_full_backward_hooks = [], []

~/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages/transformers/models/biogpt/modeling_biogpt.py in forward(self, input_ids, attention_mask, head_mask, inputs_embeds, past_key_values, labels, use_cache, output_attentions, output_hidden_states, return_dict)
    677         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    678 
--> 679         outputs = self.biogpt(
    680             input_ids,
    681             attention_mask=attention_mask,

~/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1192         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194             return forward_call(*input, **kwargs)
   1195         # Do not call functions when jit is used
   1196         full_backward_hooks, non_full_backward_hooks = [], []

~/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages/transformers/models/biogpt/modeling_biogpt.py in forward(self, input_ids, attention_mask, head_mask, inputs_embeds, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
    589                 )
    590             else:
--> 591                 layer_outputs = decoder_layer(
    592                     hidden_states,
    593                     attention_mask=attention_mask,

~/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1192         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194             return forward_call(*input, **kwargs)
   1195         # Do not call functions when jit is used
   1196         full_backward_hooks, non_full_backward_hooks = [], []

~/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages/transformers/models/biogpt/modeling_biogpt.py in forward(self, hidden_states, attention_mask, layer_head_mask, past_key_value, output_attentions, use_cache)
    313         self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
    314         # add present self-attn cache to positions 1,2 of present_key_value tuple
--> 315         hidden_states, self_attn_weights, present_key_value = self.self_attn(
    316             hidden_states=hidden_states,
    317             past_key_value=self_attn_past_key_value,

~/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1192         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194             return forward_call(*input, **kwargs)
   1195         # Do not call functions when jit is used
   1196         full_backward_hooks, non_full_backward_hooks = [], []

~/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages/transformers/models/biogpt/modeling_biogpt.py in forward(self, hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask, output_attentions)
    211         if attention_mask is not None:
    212             if attention_mask.size() != (bsz, 1, tgt_len, src_len):
--> 213                 raise ValueError(
    214                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
    215                 )

ValueError: Attention mask should be of size (1, 1, 0, 12), but is torch.Size([1, 1, 1, 1])

So apparently, everything went fine in the first execution, but the in the second model call this error came up.

Do you know how to fix this? :slight_smile: