Phi model giving extra ids than vocab size of tokenizer so Phi-2 tokenizer.batch_decode() giving error: expected string got NoneType

Issue 1: tokenizer.vocab_size has a size of 50257 when printed but the Phi-2 model gives me an output shape (5, 1256, 51200) during evaluation and the error below while decoding

I’m working on callback like:


def decode_predictions(tokenizer, predictions):
    print(type(predictions), predictions.predictions.shape, predictions.label_ids.shape) # (5, 1256, 51200)
    labels = tokenizer.batch_decode(predictions.label_ids) 
    prediction_text = tokenizer.batch_decode(predictions.predictions.argmax(axis=-1)) # HERE COMES THE ERROR
    return {"labels": labels, "predictions": prediction_text}

 def on_evaluate(self, args, state, control,  **kwargs):
        super().on_evaluate(args, state, control, **kwargs)

        predictions = self.trainer.predict(self.sample_dataset)# generate predictions
        predictions = decode_predictions(self.tokenizer, predictions) # decode predictions and labels
        predictions_df = pd.DataFrame(predictions) # add predictions to a wandb.Table
        predictions_df["epoch"] = state.epoch
        records_table = self._wandb.Table(dataframe=predictions_df)
        self._wandb.log({"sample_predictions": records_table}) # log the table to wandb

Issue-2: When I create a random example, the tokenizer works till 50300

and after that, I’m getting:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[19], line 1
----> 1 tokenizer.batch_decode(predictions.argmax(axis=-1), skip_special_tokens = True)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3742, in PreTrainedTokenizerBase.batch_decode(self, sequences, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
   3718 def batch_decode(
   3719     self,
   3720     sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
   (...)
   3723     **kwargs,
   3724 ) -> List[str]:
   3725     """
   3726     Convert a list of lists of token ids into a list of strings by calling decode.
   3727 
   (...)
   3740         `List[str]`: The list of decoded sentences.
   3741     """
-> 3742     return [
   3743         self.decode(
   3744             seq,
   3745             skip_special_tokens=skip_special_tokens,
   3746             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
   3747             **kwargs,
   3748         )
   3749         for seq in sequences
   3750     ]

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3743, in <listcomp>(.0)
   3718 def batch_decode(
   3719     self,
   3720     sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
   (...)
   3723     **kwargs,
   3724 ) -> List[str]:
   3725     """
   3726     Convert a list of lists of token ids into a list of strings by calling decode.
   3727 
   (...)
   3740         `List[str]`: The list of decoded sentences.
   3741     """
   3742     return [
-> 3743         self.decode(
   3744             seq,
   3745             skip_special_tokens=skip_special_tokens,
   3746             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
   3747             **kwargs,
   3748         )
   3749         for seq in sequences
   3750     ]

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/codegen/tokenization_codegen.py:358, in CodeGenTokenizer.decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, truncate_before_pattern, **kwargs)
    331 """
    332 Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
    333 tokens and clean up tokenization spaces.
   (...)
    353     `str`: The decoded sentence.
    354 """
    356 token_ids = to_py_obj(token_ids)
--> 358 decoded_text = super()._decode(
    359     token_ids=token_ids,
    360     skip_special_tokens=skip_special_tokens,
    361     clean_up_tokenization_spaces=clean_up_tokenization_spaces,
    362     **kwargs,
    363 )
    365 if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
    366     decoded_text = self.truncate(decoded_text, truncate_before_pattern)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/tokenization_utils.py:1024, in PreTrainedTokenizer._decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, spaces_between_special_tokens, **kwargs)
   1022         current_sub_text.append(token)
   1023 if current_sub_text:
-> 1024     sub_texts.append(self.convert_tokens_to_string(current_sub_text))
   1026 if spaces_between_special_tokens:
   1027     text = " ".join(sub_texts)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/codegen/tokenization_codegen.py:284, in CodeGenTokenizer.convert_tokens_to_string(self, tokens)
    282 def convert_tokens_to_string(self, tokens):
    283     """Converts a sequence of tokens (string) in a single string."""
--> 284     text = "".join(tokens)
    285     text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
    286     return text

TypeError: sequence item 31: expected str instance, NoneType found

This is the code I’m using:

import numpy as np
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", use_fast = False)

print(tokenizer.vocab_size)

tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

print(tokenizer.vocab_size)

predictions = np.random.uniform(size = (5, 1256, 50300)) # [No of samples, sequence_length, Vocab]
preds = predictions.argmax(axis=-1)

tokenizer.batch_decode(preds) # Works till 50300