Issue 1: tokenizer.vocab_size
has a size of 50257
when printed but the Phi-2
model gives me an output shape (5, 1256, 51200)
during evaluation and the error below while decoding
I’m working on callback like:
def decode_predictions(tokenizer, predictions):
print(type(predictions), predictions.predictions.shape, predictions.label_ids.shape) # (5, 1256, 51200)
labels = tokenizer.batch_decode(predictions.label_ids)
prediction_text = tokenizer.batch_decode(predictions.predictions.argmax(axis=-1)) # HERE COMES THE ERROR
return {"labels": labels, "predictions": prediction_text}
def on_evaluate(self, args, state, control, **kwargs):
super().on_evaluate(args, state, control, **kwargs)
predictions = self.trainer.predict(self.sample_dataset)# generate predictions
predictions = decode_predictions(self.tokenizer, predictions) # decode predictions and labels
predictions_df = pd.DataFrame(predictions) # add predictions to a wandb.Table
predictions_df["epoch"] = state.epoch
records_table = self._wandb.Table(dataframe=predictions_df)
self._wandb.log({"sample_predictions": records_table}) # log the table to wandb
Issue-2: When I create a random example, the tokenizer
works till 50300
and after that, I’m getting:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[19], line 1
----> 1 tokenizer.batch_decode(predictions.argmax(axis=-1), skip_special_tokens = True)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3742, in PreTrainedTokenizerBase.batch_decode(self, sequences, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
3718 def batch_decode(
3719 self,
3720 sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
(...)
3723 **kwargs,
3724 ) -> List[str]:
3725 """
3726 Convert a list of lists of token ids into a list of strings by calling decode.
3727
(...)
3740 `List[str]`: The list of decoded sentences.
3741 """
-> 3742 return [
3743 self.decode(
3744 seq,
3745 skip_special_tokens=skip_special_tokens,
3746 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
3747 **kwargs,
3748 )
3749 for seq in sequences
3750 ]
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3743, in <listcomp>(.0)
3718 def batch_decode(
3719 self,
3720 sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
(...)
3723 **kwargs,
3724 ) -> List[str]:
3725 """
3726 Convert a list of lists of token ids into a list of strings by calling decode.
3727
(...)
3740 `List[str]`: The list of decoded sentences.
3741 """
3742 return [
-> 3743 self.decode(
3744 seq,
3745 skip_special_tokens=skip_special_tokens,
3746 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
3747 **kwargs,
3748 )
3749 for seq in sequences
3750 ]
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/codegen/tokenization_codegen.py:358, in CodeGenTokenizer.decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, truncate_before_pattern, **kwargs)
331 """
332 Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
333 tokens and clean up tokenization spaces.
(...)
353 `str`: The decoded sentence.
354 """
356 token_ids = to_py_obj(token_ids)
--> 358 decoded_text = super()._decode(
359 token_ids=token_ids,
360 skip_special_tokens=skip_special_tokens,
361 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
362 **kwargs,
363 )
365 if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
366 decoded_text = self.truncate(decoded_text, truncate_before_pattern)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/tokenization_utils.py:1024, in PreTrainedTokenizer._decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, spaces_between_special_tokens, **kwargs)
1022 current_sub_text.append(token)
1023 if current_sub_text:
-> 1024 sub_texts.append(self.convert_tokens_to_string(current_sub_text))
1026 if spaces_between_special_tokens:
1027 text = " ".join(sub_texts)
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/codegen/tokenization_codegen.py:284, in CodeGenTokenizer.convert_tokens_to_string(self, tokens)
282 def convert_tokens_to_string(self, tokens):
283 """Converts a sequence of tokens (string) in a single string."""
--> 284 text = "".join(tokens)
285 text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
286 return text
TypeError: sequence item 31: expected str instance, NoneType found
This is the code I’m using:
import numpy as np
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", use_fast = False)
print(tokenizer.vocab_size)
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
print(tokenizer.vocab_size)
predictions = np.random.uniform(size = (5, 1256, 50300)) # [No of samples, sequence_length, Vocab]
preds = predictions.argmax(axis=-1)
tokenizer.batch_decode(preds) # Works till 50300