I trained a BPETokenizer with the following code:
import tokenizers
import transformers
from tokenizers import models, pre_tokenizers, trainers, decoders, processors
import datasets
if __name__ == "__main__":
data = datasets.load_dataset("wmt14", "de-en")
unk_token = "<unk>"
pad_token = "<pad>"
mask_token = "<mask>"
tok = tokenizers.Tokenizer(models.BPE(unk_token=unk_token))
tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tok.post_processor = processors.ByteLevel()
tok.decoder = decoders.ByteLevel()
def batch_iterator():
for i in range(0, len(data), 1_000):
yield [
" ".join([x["de"], x["en"]])
for x in data["train"][i : i + 1_000]["translation"]
]
trainer = trainers.BpeTrainer(vocab_size=37_000, special_tokens=[unk_token, pad_token])
tok.train_from_iterator(batch_iterator(), trainer=trainer)
tok.enable_padding(pad_id=tok.token_to_id(pad_token), pad_token=pad_token)
tok = transformers.PreTrainedTokenizerFast(tokenizer_object=tok)
tok.save_pretrained("bpe_tok")
After training and loading the tokenizer, it correctly assigns the token for unknown tokens such as , for example:
In [27]: tok.decode(tok("🤗")["input_ids"])
Out[27]: '<unk>��<unk>'
But when I try to access the unk_token property, I get the following message:
In [28]: tok.unk_token
Using unk_token, but it is not set yet.
What am I missing?