from transformers import AutoTokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
trainer = WordPieceTrainer(vocab_size=5000,
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train_from_iterator(texts, trainer)
output = tokenizer.encode("Hello, y'all! How are you š ?")
print(type(output))
tokenizer1 = AutoTokenizer.from_pretrained("bert-base-cased")
output = tokenizer1.encode("Hello, y'all! How are you š ?")
print(type(output))
Out:
<class ātokenizers.Encodingā>
<class ālistā>
Isnāt it supposed to be the same object?
Also, how do I get tokens from pretrained AutoTokenizer?
Thank you!