Iâm trying to recreate the BERTweet tokenizers for my own use using the tokenizers library. Here is my code so far (training data was obtained from the Kaggle âNatural Language Processing with Disaster Tweetsâ Competition):
import tokenizers
from tokenizers import normalizers, pre_tokenizers, processors, trainers, Tokenizer, models
from tokenizers.trainers import WordPieceTrainer
from tokenizers.normalizers import NFD, NFC, StripAccents, BertNormalizer, Replace, Lowercase
from tokenizers.pre_tokenizers import Whitespace, Punctuation, Split
from transformers import AutoTokenizer
from tokenizers.processors import TemplateProcessing
tokenizer_df = pd.concat([full_df, test_df], axis=0) # concatting all data points
tokenizer = Tokenizer(
models.WordPiece(unk_token='[UNK]')
)
# Normalization
tokenizer.normalizer = normalizers.Sequence([
Replace(tokenizers.Regex(r"http\S+|www\.\S+"), "HTTPURL"),
Replace(tokenizers.Regex(r"@\w+"), "@USER"), # From the BERTweet Paper
NFD(),
StripAccents(),
Replace(r"\s+", " "), # Collapsing whitespace
])
# Pre-tokenization (provides the âlegal cut pointsâ that the sub-word encoder may merge inside, but never across)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
Split(tokenizers.Regex(r'HTTPURL'), behavior="isolated"), # makes HTTPURL and @USER are standalone tokens (cannot be split)
Split(tokenizers.Regex(r'@USER'), behavior='isolated'),
Split(tokenizers.Regex(r"#\w+"), behavior="isolated"), # isolate by hashtags
Punctuation("isolated"), # isolate by punctuation marks
Whitespace(), # Split based on spaces
])
special_tokens = ["[CLS]", "[PAD]", "[SEP]", "[MASK]", "[UNK]", 'HTTPURL', '@USER']
trainer = WordPieceTrainer(
vocab_size=8000,
special_tokens=special_tokens,
)
# Model Training
tokenizer.train_from_iterator(tokenizer_df['text'], trainer=trainer) # 10K training points
cls_token_id = tokenizer.token_to_id('[CLS]')
sep_token_id = tokenizer.token_to_id('[SEP]')
# Post-processing
tokenizer.post_processor = TemplateProcessing(
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)
After training, I tested it on a bit of text:
from transformers import PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer, # <- your trained WordPiece, untouched
unk_token="[UNK]",
cls_token="[CLS]",
sep_token="[SEP]",
pad_token="[PAD]",
mask_token="[MASK]",
additional_special_tokens=['HTTPURL', '@USER'],
)
text_input = "@RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire"
encoded_input = wrapped_tokenizer(text_input)
print(encoded_input)
print(wrapped_tokenizer.convert_ids_to_tokens(encoded_input['input_ids']))
The following is the output from the last print statement:
['[CLS]', '@', 'USER', 'Update', '=', '>', 'California', 'Hwy', '.', '20', 'closed', 'in', 'both', 'direct', '##ions', 'due', 'to', 'Lake', 'County', 'fire', '-', '#', 'CA', '##fire', '[SEP]']
Is it possible to make sure â@USERâ stays intact? Why is it getting split?