I tried to create a smaller tokenizer:
def get_tokenizer_with_subset_of_vocab(tokenizer: GPT2Tokenizer, percentage_to_keep: float) -> GPT2Tokenizer:
"""
Create a tokenizer with a fraction of the vocabulary.
ref: https://chat.openai.com/c/5539083a-55b9-4a31-a0c6-bce5eeb45e1b
"""
from copy import deepcopy
tok = deepcopy(tokenizer)
assert id(tok) != id(tokenizer), "The tokenizer is not a deep copy!"
special_tokens = tok.all_special_tokens
# to make sure there is always a token set no matter what
tok.unk_token = "the" # but "the" is hopefully common enough that it doesn't damage the semantics of the sentence too much, however, putting EOS or something else might screw up the semantics of the sentence
# Calculate the number of tokens to keep
total_tokens = len(tok)
tokens_to_keep_count = int(total_tokens * percentage_to_keep)
# Get all non-special tokens
vocab = tok.get_vocab()
all_tokens = list(vocab.keys())
non_special_tokens = [token for token in all_tokens if token not in special_tokens]
assert "the" in non_special_tokens, "The token 'the' is not in the non-special tokens!"
# Randomly sample from non-special tokens
random_sampled_tokens = random.sample(non_special_tokens, tokens_to_keep_count - len(special_tokens))
# Combine special tokens with the randomly sampled tokens
final_tokens_to_keep = set(special_tokens + random_sampled_tokens + ["the"])
assert "the" in non_special_tokens, "The token 'the' is not in the non-special tokens!"
assert tok.unk_token == "the", "The token 'the' is not the unknown token!"
# Update the tokenizer's vocab
new_vocab = {token: idx for token, idx in vocab.items() if token in final_tokens_to_keep}
tok.vocab = new_vocab
tok.ids_to_tokens = {v: k for k, v in vocab.items()}
return tok
but the unit test doesn’t work as expected but code looks right:
def _test0_does_hacky_fraction_tokenizer_work():
# - have a tokenizer with only the special token "the", check everything is "the"
text_seq: str = "the cat is nice"
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
new_tokenizer = get_tokenizer_with_subset_of_vocab(tokenizer, 1/tokenizer.vocab_size)
# encode to tokens then decode to text
tokens = new_tokenizer.encode(text_seq)
llm_seq_txt: str = new_tokenizer.decode(tokens)
assert llm_seq_txt == "the the the the", f'Error: {llm_seq_txt=}'
# have a tokenizer with only the special token "the" and "cat", check the->the anything_else->the and cat->cat
text_seq: str = "the cat is nice"
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
new_tokenizer = get_tokenizer_with_subset_of_vocab(tokenizer, 1/tokenizer.vocab_size)
# encode to tokens then decode to text
tokens = new_tokenizer.encode(text_seq)
llm_seq_txt = new_tokenizer.decode(tokens)
assert llm_seq_txt == "the cat the the", f'Error: {llm_seq_txt=}
why?
Error
Exception has occurred: AssertionError (note: full exception trace is shown but execution is paused at: _run_module_as_main)
Error: llm_seq_txt='the cat is nice'
File "/lfs/ampere9/0/brando9/beyond-scale-language-data-diversity/src/diversity/embeddings/div_act_based.py", line 199, in _test0_does_hacky_fraction_tokenizer_work
assert llm_seq_txt == "the the the the", f'Error: {llm_seq_txt=}'
File "/lfs/ampere9/0/brando9/beyond-scale-language-data-diversity/src/diversity/embeddings/div_act_based.py", line 620, in <module>
_test0_does_hacky_fraction_tokenizer_work()
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/lfs/ampere9/0/brando9/miniconda/envs/beyond_scale/lib/python3.10/runpy.py", line 196, in _run_module_as_main (Current frame)
return _run_code(code, main_globals, None,
AssertionError: Error: llm_seq_txt='the cat is nice'