I want all special tokens to always be available. How do I do this?
My first attempt to give it to my tokenizer:
def does_t5_have_sep_token():
tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained('t5-small')
assert isinstance(tokenizer, PreTrainedTokenizerFast)
print(tokenizer)
print(f'{len(tokenizer)=}')
# print(f'{tokenizer.all_special_tokens=}')
print(f'{tokenizer.sep_token=}')
print(f'{tokenizer.eos_token=}')
print(f'{tokenizer.all_special_tokens=}')
special_tokens_dict = {'additional_special_tokens': ['<bos>', '<cls>', '<s>'] + tokenizer.all_special_tokens }
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f'{tokenizer.sep_token=}')
print(f'{tokenizer.eos_token=}')
print(f'{tokenizer.all_special_tokens=}')
if __name__ == '__main__':
does_t5_have_sep_token()
print('Done\a')
but feels hacky.
refs: