Trying to train some tokenizers based on existing ones and some of them are just hanging during the “Compute merges” step.
transformers.__version__: 4.38.2
tokenizers.__version__: 0.15.2
from datasets import load_dataset
from transformers import AutoTokenizer
def batch_iterator(dataset, batch_size=1_000):
for batch in dataset.iter(batch_size=batch_size):
yield batch["text"]
if __name__ == "__main__":
dsd = load_dataset("hyperdemocracy/usc-llm-text")
## these hang
# base_model = "mistralai/Mistral-7B-v0.1"
# base_model = "google/gemma-7b"
# base_model = "meta-llama/Llama-2-7b-hf"
## these work
# base_model = "openai-community/gpt2"
# base_model = "google-bert/bert-base-uncased"
# base_model = "google-bert/bert-base-cased"
base_model = "microsoft/phi-2"
orig_tokenizer = AutoTokenizer.from_pretrained(base_model)
vocab_size = 2**15
new_tokenizer = orig_tokenizer.train_new_from_iterator(batch_iterator(dsd["train"]), vocab_size)
tag = base_model.split("/")[-1]
out_path = f"tokenizer-{tag}-v{vocab_size}"
new_tokenizer.save_pretrained(out_path)
new_tokenizer.push_to_hub(f"hyperdemocracy/{out_path}")