Tokenizer train_new_from_iterator hanging for several models

Trying to train some tokenizers based on existing ones and some of them are just hanging during the “Compute merges” step.

transformers.__version__: 4.38.2
tokenizers.__version__: 0.15.2
from datasets import load_dataset
from transformers import AutoTokenizer

def batch_iterator(dataset, batch_size=1_000):
    for batch in dataset.iter(batch_size=batch_size):
        yield batch["text"]

if __name__ == "__main__":

    dsd = load_dataset("hyperdemocracy/usc-llm-text")

## these hang
#    base_model = "mistralai/Mistral-7B-v0.1"
#    base_model = "google/gemma-7b"
#    base_model = "meta-llama/Llama-2-7b-hf"

## these work
#    base_model = "openai-community/gpt2"
#    base_model = "google-bert/bert-base-uncased"
#    base_model = "google-bert/bert-base-cased"
    base_model = "microsoft/phi-2"

    orig_tokenizer = AutoTokenizer.from_pretrained(base_model)
    vocab_size = 2**15
    new_tokenizer = orig_tokenizer.train_new_from_iterator(batch_iterator(dsd["train"]), vocab_size)

    tag = base_model.split("/")[-1]
    out_path = f"tokenizer-{tag}-v{vocab_size}"
    new_tokenizer.save_pretrained(out_path)
    new_tokenizer.push_to_hub(f"hyperdemocracy/{out_path}")