How to train a LlamaTokenizer?

I have been trying to train a LlamaTokenizer but I keep running into infinite training times and out of memory problems. For some reason, my script consumes a lot of RAM.

Can someone help me? I am trying to train a LlamaTokenizer in Portuguese so my language model (to be trained) is compatible with the entire Llama ecosystem.

Here is my script:

import yaml
import argparse
from tqdm import tqdm

import torch
import datasets
from datasets import load_dataset 

from transformers import (
    AutoTokenizer,
    TrainingArguments,
)

from specifications import ModelArguments, DataTrainingArguments, ExtraArguments

def main(spec_file):
   
    # Load the arguments from the spec file
    with open(spec_file, "r") as stream:
        kwargs = yaml.safe_load(stream)
    
    # Get the arguments for the model, data, training, and extra
    model_args = ModelArguments(**kwargs['model_args'])
    data_args = DataTrainingArguments(**kwargs['data_args'])
    training_args = TrainingArguments(**kwargs['training_args'])
    extra_args = ExtraArguments(**kwargs['extra_args'])

    # Load the dataset from the huggingface Hub and prepare it for training
    if data_args.dataset_name is not None and not data_args.dataset_is_tokenized:
        dataset = load_dataset(data_args.dataset_name, 
            split=data_args.dataset_split, 
            use_auth_token=training_args.hub_token if training_args.hub_token else None,
            cache_dir=model_args.cache_dir,
            streaming=data_args.streaming,
        )
    else:
        raise ValueError("No dataset name provided or dataset is already tokenized") 

    # Remove non text columns
    dataset = dataset.remove_columns([col for col in dataset.column_names if col != "text"])

    # create a python generator to dynamically load the data
    def batch_iterator(batch_size=10000):
        for i in tqdm(range(0, len(dataset), batch_size)):
            yield dataset[i : i + batch_size]["text"]
    
    # Set the configuration kwargs for the tokenizer
    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": training_args.hub_token,
        "trust_remote_code": model_args.trust_remote_code,
        "bos_token": model_args.bos_token,
        "unk_token": model_args.unk_token,
        "eos_token": model_args.eos_token,
        "pad_token": model_args.eos_token,
    }

    # Create a tokenizer from the model checkpoint you want to train
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name, 
        **tokenizer_kwargs,
    )

    new_tokenizer = tokenizer.train_new_from_iterator(
        text_iterator=batch_iterator(), 
        vocab_size=model_args.vocab_size,
    )

    # Replace the new_tokenizer `max_model_input_sizes` for the `data_args.block_size`
    new_tokenizer.max_model_input_sizes.clear()
    new_tokenizer.max_model_input_sizes[extra_args.logger_name] = data_args.block_size
    new_tokenizer.model_max_length = tokenizer.model_max_length
    new_tokenizer.name_or_path = training_args.hub_model_id + "-tokenizer"

    # Save the new tokenizer
    new_tokenizer.save_pretrained(training_args.output_dir)
    
    # If hub_token is passed, upload the tokenizer to the hub
    if training_args.hub_token is not None and training_args.hub_model_id is not None:
        
        new_tokenizer.push_to_hub(
            repo_id=training_args.hub_model_id + '-tokenizer',
            use_auth_token=training_args.hub_token,
            commit_message=f"Trained tokenizer from scratch on {data_args.dataset_name}",
        )

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train a new Llama tokenizer")
    parser.add_argument("--spec-file", help="Path to the spec YAML file")
    args = parser.parse_args()
    main(args.spec_file)

My dataset was less than 3M lines/rows. The only time I was able to make this script work was when I reduced the dataset to 100 lines. But that is to little. :frowning:

Note: This script works fine when using a GPT2 tokenizer as the initial tokenizer. Is the sentencepience approach that much slower/memory hungry?

Does anyone know what is going on?!

In case anyone also wants to train one of these, this is how I managed:

import json
import yaml
import argparse
from tqdm import tqdm

from datasets import load_dataset
from tokenizers import SentencePieceBPETokenizer
from transformers import LlamaTokenizerFast, TrainingArguments, AutoTokenizer

from specifications import ModelArguments, DataTrainingArguments, ExtraArguments

def main(spec_file):
    
    # Load the arguments from the spec file
    with open(spec_file, "r") as stream:
        kwargs = yaml.safe_load(stream)
    
    # Get the arguments for the model, data, training, and extra
    model_args = ModelArguments(**kwargs['model_args'])
    data_args = DataTrainingArguments(**kwargs['data_args'])
    training_args = TrainingArguments(**kwargs['training_args'])
    extra_args = ExtraArguments(**kwargs['extra_args'])

    # Load the dataset from the huggingface Hub and prepare it for training
    if data_args.dataset_name is not None and not data_args.dataset_is_tokenized:
        dataset = load_dataset(data_args.dataset_name, 
            split=data_args.dataset_split, 
            use_auth_token=training_args.hub_token if training_args.hub_token else None,
            cache_dir=model_args.cache_dir,
            streaming=data_args.streaming,
        )
    else:
        raise ValueError("No dataset name provided or dataset is already tokenized") 

    # Remove non text columns
    dataset = dataset.remove_columns([col for col in dataset.column_names if col != "text"])

    # select 2_000_000 random samples from the dataset
    dataset = dataset.shuffle(seed=training_args.seed).select(range(2_000_000))

    # Create a SentencePieceBPETokenizer
    tokenizer = SentencePieceBPETokenizer()

    # Train the SentencePieceBPETokenizer on the dataset
    tokenizer.train_from_iterator(
        iterator=dataset['text'],
        vocab_size=32_000,
        show_progress=True,
        special_tokens=["<unk>", "<s>", "</s>",  "<pad>"],
    )

    # Save the tokenizer
    tokenizer.save(extra_args.logger_name + "-sentencepiece-tokenizer.json", pretty=True)

    # Load the new tokenizer as a LlamaTokenizerFast
    new_llama_tokenizer = LlamaTokenizerFast(
        tokenizer_file=extra_args.logger_name + "-sentencepiece-tokenizer.json",
        name_or_path=training_args.hub_model_id + "-tokenizer",
        unk_token="<unk>",
        unk_token_id=0,
        bos_token="<s>",
        bos_token_id=1,
        eos_token="</s>",
        eos_token_id=2,
        pad_token="<pad>",
        pad_token_id=3,
        padding_side="right",
        max_model_input_sizes={extra_args.logger_name: data_args.block_size},
    )

    # Save the new tokenizer
    new_llama_tokenizer.save_pretrained(extra_args.logger_name + "-tokenizer")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train a new Llama tokenizer")
    parser.add_argument("--spec-file", help="Path to the spec YAML file")
    args = parser.parse_args()
    main(args.spec_file)

It takes some time, but at least it gives you a tokenizer.

Hi,

Thanks for sharing this! Great starting point for me. Would it be possible to share a sample spec file?

Thanks!

You can use it without one. Just pass the arguments you want directly. Here is a code snippet you can use:

import json
import argparse
from tqdm import tqdm

from datasets import load_dataset
from tokenizers import SentencePieceBPETokenizer
from transformers import LlamaTokenizerFast, TrainingArguments, AutoTokenizer

def main(args):

    # Load the dataset from the huggingface Hub and prepare it for training
    if args.dataset_name is not None:
        dataset = load_dataset(args.dataset_name, 
            split=args.dataset_split, 
            token=args.hub_token if args.hub_token else None,
        )
    else:
        raise ValueError("No dataset name provided or dataset is already tokenized") 

    # Remove non text columns
    dataset = dataset.remove_columns([col for col in dataset.column_names if col != "text"])

    # select `num_samples` from the dataset
    dataset = dataset.shuffle(seed=42).select(range(arg.num_samples))

    # Create a SentencePieceBPETokenizer
    tokenizer = SentencePieceBPETokenizer()

    # Train the SentencePieceBPETokenizer on the dataset
    tokenizer.train_from_iterator(
        iterator=dataset['text'],
        vocab_size=args.vocab_size,
        show_progress=True,
        special_tokens=["<unk>", "<s>", "</s>",  "<pad>"],
    )

    # Save the tokenizer
    tokenizer.save("new-sentencepiece-tokenizer.json", pretty=True)

    # Load reference tokenizer
    if args.reference_tokenizer is not None and args.hub_token is not None:
        reference_tokenizer = AutoTokenizer.from_pretrained(args.reference_tokenizer, token=args.hub_token if args.hub_token else None)
        reference_tokenizer.save_pretrained("reference-tokenizer")
    else:
        raise ValueError("No tokenizer name provided or no hub token provided. Try using `--reference_tokenizer 'meta-llama/Llama-2-7b-hf'")

    # Read and dump the json file for the new tokenizer and the reference tokenizer
    with open("new-sentencepiece-tokenizer.json") as f:
        new_llama_tokenizer_json = json.load(f)

    with open("reference-tokenizer/tokenizer.json") as f:
        reference_tokenizer_json = json.load(f)
    
    # Add the reference tokenizer's config to the new tokenizer's config
    new_llama_tokenizer_json["normalizer"] = reference_tokenizer_json["normalizer"]
    new_llama_tokenizer_json["pre_tokenizer"] = reference_tokenizer_json["pre_tokenizer"]
    new_llama_tokenizer_json["post_processor"] = reference_tokenizer_json["post_processor"]
    new_llama_tokenizer_json["decoder"] = reference_tokenizer_json["decoder"]
    new_llama_tokenizer_json["model"]['fuse_unk'] = reference_tokenizer_json["model"]['fuse_unk']
    new_llama_tokenizer_json["model"]['byte_fallback'] = reference_tokenizer_json["model"]['byte_fallback']

    # Dump the new tokenizer's config
    with open("new-sentencepiece-tokenizer.json", "w") as f:
        json.dump(new_llama_tokenizer_json, f, indent=2, ensure_ascii=False)

    # Load the new tokenizer as a LlamaTokenizerFast
    new_llama_tokenizer = LlamaTokenizerFast(
        tokenizer_file="new-sentencepiece-tokenizer.json",
        unk_token="<unk>",
        unk_token_id=0,
        bos_token="<s>",
        bos_token_id=1,
        eos_token="</s>",
        eos_token_id=2,
        pad_token="<pad>",
        pad_token_id=3,
        padding_side="right",
    )

    # Save the new tokenizer
    new_llama_tokenizer.save_pretrained("new-llama-tokenizer")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train a new Llama tokenizer")
    parser.add_argument(
        "--dataset_name",
        type=str,
        default=None,
        help="The name of the dataset to be tokenized",
    )
    parser.add_argument(
        "--dataset_split",
        type=str,
        default=None,
        help="The split of the dataset to be tokenized",
    )
    parser.add_argument(
        "--hub_token",
        type=str,
        default=None,
        help="The token to access the dataset on the hub",
    )
    parser.add_argument(
        "--reference_tokenizer",
        type=str,
        default=None,
        help="The name of the reference tokenizer to use",
    )
    parser.add_argument(
        "--num_samples",
        type=int,
        default=None,
        help="Number of samples to use from the dataset",
    )
    parser.add_argument(
        "--vocab_size",
        type=int,
        default=None,
        help="Vocabulary size to use for the tokenizer",
    )
    args = parser.parse_args()
    main(args)

# How to run:
# python train_sentencepiece.py --dataset_name "NeelNanda/pile-10k" --dataset_split "train" --hub_token "hf_..." --reference_tokenizer "meta-llama/Llama-2-7b-hf" --num_samples 2000000 --vocab_size 32000

Hope it helps!