I have been trying to train a LlamaTokenizer
but I keep running into infinite training times and out of memory problems. For some reason, my script consumes a lot of RAM.
Can someone help me? I am trying to train a LlamaTokenizer
in Portuguese so my language model (to be trained) is compatible with the entire Llama ecosystem.
Here is my script:
import yaml
import argparse
from tqdm import tqdm
import torch
import datasets
from datasets import load_dataset
from transformers import (
AutoTokenizer,
TrainingArguments,
)
from specifications import ModelArguments, DataTrainingArguments, ExtraArguments
def main(spec_file):
# Load the arguments from the spec file
with open(spec_file, "r") as stream:
kwargs = yaml.safe_load(stream)
# Get the arguments for the model, data, training, and extra
model_args = ModelArguments(**kwargs['model_args'])
data_args = DataTrainingArguments(**kwargs['data_args'])
training_args = TrainingArguments(**kwargs['training_args'])
extra_args = ExtraArguments(**kwargs['extra_args'])
# Load the dataset from the huggingface Hub and prepare it for training
if data_args.dataset_name is not None and not data_args.dataset_is_tokenized:
dataset = load_dataset(data_args.dataset_name,
split=data_args.dataset_split,
use_auth_token=training_args.hub_token if training_args.hub_token else None,
cache_dir=model_args.cache_dir,
streaming=data_args.streaming,
)
else:
raise ValueError("No dataset name provided or dataset is already tokenized")
# Remove non text columns
dataset = dataset.remove_columns([col for col in dataset.column_names if col != "text"])
# create a python generator to dynamically load the data
def batch_iterator(batch_size=10000):
for i in tqdm(range(0, len(dataset), batch_size)):
yield dataset[i : i + batch_size]["text"]
# Set the configuration kwargs for the tokenizer
tokenizer_kwargs = {
"cache_dir": model_args.cache_dir,
"revision": model_args.model_revision,
"use_auth_token": training_args.hub_token,
"trust_remote_code": model_args.trust_remote_code,
"bos_token": model_args.bos_token,
"unk_token": model_args.unk_token,
"eos_token": model_args.eos_token,
"pad_token": model_args.eos_token,
}
# Create a tokenizer from the model checkpoint you want to train
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name,
**tokenizer_kwargs,
)
new_tokenizer = tokenizer.train_new_from_iterator(
text_iterator=batch_iterator(),
vocab_size=model_args.vocab_size,
)
# Replace the new_tokenizer `max_model_input_sizes` for the `data_args.block_size`
new_tokenizer.max_model_input_sizes.clear()
new_tokenizer.max_model_input_sizes[extra_args.logger_name] = data_args.block_size
new_tokenizer.model_max_length = tokenizer.model_max_length
new_tokenizer.name_or_path = training_args.hub_model_id + "-tokenizer"
# Save the new tokenizer
new_tokenizer.save_pretrained(training_args.output_dir)
# If hub_token is passed, upload the tokenizer to the hub
if training_args.hub_token is not None and training_args.hub_model_id is not None:
new_tokenizer.push_to_hub(
repo_id=training_args.hub_model_id + '-tokenizer',
use_auth_token=training_args.hub_token,
commit_message=f"Trained tokenizer from scratch on {data_args.dataset_name}",
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Train a new Llama tokenizer")
parser.add_argument("--spec-file", help="Path to the spec YAML file")
args = parser.parse_args()
main(args.spec_file)
My dataset was less than 3M lines/rows. The only time I was able to make this script work was when I reduced the dataset to 100 lines. But that is to little.
Note: This script works fine when using a GPT2 tokenizer as the initial tokenizer. Is the sentencepience
approach that much slower/memory hungry?
Does anyone know what is going on?!