Larger instance types to do not reduce training time?

I am running an MLM fine tuning job, stitched together form different examples. Training with a ml.g4dnxlarge takes about an hour. But g4dn.2xlarge and 4xlarge as well as as p3.2xlarge take about exactly the same time. Why is there no speedup in training? Makes me think I am not utilizing the gpu currently

My scripts is below





from transformers import (
    AutoModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    AutoTokenizer,
    AutoFeatureExtractor,
    AutoModelForMaskedLM,
    default_data_collator
)


from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
import random
import logging
import sys
import argparse
import os
import torch

if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--train_batch_size", type=int, default=32)
    parser.add_argument("--eval_batch_size", type=int, default=64)
    parser.add_argument("--warmup_steps", type=int, default=500)
    parser.add_argument("--model_id", type=str)
    parser.add_argument("--learning_rate", type=str, default=5e-5)
    parser.add_argument("--train_file", type=str, default="train.DbEmbeddings")
    parser.add_argument("--test_file", type=str, default="test.DbEmbeddings")
    parser.add_argument("--fp16", type=bool, default=True)

    # Data, model, and output directories
    parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
    parser.add_argument("--weight_decay", type=str, default=0.01)

    args, _ = parser.parse_known_args()

    # Set up logging
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName("INFO"),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )
    
    print('\nWalk!!:')
    for path, subdirs, files in os.walk('/opt/ml'): 
        for name in files: print(os.path.join(path, name))
          

    
    # load datasets
    raw_train_dataset = load_dataset("json", data_files=os.path.join(args.training_dir, args.train_file))["train"]
    raw_test_dataset = load_dataset("json", data_files=os.path.join(args.test_dir, args.test_file))["train"]
    
    print('\nraw_train_dataset.features', raw_train_dataset.features)

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.model_id)

    print('\nDownloading model args.model_id', args.model_id)
    # download model from model hub
    model = AutoModelForMaskedLM.from_pretrained(args.model_id, output_hidden_states=True)



    def tokenize_function(examples):
        result = tokenizer(examples["source"])
        print('check if fast')
        if tokenizer.is_fast:
            print('is fast')
            result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        return result


    # Use batched=True to activate fast multithreading!
    train_dataset = raw_train_dataset.map(
        tokenize_function, batched=True, remove_columns=["source"]
    )
    test_dataset = raw_test_dataset.map(
        tokenize_function, batched=True, remove_columns=["source"]
    )


    chunk_size = 128

    # Slicing produces a list of lists for each feature
    tokenized_samples = train_dataset[:7]

    for idx, sample in enumerate(tokenized_samples["input_ids"]):
        print(f"'>>> Description {idx} length: {len(sample)}' {sample}")

    concatenated_examples = {
        k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
    }
    total_length = len(concatenated_examples["input_ids"])
    print(f"'>>> Concatenated descriptions length: {total_length}'")

    chunks = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }

    for chunk in chunks["input_ids"]:
        print(f"'>>> Chunk length: {len(chunk)}'")

    def group_texts(examples):
        # Concatenate all texts
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        # Compute length of concatenated texts
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the last chunk if it's smaller than chunk_size
        total_length = (total_length // chunk_size) * chunk_size
        # Split by chunks of max_len
        result = {
            k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
            for k, t in concatenated_examples.items()
        }
        # Create a new labels column
        result["labels"] = result["input_ids"].copy()
        return result

    lm_train_dataset = train_dataset.map(group_texts, batched=True)

    lm_test_dataset = test_dataset.map(group_texts, batched=True)

    print(tokenizer.decode(lm_train_dataset[1]["input_ids"]))

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

    samples = [lm_train_dataset[i] for i in range(2)]
    for sample in samples:
        _ = sample.pop("word_ids")

    for chunk in data_collator(samples)["input_ids"]:
        print(f"\n'>>> {tokenizer.decode(chunk)}'")

    
    print('\nraw_train_dataset.features', raw_train_dataset.features)
    
    # print size
    logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
    logger.info(f" loaded test_dataset length is: {len(test_dataset)}")

    #model =  AutoFeatureExtractor.from_pretrained(args.model_id)
    
    print('\nmodel', model)
  
    batch_size = 64
    # Show the training loss with every epoch
    logging_steps = len(lm_train_dataset) // batch_size
    model_name = 'db-mlm-finetune'
    print('args.learning_rate', args.learning_rate)
    print('type', type(args.learning_rate))
    training_args = TrainingArguments(
        output_dir=args.model_dir,
        #overwrite_output_dir=True,
        evaluation_strategy="epoch",
        learning_rate=float(args.learning_rate),
        weight_decay=float(args.weight_decay),
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        #push_to_hub=True,
        fp16=True,
        logging_steps=logging_steps,
        num_train_epochs = int(args.epochs)
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_train_dataset,
        eval_dataset=lm_test_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # train model
    trainer.train()
    
    # Saves the model to s3
    trainer.save_model(args.model_dir)
    
    for path, subdirs, files in os.walk('/opt/ml'): 
        for name in files: print(os.path.join(path, name))

Hey @MaximusDecimusMeridi,

It makes sense that your training is not faster when using g4dn.2xlarge or g4dn.4xlarge since they only also have 1 GPU.

But when using p3.2xlarge there should be some difference. What are you seeing when taking a look at you GPUUtilization and GPUMemoryUtilization at your training job overview?

1 Like

indeed, switching from g4dn.xlarge to g4dn.4xlarge will speed up training only if you have a CPU bottleneck (check in Cloudwatch what is % utilisation for CPU and GPU)

It’s kind of odd than p3.2xlarge doesn’t speed up things, I recommend you check CloudWatch to verify that your GPU is busy enough.

I also see a couple for loop in your code: are you sure it’s worth running that on a GPU instance? how long is it taking compared to the expected training time? if you have long CPU steps, consider running them out of the GPU jobs