Larger instance types to do not reduce training time?

I am running an MLM fine tuning job, stitched together form different examples. Training with a ml.g4dnxlarge takes about an hour. But g4dn.2xlarge and 4xlarge as well as as p3.2xlarge take about exactly the same time. Why is there no speedup in training? Makes me think I am not utilizing the gpu currently

My scripts is below

from transformers import (

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
import random
import logging
import sys
import argparse
import os
import torch

if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--train_batch_size", type=int, default=32)
    parser.add_argument("--eval_batch_size", type=int, default=64)
    parser.add_argument("--warmup_steps", type=int, default=500)
    parser.add_argument("--model_id", type=str)
    parser.add_argument("--learning_rate", type=str, default=5e-5)
    parser.add_argument("--train_file", type=str, default="train.DbEmbeddings")
    parser.add_argument("--test_file", type=str, default="test.DbEmbeddings")
    parser.add_argument("--fp16", type=bool, default=True)

    # Data, model, and output directories
    parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
    parser.add_argument("--weight_decay", type=str, default=0.01)

    args, _ = parser.parse_known_args()

    # Set up logging
    logger = logging.getLogger(__name__)

        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    for path, subdirs, files in os.walk('/opt/ml'): 
        for name in files: print(os.path.join(path, name))

    # load datasets
    raw_train_dataset = load_dataset("json", data_files=os.path.join(args.training_dir, args.train_file))["train"]
    raw_test_dataset = load_dataset("json", data_files=os.path.join(args.test_dir, args.test_file))["train"]
    print('\nraw_train_dataset.features', raw_train_dataset.features)

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.model_id)

    print('\nDownloading model args.model_id', args.model_id)
    # download model from model hub
    model = AutoModelForMaskedLM.from_pretrained(args.model_id, output_hidden_states=True)

    def tokenize_function(examples):
        result = tokenizer(examples["source"])
        print('check if fast')
        if tokenizer.is_fast:
            print('is fast')
            result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        return result

    # Use batched=True to activate fast multithreading!
    train_dataset =
        tokenize_function, batched=True, remove_columns=["source"]
    test_dataset =
        tokenize_function, batched=True, remove_columns=["source"]

    chunk_size = 128

    # Slicing produces a list of lists for each feature
    tokenized_samples = train_dataset[:7]

    for idx, sample in enumerate(tokenized_samples["input_ids"]):
        print(f"'>>> Description {idx} length: {len(sample)}' {sample}")

    concatenated_examples = {
        k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
    total_length = len(concatenated_examples["input_ids"])
    print(f"'>>> Concatenated descriptions length: {total_length}'")

    chunks = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()

    for chunk in chunks["input_ids"]:
        print(f"'>>> Chunk length: {len(chunk)}'")

    def group_texts(examples):
        # Concatenate all texts
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        # Compute length of concatenated texts
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the last chunk if it's smaller than chunk_size
        total_length = (total_length // chunk_size) * chunk_size
        # Split by chunks of max_len
        result = {
            k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
            for k, t in concatenated_examples.items()
        # Create a new labels column
        result["labels"] = result["input_ids"].copy()
        return result

    lm_train_dataset =, batched=True)

    lm_test_dataset =, batched=True)


    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

    samples = [lm_train_dataset[i] for i in range(2)]
    for sample in samples:
        _ = sample.pop("word_ids")

    for chunk in data_collator(samples)["input_ids"]:
        print(f"\n'>>> {tokenizer.decode(chunk)}'")

    print('\nraw_train_dataset.features', raw_train_dataset.features)
    # print size" loaded train_dataset length is: {len(train_dataset)}")" loaded test_dataset length is: {len(test_dataset)}")

    #model =  AutoFeatureExtractor.from_pretrained(args.model_id)
    print('\nmodel', model)
    batch_size = 64
    # Show the training loss with every epoch
    logging_steps = len(lm_train_dataset) // batch_size
    model_name = 'db-mlm-finetune'
    print('args.learning_rate', args.learning_rate)
    print('type', type(args.learning_rate))
    training_args = TrainingArguments(
        num_train_epochs = int(args.epochs)

    trainer = Trainer(

    # train model
    # Saves the model to s3
    for path, subdirs, files in os.walk('/opt/ml'): 
        for name in files: print(os.path.join(path, name))

Hey @MaximusDecimusMeridi,

It makes sense that your training is not faster when using g4dn.2xlarge or g4dn.4xlarge since they only also have 1 GPU.

But when using p3.2xlarge there should be some difference. What are you seeing when taking a look at you GPUUtilization and GPUMemoryUtilization at your training job overview?

1 Like

indeed, switching from g4dn.xlarge to g4dn.4xlarge will speed up training only if you have a CPU bottleneck (check in Cloudwatch what is % utilisation for CPU and GPU)

It’s kind of odd than p3.2xlarge doesn’t speed up things, I recommend you check CloudWatch to verify that your GPU is busy enough.

I also see a couple for loop in your code: are you sure it’s worth running that on a GPU instance? how long is it taking compared to the expected training time? if you have long CPU steps, consider running them out of the GPU jobs