I am running an MLM fine tuning job, stitched together form different examples. Training with a ml.g4dnxlarge takes about an hour. But g4dn.2xlarge and 4xlarge as well as as p3.2xlarge take about exactly the same time. Why is there no speedup in training? Makes me think I am not utilizing the gpu currently
My scripts is below
from transformers import (
AutoModel,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling,
AutoTokenizer,
AutoFeatureExtractor,
AutoModelForMaskedLM,
default_data_collator
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
import random
import logging
import sys
import argparse
import os
import torch
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--train_batch_size", type=int, default=32)
parser.add_argument("--eval_batch_size", type=int, default=64)
parser.add_argument("--warmup_steps", type=int, default=500)
parser.add_argument("--model_id", type=str)
parser.add_argument("--learning_rate", type=str, default=5e-5)
parser.add_argument("--train_file", type=str, default="train.DbEmbeddings")
parser.add_argument("--test_file", type=str, default="test.DbEmbeddings")
parser.add_argument("--fp16", type=bool, default=True)
# Data, model, and output directories
parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
parser.add_argument("--weight_decay", type=str, default=0.01)
args, _ = parser.parse_known_args()
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.getLevelName("INFO"),
handlers=[logging.StreamHandler(sys.stdout)],
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
print('\nWalk!!:')
for path, subdirs, files in os.walk('/opt/ml'):
for name in files: print(os.path.join(path, name))
# load datasets
raw_train_dataset = load_dataset("json", data_files=os.path.join(args.training_dir, args.train_file))["train"]
raw_test_dataset = load_dataset("json", data_files=os.path.join(args.test_dir, args.test_file))["train"]
print('\nraw_train_dataset.features', raw_train_dataset.features)
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
print('\nDownloading model args.model_id', args.model_id)
# download model from model hub
model = AutoModelForMaskedLM.from_pretrained(args.model_id, output_hidden_states=True)
def tokenize_function(examples):
result = tokenizer(examples["source"])
print('check if fast')
if tokenizer.is_fast:
print('is fast')
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
return result
# Use batched=True to activate fast multithreading!
train_dataset = raw_train_dataset.map(
tokenize_function, batched=True, remove_columns=["source"]
)
test_dataset = raw_test_dataset.map(
tokenize_function, batched=True, remove_columns=["source"]
)
chunk_size = 128
# Slicing produces a list of lists for each feature
tokenized_samples = train_dataset[:7]
for idx, sample in enumerate(tokenized_samples["input_ids"]):
print(f"'>>> Description {idx} length: {len(sample)}' {sample}")
concatenated_examples = {
k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated descriptions length: {total_length}'")
chunks = {
k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
for k, t in concatenated_examples.items()
}
for chunk in chunks["input_ids"]:
print(f"'>>> Chunk length: {len(chunk)}'")
def group_texts(examples):
# Concatenate all texts
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
# Compute length of concatenated texts
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the last chunk if it's smaller than chunk_size
total_length = (total_length // chunk_size) * chunk_size
# Split by chunks of max_len
result = {
k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
for k, t in concatenated_examples.items()
}
# Create a new labels column
result["labels"] = result["input_ids"].copy()
return result
lm_train_dataset = train_dataset.map(group_texts, batched=True)
lm_test_dataset = test_dataset.map(group_texts, batched=True)
print(tokenizer.decode(lm_train_dataset[1]["input_ids"]))
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
samples = [lm_train_dataset[i] for i in range(2)]
for sample in samples:
_ = sample.pop("word_ids")
for chunk in data_collator(samples)["input_ids"]:
print(f"\n'>>> {tokenizer.decode(chunk)}'")
print('\nraw_train_dataset.features', raw_train_dataset.features)
# print size
logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
#model = AutoFeatureExtractor.from_pretrained(args.model_id)
print('\nmodel', model)
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_train_dataset) // batch_size
model_name = 'db-mlm-finetune'
print('args.learning_rate', args.learning_rate)
print('type', type(args.learning_rate))
training_args = TrainingArguments(
output_dir=args.model_dir,
#overwrite_output_dir=True,
evaluation_strategy="epoch",
learning_rate=float(args.learning_rate),
weight_decay=float(args.weight_decay),
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
#push_to_hub=True,
fp16=True,
logging_steps=logging_steps,
num_train_epochs = int(args.epochs)
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=lm_train_dataset,
eval_dataset=lm_test_dataset,
data_collator=data_collator,
tokenizer=tokenizer
)
# train model
trainer.train()
# Saves the model to s3
trainer.save_model(args.model_dir)
for path, subdirs, files in os.walk('/opt/ml'):
for name in files: print(os.path.join(path, name))