I am working with a GTX3070, which only has 8GB of GPU RAM. When I am running using trainer.train(), I run fine with a maximum batch size of 7 (6 if running in Jupiter notebook). However, when I attempt to run in a hyperparameter search with ray, I get CUDA out of memory every single time.
I am wondering why this could be the case.
Here is my code. Sorry if it’s a little long. It’s based off the following Jupiter notebooks:
- notebooks/text_classification.ipynb at master · huggingface/notebooks · GitHub
- Transformers-Tutorials/Fine_tune_TrOCR_on_IAM_Handwriting_Database_using_Seq2SeqTrainer.ipynb at master · NielsRogge/Transformers-Tutorials · GitHub
def model_init():
if args.pretrained_checkpoint:
model = VisionEncoderDecoderModel.from_pretrained(
args.pretrained_checkpoint
)
else:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
args.encoder_checkpoint, args.decoder_checkpoint
)
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size
# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
return model
def compute_metrics(pred, verbose=args.verbose_inference) -> Dict[str, float]:
labels_ids = pred.label_ids
pred_ids = pred.predictions
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
labels_ids[labels_ids == -100] = tokenizer.pad_token_id
label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
if verbose:
print(pred_str)
# TODO: The following package from datasets load_metric
# cer = cer_metric.compute(predictions=pred_str, references=label_str)
cer = sum([editdistance.eval(a, b) for a, b in zip(pred_str, label_str)]) / sum(
[len(b) for b in label_str]
)
return {"cer": cer}
training_args = Seq2SeqTrainingArguments(
num_train_epochs=100,
predict_with_generate=True,
evaluation_strategy="epoch",
per_device_train_batch_size=3, # 7 max for py, 6 max for ipynb
per_device_eval_batch_size=3,
fp16=True, # set to false if turning off gpu
output_dir=args.logging_dir,
save_strategy="epoch",
save_total_limit=10,
logging_steps=1000,
learning_rate=1e-4,
load_best_model_at_end=True,
report_to="wandb",
)
# instantiate trainer
trainer = Seq2SeqTrainer(
model_init=model_init,
tokenizer=feature_extractor,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=default_data_collator,
)
def hp_space(trial) -> Dict[str, float]:
# backend for ray
return {
"learning_rate": tune.loguniform(1e-6, 1e-4),
"num_train_epochs": tune.choice(list(range(1, 6))),
"seed": tune.uniform(1, 40),
"per_device_train_batch_size": 1,
}
if args.hyperparameter_search:
trainer.hyperparameter_search(
hp_space=hp_space,
backend="ray",
n_trials=10,
# search_alg=HyperOptSearch(metric="objective", mode="max"),
# scheduler=ASHAScheduler(metric="loss", mode="min"),
# fail_fast=True,
max_failures=-1,
name="testing_run_hellobro",
)
else:
trainer.train()