CUDA out memory only when performing hyperparameter search

I am working with a GTX3070, which only has 8GB of GPU RAM. When I am running using trainer.train(), I run fine with a maximum batch size of 7 (6 if running in Jupiter notebook). However, when I attempt to run in a hyperparameter search with ray, I get CUDA out of memory every single time.

I am wondering why this could be the case.

Here is my code. Sorry if it’s a little long. It’s based off the following Jupiter notebooks:

def model_init():
        if args.pretrained_checkpoint:
            model = VisionEncoderDecoderModel.from_pretrained(
                args.pretrained_checkpoint
            )
        else:
            model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                args.encoder_checkpoint, args.decoder_checkpoint
            )

        # set special tokens used for creating the decoder_input_ids from the labels
        model.config.decoder_start_token_id = tokenizer.cls_token_id
        model.config.pad_token_id = tokenizer.pad_token_id
        # make sure vocab size is set correctly
        model.config.vocab_size = model.config.decoder.vocab_size

        # set beam search parameters
        model.config.eos_token_id = tokenizer.sep_token_id
        model.config.max_length = 64
        model.config.early_stopping = True
        model.config.no_repeat_ngram_size = 3
        model.config.length_penalty = 2.0
        model.config.num_beams = 4
        return model

    def compute_metrics(pred, verbose=args.verbose_inference) -> Dict[str, float]:
        labels_ids = pred.label_ids
        pred_ids = pred.predictions

        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        labels_ids[labels_ids == -100] = tokenizer.pad_token_id
        label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
        if verbose:
            print(pred_str)

        # TODO: The following package from datasets load_metric
        # cer = cer_metric.compute(predictions=pred_str, references=label_str)
        cer = sum([editdistance.eval(a, b) for a, b in zip(pred_str, label_str)]) / sum(
            [len(b) for b in label_str]
        )

        return {"cer": cer}

    training_args = Seq2SeqTrainingArguments(
        num_train_epochs=100,
        predict_with_generate=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=3,  # 7 max for py, 6 max for ipynb
        per_device_eval_batch_size=3,
        fp16=True,  # set to false if turning off gpu
        output_dir=args.logging_dir,
        save_strategy="epoch",
        save_total_limit=10,
        logging_steps=1000,
        learning_rate=1e-4,
        load_best_model_at_end=True,
        report_to="wandb",
    )

    # instantiate trainer
    trainer = Seq2SeqTrainer(
        model_init=model_init,
        tokenizer=feature_extractor,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=default_data_collator,
    )

    def hp_space(trial) -> Dict[str, float]:
        # backend for ray
        return {
            "learning_rate": tune.loguniform(1e-6, 1e-4),
            "num_train_epochs": tune.choice(list(range(1, 6))),
            "seed": tune.uniform(1, 40),
            "per_device_train_batch_size": 1,
        }

    if args.hyperparameter_search:
        trainer.hyperparameter_search(
            hp_space=hp_space,
            backend="ray",
            n_trials=10,
            # search_alg=HyperOptSearch(metric="objective", mode="max"),
            # scheduler=ASHAScheduler(metric="loss", mode="min"),
            # fail_fast=True,
            max_failures=-1,
            name="testing_run_hellobro",
        )
    else:
        trainer.train()
1 Like

I encountered the same problem so +1 (post only for more attention)