Training Longformer works on jupyter notebook but not with .py

Hi, I am currently training a Longformer using a single GPU of 80GB with the following code:


import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # or "0,1" for multiple GPUs

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader

LEARNING_RATE = 0.005
MAX_LENGTH =  4096
BATCH_SIZE = 8
EPOCHS = 20

CHECKPOINT = 'allenai/longformer-base-4096'

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, num_labels=1, problem_type = 'regression')

ds = DatasetDict()

ds['train'] = Dataset.from_pandas(pd.DataFrame({'text' : X_train, 'label' : y_train}))
ds['validation'] = Dataset.from_pandas(pd.DataFrame({'text' : X_val, 'label' : y_val}))
ds['test'] =  Dataset.from_pandas(pd.DataFrame({'text' : X_test, 'label' : y_test}))

def preprocess_function(examples):
    label = examples["label"] 
    examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    
    # Change this to real number
    examples["label"] = float(label)
    return examples

for split in ds:
    ds[split] = ds[split].map(preprocess_function, remove_columns=[ "text"])

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="../models/camembert-fine-tuned-regression-2",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="mse",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

import torch

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss


trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

This above works with no problem when running on a jupyter notebooks, but if I am using the above code as .py script lunching it it tells me that CUDA is out of memory even using a batch size of 1, where using a batch size of 8 on jupyter notebook does not affect the training.

What could cause the issue?