Hi, I am currently training a Longformer using a single GPU of 80GB with the following code:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3" # or "0,1" for multiple GPUs
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader
LEARNING_RATE = 0.005
MAX_LENGTH = 4096
BATCH_SIZE = 8
EPOCHS = 20
CHECKPOINT = 'allenai/longformer-base-4096'
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, num_labels=1, problem_type = 'regression')
ds = DatasetDict()
ds['train'] = Dataset.from_pandas(pd.DataFrame({'text' : X_train, 'label' : y_train}))
ds['validation'] = Dataset.from_pandas(pd.DataFrame({'text' : X_val, 'label' : y_val}))
ds['test'] = Dataset.from_pandas(pd.DataFrame({'text' : X_test, 'label' : y_test}))
def preprocess_function(examples):
label = examples["label"]
examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
# Change this to real number
examples["label"] = float(label)
return examples
for split in ds:
ds[split] = ds[split].map(preprocess_function, remove_columns=[ "text"])
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="../models/camembert-fine-tuned-regression-2",
learning_rate=LEARNING_RATE,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
num_train_epochs=EPOCHS,
evaluation_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
metric_for_best_model="mse",
load_best_model_at_end=True,
weight_decay=0.01,
)
import torch
class RegressionTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs[0][:, 0]
loss = torch.nn.functional.mse_loss(logits, labels)
return (loss, outputs) if return_outputs else loss
trainer = RegressionTrainer(
model=model,
args=training_args,
train_dataset=ds["train"],
eval_dataset=ds["validation"],
compute_metrics=compute_metrics_for_regression,
)
trainer.train()
This above works with no problem when running on a jupyter notebooks, but if I am using the above code as .py script lunching it it tells me that CUDA is out of memory even using a batch size of 1, where using a batch size of 8 on jupyter notebook does not affect the training.
What could cause the issue?