I am facing an out-of-memory error when trying to fine-tune Gemma-2b for Sequence Classification. My code for it is below and
training_dataset = Dataset.from_pandas(training_df, split="train")
testing_dataset = Dataset.from_pandas(testing_df)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=t.bfloat16,
bnb_4bit_use_double_quant=True
)
# Setup the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID,
padding_side="right",
add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token
# Setup the model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID,
num_labels=7,
quantization_config=bnb_config,
device_map={"":0})
def tokenize(batch):
return tokenizer(batch['text'], truncation=True, max_length=512, padding="max_length")
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
training_dataset = training_dataset.map(tokenize, batched=True)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=4,
bias="none",
task_type="SEQ_CLS",
target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj"],
)
model = get_peft_model(model, peft_config)
training_arguments = TrainingArguments(
output_dir="./model",
learning_rate=2e-5,
per_device_train_batch_size=128,
per_device_eval_batch_size=128,
weight_decay=0.01,
logging_steps=100,
save_steps=100,
)
trainer = Trainer(
model=model,
train_dataset=training_dataset,
tokenizer=tokenizer,
args=training_arguments,
)
trainer.train()
This is the error I get
File ~/.pyenv/versions/3.11.8/lib/python3.11/site-packages/torch/nn/modules/linear.py:116, in Linear.forward(self, input)
115 def forward(self, input: Tensor) -> Tensor:
--> 116 return F.linear(input, self.weight, self.bias)
OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 0 has a total capacity of 39.39 GiB of which 3.60 GiB is free. Process 517440 has 35.78 GiB memory in use. Of the allocated memory 33.44 GiB is allocated by PyTorch, and 1.84 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
A look at the dataframes for the training and testing data is attached as an image. There are 3 features - text, label_text and label. I am using 7 labels indexed 1-7.