from datasets import load_dataset
dataset = load_dataset("Alok2304/Indian_Law_Final_Dataset",split="train[:30%]")
dataset
from transformers import BitsAndBytesConfig
import torch
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.2-3B",
quantization_config = bnb_config,
trust_remote_code = True).to(device)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
r=4,
lora_alpha=8,
target_modules= ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
trainable params: 6,078,464 || all params: 3,218,828,288 || trainable%: 0.1888
import re
def preprocess_dataset(examples):
messages = []
for sentence in examples['text']:
sentence = re.sub(r"<s>\[INST\]","",sentence)
sentence = sentence.split("[/INST]")
question = sentence[0]
answer = re.sub(r"</s>","",sentence[1])
messages.append([
{"role":"user" ,"content":question},
{"role":"assistant","content":answer}
])
examples['messages'] = messages
return examples
dataset = dataset.map(preprocess_dataset,batched=True,remove_columns=['text'])
from trl import setup_chat_format
model,tokenizer = setup_chat_format(model,tokenizer)
from trl import SFTConfig, SFTTrainer
args = SFTConfig(
output_dir = "lora_model/",
per_device_train_batch_size = 8,
per_device_eval_batch_size = 8,
learning_rate = 2e-05,
max_steps = 300,
logging_strategy = "steps",
logging_steps = 25,
save_strategy = "steps",
save_steps = 25,
eval_strategy = "steps",
eval_steps = 25,
data_seed=42,
max_seq_length = 2048,
gradient_checkpointing=False,
report_to = "none",
)
trainer = SFTTrainer(
model = model,
args = args,
processing_class = tokenizer,
train_dataset = dataset['train'],
eval_dataset = dataset['test'],)
trainer.train()
/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
return fn(*args, **kwargs)
[ 26/300 03:33 < 40:41, 0.11 it/s, Epoch 0.01/1]
Step Training Loss Validation Loss
[ 43/283 02:11 < 12:32, 0.32 it/s]
---------------------------------------------------------------------------
OutOfMemoryError Traceback (most recent call last)
<ipython-input-24-3435b262f1ae> in <cell line: 0>()
----> 1 trainer.train()
13 frames
/usr/local/lib/python3.11/dist-packages/transformers/loss/loss_utils.py in ForCausalLMLoss(logits, labels, vocab_size, num_items_in_batch, ignore_index, **kwargs)
37 labels = labels.to(logits.device)
38 # Shift so that tokens < n predict n
---> 39 shift_logits = logits[..., :-1, :].contiguous()
40 shift_labels = labels[..., 1:].contiguous()
41
OutOfMemoryError: CUDA out of memory. Tried to allocate 6.69 GiB. GPU 0 has a total capacity of 14.74 GiB of which 4.29 GiB is free. Process 2635 has 10.45 GiB memory in use. Of the allocated memory 9.77 GiB is allocated by PyTorch, and 559.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
why am i getting out of memory error even thoiugh i quantized the model and also used lora. i think i should be able to train since the trainable params are only 6 million. Help me figure out the error please