I tried to use SFTTrainer with 1 A100 80G for full-fine tuning of Llama2 7b model, but I got OOM even in batch size 1.
Is this common sense?
There are currently 3 A100 GPU available, is there any way to do full fine-tuning?
i was able to train 7b under 10gb
the trick is to batch up the training data (by default all data is loaded without batching) and load the model in 8 bit… 3-A100 dang…
I would start with batching it up… then I would try quantizing it
here is some code that worked for me
import os
import torch
from datasets import load_dataset
import pandas as pd
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
pipeline,
logging,
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, TaskType
from trl import SFTTrainer, DPOTrainer, PPOTrainer
#del model
torch.cuda.empty_cache()
base_model = "lmsys/vicuna-7b-v1.5-16k"#"../../Llama-2-7b-chat-hf/"
training_dataset = "testing_dataset.csv"
new_model = "vicuna-7b-v1.5-16k-summarization"
prompts = ['Here is latest context of the call.',
'Here is my relevant facts/triplets needed to be used construct a response.',
'Use the call context and the relevant facts/triplets to create an ideal agent verbal dialogue to respond to a customer service agent.']
df = pd.read_csv("../hc_training.csv")
df.gpt_response_extract = df.gpt_response_extract.str.replace(" "," ").str.replace(" "," ").str.replace(" "," ").str.replace(" "," ")
training_data = "<s>[INST] " + prompts[0] + " \n\n" + df.call_input + "\n\n" + prompts[1] + "\n\n" + df.gpt_response_extract + "\n\n" + prompts[2] + " [/INST] " + df.edited_gpt_response_generate + " </s>"
pd.DataFrame(training_data,columns=["text"]).to_csv(training_dataset,index=False)
dataset = load_dataset('csv', data_files=training_dataset, split="train")
from accelerate import Accelerator
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
base_model,
quantization_config=quant_config,#load_in_8bit=True,
device_map={"": Accelerator().local_process_index},
)
model = prepare_model_for_kbit_training(model)
tokenizer = AutoTokenizer.from_pretrained(base_model,token="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")
# add LoRA to model
lora_config = LoraConfig(
r=64,
lora_alpha=16,
lora_dropout=0.1,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.padding_side = "right"
training_params = TrainingArguments(
output_dir="./results",
num_train_epochs=4,
per_device_train_batch_size=7, #######<<<<<<<<<<<<<value to reduce/increase given GPU recources
gradient_accumulation_steps=1,
optim="paged_adamw_32bit",
save_steps=25,
logging_steps=50,
learning_rate=2.5e-4,
weight_decay=0.001,
fp16=False,
bf16=False,
max_grad_norm=0.3,
max_steps=-1,
warmup_ratio=0.03,
group_by_length=True,
)
tokenizer.pad_token = tokenizer.eos_token
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=lora_config,
dataset_text_field="text",
max_seq_length=None,
tokenizer=tokenizer,
args=training_params,
packing=False
)
trainer.train()
1 Like