How many GPU resources do I need for full-fine tuning of the 7b model?

I tried to use SFTTrainer with 1 A100 80G for full-fine tuning of Llama2 7b model, but I got OOM even in batch size 1.
Is this common sense?
There are currently 3 A100 GPU available, is there any way to do full fine-tuning?

i was able to train 7b under 10gb

the trick is to batch up the training data (by default all data is loaded without batching) and load the model in 8 bit… 3-A100 dang…

I would start with batching it up… then I would try quantizing it

here is some code that worked for me

import os
import torch
from datasets import load_dataset
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, TaskType
from trl import SFTTrainer, DPOTrainer, PPOTrainer

#del model
torch.cuda.empty_cache()



base_model = "lmsys/vicuna-7b-v1.5-16k"#"../../Llama-2-7b-chat-hf/"
training_dataset = "testing_dataset.csv"
new_model = "vicuna-7b-v1.5-16k-summarization"


prompts = ['Here is latest context of the call.',
 'Here is my relevant facts/triplets needed to be used construct a response.',
 'Use the call context and the relevant facts/triplets to create an ideal agent verbal dialogue to respond to a customer service agent.']

df = pd.read_csv("../hc_training.csv")
df.gpt_response_extract = df.gpt_response_extract.str.replace("     ","  ").str.replace("     ","  ").str.replace("    ","  ").str.replace("   ","  ")

training_data = "<s>[INST] " + prompts[0] + " \n\n" + df.call_input + "\n\n" + prompts[1] + "\n\n" + df.gpt_response_extract  + "\n\n" + prompts[2] + " [/INST] " + df.edited_gpt_response_generate + " </s>"
pd.DataFrame(training_data,columns=["text"]).to_csv(training_dataset,index=False)
dataset = load_dataset('csv', data_files=training_dataset, split="train")



from accelerate import Accelerator

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=quant_config,#load_in_8bit=True,
        device_map={"": Accelerator().local_process_index},
        
    )
model = prepare_model_for_kbit_training(model)
tokenizer = AutoTokenizer.from_pretrained(base_model,token="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")

# add LoRA to model
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.padding_side = "right"


training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=7,     #######<<<<<<<<<<<<<value to reduce/increase given GPU recources
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=50,
    learning_rate=2.5e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
)



tokenizer.pad_token = tokenizer.eos_token
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False
)

trainer.train()
1 Like