Hi
I want to Finetue mistral-7b using this code:
from transformers import BitsAndBytesConfig
import torch
# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None
model_kwargs = dict(
attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
torch_dtype="auto",
use_cache=False, # set to False as we're going to use gradient checkpointing
device_map=device_map,
quantization_config=quantization_config,
from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments
max_seq_length = 3072
# path where the Trainer will save its checkpoints and logs
output_dir = 'data/zephyr-7b-sft-lora'
# based on config
training_args = TrainingArguments(
fp16=True, # specify bf16=True instead when training on GPUs that support bf16
do_eval=True,
evaluation_strategy="epoch",
gradient_accumulation_steps=64,
gradient_checkpointing=True,
gradient_checkpointing_kwargs={"use_reentrant": False},
learning_rate=2.0e-05,
log_level="info",
logging_steps=5,
logging_strategy="steps",
lr_scheduler_type="cosine",
max_steps=-1,
num_train_epochs=1,
output_dir=output_dir,
overwrite_output_dir=True,
per_device_eval_batch_size=1, # originally set to 8
per_device_train_batch_size=1, # originally set to 8
# push_to_hub=True,
# hub_model_id="zephyr-7b-sft-lora",
# hub_strategy="every_save",
# report_to="tensorboard",
save_strategy="no",
save_total_limit=None,
seed=42,
)
# based on config
peft_config = LoraConfig(
r=64,
lora_alpha=16,
lora_dropout=0.1,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
trainer = SFTTrainer(
model=model_id,
model_init_kwargs=model_kwargs,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
dataset_text_field="text",
tokenizer=tokenizer,
packing=True,
peft_config=peft_config,
max_seq_length=max_seq_length,
)
train_result = trainer.train()
my problem is when I use a single GPU instance, it works well but when I use multi-GPU(4 GPUs) I face CUDA out of memory. As I watched the GPU log I saw that suddenly one of the GPU’s memory got full at the beginning of training while the other three did not!
I was wondering if anyone encountered this issue and if is there a way to fix this.
Thanks