Training using multiple GPUs

@sgugger hi, I’m trying to fine tune “meta-llama/Llama-2-7b” model in Kaggle notebook with (GPU T4 X 2), I’m noticing only one GPU is being used.

Sun Feb 25 14:06:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   76C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla T4                       Off | 00000000:00:05.0 Off |                    0 |
| N/A   77C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|  No running processes found                                                           |
+---------------------------------------------------------------------------------------+

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from random import randrange
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM, prepare_model_for_int8_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("train.csv")
train = Dataset.from_pandas(df)
model_id = "meta-llama/Llama-2-7b"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             load_in_4bit = True,
                                             torch_dtype = torch.float16,
                                             device_map="auto")
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_int8_training(model)
peft_config = LoraConfig(
                          lora_alpha=16,
                          lora_dropout=0.1,
                          r=64,
                          bias="none",
                          task_type="CAUSAL_LM"
                        )
model = get_peft_model(model, peft_config)

args = TrainingArguments(
    output_dir='custom_domain_test',
    num_train_epochs=5,
    per_device_train_batch_size=8, 
    optim = "adamw_torch",
    logging_steps = 100,
    save_total_limit = 2,
    save_strategy = "no",
    load_best_model_at_end=False,
    learning_rate=2e-4,
    fp16=True,
    seed=42,
    warmup_ratio = 0.1,
    lr_scheduler_type = "linear",
    report_to="none",
    dataloader_num_workers = 4
)

# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    # eval_dataset=test,
    dataset_text_field='text',
    peft_config=peft_config,
    max_seq_length=1042,
    tokenizer=tokenizer,
    args=args,
    packing=True,
)

# train
trainer.train()

can you please tell me how to utilise both the GPU and increase GPU and CPU utilisation using HuggingFace Trainer?