Unable to resume Multi GPU training from checkpoint SFT Trainer

The training was interrupted. When I restarted it, No output in the terminal. The GPU stat constantly shows 3 MB and 420 MB memory usage by both A00s. Is training stuck??

import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

max_seq_length = 2048 
dtype = None
load_in_4bit = True 
from utils.util import *
from transformers import TrainingArguments
from trl import SFTTrainer
from transformers import AutoTokenizer
from datasets import load_dataset
from accelerate import PartialState
device_string = PartialState().process_index
args=TrainingArguments(
        per_device_train_batch_size = 8,
        per_device_eval_batch_size =8,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, 
        save_steps = 100,
        learning_rate = 2e-4,
        fp16 = False,
        bf16 = True,
        logging_steps = 2,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        resume_from_checkpoint = 'outputs/checkpoint-7900',
        do_eval = True,
        eval_strategy = "steps",
        eval_steps = 1000,
        save_total_limit = 5,
        gradient_checkpointing_kwargs={'use_reentrant':False},
        tf32=True,
        gradient_checkpointing =True,
        logging_dir = 'logs',
        log_level = 'info'

    )
tokenizer = AutoTokenizer.from_pretrained("tokenizer2")
tokenizer.padding_side = 'right'
 
model = Load_Model.from_pretrained(
            "training1/model1",
            max_seq_length = 2048,
            device_map              = {'':device_string},
            dtype             = None,
            load_in_4bit      = True,
    
        )

model.resize_token_embeddings(len(tokenizer))
# model.save_pretrained('base_model2')
from peft import LoraConfig

peft_config = LoraConfig(
    r=64,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_rslora = True, 
    task_type="CAUSAL_LM",
    modules_to_save=["embed_tokens"]
)
# from peft import get_peft_model,prepare_model_for_kbit_training
# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, peft_config)
print('trainable parameters: ',sum(p.numel() for p in model.parameters() if p.requires_grad))
dataset = load_dataset('csv', data_files='llm_prompts2.csv', split='train')

dataset=dataset.train_test_split(test_size=0.01, seed = 47, shuffle=True)
print("dataset loaded. Starting training.")
# Train

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    eval_dataset = dataset['test'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 16,
    packing = False, # Can make training 5x faster for short sequences.
    args = args,
    peft_config=peft_config,
    )
trainer_stats = trainer.train(
    resume_from_checkpoint=True
)
1 Like

Issue solved. It took some time and training started normally.

1 Like

How long did it take for training to restart? I’ve run into a similar problem with multi-GPU training that hangs indefinitely, using Trainer though instead of SFTTrainer.