RuntimeError: tensors must be contiguous when finetuning GPT-J-6B using PEFT Lora

Hello HuggingFace Team,

While training GPT-J-6B on a dataset comprising around 40,000 samples, and utilizing multiple V100 GPUs, each with 32 GB capacity, I encountered a RuntimeError indicating that tensors must be contiguous. This error occurs when I try to utilize deepspeed to train the model on two V100 GPUs, each with 32 GB of memory.

Could you please review my code and provide any suggestions or solutions?

Here is my entire codebase.

import torch
import numpy as np
import pandas as pd
from functools import partial
from data_loader import load_dataset_from_file
from accelerate import Accelerator
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_int8_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    set_seed,
    Trainer,
)

RESPONSE_KEY = " ### Response:"
DEFAULT_INPUT_MODEL = "EleutherAI/gpt-j-6B"
seed = 42
model_checkpoint = "cp-gpt_j_peft"

class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples):
        batch = super().torch_call(examples)
        response_token_ids = self.tokenizer.encode(RESPONSE_KEY)
        labels = batch["labels"].clone()
        for i in range(len(examples)):
            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                if np.array_equal(
                    response_token_ids,
                    batch["labels"][i, idx : idx + len(response_token_ids)],
                ):
                    response_token_ids_start_idx = idx
                    break
            if response_token_ids_start_idx is None:
                raise RuntimeError("Could not find response key token IDs")
            response_token_ids_end_idx = response_token_ids_start_idx + len(
                response_token_ids
            )
            labels[i, :response_token_ids_end_idx] = -100
        batch["labels"] = labels
        return batch


def preprocess_batch(batch, tokenizer: AutoTokenizer, max_length: int ):
    return tokenizer(batch["text"], max_length=max_length, truncation=True)


def load_training_dataset(
    training_data_id=load_dataset_from_file("dataset/final_df.csv"),
):
    dataset = training_data_id
    data = load_dataset_from_file("dataset/final_df.csv")
    data = data.filter(lambda rec: not rec["text"].strip().startswith(" ### Response:"))

    def _func(rec):
        rec["text"] += "\n\n### End"
        return rec

    dataset = dataset.map(_func)
    return dataset


def preprocess_dataset(
    tokenizer: AutoTokenizer, max_length: int, seed=seed, 
):
    dataset = load_training_dataset()
    _preprocessing_function = partial(
        preprocess_batch, max_length=max_length, tokenizer=tokenizer
    )
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "input", "output", "text"],
    )
    dataset = dataset.filter(lambda rec: len(rec["input_ids"]) < max_length)
    dataset = dataset.shuffle(seed=seed)
    return dataset

def load_tokenizer(pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL):
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path, use_fast=True
    )
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer


def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def load_model(
    pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL,
    gradient_checkpointing: bool = False,
):
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path,
        trust_remote_code=True,
        load_in_8bit=False,
        device_map="auto",
        use_cache=False if gradient_checkpointing else True,
    )
    model = prepare_model_for_int8_training(model)
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
    )
    model = get_peft_model(model, peft_config)
    print_trainable_parameters(model)
    return model


def get_tokenizer_model(
    pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL,
    gradient_checkpointing: bool = False,
):
    tokenizer = load_tokenizer(pretrained_model_name_or_path)
    model = load_model(
        pretrained_model_name_or_path, gradient_checkpointing=gradient_checkpointing
    )
    return tokenizer, model


def train(
    local_output_dir,
    epochs,
    per_device_train_batch_size,
    per_device_eval_batch_size,
    lr,
    seed,
    test_size=200,
):
    set_seed(seed)
    tokenizer, model = get_tokenizer_model()

    conf = model.config
    for length_setting in ['n_positions', 'max_position_embeddings', 'seq_length']:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            break
    if not max_length:
        max_length = 1024

    processed_dataset = preprocess_dataset(tokenizer=tokenizer, max_length=max_length, seed=seed)
    split_dataset = processed_dataset.train_test_split(test_size=200, seed=seed)
    data_collator = DataCollatorForCompletionOnlyLM(
        tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
    )
    training_args = TrainingArguments(
        f"{model_checkpoint}-med-doctor",
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        optim = "adamw_torch",
        learning_rate=lr,
        weight_decay=0.01,
        gradient_checkpointing=True,
        gradient_accumulation_steps = 2,
        fp16 = True,
        warmup_ratio = 0.01,
        num_train_epochs=epochs,
        deepspeed="config/deepspeed_config.json",
        evaluation_strategy="epoch",
        eval_steps=10,
        save_strategy="epoch",
        save_steps=200,
        save_total_limit=5,
        disable_tqdm=False,
        remove_unused_columns=True,
        lr_scheduler_type ='linear',
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=split_dataset["train"],
        eval_dataset=split_dataset["test"],
        data_collator=data_collator,
    )

    trainer.train()
    trainer.save_model(output_dir=local_output_dir)

def main(**kwargs):
    train(**kwargs)


if __name__ == "__main__":
    try:
        med_tune = {
            "local_output_dir": "final-gpt-j-peft/",
            "epochs": 10,
            "per_device_train_batch_size": 8,
            "per_device_eval_batch_size": 8,
            "lr": 0.001,
            "seed": seed,
            "test_size": 200,
        }
        main(**med_tune)
    except Exception:
        raise

Here is my config for deepspeed for V100 GPU

{
  "fp16": {
    "enabled": true
  },
  "bf16": {
    "enabled": false
  },
  "optimizer": {
    "type": "AdamW",
    "params": {
      "lr": "auto",
      "betas": "auto",
      "eps": "auto",
      "weight_decay": "auto"
    }
  },
  "scheduler": {
    "type": "WarmupLR",
    "params": {
      "warmup_min_lr": "auto",
      "warmup_max_lr": "auto",
      "warmup_num_steps": "auto"
    }
  },
  "zero_optimization": {
    "stage": 3,
    "overlap_comm": true,
    "contiguous_gradients": true,
    "sub_group_size": 1e9,
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
    "stage3_max_live_parameters": 1e9,
    "stage3_max_reuse_distance": 1e9,
    "stage3_gather_16bit_weights_on_model_save": true,
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": true
    }
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "steps_per_print": 2000,
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
}

I am running the above code using below command

deepspeed --num_gpus 2 trainer.py

Here is my attached image for multiple GPU:

:hugs:Transformers DeepSpeed Intermediate