Training Process Crashes without error message

Hello everyone,

I have been using the same code for fine-tuning CodeLLAMA2. The code was working great but suddenly the training process started crashing without an error message. I tried to run the code several times, but each time it crashed at a different step. I tried to update the transformers library and the problem persisted. Also, I monitored the GPUs memory usage and it was 5/24 and 7/24. I also noticed that the expected training time was reduced from 2hrs 30 min when the training process was working fine to less than one hour.

Here is the code I am using:

from datetime import datetime
import os
import sys
import faulthandler
import transformers
from datasets import load_dataset

from transformers.utils import logging
import datasets
import torch
import logging
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,

    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)

# we use this one for the run
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, \
    BitsAndBytesConfig

# ===== Notes  =========================
#
# =====================================
faulthandler.enable()

transformers.logging.set_verbosity_debug()
logger = transformers.logging.get_logger("transformers")
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


dataset = load_dataset('json', data_files=r"dataset.json", split="train")
train_dataset = dataset.train_test_split(test_size=0.1)["train"]
eval_dataset = dataset.train_test_split(test_size=0.1)["test"]


# print(eval_dataset[3])
# device_map = {
#    "transformer.word_embeddings": 0,
#    "transformer.word_embeddings_layernorm": 0,
#    "lm_head": "gpu",
#    "transformer.h": 0,
#    "transformer.ln_f": 0,
# }
quantization_config = BitsAndBytesConfig(load_in_8bit_fp32_cpu_offload=True)
# Load the model
base_model = "codellama/CodeLlama-7b-hf"
output_dir = "XX"


model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")




# Tokenization
# Setup some tokenization settings like left padding because it makes training use less memory:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

# Setup the tokenize function to make labels and input_ids the same. This is basically what
# self-supervised fine-tuning is:


def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result


def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""YXXX

    ### Input:
    {data_point["question"]}

    ### Context:
    ###{data_point["context"]}

    ### Response:
    {data_point["answer"]}
    """
    return tokenize(full_prompt)


# Reformat to prompt and tokenize each sample:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)


# setup LORA
model.train() #  put model back into training mode
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=16, # Lora attention dimension
    lora_alpha=16, # The alpha parameter for Lora scaling.
    #  target_modules: If this is not specified, modules will be chosen according to the model architecture.
    #  If the architecture is not
    #  known, an error will be raised — in this case, you should specify the target modules manually.
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
wandb_project = "X"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True


batch_size = 64
per_device_train_batch_size = 4
gradient_accumulation_steps = batch_size // per_device_train_batch_size


training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=30,
        max_steps=300,
        push_to_hub=True,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        eval_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=20,
        save_steps=20,
        output_dir=output_dir,
        # save_total_limit=3,
        #load_best_model_at_end=True, 
        # ddp_find_unused_parameters=False if ddp else None,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="wandb", # if use_wandb else "none",
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
    )

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)
model.config.use_cache = False

old_state_dict = model.state_dict
# model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
 #   model, type(model)
#)
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)

file_formatter = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
                                       datefmt="%m/%d/%Y %H:%M:%S", )
file_handler = logging.FileHandler(
    os.path.join(training_args.output_dir, f"log.{os.getpid()}.{training_args.local_rank}.txt"))
file_handler.setFormatter(file_formatter)
logging.root.addHandler(file_handler)
trainer.train()

trainer.push_to_hub(model_name="X")



Here is the logs of the failed run:

The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: answer, context, question. If answer, context, question are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 49
  Batch size = 8
Saving model checkpoint to network-code-llama\checkpoint-20
loading configuration file config.json from cache at C:\Users\ASUS\.cache\huggingface\hub\models--codellama--CodeLlama-7b-hf\snapshots\6c284d1468fe6c413cf56183e69b194dcfa27fe6\config.json
Model config LlamaConfig {
  "_name_or_path": "codellama/CodeLlama-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.3",
  "use_cache": true,
  "vocab_size": 32016
}

C:\Users\ASUS\anaconda3\envs\Alaa_env_2\lib\site-packages\torch\utils\checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: answer, context, question. If answer, context, question are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 49
  Batch size = 8
Saving model checkpoint to network-code-llama\checkpoint-40
loading configuration file config.json from cache at C:\Users\ASUS\.cache\huggingface\hub\models--codellama--CodeLlama-7b-hf\snapshots\6c284d1468fe6c413cf56183e69b194dcfa27fe6\config.json
Model config LlamaConfig {
  "_name_or_path": "codellama/CodeLlama-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.3",
  "use_cache": true,
  "vocab_size": 32016
}

C:\Users\ASUS\anaconda3\envs\Alaa_env_2\lib\site-packages\torch\utils\checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: answer, context, question. If answer, context, question are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 49
  Batch size = 8
Saving model checkpoint to network-code-llama\checkpoint-60
loading configuration file config.json from cache at C:\Users\ASUS\.cache\huggingface\hub\models--codellama--CodeLlama-7b-hf\snapshots\6c284d1468fe6c413cf56183e69b194dcfa27fe6\config.json
Model config LlamaConfig {
  "_name_or_path": "codellama/CodeLlama-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.3",
  "use_cache": true,
  "vocab_size": 32016
}

C:\Users\ASUS\anaconda3\envs\Alaa_env_2\lib\site-packages\torch\utils\checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: answer, context, question. If answer, context, question are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 49
  Batch size = 8
Saving model checkpoint to network-code-llama\checkpoint-80
loading configuration file config.json from cache at C:\Users\ASUS\.cache\huggingface\hub\models--codellama--CodeLlama-7b-hf\snapshots\6c284d1468fe6c413cf56183e69b194dcfa27fe6\config.json
Model config LlamaConfig {
  "_name_or_path": "codellama/CodeLlama-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.3",
  "use_cache": true,
  "vocab_size": 32016
}

C:\Users\ASUS\anaconda3\envs\Alaa_env_2\lib\site-packages\torch\utils\checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: answer, context, question. If answer, context, question are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 49
  Batch size = 8
Saving model checkpoint to network-code-llama\checkpoint-100
loading configuration file config.json from cache at C:\Users\ASUS\.cache\huggingface\hub\models--codellama--CodeLlama-7b-hf\snapshots\6c284d1468fe6c413cf56183e69b194dcfa27fe6\config.json
Model config LlamaConfig {
  "_name_or_path": "codellama/CodeLlama-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.3",
  "use_cache": true,
  "vocab_size": 32016
}

C:\Users\ASUS\anaconda3\envs\Alaa_env_2\lib\site-packages\torch\utils\checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: answer, context, question. If answer, context, question are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 49
  Batch size = 8
Saving model checkpoint to network-code-llama\checkpoint-120
loading configuration file config.json from cache at C:\Users\ASUS\.cache\huggingface\hub\models--codellama--CodeLlama-7b-hf\snapshots\6c284d1468fe6c413cf56183e69b194dcfa27fe6\config.json
Model config LlamaConfig {
  "_name_or_path": "codellama/CodeLlama-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.3",
  "use_cache": true,
  "vocab_size": 32016
}

C:\Users\ASUS\anaconda3\envs\Alaa_env_2\lib\site-packages\torch\utils\checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(


Even though the model should be using these columns. I searched for similar problems but nothing worked for my case. I would appreciate any help.