Problem saving QLORA fine tuned model

I’m having an issue with saving the safetensors for my fine tuned model and pushing them to hub. It’s related to, but distinct from this post.

Whenever I try to use the save_pretrained() method I get the same error message as this previous poster. I can push my model to the hub, but the safetensors containing my fine tuned model weights are not uploaded, so when I try to download my model later for inference, I get the same error message “Error while deserializing header: InvalidHeaderDeserialization”.

I understand that this may be due to an incompatibility between PEFT and Pytorch, so can anyone suggest a workaround?

I should add that the safetensors containing the model weights are there after fine tuning, it’s just that they won’t save or be uploaded to Huggingface

My code is reproduced below:

from transformers import LlamaForCausalLM, LlamaTokenizer
import os
import argparse
from transformers import (
    LlamaForCausalLM,
    LlamaTokenizer,
    LlamaConfig,
    set_seed,
    default_data_collator,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
)
from datasets import load_from_disk
import torch
import bitsandbytes as bnb
from huggingface_hub import login, HfFolder
import accelerate
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

model_id = "psymon/KoLlama2-7b" # sharded weights
tokenizer = LlamaTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

model = LlamaForCausalLM.from_pretrained(
        model_id,
        use_cache=False,
        device_map="auto",
        quantization_config=bnb_config,
    )

def find_all_linear_names(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)

def create_peft_model(model, gradient_checkpointing=True, bf16=True):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_kbit_training,
    )
    from peft.tuners.lora import LoraLayer

    # prepare int-4 model for training
    model = prepare_model_for_kbit_training(
        model, use_gradient_checkpointing=gradient_checkpointing
    )
    if gradient_checkpointing:
        model.gradient_checkpointing_enable()

    # get lora target modules
    modules = find_all_linear_names(model)
    print(f"Found {len(modules)} modules to quantize: {modules}")

    peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        target_modules=modules,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

    model = get_peft_model(model, peft_config)

    # pre-process the model by upcasting the layer norms in float 32 for
    for name, module in model.named_modules():
        if isinstance(module, LoraLayer):
            if bf16:
                module = module.to(torch.bfloat16)
        if "norm" in name:
            module = module.to(torch.float32)
        if "lm_head" in name or "embed_tokens" in name:
            if hasattr(module, "weight"):
                if bf16 and module.weight.dtype == torch.float32:
                    module = module.to(torch.bfloat16)

    model.print_trainable_parameters()
    return model

# create peft config
model = create_peft_model(model, gradient_checkpointing=True, bf16=True)

output_dir = XXXXX
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=8,
    bf16=True,  # Use BF16 if available
    learning_rate=5e-5,
    num_train_epochs=3,
    gradient_checkpointing=True,
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
)

# Create a data collator
data_collator = DataCollatorWithPadding(tokenizer)

# Initialize the custom Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# Start training
trainer.train()

model.merge_and_unload()

model.save_pretrained('model_name')