I’m having an issue with saving the safetensors for my fine tuned model and pushing them to hub. It’s related to, but distinct from this post.
Whenever I try to use the save_pretrained() method I get the same error message as this previous poster. I can push my model to the hub, but the safetensors containing my fine tuned model weights are not uploaded, so when I try to download my model later for inference, I get the same error message “Error while deserializing header: InvalidHeaderDeserialization”.
I understand that this may be due to an incompatibility between PEFT and Pytorch, so can anyone suggest a workaround?
I should add that the safetensors containing the model weights are there after fine tuning, it’s just that they won’t save or be uploaded to Huggingface
My code is reproduced below:
from transformers import LlamaForCausalLM, LlamaTokenizer
import os
import argparse
from transformers import (
LlamaForCausalLM,
LlamaTokenizer,
LlamaConfig,
set_seed,
default_data_collator,
BitsAndBytesConfig,
Trainer,
TrainingArguments,
)
from datasets import load_from_disk
import torch
import bitsandbytes as bnb
from huggingface_hub import login, HfFolder
import accelerate
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
model_id = "psymon/KoLlama2-7b" # sharded weights
tokenizer = LlamaTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = LlamaForCausalLM.from_pretrained(
model_id,
use_cache=False,
device_map="auto",
quantization_config=bnb_config,
)
def find_all_linear_names(model):
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, bnb.nn.Linear4bit):
names = name.split(".")
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if "lm_head" in lora_module_names: # needed for 16-bit
lora_module_names.remove("lm_head")
return list(lora_module_names)
def create_peft_model(model, gradient_checkpointing=True, bf16=True):
from peft import (
get_peft_model,
LoraConfig,
TaskType,
prepare_model_for_kbit_training,
)
from peft.tuners.lora import LoraLayer
# prepare int-4 model for training
model = prepare_model_for_kbit_training(
model, use_gradient_checkpointing=gradient_checkpointing
)
if gradient_checkpointing:
model.gradient_checkpointing_enable()
# get lora target modules
modules = find_all_linear_names(model)
print(f"Found {len(modules)} modules to quantize: {modules}")
peft_config = LoraConfig(
r=64,
lora_alpha=16,
target_modules=modules,
lora_dropout=0.1,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, peft_config)
# pre-process the model by upcasting the layer norms in float 32 for
for name, module in model.named_modules():
if isinstance(module, LoraLayer):
if bf16:
module = module.to(torch.bfloat16)
if "norm" in name:
module = module.to(torch.float32)
if "lm_head" in name or "embed_tokens" in name:
if hasattr(module, "weight"):
if bf16 and module.weight.dtype == torch.float32:
module = module.to(torch.bfloat16)
model.print_trainable_parameters()
return model
# create peft config
model = create_peft_model(model, gradient_checkpointing=True, bf16=True)
output_dir = XXXXX
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=8,
bf16=True, # Use BF16 if available
learning_rate=5e-5,
num_train_epochs=3,
gradient_checkpointing=True,
# logging strategies
logging_dir=f"{output_dir}/logs",
logging_strategy="steps",
logging_steps=10,
save_strategy="no",
)
# Create a data collator
data_collator = DataCollatorWithPadding(tokenizer)
# Initialize the custom Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
data_collator=data_collator,
)
# Start training
trainer.train()
model.merge_and_unload()
model.save_pretrained('model_name')