Addition of lm_head and embed_tokens layers to the lora adapter

Hello,

I’m finetuning a Llama 3.1 8B model using peft and for some reason getting extra layers to the output adapter.
I try to understand why is this happening, since I’m trying to use VLLM multi lora inference, which does not support adapters with the lm_head and embed_tokens.
Is there a way to avoid this from happening?

During the finetuning I am getting the following log:

rWarning: Setting `save_embedding_layers` to `True` as the embedding layer has been resized during finetuning.

Attached is a snippet of the trainer, thank you for your help!

import torch
import gc
from transformers import TrainingArguments, EarlyStoppingCallback
from peft import LoraConfig, AutoPeftModelForCausalLM
from trl import DataCollatorForCompletionOnlyLM, SFTTrainer
from utils.tokenizers import load_tokenizer
from utils.models import load_model
from data_loaders.loaders import load_datasets
from utils.seed import set_seed
from utils.config import validate_config
from common.config import TrainingConfig
from utils.collators import debug_completion_only_collator
from utils.training import find_best_checkpoint, get_checkpoint_metrics, validate_checkpoints_count, clear_past_checkpoints
from utils.mlflow import setup_mlflow, save_model_to_mlflow, mlflow_end_run, save_tag
from utils.system import print_device_name
from utils.swarm_one_utils import wait_for_task_and_sync_metrics_to_mlflow
#from utils.swarm_sft_trainer import SwarmSFTTrainer
from common.logger import get_logger

logger = get_logger(__name__)

def sft_training(cfg: TrainingConfig):
    print_device_name()
    validate_config(cfg)
    setup_mlflow(cfg)

    if cfg.seed:
        set_seed(cfg.seed)

    # Load base model
    model = load_model(cfg)
    # for param in model.lm_head.parameters():
    #     param.requires_grad = False
    # logger.info("Training Parameters:\n")
    # for name, param in model.named_parameters():
    #     if param.requires_grad:
    #         logger.info(name+"\n")
    # model.resize_token_embeddings(mean_resizing=False)
    tokenizer = load_tokenizer(cfg, model)
    train_dataset, eval_dataset = load_datasets(cfg, tokenizer)

    if cfg.auto_eval_save_steps_config is not None:
        logger.info("auto_eval_save_steps_config is activated...")
        steps = round(len(train_dataset) / (cfg.batch_size * cfg.auto_eval_save_steps_config.eval_and_save_times_per_epoch))
        steps = round(steps / cfg.gradient_accumulation_steps)
        cfg.save_steps = steps
        cfg.eval_steps = steps
        logger.info("set eval and save steps to be ..." + str(steps))

    validate_checkpoints_count(train_dataset, cfg)
    clear_past_checkpoints(cfg)

    logger.info("calculating max_seq_len...")

    def tokenize_text(examples):
        return {"num_tokens": [len(tokens) for tokens in tokenizer(examples["text"])["input_ids"]]}

    # Apply tokenization to the dataset and compute the maximum token size
    dataset_with_tokens = train_dataset.map(tokenize_text, batched=True, num_proc=4)
    calculated_max_seq_len = max(dataset_with_tokens["num_tokens"])
    logger.info("found max_seq_len is " + str(calculated_max_seq_len))

    # Set training parameters
    training_params = TrainingArguments(
        # Output directory where the model predictions and checkpoints will be stored
        output_dir=cfg.output_dir,
        num_train_epochs=cfg.num_epochs,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=1,
        # Number of update steps to accumulate the gradients for
        gradient_accumulation_steps=cfg.gradient_accumulation_steps,
        # Save checkpoint every X updates steps
        save_steps=cfg.save_steps,
        optim=cfg.optimizer,
        # Log every X updates steps
        logging_steps=cfg.logging_steps,
        eval_steps=cfg.eval_steps,
        # Initial learning rate
        learning_rate=cfg.learning_rate,
        # Weight decay to apply to all layers except bias/LayerNorm weights
        weight_decay=cfg.weight_decay,
        fp16=cfg.fp16,            # Enable fp16/bf16 training (set bf16 to True with an A100)
        bf16=cfg.bf16,
        # Maximum gradient normal (gradient clipping)
        max_grad_norm=cfg.max_grad_norm,
        # Number of training steps (overrides num_train_epochs)
        max_steps=cfg.max_steps,
        # Ratio of steps for a linear warmup (from 0 to learning rate)
        warmup_ratio=cfg.warmup_ratio,
        # Group sequences into batches with same length
        # Saves memory and speeds up training considerably
        group_by_length=cfg.group_by_length,
        save_strategy="steps",
        evaluation_strategy="steps",
        lr_scheduler_type=cfg.lr_scheduler_type,
        load_best_model_at_end=True,
        save_total_limit=1,
        gradient_checkpointing=True,
        metric_for_best_model="eval_loss",
        report_to=["tensorboard"] + (["mlflow"] if cfg.mlflow is not None else [])
    )

    sft_trainer_args = {
        "model": model,
        "train_dataset": train_dataset,
        "eval_dataset": eval_dataset,
        "dataset_text_field": "text",
        # Maximum sequence length to use
        "max_seq_length": calculated_max_seq_len,
        "tokenizer": tokenizer,
        "args": training_params,
        # Pack multiple short examples in the same input sequence to increase efficiency
        "packing": cfg.packing,
    }

    # Add peft_config only if the condition is met
    if cfg.lora_config:  # Replace 'condition' with your specific condition
        # Load LoRA configuration
        peft_config = LoraConfig(
            lora_alpha=cfg.lora_config.lora_alpha,
            lora_dropout=cfg.lora_config.lora_dropout,
            r=cfg.lora_config.lora_r,
            bias="none",
            task_type="CAUSAL_LM",
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj"
                ],
            # modules_to_save = ["embed_tokens"]
        )

        sft_trainer_args["peft_config"] = peft_config

    if cfg.train_on_completions_only:
        collator = DataCollatorForCompletionOnlyLM(
            instruction_template=cfg.train_on_completions_only_config.instruction_template,
            response_template=cfg.train_on_completions_only_config.response_template,
            tokenizer=tokenizer
        )

        logger.info("Training Dataset with Collator")
        debug_completion_only_collator(cfg, train_dataset, tokenizer, collator)
        logger.info("Eval Dataset with Collator")
        debug_completion_only_collator(cfg, eval_dataset, tokenizer, collator)

        sft_trainer_args["data_collator"] = collator

    if cfg.early_stopping_config:
        sft_trainer_args["callbacks"] = [EarlyStoppingCallback(early_stopping_patience=cfg.early_stopping_config.patience,
                                                                early_stopping_threshold=cfg.early_stopping_config.threshold)]

    # If Local training
    if cfg.swarm_one_config is None:
        trainer = SFTTrainer(**sft_trainer_args)
        trainer.train()
        best_checkpoint_path =  find_best_checkpoint(cfg.output_dir)
        best_checkpoint_metrics = get_checkpoint_metrics(best_checkpoint_path)
        saved_trained_model_path = best_checkpoint_path

        if cfg.lora_config is None:
            model.save_pretrained(saved_trained_model_path)
            tokenizer.save_pretrained(saved_trained_model_path)

        # clear model and data fro GPU
        del trainer
        del model
        del train_dataset
        del eval_dataset

        gc.collect()
        torch.cuda.empty_cache()  # Clear unused memory from PyTorch

        save_model_to_mlflow(cfg, saved_trained_model_path, best_checkpoint_metrics)

        return saved_trained_model_path
  
1 Like

Possibly this issue…

Thanks @John6666.
I’ve seen the solution of splitting the additional layers to a new safetensors file.
Do you know what could be the impact on the adapter performance with this fix?

1 Like

I’ve never used vllm, but if you just separate the safetensors file and load it later to combine it, it shouldn’t have any particular effect on the model’s behavior. Probably.

1 Like