ValueError: Expected input batch_size (1024) to match target batch_size (4)

I want to perform LoRA Fine-tuning on an llama3 model, using the following dataset:
sst2_sent.json

sst2_sent.json example

{"sentence": "hide new secretions from the parental units", "label": 0}
{"sentence": "contains no wit , only labored gags", "label": 0}
{"sentence": "that loves its characters and communicates something rather beautiful about human nature", "label": 1}
{"sentence": "remains utterly satisfied to remain the same throughout", "label": 0}
........

I have split the data set into training data and testing data.

But I get the following error while training:

ValueError: Expected input batch_size (1024) to match target batch_size (4).

Here is my code:

def tokenize_dataset(dataset, tokenizer):
    def preprocess_function(examples):
        inputs = tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)
        inputs["labels"] = examples["label"]
        return inputs
    return dataset.map(preprocess_function, batched=True)
def load_model(model_name):
    """Load the model and configure LoRA
    """
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map='cuda:0',
        num_labels=2
    )

    lora_config = LoraConfig(
        r=16,  
        lora_alpha=8,  
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
        lora_dropout=0.05,  
        bias='none',  # wether to train bias weights, set to 'none' for attention layers
        task_type='SEQ_CLS'
    )
    model = prepare_model_for_kbit_training(model)
    return get_peft_model(model, lora_config)
def train_model(model, train_dataset, test_dataset, tokenizer, output_dir):
    """Set training parameters and fine-tune them"""
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        logging_steps=10,
        num_train_epochs=3,
        save_steps=100,
        learning_rate=1e-4,
        save_on_each_node=True,
        gradient_checkpointing=True
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    )

    print("Staring to training...")
    trainer.train()
    print("Finished to training...")
    trainer.save_pretrained(output_dir)  # Save model
    tokenizer.save_pretrained(output_dir)  # Save tokenizer

main.py :

# 1. Load dataset
print("Loading and processing dataset...")
dataset = load_dataset("json", data_files={
    "train": "train.json",
     "test": "test.json"})
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# 2. Tokenization
print("------------------------------------------------------------")
print("Start the Tokenization dataset...")
tokenizer = AutoTokenizer.from_pretrained(llm_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
tokenized_train = tokenize_dataset(train_dataset, tokenizer)
tokenized_test = tokenize_dataset(test_dataset, tokenizer)
# 3. Model fine-tuning
print("------------------------------------------------------------")
print("Loading model and starting fine-tuning...")
model = load_model("meta-llama/Meta-Llama-3-8B-Instruct")
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
train_model(model, tokenized_train, tokenized_test, tokenizer, "./llm-lora-fine-tuned")

I’ve seen a few posts similar, but none of the proposed answers were useful for this case. Has this ever happened to you?
Would love some insight.
Thanks in advance.

1 Like

I tried to reproduce it, but no matter what I did, DataCollatorForSeq2Seq kept failing…
If you use CausalLM and DataCollatorForLanguageModeling together, it will work, but it probably won’t be the training you want.
The fastest thing to do might be to inherit from DataCollatorForSeq2Seq and rewrite part of it.