I want to perform LoRA Fine-tuning on an llama3 model, using the following dataset:
sst2_sent.json
sst2_sent.json example
{"sentence": "hide new secretions from the parental units", "label": 0}
{"sentence": "contains no wit , only labored gags", "label": 0}
{"sentence": "that loves its characters and communicates something rather beautiful about human nature", "label": 1}
{"sentence": "remains utterly satisfied to remain the same throughout", "label": 0}
........
I have split the data set into training data and testing data.
But I get the following error while training:
ValueError: Expected input batch_size (1024) to match target batch_size (4).
Here is my code:
def tokenize_dataset(dataset, tokenizer):
    def preprocess_function(examples):
        inputs = tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)
        inputs["labels"] = examples["label"]
        return inputs
    return dataset.map(preprocess_function, batched=True)
def load_model(model_name):
    """Load the model and configure LoRA
    """
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map='cuda:0',
        num_labels=2
    )
    lora_config = LoraConfig(
        r=16,  
        lora_alpha=8,  
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
        lora_dropout=0.05,  
        bias='none',  # wether to train bias weights, set to 'none' for attention layers
        task_type='SEQ_CLS'
    )
    model = prepare_model_for_kbit_training(model)
    return get_peft_model(model, lora_config)
def train_model(model, train_dataset, test_dataset, tokenizer, output_dir):
    """Set training parameters and fine-tune them"""
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        logging_steps=10,
        num_train_epochs=3,
        save_steps=100,
        learning_rate=1e-4,
        save_on_each_node=True,
        gradient_checkpointing=True
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    )
    print("Staring to training...")
    trainer.train()
    print("Finished to training...")
    trainer.save_pretrained(output_dir)  # Save model
    tokenizer.save_pretrained(output_dir)  # Save tokenizer
main.py :
# 1. Load dataset
print("Loading and processing dataset...")
dataset = load_dataset("json", data_files={
    "train": "train.json",
     "test": "test.json"})
train_dataset = dataset["train"]
test_dataset = dataset["test"]
# 2. Tokenization
print("------------------------------------------------------------")
print("Start the Tokenization dataset...")
tokenizer = AutoTokenizer.from_pretrained(llm_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
tokenized_train = tokenize_dataset(train_dataset, tokenizer)
tokenized_test = tokenize_dataset(test_dataset, tokenizer)
# 3. Model fine-tuning
print("------------------------------------------------------------")
print("Loading model and starting fine-tuning...")
model = load_model("meta-llama/Meta-Llama-3-8B-Instruct")
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
train_model(model, tokenized_train, tokenized_test, tokenizer, "./llm-lora-fine-tuned")
I’ve seen a few posts similar, but none of the proposed answers were useful for this case. Has this ever happened to you?
Would love some insight.
Thanks in advance.