I want to perform LoRA Fine-tuning on an llama3 model, using the following dataset:
sst2_sent.json
sst2_sent.json example
{"sentence": "hide new secretions from the parental units", "label": 0}
{"sentence": "contains no wit , only labored gags", "label": 0}
{"sentence": "that loves its characters and communicates something rather beautiful about human nature", "label": 1}
{"sentence": "remains utterly satisfied to remain the same throughout", "label": 0}
........
I have split the data set into training data and testing data.
But I get the following error while training:
ValueError: Expected input batch_size (1024) to match target batch_size (4).
Here is my code:
def tokenize_dataset(dataset, tokenizer):
def preprocess_function(examples):
inputs = tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)
inputs["labels"] = examples["label"]
return inputs
return dataset.map(preprocess_function, batched=True)
def load_model(model_name):
"""Load the model and configure LoRA
"""
model = AutoModelForTokenClassification.from_pretrained(
model_name,
quantization_config=quantization_config,
device_map='cuda:0',
num_labels=2
)
lora_config = LoraConfig(
r=16,
lora_alpha=8,
target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
lora_dropout=0.05,
bias='none', # wether to train bias weights, set to 'none' for attention layers
task_type='SEQ_CLS'
)
model = prepare_model_for_kbit_training(model)
return get_peft_model(model, lora_config)
def train_model(model, train_dataset, test_dataset, tokenizer, output_dir):
"""Set training parameters and fine-tune them"""
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
logging_steps=10,
num_train_epochs=3,
save_steps=100,
learning_rate=1e-4,
save_on_each_node=True,
gradient_checkpointing=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
print("Staring to training...")
trainer.train()
print("Finished to training...")
trainer.save_pretrained(output_dir) # Save model
tokenizer.save_pretrained(output_dir) # Save tokenizer
main.py :
# 1. Load dataset
print("Loading and processing dataset...")
dataset = load_dataset("json", data_files={
"train": "train.json",
"test": "test.json"})
train_dataset = dataset["train"]
test_dataset = dataset["test"]
# 2. Tokenization
print("------------------------------------------------------------")
print("Start the Tokenization dataset...")
tokenizer = AutoTokenizer.from_pretrained(llm_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
tokenized_train = tokenize_dataset(train_dataset, tokenizer)
tokenized_test = tokenize_dataset(test_dataset, tokenizer)
# 3. Model fine-tuning
print("------------------------------------------------------------")
print("Loading model and starting fine-tuning...")
model = load_model("meta-llama/Meta-Llama-3-8B-Instruct")
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
train_model(model, tokenized_train, tokenized_test, tokenizer, "./llm-lora-fine-tuned")
I’ve seen a few posts similar, but none of the proposed answers were useful for this case. Has this ever happened to you?
Would love some insight.
Thanks in advance.