I am trying to finetunned LLama3-Instruct model. My dataset looks like this
Here label and text are text description.
DatasetDict({
train: Dataset({
features: [‘label’, ‘text’],
num_rows: 30
})
valid: Dataset({
features: [‘label’, ‘text’],
num_rows: 10
})
test: Dataset({
features: [‘label’, ‘text’],
num_rows: 10
})
})
My code:
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
lora_config = LoraConfig(
r=16,
lora_alpha=8,
target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
lora_dropout=0.05,
bias='none',
task_type='CAUSAL_LM' # Use causal language modeling task
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
def tokenize_function(example):
example['input_ids'] = tokenizer(example["text"], padding="max_length",max_length=256, truncation=True, return_tensors="pt").input_ids
return example
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
training_args = TrainingArguments(
output_dir=model_name + "-causal-lm-finetuning",
learning_rate=1e-4,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=10,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
logging_steps=1
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["valid"],
tokenizer=tokenizer,
data_collator=data_collator # for text generation, use causal language modeling data collator
)
trainer.train()
Output error:
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your features (label
in this case) have excessive nesting (inputs type list
where type int
is expected).