Not able to finetunned(q-lora) LLama3-Instruct model for CausalLM

I am trying to finetunned LLama3-Instruct model. My dataset looks like this
Here label and text are text description.
DatasetDict({
train: Dataset({
features: [‘label’, ‘text’],
num_rows: 30
})
valid: Dataset({
features: [‘label’, ‘text’],
num_rows: 10
})
test: Dataset({
features: [‘label’, ‘text’],
num_rows: 10
})
})

My code:

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM'  # Use causal language modeling task
)


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config
)


model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)


tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False


def tokenize_function(example):
    example['input_ids'] = tokenizer(example["text"], padding="max_length",max_length=256, truncation=True, return_tensors="pt").input_ids
    return example

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  
)


training_args = TrainingArguments(
    output_dir=model_name + "-causal-lm-finetuning",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=1
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator  # for text generation, use causal language modeling data collator
)


trainer.train()

Output error:
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your features (label in this case) have excessive nesting (inputs type list where type int is expected).

1 Like

You are passing Truncation=True
This seems to be a bug in the library, but it also seems to occur due to numpy version issues.
Please try this first.

pip install numpy<2

It looks like the error is due to inconsistent tokenization or handling of the label field. To fix this, make sure you tokenize both text and label properly. In the tokenize_function, ensure that both fields are processed and padded to the same length. You also need to check that the input_ids and attention_mask are returned correctly, without extra batch dimensions. Lastly, if you’re using labels for a supervised task, make sure they’re tokenized too. This should resolve the tensor size mismatch issue you’re facing.

1 Like