Perhaps your features (`output` in this case) have excessive nesting (inputs type `list` where type `int` is expected)

I am also getting similar issue here.

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 
'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features
(`output` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
  0% 0/20 [00:05<?, ?it/s]

here are my fine-tuning step details.

Model load using unsloth not Huggingface Transformers directly

from unsloth import FastLanguageModel, is_bfloat16_supported

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B", # or choose "unsloth/Llama-3.2-1B"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

Dataset preparation

def prepare_dataset(tokenizer_data: dict) -> dict:
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

    def formatting_prompts_func(examples):
        return {"text": [alpaca_prompt.format(inst, inp, out) + tokenizer_data['tokenizer'].eos_token
                         for inst, inp, out in zip(examples["instruction"], examples["input"], examples["output"])]}

    # Load the dataset
    dataset = load_dataset("yahma/alpaca-cleaned")

    # Apply formatting
    dataset = dataset.map(formatting_prompts_func, batched=True)

    # Split the dataset into train, validation, and test sets
    train_valid_test_split = dataset['train'].train_test_split(test_size=0.1, seed=42)
    train_valid_dataset = train_valid_test_split['train']
    test_dataset = train_valid_test_split['test']

    train_valid_split = train_valid_dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = train_valid_split['train']
    val_dataset = train_valid_split['test']

    return {
        'train_dataset': train_dataset,
        'val_dataset': val_dataset,
        'test_dataset': test_dataset
    }

Here passing train_dataset and eval_dataset for eval_loss metrics.

    training_args = TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 20,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="mlflow",
        evaluation_strategy=IntervalStrategy.STEPS,
        eval_steps=20,
        save_total_limit=5,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        remove_unused_columns=False
    )

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        dataset_text_field="text",
        max_seq_length=2048,
        dataset_num_proc=2,
        packing=False,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        args=training_args
    )
1 Like