I am also getting similar issue here.
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with
'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features
(`output` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
0% 0/20 [00:05<?, ?it/s]
here are my fine-tuning step details.
Model load using unsloth not Huggingface Transformers directly
from unsloth import FastLanguageModel, is_bfloat16_supported
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/Llama-3.2-3B", # or choose "unsloth/Llama-3.2-1B"
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
Dataset preparation
def prepare_dataset(tokenizer_data: dict) -> dict:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
def formatting_prompts_func(examples):
return {"text": [alpaca_prompt.format(inst, inp, out) + tokenizer_data['tokenizer'].eos_token
for inst, inp, out in zip(examples["instruction"], examples["input"], examples["output"])]}
# Load the dataset
dataset = load_dataset("yahma/alpaca-cleaned")
# Apply formatting
dataset = dataset.map(formatting_prompts_func, batched=True)
# Split the dataset into train, validation, and test sets
train_valid_test_split = dataset['train'].train_test_split(test_size=0.1, seed=42)
train_valid_dataset = train_valid_test_split['train']
test_dataset = train_valid_test_split['test']
train_valid_split = train_valid_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_valid_split['train']
val_dataset = train_valid_split['test']
return {
'train_dataset': train_dataset,
'val_dataset': val_dataset,
'test_dataset': test_dataset
}
Here passing train_dataset and eval_dataset for eval_loss metrics.
training_args = TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=5,
# num_train_epochs = 1, # Set this for 1 full training run.
max_steps = 20,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="mlflow",
evaluation_strategy=IntervalStrategy.STEPS,
eval_steps=20,
save_total_limit=5,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
remove_unused_columns=False
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=train_dataset,
eval_dataset=val_dataset,
dataset_text_field="text",
max_seq_length=2048,
dataset_num_proc=2,
packing=False,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
args=training_args
)