Hi, I create a small dataset to mimic the format (oai) recommended by STFTrainer doc page:
from datasets import load_dataset
from trl import SFTTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig
import torch, sys
dataset = load_dataset("celsowm/auryn", split="train", download_mode="force_redownload")
# Specify the model
model_name = 'recogna-nlp/bode-7b-alpaca-pt-br-no-peft'
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
use_cache=False,
device_map="auto",
torch_dtype=torch.float16
)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
task_type="CAUSAL_LM",
)
training_args = TrainingArguments(
output_dir='outputs/summarize',
num_train_epochs=3,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_accumulation_steps=4, # Increase gradient accumulation to compensate for small batch size
gradient_checkpointing=True,
bf16=True,
optim="adamw_bnb_8bit",
#warmup_steps=500,
weight_decay=0.01,
#save_steps=500, # Save the model every 500 steps
save_total_limit=3, # Keep only the last 3 models
)
trainer = SFTTrainer(
model=model,
peft_config=peft_config,
args=training_args,
train_dataset=dataset,
packing=True,
max_seq_length=2048
)
But I got this error:
Make sure that your dataset has enough samples to at least yield one packed sequence
Any hints?