With all other parameters constant, why is the second formatting_func
slower and leads to higher loss than the first?
def formatting_func_single(example):
text = f"Quote: {example['quote'][0]}\nAuthor: {example['author'][0]}"
return [text]
def formatting_func_multi(example):
output_texts = list()
for i in range(len(example["quote"])):
text = f"Quote: {example['quote'][i]}\nAuthor: {example['author'][i]}"
output_texts.append(text)
return output_texts
Results in screenshot (see loss
and train_steps_per_second
):
Full code (can run in a Colab on a T4):
!pip3 install -q -U huggingface_hub==0.21.4
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.1
# %
import huggingface_hub
import torch
import os
import transformers
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer
from datasets import Dataset, DatasetDict
os.environ["WANDB_DISABLED"] = "true"
# %
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
model_id, quantization_config=bnb_config, device_map={"":0})
# %
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")
# %
lora_config = LoraConfig(
r=8,
target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
task_type="CAUSAL_LM",
)
# %
def formatting_func_single(example):
text = f"Quote: {example['quote'][0]}\nAuthor: {example['author'][0]}"
return [text]
def formatting_func_multi(example):
output_texts = list()
for i in range(len(example["quote"])):
text = f"Quote: {example['quote'][i]}\nAuthor: {example['author'][i]}"
output_texts.append(text)
return output_texts
# %
# @title Train using formatting_func_single
trainer = SFTTrainer(
model=model,
train_dataset=data["train"],
max_seq_length=128,
args=transformers.TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_steps=2,
max_steps=10,
learning_rate=2e-4,
fp16=True,
logging_steps=1,
output_dir="outputs",
optim="paged_adamw_8bit"
),
peft_config=lora_config,
formatting_func=formatting_func_single,
)
trainer.train()
# %
# @title Train using formatting_func_multi
trainer = SFTTrainer(
model=model,
train_dataset=data["train"],
max_seq_length=128,
args=transformers.TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_steps=2,
max_steps=10,
learning_rate=2e-4,
fp16=True,
logging_steps=1,
output_dir="outputs",
optim="paged_adamw_8bit"
),
peft_config=lora_config,
formatting_func=formatting_func_multi,
)
trainer.train()