Hi, I am new to NLP and I have been following a tutorial to fine tune Llama2. I was following the steps and it worked well, but I encountered an IndexError when i try to instantiate SFTTrainer() to train my model. Before I ran into the error, this is what I did in the beginning:
# import libraries
import json
import re
from pprint import pprint
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments
)
from trl import SFTTrainer
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
# import dataset
dataset = load_dataset("Salesforce/dialogstudio", "Empathetic")
dataset
# prompt instructions
DEFAULT_SYSTEM_PROMPT = """
Below is a conversation between 2 users. Explain the feeling that the users felt during this conversation in 1 word.
""".strip()
def generate_training_prompt(
conversations: str, feeling: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
return f"""### Instruction: {system_prompt}
### Input:
{conversations.strip()}
### Response:
{feeling}
""".strip()
# clean text
def clean_text(text):
text = re.sub(r"http\S+", "", text)
text = re.sub(r"@[^\s]+", "", text)
text = re.sub(r"\s+", " ", text)
return re.sub(r"\^[^ ]+", "", text)
def create_conversation_text(data_points):
text = ""
for item in data_points["log"]:
user1 = clean_text(item["user utterance"])
text += f"user1: {user1.strip()}\n"
user2 = clean_text(item["system response"])
text += f"user2: {user2.strip()}\n"
return text
# generate text
def generate_text(data_point):
feeling = json.loads(data_point["original dialog info"])["context"]
conversation_text = create_conversation_text(data_point)
return {
"conversation": conversation_text,
"feeling": feeling,
"text": generate_training_prompt(conversation_text, feeling)
}
example = generate_text(dataset["train"][1])
print(example["feeling"])
print(example["conversation"])
print(example["text"])
# process dataset
def process_dataset(data: Dataset):
return (
data.shuffle(seed = 333)
.map(generate_text)
.remove_columns(
[
"original dialog id",
"new dialog id",
"dialog index",
"original dialog info",
"log",
"prompt"
]
)
)
dataset["train"] = process_dataset(dataset["train"])
dataset["validation"] = process_dataset(dataset["validation"])
dataset["test"] = process_dataset(dataset["test"])
dataset
# log in to HF
!huggingface-cli login --token "redacted"
# create model and tokenizer
def create_model_and_tokenizer():
bnb_config = BitsAndBytesConfig(
bnb_4bit_compute_dtype=torch.float16,
llm_int8_enable_fp32_cpu_offload=True
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
use_safetensors=True,
quantization_config=bnb_config,
trust_remote_code=True,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
return model, tokenizer
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False
lora_alpha = 32
lora_dropout = 0.05
lora_r = 16
lora_target_modules = [
"q_proj",
"up_proj",
"o_proj",
"k_proj",
"down_proj",
"gate_proj",
"v_proj",
]
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
target_modules=lora_target_modules,
bias="none",
task_type="CAUSAL_LM"
)
OUTPUT_DIR = "experiments"
%load_ext tensorboard
%tensorboard --logdir experiments/run
training_arguments = TrainingArguments(
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
optim="paged_adamw_32bit",
logging_steps=1,
learning_rate=0.0001,
# fp16=True,
max_grad_norm=0.3,
num_train_epochs=2,
evaluation_strategy="steps",
eval_steps=0.2,
warmup_ratio=0.05,
save_strategy="epoch",
group_by_length=True,
output_dir=OUTPUT_DIR,
report_to="tensorboard",
save_safetensors=True,
lr_scheduler_type="cosine",
seed=333
)
After running the above successfully, running this next chunk of code to instantiate the SFTTrainer gives me an IndexError:
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
eval_dataset=dataset["validation"],
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=4096,
tokenizer=tokenizer,
args=training_arguments
)
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[2], line 1 ----> 1 trainer = SFTTrainer( 2 model=model, 3 train_dataset=dataset["train"], 4 eval_dataset=dataset["validation"], 5 peft_config=peft_config, 6 dataset_text_field="text", 7 max_seq_length=4096, 8 tokenizer=tokenizer, 9 args=training_arguments 10 ) File [c:\Users\wl\AppData\Local\Programs\Python\Python311\Lib\site-packages\trl\trainer\sft_trainer.py:219](file:///C:/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py:219), in SFTTrainer.__init__(self, model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics, peft_config, dataset_text_field, packing, formatting_func, max_seq_length, infinite, num_of_sequences, chars_per_token, dataset_num_proc, dataset_batch_size) [213](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py?line=212) if tokenizer.padding_side is not None and tokenizer.padding_side != "right": [214](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py?line=213) warnings.warn( [215](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py?line=214) "You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to " [216](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py?line=215) "overflow issues when training a model in half-precision. You might consider adding `tokenizer.padding_side = 'right'` to your code." [217](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py?line=216) ) --> [219](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py?line=218) super().__init__( [220](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py?line=219) model=model, [221](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py?line=220) args=args, [222](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py?line=221) data_collator=data_collator, [223](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/trl/trainer/sft_trainer.py?line=222) train_dataset=train_dataset,
...
[400](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/transformers/trainer.py?line=399) "You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set" [401](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/transformers/trainer.py?line=400) " to `True` to avoid any unexpected behavior such as device placement mismatching." [402](file:///c%3A/Users/wl/AppData/Local/Programs/Python/Python311/Lib/site-packages/transformers/trainer.py?line=401) ) IndexError: list index out of range
Output is truncated. View as a [scrollable element](command:cellOutput.enableScrolling?f80cd8a1-e6c5-4e39-ba7e-fd3bf680aa55) or open in a [text editor](command:workbench.action.openLargeOutput?f80cd8a1-e6c5-4e39-ba7e-fd3bf680aa55). Adjust cell output [settings](command:workbench.action.openSettings?%5B%22%40tag%3AnotebookOutputLayout%22%5D)...
I have checked through my datasets and formattings but I am still not able to find anything that is causing this error. Would love it if anyone has an idea why this is happening.