Help using sfttrainer with data collator, peft, and tokenizer template

Liaminamerica3 · July 23, 2024, 7:16pm

dataframe = pd.read_csv("answer_response.csv")
dataframe['answer'] = dataframe['question'].apply((lambda x: (x.split('?', 1))[1][1:]))
dataframe['question'] = dataframe["question"].apply((lambda x: x.split('?', 1)[0] + '?'))


def get_chat_format(element):

    system_prompt = (
        "You are a helpful assistant that excels at answering questions regarding detection programs." #?
    )
    user_prompt = "{question}"
    return [{"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt.format_map(element)},
        {"role": "assistant", "content": element["answer"]}]


dataframe["messages"] = dataframe.apply(get_chat_format, axis=1)
dataframe.drop(columns=['question', 'label', 'answer'], inplace=True)

dataframe

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from trl import setup_chat_format, DataCollatorForCompletionOnlyLM, SFTTrainer
from peft import LoraConfig, PeftModel


device_map = {"": 0}

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)



model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.4"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bnb_config,
                                             device_map=device_map)

model.config.use_cache = False 
model.config.pretraining_tp = 1 

tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True)

new_special_tokens = {
    'pad_token': '<pad>',
    'additional_special_tokens': ['<maturityScore>', '<feedScore>', '<detectionScore>', '<productivityScore>', '<adversary>', '<tech1>', '<tech2>', '<tech3>', '<usecase1>', '<usecase2>', '<usecase3>', '<usecase>', '<yesOrNo>', '<blankOrNot>']
}
tokenizer.add_special_tokens(new_special_tokens)
tokenizer.padding_side = "right"

setup_chat_format(model, tokenizer)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    modules_to_save= ["embed_tokens", "lm_head"], # this helps train token embeddings
    task_type="CAUSAL_LM",
    target_modules = [
    "model.layers.1.self_attn.q_proj",
    "model.layers.2.self_attn.q_proj",
    "model.layers.3.self_attn.q_proj",
    "model.layers.4.self_attn.q_proj",
    "model.layers.5.self_attn.q_proj",
    "model.layers.6.self_attn.q_proj",
    "model.layers.7.self_attn.q_proj",
    "model.layers.8.self_attn.q_proj",
    "model.layers.9.self_attn.q_proj",
    "model.layers.10.self_attn.q_proj",
    "model.layers.11.self_attn.q_proj",
    "model.layers.12.self_attn.q_proj",
    "model.layers.13.self_attn.q_proj",
    "model.layers.14.self_attn.q_proj",
    "model.layers.15.self_attn.q_proj",
    "model.layers.16.self_attn.q_proj",
    "model.layers.17.self_attn.q_proj",
    "model.layers.18.self_attn.q_proj",
    "model.layers.19.self_attn.q_proj",
    "model.layers.20.self_attn.q_proj",
    "model.layers.21.self_attn.q_proj",
    "model.layers.1.self_attn.v_proj",
    "model.layers.2.self_attn.v_proj",
    "model.layers.3.self_attn.v_proj",
    "model.layers.4.self_attn.v_proj",
    "model.layers.5.self_attn.v_proj",
    "model.layers.6.self_attn.v_proj",
    "model.layers.7.self_attn.v_proj",
    "model.layers.8.self_attn.v_proj",
    "model.layers.9.self_attn.v_proj",
    "model.layers.10.self_attn.v_proj",
    "model.layers.11.self_attn.v_proj",
    "model.layers.12.self_attn.v_proj",
    "model.layers.13.self_attn.v_proj",
    "model.layers.14.self_attn.v_proj",
    "model.layers.15.self_attn.v_proj",
    "model.layers.16.self_attn.v_proj",
    "model.layers.17.self_attn.v_proj",
    "model.layers.18.self_attn.v_proj",
    "model.layers.19.self_attn.v_proj",
    "model.layers.20.self_attn.v_proj",
    "model.layers.21.self_attn.v_proj",
    ]
    )

peft_model = PeftModel(model, peft_config)


from datasets import load_dataset
import datasets
def create_and_save_datasets(
    df, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1
):
    seed = 123
    dataset = datasets.Dataset.from_pandas(df, preserve_index=False)

    # split into training and "the rest"
    train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed)

    # split "the rest" into validation and testing
    val_test = train_valtest["test"].train_test_split(
        test_size=test_ratio / (test_ratio + val_ratio), seed=seed
    )

    dataset = datasets.DatasetDict(
        {
            "train": train_valtest["train"],
            "valid": val_test["train"],
            "test": val_test["test"],
        }
    )
    return dataset
dataset = create_and_save_datasets(dataframe)


#tokenize the damn dataset

training_arguments = TrainingArguments(
    output_dir="./results",
    remove_unused_columns=False,
    num_train_epochs=4, 
    per_device_train_batch_size=2,
    gradient_accumulation_steps=3,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-3,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"

)

collator = DataCollatorForCompletionOnlyLM(
    tokenizer.encode("<|im_start|>assistant\n", add_special_tokens=False), tokenizer=tokenizer, mlm=False
)

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=dataset["train"],
    # eval_dataset=dataset["valid"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    data_collator=collator,
    )

trainer.train()

I’m getting the following error

RuntimeError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
761 if not is_tensor(value):
→ 762 tensor = as_tensor(value)
763

15 frames
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in as_tensor(value, dtype)
723 return torch.tensor(np.array(value))
→ 724 return torch.tensor(value)
725

RuntimeError: Could not infer dtype of dict

The above exception was the direct cause of the following exception:

ValueError Traceback (most recent call last)
in <cell line: 44>()
42 )
43
—> 44 trainer.train()

/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py in train(self, *args, **kwargs)
449 self.model = self._trl_activate_neftune(self.model)
450
→ 451 output = super().train(*args, **kwargs)
452
453 # After training we make sure to retrieve back the original forward pass method

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1936 hf_hub_utils.enable_progress_bars()
1937 else:
→ 1938 return inner_training_loop(
1939 args=args,
1940 resume_from_checkpoint=resume_from_checkpoint,

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2234
2235 step = -1
→ 2236 for step, inputs in enumerate(epoch_iterator):
2237 total_batched_samples += 1
2238

/usr/local/lib/python3.10/dist-packages/accelerate/data_loader.py in iter(self)
452 # We iterate one batch ahead to check when we are at the end
453 try:
→ 454 current_batch = next(dataloader_iter)
455 except StopIteration:
456 yield

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in next(self)
629 # TODO(Bug in dataloader iterator found by mypy · Issue #76750 · pytorch/pytorch · GitHub)
630 self._reset() # type: ignore[call-arg]
→ 631 data = self._next_data()
632 self._num_yielded += 1
633 if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
673 def _next_data(self):
674 index = self._next_index() # may raise StopIteration
→ 675 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
676 if self._pin_memory:
677 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
52 else:
53 data = self.dataset[possibly_batched_index]
—> 54 return self.collate_fn(data)

/usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py in call(self, features, return_tensors)
43 return self.tf_call(features)
44 elif return_tensors == “pt”:
—> 45 return self.torch_call(features)
46 elif return_tensors == “np”:
47 return self.numpy_call(features)

/usr/local/lib/python3.10/dist-packages/trl/trainer/utils.py in torch_call(self, examples)
170
171 def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) → Dict[str, Any]:
→ 172 batch = super().torch_call(examples)
173
174 if self.instruction_template is None:

/usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py in torch_call(self, examples)
804 # Handle dict or lists with proper padding and conversion to tensor.
805 if isinstance(examples[0], Mapping):
→ 806 batch = pad_without_fast_tokenizer_warning(
807 self.tokenizer, examples, return_tensors=“pt”, pad_to_multiple_of=self.pad_to_multiple_of
808 )

/usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py in pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs)
64
65 try:
—> 66 padded = tokenizer.pad(*pad_args, **pad_kwargs)
67 finally:
68 # Restore the state of the warning.

/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
3576 batch_outputs[key].append(value)
3577
→ 3578 return BatchEncoding(batch_outputs, tensor_type=return_tensors)
3579
3580 def create_token_type_ids_from_sequences(

/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in init(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
225 self._n_sequences = n_sequences
226
→ 227 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
228
229 @property

/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
776 “Please see if a fast version of this tokenizer is available to have this feature available.”
777 ) from e
→ 778 raise ValueError(
779 “Unable to create tensor, you should probably activate truncation and/or padding with”
780 " ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your"

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your features (messages in this case) have excessive nesting (inputs type list where type int is expected).

any help is appreciated

Topic		Replies	Views
Error using SFTTrainer: Make sure that your dataset has enough samples to at least yield one packed sequence Beginners	9	3005	November 1, 2024
Reproduce SFTTrainer with Accelerate and Pytorch 🤗Accelerate	0	42	May 18, 2025
"Expected all tensors to be on the same device" with SFTTrainer Beginners	2	60	April 26, 2025
SFT Trainer and chat templates Beginners	3	418	March 26, 2025
Llama inference with apply_chat_template Beginners	0	218	November 30, 2024

Help using sfttrainer with data collator, peft, and tokenizer template

Related topics