Help using sfttrainer with data collator, peft, and tokenizer template

dataframe = pd.read_csv("answer_response.csv")
dataframe['answer'] = dataframe['question'].apply((lambda x: (x.split('?', 1))[1][1:]))
dataframe['question'] = dataframe["question"].apply((lambda x: x.split('?', 1)[0] + '?'))


def get_chat_format(element):

    system_prompt = (
        "You are a helpful assistant that excels at answering questions regarding detection programs." #?
    )
    user_prompt = "{question}"
    return [{"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt.format_map(element)},
        {"role": "assistant", "content": element["answer"]}]


dataframe["messages"] = dataframe.apply(get_chat_format, axis=1)
dataframe.drop(columns=['question', 'label', 'answer'], inplace=True)

dataframe

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from trl import setup_chat_format, DataCollatorForCompletionOnlyLM, SFTTrainer
from peft import LoraConfig, PeftModel


device_map = {"": 0}

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)



model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.4"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bnb_config,
                                             device_map=device_map)

model.config.use_cache = False 
model.config.pretraining_tp = 1 

tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True)

new_special_tokens = {
    'pad_token': '<pad>',
    'additional_special_tokens': ['<maturityScore>', '<feedScore>', '<detectionScore>', '<productivityScore>', '<adversary>', '<tech1>', '<tech2>', '<tech3>', '<usecase1>', '<usecase2>', '<usecase3>', '<usecase>', '<yesOrNo>', '<blankOrNot>']
}
tokenizer.add_special_tokens(new_special_tokens)
tokenizer.padding_side = "right"

setup_chat_format(model, tokenizer)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    modules_to_save= ["embed_tokens", "lm_head"], # this helps train token embeddings
    task_type="CAUSAL_LM",
    target_modules = [
    "model.layers.1.self_attn.q_proj",
    "model.layers.2.self_attn.q_proj",
    "model.layers.3.self_attn.q_proj",
    "model.layers.4.self_attn.q_proj",
    "model.layers.5.self_attn.q_proj",
    "model.layers.6.self_attn.q_proj",
    "model.layers.7.self_attn.q_proj",
    "model.layers.8.self_attn.q_proj",
    "model.layers.9.self_attn.q_proj",
    "model.layers.10.self_attn.q_proj",
    "model.layers.11.self_attn.q_proj",
    "model.layers.12.self_attn.q_proj",
    "model.layers.13.self_attn.q_proj",
    "model.layers.14.self_attn.q_proj",
    "model.layers.15.self_attn.q_proj",
    "model.layers.16.self_attn.q_proj",
    "model.layers.17.self_attn.q_proj",
    "model.layers.18.self_attn.q_proj",
    "model.layers.19.self_attn.q_proj",
    "model.layers.20.self_attn.q_proj",
    "model.layers.21.self_attn.q_proj",
    "model.layers.1.self_attn.v_proj",
    "model.layers.2.self_attn.v_proj",
    "model.layers.3.self_attn.v_proj",
    "model.layers.4.self_attn.v_proj",
    "model.layers.5.self_attn.v_proj",
    "model.layers.6.self_attn.v_proj",
    "model.layers.7.self_attn.v_proj",
    "model.layers.8.self_attn.v_proj",
    "model.layers.9.self_attn.v_proj",
    "model.layers.10.self_attn.v_proj",
    "model.layers.11.self_attn.v_proj",
    "model.layers.12.self_attn.v_proj",
    "model.layers.13.self_attn.v_proj",
    "model.layers.14.self_attn.v_proj",
    "model.layers.15.self_attn.v_proj",
    "model.layers.16.self_attn.v_proj",
    "model.layers.17.self_attn.v_proj",
    "model.layers.18.self_attn.v_proj",
    "model.layers.19.self_attn.v_proj",
    "model.layers.20.self_attn.v_proj",
    "model.layers.21.self_attn.v_proj",
    ]
    )

peft_model = PeftModel(model, peft_config)


from datasets import load_dataset
import datasets
def create_and_save_datasets(
    df, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1
):
    seed = 123
    dataset = datasets.Dataset.from_pandas(df, preserve_index=False)

    # split into training and "the rest"
    train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed)

    # split "the rest" into validation and testing
    val_test = train_valtest["test"].train_test_split(
        test_size=test_ratio / (test_ratio + val_ratio), seed=seed
    )

    dataset = datasets.DatasetDict(
        {
            "train": train_valtest["train"],
            "valid": val_test["train"],
            "test": val_test["test"],
        }
    )
    return dataset
dataset = create_and_save_datasets(dataframe)


#tokenize the damn dataset

training_arguments = TrainingArguments(
    output_dir="./results",
    remove_unused_columns=False,
    num_train_epochs=4, 
    per_device_train_batch_size=2,
    gradient_accumulation_steps=3,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-3,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"

)

collator = DataCollatorForCompletionOnlyLM(
    tokenizer.encode("<|im_start|>assistant\n", add_special_tokens=False), tokenizer=tokenizer, mlm=False
)

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=dataset["train"],
    # eval_dataset=dataset["valid"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    data_collator=collator,
    )

trainer.train()

I’m getting the following error


RuntimeError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
761 if not is_tensor(value):
→ 762 tensor = as_tensor(value)
763

15 frames
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in as_tensor(value, dtype)
723 return torch.tensor(np.array(value))
→ 724 return torch.tensor(value)
725

RuntimeError: Could not infer dtype of dict

The above exception was the direct cause of the following exception:

ValueError Traceback (most recent call last)
in <cell line: 44>()
42 )
43
—> 44 trainer.train()

/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py in train(self, *args, **kwargs)
449 self.model = self._trl_activate_neftune(self.model)
450
→ 451 output = super().train(*args, **kwargs)
452
453 # After training we make sure to retrieve back the original forward pass method

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1936 hf_hub_utils.enable_progress_bars()
1937 else:
→ 1938 return inner_training_loop(
1939 args=args,
1940 resume_from_checkpoint=resume_from_checkpoint,

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2234
2235 step = -1
→ 2236 for step, inputs in enumerate(epoch_iterator):
2237 total_batched_samples += 1
2238

/usr/local/lib/python3.10/dist-packages/accelerate/data_loader.py in iter(self)
452 # We iterate one batch ahead to check when we are at the end
453 try:
→ 454 current_batch = next(dataloader_iter)
455 except StopIteration:
456 yield

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in next(self)
629 # TODO(Bug in dataloader iterator found by mypy · Issue #76750 · pytorch/pytorch · GitHub)
630 self._reset() # type: ignore[call-arg]
→ 631 data = self._next_data()
632 self._num_yielded += 1
633 if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
673 def _next_data(self):
674 index = self._next_index() # may raise StopIteration
→ 675 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
676 if self._pin_memory:
677 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
52 else:
53 data = self.dataset[possibly_batched_index]
—> 54 return self.collate_fn(data)

/usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py in call(self, features, return_tensors)
43 return self.tf_call(features)
44 elif return_tensors == “pt”:
—> 45 return self.torch_call(features)
46 elif return_tensors == “np”:
47 return self.numpy_call(features)

/usr/local/lib/python3.10/dist-packages/trl/trainer/utils.py in torch_call(self, examples)
170
171 def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) → Dict[str, Any]:
→ 172 batch = super().torch_call(examples)
173
174 if self.instruction_template is None:

/usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py in torch_call(self, examples)
804 # Handle dict or lists with proper padding and conversion to tensor.
805 if isinstance(examples[0], Mapping):
→ 806 batch = pad_without_fast_tokenizer_warning(
807 self.tokenizer, examples, return_tensors=“pt”, pad_to_multiple_of=self.pad_to_multiple_of
808 )

/usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py in pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs)
64
65 try:
—> 66 padded = tokenizer.pad(*pad_args, **pad_kwargs)
67 finally:
68 # Restore the state of the warning.

/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
3576 batch_outputs[key].append(value)
3577
→ 3578 return BatchEncoding(batch_outputs, tensor_type=return_tensors)
3579
3580 def create_token_type_ids_from_sequences(

/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in init(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
225 self._n_sequences = n_sequences
226
→ 227 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
228
229 @property

/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
776 “Please see if a fast version of this tokenizer is available to have this feature available.”
777 ) from e
→ 778 raise ValueError(
779 “Unable to create tensor, you should probably activate truncation and/or padding with”
780 " ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your"

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your features (messages in this case) have excessive nesting (inputs type list where type int is expected).

any help is appreciated