dataframe = pd.read_csv("answer_response.csv")
dataframe['answer'] = dataframe['question'].apply((lambda x: (x.split('?', 1))[1][1:]))
dataframe['question'] = dataframe["question"].apply((lambda x: x.split('?', 1)[0] + '?'))
def get_chat_format(element):
system_prompt = (
"You are a helpful assistant that excels at answering questions regarding detection programs." #?
)
user_prompt = "{question}"
return [{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt.format_map(element)},
{"role": "assistant", "content": element["answer"]}]
dataframe["messages"] = dataframe.apply(get_chat_format, axis=1)
dataframe.drop(columns=['question', 'label', 'answer'], inplace=True)
dataframe
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from trl import setup_chat_format, DataCollatorForCompletionOnlyLM, SFTTrainer
from peft import LoraConfig, PeftModel
device_map = {"": 0}
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=False,
)
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.4"
model = AutoModelForCausalLM.from_pretrained(model_name,
quantization_config=bnb_config,
device_map=device_map)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True)
new_special_tokens = {
'pad_token': '<pad>',
'additional_special_tokens': ['<maturityScore>', '<feedScore>', '<detectionScore>', '<productivityScore>', '<adversary>', '<tech1>', '<tech2>', '<tech3>', '<usecase1>', '<usecase2>', '<usecase3>', '<usecase>', '<yesOrNo>', '<blankOrNot>']
}
tokenizer.add_special_tokens(new_special_tokens)
tokenizer.padding_side = "right"
setup_chat_format(model, tokenizer)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
modules_to_save= ["embed_tokens", "lm_head"], # this helps train token embeddings
task_type="CAUSAL_LM",
target_modules = [
"model.layers.1.self_attn.q_proj",
"model.layers.2.self_attn.q_proj",
"model.layers.3.self_attn.q_proj",
"model.layers.4.self_attn.q_proj",
"model.layers.5.self_attn.q_proj",
"model.layers.6.self_attn.q_proj",
"model.layers.7.self_attn.q_proj",
"model.layers.8.self_attn.q_proj",
"model.layers.9.self_attn.q_proj",
"model.layers.10.self_attn.q_proj",
"model.layers.11.self_attn.q_proj",
"model.layers.12.self_attn.q_proj",
"model.layers.13.self_attn.q_proj",
"model.layers.14.self_attn.q_proj",
"model.layers.15.self_attn.q_proj",
"model.layers.16.self_attn.q_proj",
"model.layers.17.self_attn.q_proj",
"model.layers.18.self_attn.q_proj",
"model.layers.19.self_attn.q_proj",
"model.layers.20.self_attn.q_proj",
"model.layers.21.self_attn.q_proj",
"model.layers.1.self_attn.v_proj",
"model.layers.2.self_attn.v_proj",
"model.layers.3.self_attn.v_proj",
"model.layers.4.self_attn.v_proj",
"model.layers.5.self_attn.v_proj",
"model.layers.6.self_attn.v_proj",
"model.layers.7.self_attn.v_proj",
"model.layers.8.self_attn.v_proj",
"model.layers.9.self_attn.v_proj",
"model.layers.10.self_attn.v_proj",
"model.layers.11.self_attn.v_proj",
"model.layers.12.self_attn.v_proj",
"model.layers.13.self_attn.v_proj",
"model.layers.14.self_attn.v_proj",
"model.layers.15.self_attn.v_proj",
"model.layers.16.self_attn.v_proj",
"model.layers.17.self_attn.v_proj",
"model.layers.18.self_attn.v_proj",
"model.layers.19.self_attn.v_proj",
"model.layers.20.self_attn.v_proj",
"model.layers.21.self_attn.v_proj",
]
)
peft_model = PeftModel(model, peft_config)
from datasets import load_dataset
import datasets
def create_and_save_datasets(
df, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1
):
seed = 123
dataset = datasets.Dataset.from_pandas(df, preserve_index=False)
# split into training and "the rest"
train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed)
# split "the rest" into validation and testing
val_test = train_valtest["test"].train_test_split(
test_size=test_ratio / (test_ratio + val_ratio), seed=seed
)
dataset = datasets.DatasetDict(
{
"train": train_valtest["train"],
"valid": val_test["train"],
"test": val_test["test"],
}
)
return dataset
dataset = create_and_save_datasets(dataframe)
#tokenize the damn dataset
training_arguments = TrainingArguments(
output_dir="./results",
remove_unused_columns=False,
num_train_epochs=4,
per_device_train_batch_size=2,
gradient_accumulation_steps=3,
optim="paged_adamw_32bit",
save_steps=25,
logging_steps=25,
learning_rate=2e-3,
weight_decay=0.001,
fp16=False,
bf16=False,
max_grad_norm=0.3,
max_steps=-1,
warmup_ratio=0.03,
group_by_length=True,
lr_scheduler_type="constant",
report_to="tensorboard"
)
collator = DataCollatorForCompletionOnlyLM(
tokenizer.encode("<|im_start|>assistant\n", add_special_tokens=False), tokenizer=tokenizer, mlm=False
)
trainer = SFTTrainer(
model=peft_model,
train_dataset=dataset["train"],
# eval_dataset=dataset["valid"],
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=1024,
tokenizer=tokenizer,
args=training_arguments,
packing=False,
data_collator=collator,
)
trainer.train()
I’m getting the following error
RuntimeError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
761 if not is_tensor(value):
→ 762 tensor = as_tensor(value)
763
15 frames
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in as_tensor(value, dtype)
723 return torch.tensor(np.array(value))
→ 724 return torch.tensor(value)
725
RuntimeError: Could not infer dtype of dict
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
in <cell line: 44>()
42 )
43
—> 44 trainer.train()
/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py in train(self, *args, **kwargs)
449 self.model = self._trl_activate_neftune(self.model)
450
→ 451 output = super().train(*args, **kwargs)
452
453 # After training we make sure to retrieve back the original forward pass method
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1936 hf_hub_utils.enable_progress_bars()
1937 else:
→ 1938 return inner_training_loop(
1939 args=args,
1940 resume_from_checkpoint=resume_from_checkpoint,
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2234
2235 step = -1
→ 2236 for step, inputs in enumerate(epoch_iterator):
2237 total_batched_samples += 1
2238
/usr/local/lib/python3.10/dist-packages/accelerate/data_loader.py in iter(self)
452 # We iterate one batch ahead to check when we are at the end
453 try:
→ 454 current_batch = next(dataloader_iter)
455 except StopIteration:
456 yield
/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in next(self)
629 # TODO(Bug in dataloader iterator found by mypy · Issue #76750 · pytorch/pytorch · GitHub)
630 self._reset() # type: ignore[call-arg]
→ 631 data = self._next_data()
632 self._num_yielded += 1
633 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
673 def _next_data(self):
674 index = self._next_index() # may raise StopIteration
→ 675 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
676 if self._pin_memory:
677 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
52 else:
53 data = self.dataset[possibly_batched_index]
—> 54 return self.collate_fn(data)
/usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py in call(self, features, return_tensors)
43 return self.tf_call(features)
44 elif return_tensors == “pt”:
—> 45 return self.torch_call(features)
46 elif return_tensors == “np”:
47 return self.numpy_call(features)
/usr/local/lib/python3.10/dist-packages/trl/trainer/utils.py in torch_call(self, examples)
170
171 def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) → Dict[str, Any]:
→ 172 batch = super().torch_call(examples)
173
174 if self.instruction_template is None:
/usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py in torch_call(self, examples)
804 # Handle dict or lists with proper padding and conversion to tensor.
805 if isinstance(examples[0], Mapping):
→ 806 batch = pad_without_fast_tokenizer_warning(
807 self.tokenizer, examples, return_tensors=“pt”, pad_to_multiple_of=self.pad_to_multiple_of
808 )
/usr/local/lib/python3.10/dist-packages/transformers/data/data_collator.py in pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs)
64
65 try:
—> 66 padded = tokenizer.pad(*pad_args, **pad_kwargs)
67 finally:
68 # Restore the state of the warning.
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
3576 batch_outputs[key].append(value)
3577
→ 3578 return BatchEncoding(batch_outputs, tensor_type=return_tensors)
3579
3580 def create_token_type_ids_from_sequences(
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in init(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
225 self._n_sequences = n_sequences
226
→ 227 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
228
229 @property
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
776 “Please see if a fast version of this tokenizer is available to have this feature available.”
777 ) from e
→ 778 raise ValueError(
779 “Unable to create tensor, you should probably activate truncation and/or padding with”
780 " ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your"
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your features (messages
in this case) have excessive nesting (inputs type list
where type int
is expected).
any help is appreciated