Hi, I was finetuning a DNABERT model using LORA.
However, there seems to have a data loading error after applying LORA.
My data looks like
Dataset({
features: [‘binding_energy’, ‘label’, ‘input_ids’, ‘token_type_ids’, ‘attention_mask’, ‘labels’],
num_rows: 9261
})
and my code is
config = BertConfig.from_pretrained(
data_training_args.model_path,
num_labels=1,
)
tokenizer = DNATokenizer.from_pretrained(
data_training_args.model_path,
do_lower_case=False,
)
model = DNABertForSequenceClassification.from_pretrained(
data_training_args.model_path,
from_tf=bool(".ckpt" in data_training_args.model_path),
config=config,
)
# Define LoRA Config
lora_config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
bias="none",
inference_mode=False
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
def collate_fn(examples):
rdict = {}
print(examples)
for k in examples[0].keys():
if k not in [
"binding_energy",
"input_ids",
"token_type_ids",
"attention_mask",
"labels",
]:
continue
rdict[k] = torch.stack([torch.tensor(example[k]) for example in examples])
rdict["binding_energy"] = torch.unsqueeze(rdict["binding_energy"], 1)
return rdict
train_dataset = load_from_disk(data_training_args.train_data_dir)
dev_dataset = load_from_disk(data_training_args.dev_data_dir)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=dev_dataset,
tokenizer=tokenizer,
data_collator=collate_fn,
compute_metrics=compute_metrics
)
Then there is a key error:
rdict[“binding_energy”] = torch.unsqueeze(rdict[“binding_energy”], 1)
KeyError: ‘binding_energy’
When I print examples, only label is there, but my loaded data has more than that. And without LORA won’t have such issue.
Wondering why would this happen? Thanks!