I am new to Hugging Face and transformers in general. I have worked with tensorflow models before, but not with Pytorch.
I want to create a regression model based on DNA, and have found an interesting model here:armheb/DNA_bert_4.
Loading the model, tokenizing the data etc works, however when I want to train the model, I recive a runtime error leading back to loss.backwards(), stating that it found double but expected float.
I then thought it might be the regression and the custom loss function I wanted to use, so I created random class (0 or 1) for my data, and copied the accuracy loss function from the Hugging Face tutorial, resulting in the same error, this time with long and float.
Therefor i assume I have made a mistake with creating the labels/loss function, however I do not know how to fix it.
Here is my current code.
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from transformers import pipeline
import numpy as np
model_name="armheb/DNA_bert_4"
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_name)
#reading in data, truncated for this forum post
df_whole_genome=pd.read_csv("xyz",sep="\t",index_col=0)
batch=tokenizer(list(df_whole_genome["Sequence"].values),padding="max_length")
labels=df_whole_genome.value_of_interest.values
class CycDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
dataset=CycDataset(batch,labels)
train, test = train_test_split(dataset, test_size=0.2,random_state=42)
here is the training
from sklearn.metrics import mean_squared_error
def compute_metrics(eval_pred):
predictions, labels = eval_pred
rmse = mean_squared_error(labels, predictions, squared=False)
return {"rmse": rmse}
training_args = TrainingArguments(output_dir="test_trainer",
num_train_epochs=3
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train,
eval_dataset=test,
compute_metrics=compute_metrics,
)
trainer.train()
This results in
***** Running training *****
Num examples = 65923
Num Epochs = 3
Instantaneous batch size per device = 8
Total train batch size (w. parallel, distributed & accumulation) = 8
Gradient Accumulation steps = 1
Total optimization steps = 24723
Traceback (most recent call last):
File "/tmp/ipykernel_8483/462224225.py", line 21, in <cell line: 21>
trainer.train()
File "redacted/envs/hugging/lib/python3.8/site-packages/transformers/trainer.py", line 1332, in train
tr_loss_step = self.training_step(model, inputs)
File "redacted/envs/hugging/lib/python3.8/site-packages/transformers/trainer.py", line 1909, in training_step
loss.backward()
File "redacted/anaconda3/envs/hugging/lib/python3.8/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "redacted/anaconda3/envs/hugging/lib/python3.8/site-packages/torch/autograd/__init__.py", line 130, in backward
Variable._execution_engine.run_backward(
RuntimeError: Found dtype Double but expected Float
The labels as I call them here are float64