The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_values


Fine-Tuning  of Self-Supervised Models for Audio based Bird Detection.
I am trying to add linear classifier on top of wav2vec2. I want to fine tune the wav2vec2 mdel for multi-class classification as below.

I followed the tutorial from HuggingFace, but I am still getting the following error 

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_values.
 

"""
import torch
import numpy as np
import pandas as pd 
import librosa 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score


from transformers import pipeline, AutoFeatureExtractor, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoModelForAudioClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, load_metric, Audio, concatenate_datasets
"""

> FineTuning Wav2Vec2 for bird classification

class FineTuneSSLModels():
    def __init__(self):
        self.categories = {'Pigeon': 0, 'Sparrow': 1, 'Crow': 2, 'Eagle': 3, 'Hawk': 4, 'Parrot': 5, 'Dove': 6, 'Peacock': 7}
        self.freeze_encoder = False
        self.transformer = False
        self.pre_trained_model = "facebook/wav2vec2-base-960h"  #w2v2 base 960 hours librispeech

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.f1_metric = load_metric("f1")
        self.recall_metric = load_metric("recall")
        self.target_sr = 16000
        #Read CSV Files
        self.path_to_train='train.csv'
        self.path_to_val='val.csv'
        self.path_to_test='test.csv'

        self.df_train = pd.read_csv(self.path_to_train, encoding='utf-8')
        self.df_val = pd.read_csv(self.path_to_val, encoding='utf-8')
        self.df_test = pd.read_csv(self.path_to_test, encoding='utf-8')
        print("self train", self.df_train)
        
        self.labels=sorted(self.df_train.Label.unique())
        self.label_dict = {self.labels[i]: [j for j in range(len(self.labels))][i] for i in range(len(self.labels))}
        print(self.label_dict)
        self.df_train = self.df_train.replace({"Label": self.label_dict})
        self.df_val = self.df_val.replace({"Label": self.label_dict})
        self.df_test = self.df_test.replace({"Label": self.label_dict})
        
        #Get Full Path 
        self.df_train['FilePath']='full_path/train/' + self.df_train['FileName']
        self.df_val['FilePath']=''full_path/eval/' + self.df_val['FileName'] 
        self.df_test['FilePath']='full_pathtest/' + self.df_test['FileName']

        #Create Datasets
        self.train_dataset = Dataset.from_pandas(self.df_train)
        self.val_dataset = Dataset.from_pandas(self.df_val)
        self.test_dataset = Dataset.from_pandas(self.df_test)
        
        self.train_dataset = self.train_dataset.map(self.preprocess_data)
        self.val_dataset = self.val_dataset.map(self.preprocess_data)
        self.test_dataset = self.test_dataset.map(self.preprocess_data)
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(self.pre_trained_model, cache_dir="SSLFineTune/")
        self.processor = Wav2Vec2Processor.from_pretrained(self.pre_trained_model, cache_dir="SSLFineTune/")

        self.train_dataset = self.train_dataset.map(self.prepare_datasets, remove_columns = ['FilePath', 
'SpeakerLabel', 'FileName'], batched=True, batch_size=1)
self.val_dataset = self.val_dataset.map(self.prepare_datasets, remove_columns = ['FilePath', 'SpeakerLabel', 'FileName'], batched=True, batch_size=1)
self.test_dataset = self.test_dataset.map(self.prepare_datasets, remove_columns = ['FilePath', 'SpeakerLabel', 'FileName'], batched=True, batch_size=1)

        self.model = AutoModelForAudioClassification.from_pretrained(
                                                                self.pre_trained_model,
                                                                trust_remote_code=True,
                                                                cache_dir="SSLFineTune/",
                                                                num_labels = len(self.df_train['Label'].unique()),                                                               
                                                                )
        


       #Classification head for 8-classes
        self.model.classifier=torch.nn.Linear(in_features=256, out_features=8, bias=True)

        self.freeze_feature_extractor = False
        self.freeze_transformer = False
        if self.freeze_feature_extractor:
            self.model.freeze_feature_extractor()
        if self.freeze_transformer:
            self.model.freeze_transformer()


        self.args = TrainingArguments(
                "SSLFineTune/",
                overwrite_output_dir=True,
                evaluation_strategy = "epoch",
                save_strategy = "epoch",
                learning_rate=3e-5,
                per_device_train_batch_size=8,
                gradient_accumulation_steps=1,
                per_device_eval_batch_size=1,
                num_train_epochs=10,
                warmup_ratio=0.1,
                logging_steps=10,
                load_best_model_at_end=True,
                metric_for_best_model="uar",
                push_to_hub=False,
                gradient_checkpointing=True,
                save_total_limit=5
                )
        
        self.trainer = Trainer(
                self.model,
                self.args,
                train_dataset=self.train_dataset,
                eval_dataset=self.val_dataset,
                tokenizer=self.feature_extractor,
                compute_metrics=self.compute_metrics)

        self.trainer.train()

        self.predictions = self.trainer.predict(self.val_dataset)
        print(self.compute_metrics(self.predictions))



    def compute_metrics(self, eval_pred):
        """Computes accuracy on a batch of predictions"""
        print(evl_pred)
        predictions = np.argmax(eval_pred.predictions, axis=1)
        recall = self.recall_metric.compute(predictions=predictions, references=eval_pred.label_ids,average="macro")
        # f1 = f1_metric.compute(predictions=predictions, references=eval_pred.label_ids, average="macro")
        #return {"f1": f1, "spearmanr": spearmanr}
        return recall


    def preprocess_data(self, audio_example):
        """Adds Audio and Sampling Rate to Panda Frame in another columns"""
        audio_example['audio'], audio_example['sampling_rate'] = librosa.load(audio_example["FilePath"], sr=16000)
        #print("Single Audio", np.array(audio_example['audio']).shape, audio_example)
        #exit(0)
        return audio_example

    def prepare_datasets(self, audio_example):
        audio_data = audio_example['audio']     

        enc_embeds = self.feature_extractor(audio_data, sampling_rate = self.feature_extractor.sampling_rate)
        return enc_embeds

    




if __name__=="__main__":
    initiate = FineTuneSSLModels()

1 Like

Edit: fixed for me by changing BertModel to BertForSequenceClassification.

Iā€™m getting the same error, also following the tutorials in the course, chapter 3: Fine-tuning a model with the Trainer API - Hugging Face Course.

My code:

import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
    BertModel,
    BertTokenizerFast,
    TrainingArguments,
    Trainer,
    EvalPrediction,
)

checkpoint = "bert-base-uncased"
model = BertModel.from_pretrained(checkpoint)
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)
raw_ds = load_dataset("glue", "mrpc")
metric = evaluate.load("glue", "mrpc")

dataset = raw_ds.map(
    lambda x: tokenizer(x["sentence1"], x["sentence2"], truncation=True),
    batched=True,
)
dataset = dataset.remove_columns(["sentence1", "sentence2", "idx"])
dataset = dataset.rename_column("label", "labels")
dataset = dataset.with_format("torch")

trainer_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")


def compute_metrics(eval_preds: EvalPrediction):
    x, y = eval_preds
    preds = np.argmax(x, -1)
    return metric.compute(predictions=preds, references=y)


trainer = Trainer(
    model=model,
    args=trainer_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()