Fine-Tuning of Self-Supervised Models for Audio based Bird Detection.
I am trying to add linear classifier on top of wav2vec2. I want to fine tune the wav2vec2 mdel for multi-class classification as below.
I followed the tutorial from HuggingFace, but I am still getting the following error
ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_values.
"""
import torch
import numpy as np
import pandas as pd
import librosa
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score
from transformers import pipeline, AutoFeatureExtractor, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoModelForAudioClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, load_metric, Audio, concatenate_datasets
"""
> FineTuning Wav2Vec2 for bird classification
class FineTuneSSLModels():
def __init__(self):
self.categories = {'Pigeon': 0, 'Sparrow': 1, 'Crow': 2, 'Eagle': 3, 'Hawk': 4, 'Parrot': 5, 'Dove': 6, 'Peacock': 7}
self.freeze_encoder = False
self.transformer = False
self.pre_trained_model = "facebook/wav2vec2-base-960h" #w2v2 base 960 hours librispeech
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.f1_metric = load_metric("f1")
self.recall_metric = load_metric("recall")
self.target_sr = 16000
#Read CSV Files
self.path_to_train='train.csv'
self.path_to_val='val.csv'
self.path_to_test='test.csv'
self.df_train = pd.read_csv(self.path_to_train, encoding='utf-8')
self.df_val = pd.read_csv(self.path_to_val, encoding='utf-8')
self.df_test = pd.read_csv(self.path_to_test, encoding='utf-8')
print("self train", self.df_train)
self.labels=sorted(self.df_train.Label.unique())
self.label_dict = {self.labels[i]: [j for j in range(len(self.labels))][i] for i in range(len(self.labels))}
print(self.label_dict)
self.df_train = self.df_train.replace({"Label": self.label_dict})
self.df_val = self.df_val.replace({"Label": self.label_dict})
self.df_test = self.df_test.replace({"Label": self.label_dict})
#Get Full Path
self.df_train['FilePath']='full_path/train/' + self.df_train['FileName']
self.df_val['FilePath']=''full_path/eval/' + self.df_val['FileName']
self.df_test['FilePath']='full_pathtest/' + self.df_test['FileName']
#Create Datasets
self.train_dataset = Dataset.from_pandas(self.df_train)
self.val_dataset = Dataset.from_pandas(self.df_val)
self.test_dataset = Dataset.from_pandas(self.df_test)
self.train_dataset = self.train_dataset.map(self.preprocess_data)
self.val_dataset = self.val_dataset.map(self.preprocess_data)
self.test_dataset = self.test_dataset.map(self.preprocess_data)
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(self.pre_trained_model, cache_dir="SSLFineTune/")
self.processor = Wav2Vec2Processor.from_pretrained(self.pre_trained_model, cache_dir="SSLFineTune/")
self.train_dataset = self.train_dataset.map(self.prepare_datasets, remove_columns = ['FilePath',
'SpeakerLabel', 'FileName'], batched=True, batch_size=1)
self.val_dataset = self.val_dataset.map(self.prepare_datasets, remove_columns = ['FilePath', 'SpeakerLabel', 'FileName'], batched=True, batch_size=1)
self.test_dataset = self.test_dataset.map(self.prepare_datasets, remove_columns = ['FilePath', 'SpeakerLabel', 'FileName'], batched=True, batch_size=1)
self.model = AutoModelForAudioClassification.from_pretrained(
self.pre_trained_model,
trust_remote_code=True,
cache_dir="SSLFineTune/",
num_labels = len(self.df_train['Label'].unique()),
)
#Classification head for 8-classes
self.model.classifier=torch.nn.Linear(in_features=256, out_features=8, bias=True)
self.freeze_feature_extractor = False
self.freeze_transformer = False
if self.freeze_feature_extractor:
self.model.freeze_feature_extractor()
if self.freeze_transformer:
self.model.freeze_transformer()
self.args = TrainingArguments(
"SSLFineTune/",
overwrite_output_dir=True,
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=3e-5,
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
per_device_eval_batch_size=1,
num_train_epochs=10,
warmup_ratio=0.1,
logging_steps=10,
load_best_model_at_end=True,
metric_for_best_model="uar",
push_to_hub=False,
gradient_checkpointing=True,
save_total_limit=5
)
self.trainer = Trainer(
self.model,
self.args,
train_dataset=self.train_dataset,
eval_dataset=self.val_dataset,
tokenizer=self.feature_extractor,
compute_metrics=self.compute_metrics)
self.trainer.train()
self.predictions = self.trainer.predict(self.val_dataset)
print(self.compute_metrics(self.predictions))
def compute_metrics(self, eval_pred):
"""Computes accuracy on a batch of predictions"""
print(evl_pred)
predictions = np.argmax(eval_pred.predictions, axis=1)
recall = self.recall_metric.compute(predictions=predictions, references=eval_pred.label_ids,average="macro")
# f1 = f1_metric.compute(predictions=predictions, references=eval_pred.label_ids, average="macro")
#return {"f1": f1, "spearmanr": spearmanr}
return recall
def preprocess_data(self, audio_example):
"""Adds Audio and Sampling Rate to Panda Frame in another columns"""
audio_example['audio'], audio_example['sampling_rate'] = librosa.load(audio_example["FilePath"], sr=16000)
#print("Single Audio", np.array(audio_example['audio']).shape, audio_example)
#exit(0)
return audio_example
def prepare_datasets(self, audio_example):
audio_data = audio_example['audio']
enc_embeds = self.feature_extractor(audio_data, sampling_rate = self.feature_extractor.sampling_rate)
return enc_embeds
if __name__=="__main__":
initiate = FineTuneSSLModels()
2 Likes
Edit: fixed for me by changing BertModel
to BertForSequenceClassification
.
Iām getting the same error, also following the tutorials in the course, chapter 3: Fine-tuning a model with the Trainer API - Hugging Face Course.
My code:
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
BertModel,
BertTokenizerFast,
TrainingArguments,
Trainer,
EvalPrediction,
)
checkpoint = "bert-base-uncased"
model = BertModel.from_pretrained(checkpoint)
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)
raw_ds = load_dataset("glue", "mrpc")
metric = evaluate.load("glue", "mrpc")
dataset = raw_ds.map(
lambda x: tokenizer(x["sentence1"], x["sentence2"], truncation=True),
batched=True,
)
dataset = dataset.remove_columns(["sentence1", "sentence2", "idx"])
dataset = dataset.rename_column("label", "labels")
dataset = dataset.with_format("torch")
trainer_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
def compute_metrics(eval_preds: EvalPrediction):
x, y = eval_preds
preds = np.argmax(x, -1)
return metric.compute(predictions=preds, references=y)
trainer = Trainer(
model=model,
args=trainer_args,
train_dataset=dataset["train"],
eval_dataset=dataset["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()
I get a similar error when using
# Set DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# Define DistilBERT as our base model:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=9)
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
Specifically, when using
# Define a new Trainer with all the objects we constructed so far
repo_name = "sentiment-model-amazon-reviews-distilbert"
training_args = TrainingArguments(
output_dir=repo_name,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=1,
weight_decay=0.01,
save_strategy="epoch",
push_to_hub=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_cal,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics
)
# Train and push to hub
trainer.train()
returns
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-50-f0acdc25090a> in <module>
24
25 # Train and push to hub
---> 26 trainer.train()
3 frames
/usr/local/lib/python3.8/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1541 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1542 )
-> 1543 return inner_training_loop(
1544 args=args,
1545 resume_from_checkpoint=resume_from_checkpoint,
/usr/local/lib/python3.8/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1789 tr_loss_step = self.training_step(model, inputs)
1790 else:
-> 1791 tr_loss_step = self.training_step(model, inputs)
1792
1793 if (
/usr/local/lib/python3.8/dist-packages/transformers/trainer.py in training_step(self, model, inputs)
2537
2538 with self.compute_loss_context_manager():
-> 2539 loss = self.compute_loss(model, inputs)
2540
2541 if self.args.n_gpu > 1:
/usr/local/lib/python3.8/dist-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
2582 else:
2583 if isinstance(outputs, dict) and "loss" not in outputs:
-> 2584 raise ValueError(
2585 "The model did not return a loss from the inputs, only the following keys: "
2586 f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.
1 Like
Rename columns to text
and labels
for text classification using distilbert-base-uncased
model. It needs to be checked for other domains and models too.
2 Likes