Finetuning Wav2Vec2 loss constant

Hi there,
I’m trying to fine-tune Wav2Vec2-base on the UaSpeech dataset containing atypical speech samples (UASpeech Database). Unfortunately, the loss remains constant over the training epochs and the eval_wer is equal to 1.0, as shown below:

Any suggestions to solve? I attach my training script in the end of this message.
Thanks in advance,
Davide.

{‘loss’: 19.5561, ‘learning_rate’: 9.73816155988858e-05, ‘epoch’: 1.63}
{‘eval_loss’: 3.539416790008545, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.1628, ‘eval_samples_per_second’: 26.584, ‘eval_steps_per_second’: 3.43, ‘epoch’: 1.63}
{‘loss’: 3.4805, ‘learning_rate’: 9.181058495821727e-05, ‘epoch’: 3.25}
{‘eval_loss’: 3.531764268875122, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.396, ‘eval_samples_per_second’: 25.846, ‘eval_steps_per_second’: 3.335, ‘epoch’: 3.25}
{‘loss’: 3.4661, ‘learning_rate’: 8.623955431754876e-05, ‘epoch’: 4.88}
{‘eval_loss’: 3.6301000118255615, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.4002, ‘eval_samples_per_second’: 25.833, ‘eval_steps_per_second’: 3.333, ‘epoch’: 4.88}
{‘loss’: 3.4783, ‘learning_rate’: 8.066852367688023e-05, ‘epoch’: 6.5}
{‘eval_loss’: 3.519192695617676, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.2413, ‘eval_samples_per_second’: 26.331, ‘eval_steps_per_second’: 3.398, ‘epoch’: 6.5}
{‘loss’: 3.4622, ‘learning_rate’: 7.509749303621171e-05, ‘epoch’: 8.13}
{‘eval_loss’: 3.5252716541290283, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.4598, ‘eval_samples_per_second’: 25.651, ‘eval_steps_per_second’: 3.31, ‘epoch’: 8.13}
{‘loss’: 3.4772, ‘learning_rate’: 6.952646239554319e-05, ‘epoch’: 9.76}
{‘eval_loss’: 3.5181665420532227, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.2436, ‘eval_samples_per_second’: 26.323, ‘eval_steps_per_second’: 3.397, ‘epoch’: 9.76}
{‘loss’: 3.4638, ‘learning_rate’: 6.395543175487466e-05, ‘epoch’: 11.38}
{‘eval_loss’: 3.527850866317749, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.3076, ‘eval_samples_per_second’: 26.121, ‘eval_steps_per_second’: 3.37, ‘epoch’: 11.38}
{‘loss’: 3.476, ‘learning_rate’: 5.8384401114206136e-05, ‘epoch’: 13.01}
{‘eval_loss’: 3.522648572921753, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.4275, ‘eval_samples_per_second’: 25.749, ‘eval_steps_per_second’: 3.322, ‘epoch’: 13.01}
{‘loss’: 3.4636, ‘learning_rate’: 5.281337047353761e-05, ‘epoch’: 14.63}
{‘eval_loss’: 3.521289825439453, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.5234, ‘eval_samples_per_second’: 25.459, ‘eval_steps_per_second’: 3.285, ‘epoch’: 14.63}
{‘loss’: 3.4722, ‘learning_rate’: 4.724233983286908e-05, ‘epoch’: 16.26}
{‘eval_loss’: 3.5324676036834717, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.561, ‘eval_samples_per_second’: 25.347, ‘eval_steps_per_second’: 3.271, ‘epoch’: 16.26}
{‘loss’: 3.4629, ‘learning_rate’: 4.167130919220056e-05, ‘epoch’: 17.89}
{‘eval_loss’: 3.5295488834381104, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.5868, ‘eval_samples_per_second’: 25.271, ‘eval_steps_per_second’: 3.261, ‘epoch’: 17.89}
{‘loss’: 3.4659, ‘learning_rate’: 3.6100278551532036e-05, ‘epoch’: 19.51}
{‘eval_loss’: 3.5280654430389404, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.7034, ‘eval_samples_per_second’: 24.933, ‘eval_steps_per_second’: 3.217, ‘epoch’: 19.51}
{‘loss’: 3.4656, ‘learning_rate’: 3.0529247910863515e-05, ‘epoch’: 21.14}
{‘eval_loss’: 3.526977777481079, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.6447, ‘eval_samples_per_second’: 25.102, ‘eval_steps_per_second’: 3.239, ‘epoch’: 21.14}
{‘loss’: 3.4603, ‘learning_rate’: 2.4958217270194986e-05, ‘epoch’: 22.76}
{‘eval_loss’: 3.523038625717163, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.3862, ‘eval_samples_per_second’: 25.876, ‘eval_steps_per_second’: 3.339, ‘epoch’: 22.76}
{‘loss’: 3.4666, ‘learning_rate’: 1.938718662952646e-05, ‘epoch’: 24.39}
{‘eval_loss’: 3.5222201347351074, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.669, ‘eval_samples_per_second’: 25.032, ‘eval_steps_per_second’: 3.23, ‘epoch’: 24.39}
{‘loss’: 3.4637, ‘learning_rate’: 1.381615598885794e-05, ‘epoch’: 26.02}
{‘eval_loss’: 3.5271711349487305, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.8704, ‘eval_samples_per_second’: 24.463, ‘eval_steps_per_second’: 3.157, ‘epoch’: 26.02}
{‘loss’: 3.4624, ‘learning_rate’: 8.245125348189415e-06, ‘epoch’: 27.64}
{‘eval_loss’: 3.5230629444122314, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.7077, ‘eval_samples_per_second’: 24.92, ‘eval_steps_per_second’: 3.216, ‘epoch’: 27.64}
{‘loss’: 3.4619, ‘learning_rate’: 2.6740947075208913e-06, ‘epoch’: 29.27}
{‘eval_loss’: 3.524078845977783, ‘eval_wer’: 1.0, ‘eval_runtime’: 8.5451, ‘eval_samples_per_second’: 25.395, ‘eval_steps_per_second’: 3.277, ‘epoch’: 29.27}
{‘train_runtime’: 2438.4702, ‘train_samples_per_second’: 24.027, ‘train_steps_per_second’: 1.513, ‘train_loss’: 4.339348538199737, ‘epoch’: 30.0}


import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import evaluate
import numpy as np
from datasets import load_dataset, concatenate_datasets
from transformers import AutoModelForCTC, TrainingArguments, Trainer, Wav2Vec2ForCTC, AutoProcessor

dataset = load_dataset("audiofolder", data_dir="./mydata")
dataset=dataset['train']
L=len(dataset)
print("Elementi nel dataset ", L)
dataset=dataset.train_test_split(test_size=0.1)
print(dataset)
print(dataset["train"][0])


processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")

def uppercase(example):

    return {"transcription": example["transcription"].upper()}


dataset=dataset.map(uppercase)
print(dataset["train"][0])

def prepare_dataset(batch):
    audio = batch["audio"]
    batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
    batch["input_length"] = len(batch["input_values"][0])
    return batch

encoded_minds = dataset.map(prepare_dataset, num_proc=4)

@dataclass
class DataCollatorCTCWithPadding:

    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")


wer = evaluate.load("wer")



def compute_metrics(pred):
    wer = evaluate.load("wer")
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}



model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)
model.freeze_feature_extractor()

training_args = TrainingArguments(
    output_dir="UAwav2vec2-base",
    group_by_length=True,
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=30,
    fp16=True,
    gradient_checkpointing=True, 
    save_steps=200,
    eval_steps=200,
    logging_steps=200,
    learning_rate=1e-4,
    weight_decay=0.005,
    warmup_steps=100,
    save_total_limit=2,
    push_to_hub=False,
)
feature_extractor=processor.feature_extractor

tokenizer=processor.tokenizer

feature_extractor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

found any solution?