Using Wav2Vec in speech classification/regression problems

Hi @darkcurrent,

Thanks, If you want to use the script, there’s a need to change some parts, but I’ll guide you through the notebook, and if it helped you, please contribute to improving the code for general use (repo).

Suppose you have a schema like this, and the emotion is a list of floats/integers.

Dataset({
    features: ['path', 'emotion'],
    num_rows: xxx
})

Item({
path: "/to/path/idk.wav",
emotion: [1.1, 2.1, 2.4, ..., 1.5]
})

First of all, we need to change the label information:

label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

To this

num_labels = len(train_dataset[0][output_column])
label_list = list(range(nnu_labels))
print(f"A regression problem with {num_labels} items: {label_list}")
is_regression = True

Then, add the problem type to the config and adjust the preprocessing step and the label type in the collator fn.

# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
    problem_type="regression"
)
setattr(config, 'pooling_mode', pooling_mode)
# preprocess
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label for label in examples[output_column]] # Do any preprocessing on your float/integer data

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result
# collator
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        # d_type = torch.long if isinstance(label_features[0], int) else torch.float
        d_type = torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

If I continue the process, it will be a long reply :thinking:, so I’ll attach the modified notebook for better intuition.

Google Colab Notebook: Regression Example

1 Like