Error: operands could not be broadcast together

Hi! I’m currently trying to train a SpeechT5 model on an ASR task. I have everything set up but it still get an error when i try to map my data: ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (2,2) and requested shape (1,2). Im using the SpeechT5 Processor to prepare and pad my data for the model but i dont know why this error still occurs, i tried many thnigs to fix it… This is my code, any help would be really appreceated!

feature_extractor = SpeechT5FeatureExtractor(sampling_rate=16000,
num_mel_bins=128,
feature_size=128,
win_length=32,
padding_value=3,
return_attention_mask=True,
do_normalize=False,
hop_length=16,
win_function=“hann_window”,
fmin=50,
fmax=7600,
mel_floor=1e-10,
)

processor = SpeechT5Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

valid_dataset = valid_dataset.map(lambda example: {‘array’: [example[‘audio’][‘array’]], **example})
print(valid_dataset)

train_dataset = train_dataset.map(lambda example: {‘array’: [example[‘audio’][‘array’]], **example})
print(train_dataset)

def preprocess_function(examples):
audio_arrays = examples[‘array’]

text_list = examples['transcription']

input_data = processor(
    audio=audio_arrays,
    text_target=text_list,
    sampling_rate=16000,
    # return_tensors='pt'
)
return input_data

valid_dataset = valid_dataset.map(preprocess_function, batched=False)
valid_dataset = valid_dataset.remove_columns([‘audio’, ‘transcription’, ‘array’, ‘attention_mask’, ‘decoder_attention_mask’])

train_dataset = train_dataset.map(preprocess_function, batched=False)
train_dataset = train_dataset.remove_columns([‘audio’, ‘transcription’, ‘array’, ‘attention_mask’, ‘decoder_attention_mask’])

print(train_dataset)

config = SpeechT5Config(
return_dict=False,
sampling_rate=16000,
vocab_size=tokenizer.vocab_size,
use_cache=True,
activation_function=‘relu’,
max_target_positions=2048,
max_source_positions=160000,
input_feat_per_channel=80,
pad_token_id=3,
eos_token_id=2,
bos_token_id=1,

)

@dataclass
class DataCollatorForSeq2Seq:
processor: processor

padding: Union[bool, str] = "longest"

def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

    # split inputs and labels since they have to be of different lengths and need

    # different padding methods

    input_features = [{"input_values": feature["input_values"][0]} for feature in features]

    label_features = [{"input_ids": feature["labels"]} for feature in features]

    batch = self.processor.pad(input_values=input_features, padding=True, return_tensors="pt", return_attention_mask=True)

    labels_batch = self.processor.pad(labels=label_features, padding=True, return_tensors="pt", return_attention_mask=True)

    # replace padding with -100 to ignore loss correctly

    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

    batch["labels"] = labels

    return batch

data_collator = DataCollatorForSeq2Seq(processor=processor, padding=‘longest’)

The error occurs in the feature_extractor.pad() function