Hi! I’m currently trying to train a SpeechT5 model on an ASR task. I have everything set up but it still get an error when i try to map my data: ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (2,2) and requested shape (1,2). Im using the SpeechT5 Processor to prepare and pad my data for the model but i dont know why this error still occurs, i tried many thnigs to fix it… This is my code, any help would be really appreceated!
feature_extractor = SpeechT5FeatureExtractor(sampling_rate=16000,
num_mel_bins=128,
feature_size=128,
win_length=32,
padding_value=3,
return_attention_mask=True,
do_normalize=False,
hop_length=16,
win_function=“hann_window”,
fmin=50,
fmax=7600,
mel_floor=1e-10,
)
processor = SpeechT5Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
valid_dataset = valid_dataset.map(lambda example: {‘array’: [example[‘audio’][‘array’]], **example})
print(valid_dataset)
train_dataset = train_dataset.map(lambda example: {‘array’: [example[‘audio’][‘array’]], **example})
print(train_dataset)
def preprocess_function(examples):
audio_arrays = examples[‘array’]
text_list = examples['transcription']
input_data = processor(
audio=audio_arrays,
text_target=text_list,
sampling_rate=16000,
# return_tensors='pt'
)
return input_data
valid_dataset = valid_dataset.map(preprocess_function, batched=False)
valid_dataset = valid_dataset.remove_columns([‘audio’, ‘transcription’, ‘array’, ‘attention_mask’, ‘decoder_attention_mask’])
train_dataset = train_dataset.map(preprocess_function, batched=False)
train_dataset = train_dataset.remove_columns([‘audio’, ‘transcription’, ‘array’, ‘attention_mask’, ‘decoder_attention_mask’])
print(train_dataset)
config = SpeechT5Config(
return_dict=False,
sampling_rate=16000,
vocab_size=tokenizer.vocab_size,
use_cache=True,
activation_function=‘relu’,
max_target_positions=2048,
max_source_positions=160000,
input_feat_per_channel=80,
pad_token_id=3,
eos_token_id=2,
bos_token_id=1,
)
@dataclass
class DataCollatorForSeq2Seq:
processor: processor
padding: Union[bool, str] = "longest"
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths and need
# different padding methods
input_features = [{"input_values": feature["input_values"][0]} for feature in features]
label_features = [{"input_ids": feature["labels"]} for feature in features]
batch = self.processor.pad(input_values=input_features, padding=True, return_tensors="pt", return_attention_mask=True)
labels_batch = self.processor.pad(labels=label_features, padding=True, return_tensors="pt", return_attention_mask=True)
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
batch["labels"] = labels
return batch
data_collator = DataCollatorForSeq2Seq(processor=processor, padding=‘longest’)
The error occurs in the feature_extractor.pad() function