Need help with speech Encoder-Decoder

I’ve been trying to get this snippet to run for days but keep running into the same error so I’ve come here for help. I know there exists a Wav2Vec2-Bert model but I’m trying to learn how to finetune/train a model for educational purposes. I’ve tried setting “padding = True” everywhere, tried to batch them, tried setting “dispatch_batches=False” and “split_batches=True” but no matter what I do, I always get the tensor error when it tries to compute any metrics. Any help is appreciated! (I’m using transformers-4.39.3, tokenizers-0.15.2, and accelerate-0.29.2)

from transformers import AutoTokenizer, Wav2Vec2Processor, SpeechEncoderDecoderModel
from datasets import load_dataset, load_metric, Dataset
from transformers import Trainer, TrainingArguments
import torch
import numpy as np
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

encoder_id = “facebook/wav2vec2-base-960h”
decoder_id = “google-bert/bert-base-uncased”

processor = Wav2Vec2Processor.from_pretrained(encoder_id)
tokenizer = AutoTokenizer.from_pretrained(decoder_id)

model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_id, decoder_id)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

num_samples = 10

dataset = load_dataset(“librispeech_asr”, split=“train.clean.100”, streaming = True, trust_remote_code=True)
das = list(dataset.take(num_samples))
ds = Dataset.from_dict({‘audio’: [da[‘audio’] for da in das], ‘text’: [da[‘text’] for da in das]})

#ds = load_dataset(“hf-internal-testing/librispeech_asr_demo”, “clean”, split=“validation”)

wer_metric = load_metric(“wer”)

def compute_metrics(pred):
pred_logits = pred.predictions
pred_ids = torch.argmax(pred_logits, dim=-1)

pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

pred_str = processor.batch_decode(pred_ids, group_tokens = False)
label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

wer = wer_metric.compute(predictions=pred_str, references=label_str)

return {"wer": wer}

@dataclass
class DataCollatorCTCWithPadding:

processor: Wav2Vec2Processor
padding: Union[bool, str] = True

def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
    input_features = [{"input_values": feature["input_values"]} for feature in features]
    label_features = [{"input_ids": feature["labels"]} for feature in features]

    batch = self.processor.pad(
        input_features,
        padding=self.padding,
        return_tensors="pt",
    )
    with self.processor.as_target_processor():
        labels_batch = self.processor.pad(
            label_features,
            padding=self.padding,
            return_tensors="pt",
        )

    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

    batch["labels"] = labels

    return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

def prepare_dataset(batch):
audio = batch[“audio”]
batch[“input_values”] = processor(audio[“array”], sampling_rate=audio[“sampling_rate”]).input_values[0]
batch[“input_length”] = len(batch[“input_values”])
with processor.as_target_processor():
batch[“labels”] = processor(batch[“text”]).input_ids
return batch

train_dataset = ds.map(prepare_dataset, remove_columns=ds.column_names)

for param in model.encoder.parameters():
param.requires_grad = False

training_args = TrainingArguments(
output_dir=“YousifTest”,
per_device_train_batch_size=4,
split_batches=True,
gradient_accumulation_steps=2,
evaluation_strategy=“epoch”,
num_train_epochs=1,
gradient_checkpointing=True,
fp16=True,
save_steps=400,
eval_steps=1,
logging_steps=1,
learning_rate=3e-4,
save_total_limit=2,
max_steps=24,
)

trainer = Trainer(
model=model,
data_collator=data_collator,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=train_dataset,
)

trainer.train()
trainer.save_model(“Test2”)

Traceback (most recent call last): | 0/2 [00:00<?, ?it/s]
File “c:\Users\Yousif\Documents\University\Assignments\Thesis\Work Files\Transformers\Trainer.py”, line 116, in
trainer.train()
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer.py”, line 1780, in train
return inner_training_loop(
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer.py”, line 2213, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer.py”, line 2577, in _maybe_log_save_evaluate
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer.py”, line 3365, in evaluate
output = eval_loop(
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer.py”, line 3580, in evaluation_loop
preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer_pt_utils.py”, line 138, in nested_concat
return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer_pt_utils.py”, line 138, in
return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer_pt_utils.py”, line 138, in nested_concat
return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer_pt_utils.py”, line 138, in
return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer_pt_utils.py”, line 138, in nested_concat
return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer_pt_utils.py”, line 138, in
return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer_pt_utils.py”, line 140, in nested_concat
return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
File “C:\Users\Yousif\anaconda3\lib\site-packages\transformers\trainer_pt_utils.py”, line 99, in torch_pad_and_concatenate
return torch.cat((tensor1, tensor2), dim=0)
RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 227 but got size 214 for tensor number 1 in the list.