RuntimeError: grad can be implicitly created only for scalar outputs

Hi! I am currently trying to train a Speech2Text model from scratch. I have everything set up and my dataset matches the requirements of the documentation in terms of the tensor shapes! I dont know why i get this error is there something wrong with my data or the model config?? This is my Code and the whole error message:

import sys
from datasets import load_dataset_builder, load_dataset, dataset_dict
from transformers import Seq2SeqTrainer, Speech2TextForConditionalGeneration, Speech2TextConfig, Speech2TextProcessor,
Speech2TextTokenizer, Speech2TextFeatureExtractor, trainer_utils,
Trainer, DataCollatorWithPadding, Speech2TextModel, TrainingArguments, Seq2SeqTrainingArguments
import torch
from sklearn.model_selection import train_test_split
import soundfile as sf
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import torchaudio
import numpy as np
import json

test_dataset = load_dataset(‘audiofolder’, data_dir=r"C:\Users\LuanL\Desktop\Dataset Test")
test_dataset = test_dataset[‘train’]
print(test_dataset)

tokenizer = Speech2TextTokenizer(spm_file=r"C:\Users\LuanL\Desktop\BA\Tokenizer\char.model",
unk_token=‘’, eos_token=‘’, sos_token=‘’,
vocab_file=r"C:\Users\LuanL\Desktop\BA\Tokenizer\char.json", pad_token=‘’)

feature_extractor = Speech2TextFeatureExtractor(sampling_rate=16000, num_mel_bins=80, feature_size=80)

processor = Speech2TextProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

def preprocess_function(examples):
audio_arrays = [x[“array”] for x in examples[“audio”]]

inputs_audio = processor(

    audio_arrays,
    sampling_rate=16000,
    padding=True,
    max_length=100000000,
    truncation=True,

)

print(inputs_audio)

return inputs_audio

def preprocess_function_text(examples):
text = [x for x in examples[‘transcription’]]

input_text = tokenizer(text, padding=True, truncation=True, return_tensors="np")

input_ids_list = []
attention_mask_list = []

for i in range(len(text)):
    input_ids = input_text.input_ids[i]
    attention_mask = input_text.attention_mask[i]

    input_ids_list.append(input_ids)
    attention_mask_list.append(attention_mask)

return {
    'decoder_input_ids': input_ids_list,
    'decoder_attention_mask': attention_mask_list
}

test_dataset = test_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function_text, batched=True)
test_dataset = test_dataset.remove_columns([‘audio’, ‘transcription’])
test_dataset.set_format(type=“pt”)

training_args = Seq2SeqTrainingArguments(
evaluation_strategy=“epoch”,
output_dir=r"C:\Users\LuanL\Desktop\BA\Train Loop",
num_train_epochs=3,
learning_rate=2e-5,
# per_device_eval_batch_size=4,
weight_decay=0.01,
logging_dir=r"C:\Users\LuanL\Desktop\BA\Train Loop\Log",
gradient_accumulation_steps=1,
max_grad_norm=1.0,
)

config = Speech2TextConfig(
return_dict=False,
sampling_rate=16000,
vocab_size=tokenizer.vocab_size,
use_cache=False,
activation_function=‘relu’,
max_target_positions=2048,
max_source_positions=100000,
input_feat_per_channel=80,
pad_token_id=35,
eos_token_id=2,
bos_token_id=1,

)

model = Speech2TextModel(config)
model.train()

trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=test_dataset,
)

trainer.train()

Error message:
C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\Scripts\python.exe C:\Users\LuanL\PycharmProjects\Speech2TextNeu\main.py
Dataset({
features: [‘audio’, ‘transcription’],
num_rows: 1
})
Map: 0%| | 0/1 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1/1 [00:00<00:00, 45.45 examples/s]
Dataset({
features: [‘input_features’, ‘attention_mask’, ‘decoder_input_ids’, ‘decoder_attention_mask’],
num_rows: 1
})
C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\transformers\optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set no_deprecation_warning=True to disable this warning
warnings.warn(
0%| | 0/3 [00:00<?, ?it/s]Traceback (most recent call last):
File “C:\Users\LuanL\PycharmProjects\Speech2TextNeu\main.py”, line 175, in
trainer.train()
File “C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\transformers\trainer.py”, line 1539, in train
return inner_training_loop(
File “C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\transformers\trainer.py”, line 1809, in inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File “C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\transformers\trainer.py”, line 2665, in training_step
self.accelerator.backward(loss)
File “C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\accelerate\accelerator.py”, line 1853, in backward
loss.backward(**kwargs)
File “C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\torch_tensor.py”, line 487, in backward
torch.autograd.backward(
File "C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\torch\autograd_init
.py", line 193, in backward
grad_tensors_ = make_grads(tensors, grad_tensors, is_grads_batched=False)
File “C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\torch\autograd_init_.py”, line 88, in _make_grads
raise RuntimeError(“grad can be implicitly created only for scalar outputs”)
RuntimeError: grad can be implicitly created only for scalar outputs
0%| | 0/3 [00:02<?, ?it/s]

Process finished with exit code 1