Hi! I am currently trying to train a Speech2Text model from scratch. I have everything set up and my dataset matches the requirements of the documentation in terms of the tensor shapes! I dont know why i get this error is there something wrong with my data or the model config?? This is my Code and the whole error message:
import sys
from datasets import load_dataset_builder, load_dataset, dataset_dict
from transformers import Seq2SeqTrainer, Speech2TextForConditionalGeneration, Speech2TextConfig, Speech2TextProcessor,
Speech2TextTokenizer, Speech2TextFeatureExtractor, trainer_utils,
Trainer, DataCollatorWithPadding, Speech2TextModel, TrainingArguments, Seq2SeqTrainingArguments
import torch
from sklearn.model_selection import train_test_split
import soundfile as sf
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import torchaudio
import numpy as np
import json
test_dataset = load_dataset(âaudiofolderâ, data_dir=r"C:\Users\LuanL\Desktop\Dataset Test")
test_dataset = test_dataset[âtrainâ]
print(test_dataset)
tokenizer = Speech2TextTokenizer(spm_file=r"C:\Users\LuanL\Desktop\BA\Tokenizer\char.model",
unk_token=ââ, eos_token=ââ, sos_token=ââ,
vocab_file=r"C:\Users\LuanL\Desktop\BA\Tokenizer\char.json", pad_token=ââ)
feature_extractor = Speech2TextFeatureExtractor(sampling_rate=16000, num_mel_bins=80, feature_size=80)
processor = Speech2TextProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
def preprocess_function(examples):
audio_arrays = [x[âarrayâ] for x in examples[âaudioâ]]
inputs_audio = processor(
audio_arrays,
sampling_rate=16000,
padding=True,
max_length=100000000,
truncation=True,
)
print(inputs_audio)
return inputs_audio
def preprocess_function_text(examples):
text = [x for x in examples[âtranscriptionâ]]
input_text = tokenizer(text, padding=True, truncation=True, return_tensors="np")
input_ids_list = []
attention_mask_list = []
for i in range(len(text)):
input_ids = input_text.input_ids[i]
attention_mask = input_text.attention_mask[i]
input_ids_list.append(input_ids)
attention_mask_list.append(attention_mask)
return {
'decoder_input_ids': input_ids_list,
'decoder_attention_mask': attention_mask_list
}
test_dataset = test_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function_text, batched=True)
test_dataset = test_dataset.remove_columns([âaudioâ, âtranscriptionâ])
test_dataset.set_format(type=âptâ)
training_args = Seq2SeqTrainingArguments(
evaluation_strategy=âepochâ,
output_dir=r"C:\Users\LuanL\Desktop\BA\Train Loop",
num_train_epochs=3,
learning_rate=2e-5,
# per_device_eval_batch_size=4,
weight_decay=0.01,
logging_dir=r"C:\Users\LuanL\Desktop\BA\Train Loop\Log",
gradient_accumulation_steps=1,
max_grad_norm=1.0,
)
config = Speech2TextConfig(
return_dict=False,
sampling_rate=16000,
vocab_size=tokenizer.vocab_size,
use_cache=False,
activation_function=âreluâ,
max_target_positions=2048,
max_source_positions=100000,
input_feat_per_channel=80,
pad_token_id=35,
eos_token_id=2,
bos_token_id=1,
)
model = Speech2TextModel(config)
model.train()
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=test_dataset,
)
trainer.train()
Error message:
C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\Scripts\python.exe C:\Users\LuanL\PycharmProjects\Speech2TextNeu\main.py
Dataset({
features: [âaudioâ, âtranscriptionâ],
num_rows: 1
})
Map: 0%| | 0/1 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|ââââââââââ| 1/1 [00:00<00:00, 45.45 examples/s]
Dataset({
features: [âinput_featuresâ, âattention_maskâ, âdecoder_input_idsâ, âdecoder_attention_maskâ],
num_rows: 1
})
C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\transformers\optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set no_deprecation_warning=True
to disable this warning
warnings.warn(
0%| | 0/3 [00:00<?, ?it/s]Traceback (most recent call last):
File âC:\Users\LuanL\PycharmProjects\Speech2TextNeu\main.pyâ, line 175, in
trainer.train()
File âC:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\transformers\trainer.pyâ, line 1539, in train
return inner_training_loop(
File âC:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\transformers\trainer.pyâ, line 1809, in inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File âC:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\transformers\trainer.pyâ, line 2665, in training_step
self.accelerator.backward(loss)
File âC:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\accelerate\accelerator.pyâ, line 1853, in backward
loss.backward(**kwargs)
File âC:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\torch_tensor.pyâ, line 487, in backward
torch.autograd.backward(
File "C:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\torch\autograd_init.py", line 193, in backward
grad_tensors_ = make_grads(tensors, grad_tensors, is_grads_batched=False)
File âC:\Users\LuanL\PycharmProjects\Speech2TextNeu\venv\lib\site-packages\torch\autograd_init_.pyâ, line 88, in _make_grads
raise RuntimeError(âgrad can be implicitly created only for scalar outputsâ)
RuntimeError: grad can be implicitly created only for scalar outputs
0%| | 0/3 [00:02<?, ?it/s]
Process finished with exit code 1