Hello,
Here I read about set the eval_steps to 10, but hasn´t work for me. When I do it, I get: ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided [âinput_valuesâ].
Here is my code
import os
import torch
print(torch.cuda.is_available())
import transformers
print(transformers.__version__)
import datasets
print(datasets.__version__)
path_audio = "audios/"
path_transcripts = "transcripts/"
i = 0
j = 0
for (root,dirs,file) in os.walk(path_audio):
if i > 6:
break
i = i+1
print(i)
files = file
for (root,dirs,file) in os.walk(path_transcripts):
if j > 6:
break
else:
text = file
j =j+1
texts = []
for t in text:
with open(path_transcripts + t) as f:
line = f.readlines()
if len(line)>0:
texts.append(line[0])
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\â\%\`\â\ďż˝\$\&]'
def remove_special_characters(batch):
from unidecode import unidecode
#batch = re.sub(chars_to_ignore_regex, '', batch).lower() + " "
batch = re.sub(chars_to_ignore_regex, '', batch).lower()
batch = unidecode(batch)
if batch[-1] == " ":
batch = batch[:len(batch)-1]
return batch
texts = list(map(remove_special_characters,texts))
text_combined = " ".join(texts)
tokens = {}
for token in text_combined:
if token in tokens.keys():
tokens[token] = tokens[token] + 1
else:
tokens[token] = 0
tokens["|"] = tokens[" "]
del tokens[" "]
import json
with open('tokens.json', 'w') as f:
json.dump(tokens, f)
data_lst = []
for a,t in zip(files,texts):
data_lst.append({"path":path_audio + a, "transcription": t})
tokens["[UNK]"] = len(tokens)
tokens["[PAD]"] = len(tokens)
from transformers import Wav2Vec2CTCTokenizer
tokenizer = Wav2Vec2CTCTokenizer("tokens.json",unk_token="[UNK]",pad_token="[PAD]",word_delimiter_token="|")
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
data_train = data_lst[0:round(len(data_lst)*0.8)]
data_test = data_lst[round(len(data_lst)*0.8):]
import librosa
for d in data_lst:
d["audio"] = librosa.load(d["path"],sr=16000)
def prepare_dataset(batch):
audio = batch["audio"]
# batched output is "un-batched"
batch["input_values"] = processor(audio[0], sampling_rate=audio[-1]).input_values[0]
with processor.as_target_processor():
batch["labels"] = processor(batch["transcription"]).input_ids
#batch["labels"] = replace_n(batch["labels"],tokens)
return batch
audio_train = list(map(prepare_dataset, data_train))
audio_test = list(map(prepare_dataset, data_test))
import pandas as pd
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset
df_train = pd.DataFrame(audio_train).drop(['path', 'audio','transcription'], axis=1)
df_test = pd.DataFrame(audio_test).drop(['path', 'audio','transcription'], axis=1)
train = Dataset(pa.Table.from_pandas(df_train))
test = Dataset(pa.Table.from_pandas(df_test))
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
@dataclass
class DataCollatorCTCWithPadding:
"""
Data collator that will dynamically pad the inputs received.
Args:
processor (:class:`~transformers.Wav2Vec2Processor`)
The processor used for proccessing the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
"""
processor: Wav2Vec2Processor
padding: Union[bool, str] = True
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lenghts and need
# different padding methods
input_features = [{"input_values": feature["input_values"]} for feature in features]
label_features = [{"input_ids": feature["labels"]} for feature in features]
batch = self.processor.pad(
input_features,
padding=self.padding,
return_tensors="pt",
)
with self.processor.as_target_processor():
labels_batch = self.processor.pad(
label_features,
padding=self.padding,
return_tensors="pt",
)
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
batch["labels"] = labels
return batch
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wer_metric = datasets.load_metric("wer")
def compute_metrics(pred):
pred_logits = pred.predictions
pred_ids = np.argmax(pred_logits, axis=-1)
pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
pred_str = processor.batch_decode(pred_ids)
# we do not want to group tokens when computing the metrics
label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
wer = wer_metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
from transformers import Wav2Vec2ForCTC
model = Wav2Vec2ForCTC.from_pretrained(
"facebook/wav2vec2-base",
ctc_loss_reduction="mean",
pad_token_id=processor.tokenizer.pad_token_id,
)
model.freeze_feature_encoder()
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir='wav2vec2-spanish-demo',
group_by_length=True,
per_device_train_batch_size=8,
evaluation_strategy="steps",
num_train_epochs=30,
fp16=True,
gradient_checkpointing=True,
save_steps=500,
eval_steps=500,
logging_steps=500,
learning_rate=1e-4,
weight_decay=0.005,
warmup_steps=1000,
save_total_limit=2,
)
from transformers import Trainer
trainer = Trainer(
model=model,
data_collator=data_collator,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train,
eval_dataset=test,
tokenizer=processor.feature_extractor,
)
Here is the complete error:
/anaconda/envs/transformer/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
***** Running training *****
Num examples = 26
Num Epochs = 30
Instantaneous batch size per device = 8
Total train batch size (w. parallel, distributed & accumulation) = 8
Gradient Accumulation steps = 1
Total optimization steps = 120
/anaconda/envs/transformer/lib/python3.10/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:154: UserWarning: `as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your labels by using the argument `text` of the regular `__call__` method (either in the same call as your audio inputs, or in a separate call.
warnings.warn(
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[58], line 1
----> 1 trainer.train()
File /anaconda/envs/transformer/lib/python3.10/site-packages/transformers/trainer.py:1500, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1495 self.model_wrapped = self.model
1497 inner_training_loop = find_executable_batch_size(
1498 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1499 )
-> 1500 return inner_training_loop(
1501 args=args,
1502 resume_from_checkpoint=resume_from_checkpoint,
1503 trial=trial,
1504 ignore_keys_for_eval=ignore_keys_for_eval,
1505 )
File /anaconda/envs/transformer/lib/python3.10/site-packages/transformers/trainer.py:1716, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1713 self._load_rng_state(resume_from_checkpoint)
1715 step = -1
-> 1716 for step, inputs in enumerate(epoch_iterator):
1717
1718 # Skip past any already trained steps if resuming training
1719 if steps_trained_in_current_epoch > 0:
1720 steps_trained_in_current_epoch -= 1
File /anaconda/envs/transformer/lib/python3.10/site-packages/torch/utils/data/dataloader.py:530, in _BaseDataLoaderIter.__next__(self)
528 if self._sampler_iter is None:
529 self._reset()
--> 530 data = self._next_data()
531 self._num_yielded += 1
532 if self._dataset_kind == _DatasetKind.Iterable and \
533 self._IterableDataset_len_called is not None and \
534 self._num_yielded > self._IterableDataset_len_called:
File /anaconda/envs/transformer/lib/python3.10/site-packages/torch/utils/data/dataloader.py:570, in _SingleProcessDataLoaderIter._next_data(self)
568 def _next_data(self):
569 index = self._next_index() # may raise StopIteration
--> 570 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
571 if self._pin_memory:
572 data = _utils.pin_memory.pin_memory(data)
File /anaconda/envs/transformer/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:52, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
50 else:
51 data = self.dataset[possibly_batched_index]
---> 52 return self.collate_fn(data)
Cell In[50], line 39, in DataCollatorCTCWithPadding.__call__(self, features)
33 batch = self.processor.pad(
34 input_features,
35 padding=self.padding,
36 return_tensors="pt",
37 )
38 with self.processor.as_target_processor():
---> 39 labels_batch = self.processor.pad(
40 label_features,
41 padding=self.padding,
42 return_tensors="pt",
43 )
45 # replace padding with -100 to ignore loss correctly
46 labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
File /anaconda/envs/transformer/lib/python3.10/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:113, in Wav2Vec2Processor.pad(self, *args, **kwargs)
111 # For backward compatibility
112 if self._in_target_context_manager:
--> 113 return self.current_processor.pad(*args, **kwargs)
115 input_features = kwargs.pop("input_features", None)
116 labels = kwargs.pop("labels", None)
File /anaconda/envs/transformer/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2941, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
2938 encoded_inputs[key] = to_py_obj(value)
2940 # Convert padding_strategy in PaddingStrategy
-> 2941 padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
2942 padding=padding, max_length=max_length, verbose=verbose
2943 )
2945 required_input = encoded_inputs[self.model_input_names[0]]
2946 if required_input and not isinstance(required_input[0], (list, tuple)):
File /anaconda/envs/transformer/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2388, in PreTrainedTokenizerBase._get_padding_truncation_strategies(self, padding, truncation, max_length, pad_to_multiple_of, verbose, **kwargs)
2385 max_length = self.model_max_length
2387 # Test if we have a padding token
-> 2388 if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
2389 raise ValueError(
2390 "Asking to pad but the tokenizer does not have a padding token. "
2391 "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
2392 "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
2393 )
2395 # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
TypeError: '<' not supported between instances of 'NoneType' and 'int'