Thank you for building a great platform!
I am trying to use custom torch.utils.data.Dataset instead of datasets.Dataset and I am getting following error:
ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py", line 158, in convert_to_tensors
tensor = as_tensor(value)
ValueError: expected sequence of length 105600 at dim 2 (got 92160)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
return self.collate_fn(data)
File "<ipython-input-8-1e61eb8cbac4>", line 94, in __call__
return_tensors="pt",
File "/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/processing_wav2vec2.py", line 127, in pad
return self.current_processor.pad(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_sequence_utils.py", line 225, in pad
return BatchFeature(batch_outputs, tensor_type=return_tensors)
File "/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py", line 73, in __init__
self.convert_to_tensors(tensor_type=tensor_type)
File "/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py", line 165, in convert_to_tensors
"Unable to create tensor, you should probably activate padding "
ValueError: Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.
I passed data_collactor
which, in my understanding, responsible for dynamic padding. Here is code
# -------------- Custom Dataset --------------
class Dataset(torch.utils.data.Dataset):
def __init__(self, df):
self.df = df
def __len__(self):
return self.df.shape[0]
def __getitem__(self, idx):
row = self.df.iloc[idx]
speech_array, sampling_rate = sf.read(row['file'])
input_values = processor(speech_array, sampling_rate=16_000).input_values
with processor.as_target_processor():
labels = processor(row["sentence"]).input_ids
return {
"input_values": input_values,
"labels": labels
}
# --------------- Trainer Part --------------
from transformers import Trainer
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
trainer = Trainer(
model=model,
data_collator=data_collator,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=all_train,
eval_dataset=cv_test,
tokenizer=processor.feature_extractor,
)
Has anybody faced same issue?