I’m getting the following error when trying to train wav2vec2 on SageMaker. I was not getting this error last week with the same code.
It appears to be an issue with datasets use of pyarrow.
Traceback (most recent call last):
File "pretrain.py", line 48, in <module>
import datasets
File "/opt/conda/lib/python3.6/site-packages/datasets/__init__.py", line 22, in <module>
import pyarrow
File "/opt/conda/lib/python3.6/site-packages/pyarrow/__init__.py", line 63, in <module>
import pyarrow.lib as _lib
File "pyarrow/compat.pxi", line 49, in init pyarrow.lib
File "/opt/conda/lib/python3.6/site-packages/cloudpickle/__init__.py", line 4, in <module>
from cloudpickle.cloudpickle import * # noqa
File "/opt/conda/lib/python3.6/site-packages/cloudpickle/cloudpickle.py", line 63, in <module>
import typing_extensions as _typing_extensions
File "/opt/conda/lib/python3.6/site-packages/typing_extensions.py", line 159, in <module>
class _FinalForm(typing._SpecialForm, _root=True):
AttributeError: module 'typing' has no attribute '_SpecialForm'
This code block is called and then it calls ‘pretrain.py’ which is just a copy of the tutorial here`
import sagemaker
import boto3
from pathlib import Path
from sagemaker.huggingface import HuggingFace
from sagemaker.s3 import S3Downloader
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
# hyperparameters which are passed to the training job
hyperparameters=dict(
dataset_name="patrickvonplaten/librispeech_asr_dummy",
dataset_config_names="clean",
dataset_split_names="validation",
validation_split_percentage="3",
model_name_or_path="patrickvonplaten/wav2vec2-base-v2",
output_dir="/opt/ml/model/artefacts",
max_train_steps="1",
num_warmup_steps="1",
gradient_accumulation_steps="1",
learning_rate="0.005",
weight_decay="0.01",
max_duration_in_seconds="20.0",
min_duration_in_seconds="2.0",
logging_steps="1",
saving_steps="1",
per_device_train_batch_size="8",
per_device_eval_batch_size="8",
adam_beta1="0.9",
adam_beta2="0.98",
adam_epsilon="1e-06"
)
# create the Estimator
huggingface_estimator = HuggingFace(
entry_point='pretrain.py',
source_dir='./',
instance_type='ml.p2.xlarge',
instance_count=1,
role=role,
transformers_version='4.4',
pytorch_version='1.6',
py_version='py36',
hyperparameters = hyperparameters
)
huggingface_estimator.fit()