Ive been working closely with AWS to solve this issue. They told me to post here. Ive been trying to get multi instance working with AWS Sagemaker x Hugging Face estimators. My code works okay for single instance non distributed training and single instance distributed training. It does not work for multi instance distributed training. I am using the huggingface-pytorch-training:1.7-transformers4.6-gpu-py36-cu110-ubuntu18.04 image. The image is in our internal ECR because we run in a VPC.
Here is the code I am using. Its calling the same train.py from this repo (SageMaker-HuggingFace-Workshop/train.py at main · C24IO/SageMaker-HuggingFace-Workshop · GitHub). I get a FileNotFoundError error after training when the script is trying to load the model. I must be forgetting to set the correct path somewhere.
import sagemaker
import time
from sagemaker.huggingface import HuggingFace
import logging
import os
from sagemaker.s3 import S3Uploader
role = 'ROLE'
default_bucket = 'BUCKET_NAME'
local_train_dataset = "amazon_us_reviews_apparel_v1_00_train.json"
local_test_dataset = "amazon_us_reviews_apparel_v1_00_test.json"
# s3 uris for datasets
remote_train_dataset = f"s3://{default_bucket}/"
remote_test_dataset = f"s3://{default_bucket}/"
# upload datasets
S3Uploader.upload(local_train_dataset,remote_train_dataset)
S3Uploader.upload(local_test_dataset,remote_test_dataset)
print(f"train dataset uploaded to: {remote_train_dataset}/{local_train_dataset}")
print(f"test dataset uploaded to: {remote_test_dataset}/{local_test_dataset}")
# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1, # number of training epochs
'train_batch_size': 32, # batch size for training
'eval_batch_size': 64, # batch size for evaluation
'learning_rate': 3e-5, # learning rate used during training
'model_id':'distilbert-base-uncased', # pre-trained model
'fp16': True, # Whether to use 16-bit (mixed) precision training
'train_file': local_train_dataset, # training dataset
'test_file': local_test_dataset, # test dataset
}
metric_definitions=[
{'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
{'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
{'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
{'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"}]
# define Training Job Name
job_name = f'huggingface-workshop-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'
discovery_bucket_kms = 'KMS'
subnets = ['subnet-xxx']
security_group_ids = ['sg-xxx','sg-xxz','sg-xxy','sg-xx10']
distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
logging.debug('Creating the Estimator')
# create the Estimator
huggingface_estimator = HuggingFace(
entry_point = 'train.py',
source_dir = 'scripts',
instance_type = 'ml.p3.16xlarge',
instance_count = 2,
base_job_name = job_name,
role = role,
transformers_version = '4.6',
pytorch_version = '1.7',
py_version = 'py36',
hyperparameters = hyperparameters,
metric_definitions = metric_definitions,
sagemaker_session=sess,
distribution = distribution,
# SECURITY CONFIGS
output_kms_key = discovery_bucket_kms,
subnets = subnets,
security_group_ids = security_group_ids,
enable_network_isolation = True,
encrypt_inter_container_traffic = True,
image_uri = 'INTERNAL_ECR_URI'
)
# define a data input dictonary with our uploaded s3 uris
training_data = {
'train': remote_train_dataset,
'test': remote_test_dataset
}
logging.debug('Running Fit')
huggingface_estimator.fit(training_data)