Hi Mighty HF community,
I am trying to build POC code for to fine tune the Text summarization model sshleifer/distilbart-cnn-12-6 using Sagemaker. Training job is completed successfully but I don’t see model.tar.gz file at destination location not any directory under /opt/ml. Appreciate any help you could provide?
tokenizer_name = 'sshleifer/distilbart-cnn-12-6'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
dataset_name = 'ccdv/cnn_dailymail'
I have used 5000 examples for training and 1000 examples for testing from ccdv/cnn_dailymail dataset. I have tokenized columns: 1/article and 2/highlights.
max_input_length = 512
max_target_length = 512
def preprocess_function(examples):
model_inputs = tokenizer(
examples["article"], max_length=max_input_length, truncation=True
)
# Set up the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(
examples["highlights"], max_length=max_target_length, truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
test = load_dataset(dataset_name, '3.0.0')
tokenized_datasets = test.map(preprocess_function, batched=True)
train_dataset1 = test['train'].shuffle().select(range(5000))
test_dataset1 = test['train'].shuffle().select(range(1000))
train_dataset1_tokenized = train_dataset1.map(preprocess_function, batched=True)
test_dataset1_tokenized = test_dataset1.map(preprocess_function, batched=True)
train_dataset1_tokenized = train_dataset1_tokenized.remove_columns(['article', 'highlights'])
test_dataset1_tokenized = test_dataset1_tokenized.remove_columns(['article', 'highlights'])
train_dataset1_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset1_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
Then I have uploaded this train and test datasets to S3 bucket.
s3 = S3FileSystem()
s3_prefix = f'samples/datasets/{dataset_name}'
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
#train_dataset.save_to_disk(training_input_path,fs=s3)
train_dataset1_tokenized.save_to_disk(training_input_path,fs=s3)
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
#test_dataset.save_to_disk(test_input_path,fs=s3)
test_dataset1_tokenized.save_to_disk(test_input_path,fs=s3)
print(f'Uploaded training data to {training_input_path}')
print(f'Uploaded testing data to {test_input_path}')
Hyper-parameter and estimator definition
hyperparameters={ 'epochs': 1,
'train_batch_size': 32,
'model_name': model_name,
'tokenizer_name': tokenizer_name,
}
from sagemaker.huggingface import HuggingFace
# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.17.0'}
# creates Hugging Face estimator
huggingface_estimator = HuggingFace(
entry_point='run_summarization.py',
source_dir='./examples/pytorch/summarization',
instance_type='ml.p3.2xlarge',
instance_count=2,
role=role,
git_config=git_config,
transformers_version='4.17.0',
pytorch_version='1.10.2',
py_version='py38',
hyperparameters = hyperparameters
)
Model fit
huggingface_estimator.fit(
{'train': training_input_path, 'test': test_input_path},
wait=False,
job_name='finetune-sshleifer-distilbart-cnn-12-6-2022-06-03-22-16-10' )