"No space left on device" when using HuggingFace + SageMaker

You job fails with the following error? Or do you see something different?

Distributed can help you either speed up you training or make it possible to fine-tune models which are not fit onto a single GPU. Since your corpus doesn’t sounds that big there is no need yet to go with distributed training

@philschmid correct, that is the error message I receive. Below is my training script and estimator call

### Estimator

estimator = HuggingFace(
                entry_point          = 'train.py',        # fine-tuning script used in training jon
                source_dir           = 'embed_source',      # directory where fine-tuning script is stored
                instance_type        = instance_type,   # instances type used for the training job
                instance_count       = 1,                 # the number of instances used for training
                role                 = get_execution_role(), # Iam role used in training job to access AWS ressources, 
                transformers_version = '4.6',             # the transformers version used in the training job
                #train_use_spot_instances=True,
                max_run= 36000,
                #max_wait= 36000,
                pytorch_version      = '1.7',             # the pytorch_version version used in the training job
                py_version           = 'py36',            # the python version used in the training job
                hyperparameters      = hyperparameters,   # the hyperparameter used for running the training job
                metric_definitions   = metric_definitions, # the metrics regex definitions to extract logs
                output_path=os.path.join(dataconnector.version_s3_prefix,  "models"),
                code_location=os.path.join(dataconnector.version_s3_prefix,  "models"),
                volume_size = 200,
                checkpoint_s3_uri='s3://kj-temp/checkpoints'
                
            )

### Trainging script (train.py)

import os

os.environ['TRANSFORMERS_CACHE'] = "cache"
os.environ['HF_DATASETS_CACHE'] = "cache"
print(os.system('python -m pip install datasets --upgrade'))

from transformers import (
    AutoModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    AutoTokenizer,
    AutoFeatureExtractor,
    AutoModelForMaskedLM,
    default_data_collator,
    AutoModelForSequenceClassification
)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset, Dataset
import random
import logging
import sys
import argparse
import torch
import numpy as np
import pandas as pd
import datasets

import ast

print('datasets.__version__', datasets.__version__)

if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=1)
    parser.add_argument("--train_batch_size", type=int, default=32)
    parser.add_argument("--eval_batch_size", type=int, default=64)
    parser.add_argument("--warmup_steps", type=int, default=500)
    parser.add_argument("--model_id", type=str)
    parser.add_argument("--num_labels", type=str)
    parser.add_argument("--labels", type=str)
    parser.add_argument("--learning_rate", type=str, default=5e-5)
    parser.add_argument("--train_file", type=str, default="train.DbEmbeddings")
    parser.add_argument("--test_file", type=str, default="test.DbEmbeddings")
    parser.add_argument("--fp16", type=bool, default=True)

    # Data, model, and output directories
    parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])

    args, _ = parser.parse_known_args()

    # Set up logging
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName("INFO"),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )
    
          
    raw_train_dataset = load_dataset("json", data_files=os.path.join(args.training_dir, args.train_file), cache_dir="opt/ml/input")["train"]
    raw_test_dataset = load_dataset("json", data_files=os.path.join(args.test_dir, args.test_file), cache_dir="opt/ml/input")["train"]
    
    
    print('\nargs.labels', args.labels)
    print('type args.labels', type(args.labels))

    num_labels = int(args.num_labels)
    labels = ast.literal_eval(args.labels)
    
    print('type(args.num_labels)', type(labels))
    raw_train_dataset = raw_train_dataset.cast_column("label", datasets.ClassLabel(num_classes=num_labels, 
                                                                                   names= labels, names_file=None, id=None))
    
    print('\nraw_train_dataset.features', raw_train_dataset.features)

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
 

    def tokenize(examples):
        result = tokenizer(examples["source"], padding=True, truncation=True)

        return result


    # Use batched=True to activate fast multithreading!
    train_dataset = raw_train_dataset.map(
        tokenize, batched=True, batch_size=None
    )
    test_dataset = raw_test_dataset.map(
        tokenize, batched=True, batch_size=None
    )


    train_dataset.reset_format()
    test_dataset.reset_format()
    
    train_dataset.set_format("torch",
                                columns=["input_ids", "attention_mask", "label"])

    test_dataset.set_format(type="pandas")
    df = test_dataset[:]
    df_test, df_valid = np.split(df, [int(.5*len(df))])
    test_data = Dataset.from_pandas(df_test)
    valid_data = Dataset.from_pandas(df_valid)

    test_data = test_data.cast_column("label", datasets.ClassLabel(num_classes=num_labels, 
                                                                 names= labels , 
                                                                   names_file=None, id=None))

    valid_data = valid_data.cast_column("label", datasets.ClassLabel(num_classes=num_labels, 
                                                                 names= labels , names_file=None, 
                                                                   id=None))

    test_data.reset_format()
    test_data.set_format("torch",
                                columns=["input_ids", "attention_mask", "label"])

    valid_data.reset_format()
    valid_data.set_format("torch",
                                columns=["input_ids", "attention_mask", "label"])

    from sklearn.metrics import accuracy_score, f1_score

    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        f1 = f1_score(labels, preds, average="weighted")
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "f1": f1}    
        # Saves the model to s3



    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = (AutoModelForSequenceClassification
             .from_pretrained(args.model_id, num_labels=num_labels)
             .to(device))                      

    batch_size = 64
    logging_steps = len(train_dataset) // batch_size
    model_name = f"{args.model_id}-finetuned-d"
    training_args = TrainingArguments(output_dir=model_name,
                                      num_train_epochs=args.epochs,
                                      learning_rate=2e-5,
                                      per_device_train_batch_size=batch_size,
                                      per_device_eval_batch_size=batch_size,
                                      weight_decay=0.01,
                                      evaluation_strategy="epoch",
                                      disable_tqdm=False,
                                      logging_steps=logging_steps,
                                      push_to_hub=False,
                                     # log_level="error"
                                     )                       


    trainer = Trainer(model=model, args=training_args,
                      compute_metrics=compute_metrics,
                      train_dataset=train_dataset,
                      eval_dataset=valid_data,
                      #eval_dataset=valid_data_down,
                      tokenizer=tokenizer)                       
    trainer.train()
    
    

    preds_output = trainer.predict(test_data)
    
    print('.')
    print('preds_output.metrics:')
    print(preds_output.metrics)
            
    
    trainer.save_model(args.model_dir)
    
                       
    print(f'my_acc: {preds_output.metrics["test_accuracy"]}')

Could try to update to the latest DLC: Reference

Also, you should move the setting to the env after updating the datasets version. Could you also please try /opt/ml/checkpoints/?

@philschmid @MaximusDecimusMeridi

I am still experiencing this issue. I think we should update cache_dir or os.environ[‘TRANSFORMERS_CACHE’] in TrainingArguments to store the checkpoints in cache_dir otherwise there is no use of setting these env variables.

cache_dir = os.makedirs(“cache”, exist_ok=True)
os.environ[‘TRANSFORMERS_CACHE’] = “cache”
os.environ[‘HF_DATASETS_CACHE’] = “cache”

Please check and confirm. Thanks

Can you clarify this part? Will make the other changes as well. Thanks :pray:

sorry for instead of “cache”

like

print(os.system('python -m pip install datasets --upgrade'))
os.environ['TRANSFORMERS_CACHE'] = "/opt/ml/checkpoints/"
os.environ['HF_DATASETS_CACHE'] = "/opt/ml/checkpoints/"

Thanks @philschmid for your reply.

Please clarify where to use those two env variables?

Thanks for clarification. It looks to me under “Training DLC Overview” that the versions should be

estimator = HuggingFace(
                entry_point          = 'train.py',        # fine-tuning script used in training jon
                source_dir           = 'embed_source',      # directory where fine-tuning script is stored
                instance_type        = instance_type,   # instances type used for the training job
                instance_count       = 1,                 # the number of instances used for training
                role                 = get_execution_role(), # Iam role used in training job to access AWS ressources, 
                transformers_version = '4.17.0',             # the transformers version used in the training job
                max_run= 36000,
                pytorch_version      = '1.10.2',             # the pytorch_version version used in the training job
                py_version           = 'py38',            # the python version used in the training job
                hyperparameters      = hyperparameters,   # the hyperparameter used for running the training job
                metric_definitions   = metric_definitions, # the metrics regex definitions to extract logs
                output_path=os.path.join(dataconnector.version_s3_prefix,  "models"),
                code_location=os.path.join(dataconnector.version_s3_prefix,  "models"),
                volume_size = 200,
                checkpoint_s3_uri='s3://kj-temp/checkpoints'

However, this fails with

ClientError: TrainingHostAgent Initialization failed:API error (404): manifest for 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu110-ubuntu18.04 not found: manifest unknown: Requested image not found

Based on this post I tried passing

image_uri = '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference:1.10.2-transformers4.17.0-cpu-py38-ubuntu20.04-v1.0',

But now the training job fails with

FileNotFoundError: [Errno 2] No such file or directory: 'train'

What should I be passing to use the latest DLC in training?

@MaximusDecimusMeridi

You need to change – image_uri=‘763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference:1.10.2-transformers4.17.0-cpu-py38-ubuntu20.04-v1.0

to

image_uri=‘763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04

1 Like

FileNotFoundError: [Errno 2] No such file or directory: ‘train’

Which line is it? how do you call .fit()?

ClientError: TrainingHostAgent Initialization failed:API error (404): manifest for 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu110-ubuntu18.04 not found: manifest unknown: Requested image not found

I ran a training this morning with the same version and didn’t had and issue can you make sure you have the latest sagemaker version installed and internet access

@philschmid Need your help on this above query.

@philschmid Need your help on this above query.

@Vinayaks117 i have no idea what your issue is or what you are trying to do? If it is not really related to this please open a new thread, explain what you are trying to do, what error you are seeing, what you have tried to solve, and provide code and details to be able to reproduce your issue.

@philschmid

We are also experiencing “No space left on device” when training a BERT model using a HuggingFace estimator in SageMaker pipelines training job.

As requested earlier I did share the training script with you then you suggested make below changes but it didn’t work.

import os
cache_dir = os.makedirs(“cache”,exist_ok=True)
os.environ[‘TRANSFORMERS_CACHE’] = “cache”
os.environ[‘HF_DATASETS_CACHE’] = “cache”

You suggested the below changes in one of your recent comments.

print(os.system(‘python -m pip install datasets --upgrade’))
os.environ[‘TRANSFORMERS_CACHE’] = “/opt/ml/checkpoints/”
os.environ[‘HF_DATASETS_CACHE’] = “/opt/ml/checkpoints/”

Question: Do we need to use above env variables in TrainingArguments to store the checkpoints in “/opt/ml/checkpoints/” or something else.

I request you to go through this thread history.

Thanks

@Vinayaks117 sorry the information you provided is not very helpful and missing a lot of contexts. I have no idea how you created your training job (parameter, instance, configuration etc.), and which script you are using. The error No space left on device can have multiple reasons, e.g. you haven’t adjusted the volume_size or something different.
If you cannot provide public access to the resources to identify the root cause of the issue you can sign up for our premium support here where we can help you in a private dedicated fashion.

Question : Do we need to use above env variables in TrainingArguments to store the checkpoints in “/opt/ml/checkpoints/” or something else.

The environment variables are not related to the TrainingArguments. You can check the documentation here on what they do. The output_dir defines where the checkpoints are stored.

I request you to go through this thread history.

P.S. we are trying to help you for free being unpolite has no place here!

I still get the same error… Here are the estimator settings

          estimator = HuggingFace(
                entry_point          = 'train.py',        # fine-tuning script used in training jon
                source_dir           = 'embed_source',      # directory where fine-tuning script is stored
                instance_type        = 'ml.g4dn.4xlarge',   # instances type used for the training job
                instance_count       = 1,                 # the number of instances used for training
                role                 = get_execution_role(), # Iam role used in training job to access AWS ressources, 
                transformers_version = '4.17.0',             # the transformers version used in the training job
                max_run= 36000,
                pytorch_version      = '1.10.2',             # the pytorch_version version used in the training job
                py_version           = 'py38',            # the python version used in the training job
                image_uri = '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.10.2-transformers4.17.0-gpu-py38-cu113-ubuntu20.04',
                hyperparameters      = hyperparameters,   # the hyperparameter used for running the training job
                metric_definitions   = metric_definitions, # the metrics regex definitions to extract logs
                output_path=os.path.join(dataconnector.version_s3_prefix,  "models"),
                code_location=os.path.join(dataconnector.version_s3_prefix,  "models"),
                volume_size = 200,
                checkpoint_s3_uri='s3://kj-temp/checkpoints'
                
            )

I also updated the code to set the cache envs after the upgrade

print(os.system('python -m pip install datasets --upgrade'))

#os.environ['TRANSFORMERS_CACHE'] = "cache"
#os.environ['HF_DATASETS_CACHE'] = "cache"

os.environ['TRANSFORMERS_CACHE'] = "/opt/ml/checkpoints/"
os.environ['HF_DATASETS_CACHE'] = "/opt/ml/checkpoints/"
File "train.py", line 188, in <module>
trainer.train() File "/opt/conda/lib/python3.8/site-packages/transformers/trainer.py", line 1475, in train

So this is the trainer.train() call in the train.py script I posted above. I realize this thread is becoming quite huge with multiple people posting their issues. Would it help if I started a separate thread of this issue and summarized everything so far? Thanks

@philschmid just checking if you want me to create a separate topic for my particular issue? I can summarize everything so far

@philschmid Sorry, I never meant in that way. Thanks.

Hi All,

My working solution for space issue.

If we don’t want to save all the checkpoints then we can go with an option below.

  1. Increase the “volume_size”, need to set this parameter in HuggingFace estimator.
  2. Set the “save_total_limit” parameter in TrainingArguments.
    Ex: save_total_limit = 2

Which means it will save only 2 checkpoints: best checkpoint and last checkpoint (to make sure we can resume training from it)

Reference

If we want to save all the checkpoints and use it for future analysis then we can go with an option below.

  1. Increase the volume_size, need to set this parameter in HuggingFace estimator.
  2. Need to use checkpointing, which saves all checkpoints in /opt/ml/checkpoints which is in sync to a s3 bucket defined in the HuggingFace estimator.

Set output_dir parameter in hyperparameters → ‘output_dir’:"/opt/ml/checkpoints"
Set checkpoint_s3_uri parameter in HuggingFace estimator → checkpoint_s3_uri=“s3://sm-pipelines/checkpoints”
Set output_dir parameter in TrainingArguments which allows us to save the checkpoints in “/opt/ml/checkpoints” directory which is in sync with s3 bucket → output_dir=args.output_dir

Hope this helps.

1 Like

Yes please, I tried to reproduce but couldn’t. So if you can share the estimator (how you started the training etc.), the training script and the dataset size

1 Like

@philschmid

I was able to figure it out. The training argument save_strategy defaults to every 500 steps. Given my small batch size and large volume that caused a lot of checkpoints to be saved, erroring out before it finished the epoch. By adding load_best_model_at_end=True as a training argument the save_strategy defaults to ‘epoch’ instead and now everything runs fine.


training_args = TrainingArguments(output_dir=model_name,
                                      num_train_epochs=args.epochs,
                                      learning_rate=2e-5,
                                      per_device_train_batch_size=batch_size,
                                      per_device_eval_batch_size=batch_size,
                                      weight_decay=0.01,
                                      evaluation_strategy="epoch",
                                      save_strategy="epoch", # redundant if load best model set to true, but should also work
                                      load_best_model_at_end=True,                                      
                                     disable_tqdm=False,
                                      logging_steps=logging_steps,
                                      push_to_hub=False,
 
                                     )
1 Like

Hey y’all, I’m also getting OSError: [Errno 28] No space left on device after the first epoch and after checkpoints and weights are saved.

I’ve shared snippets from my notebook and script below with links to the code. Does anyone see any problems or suggest knobs to turn? Thanks for taking a look!

For context, the training script code is working in a colab pro instance without issues.

Training script snippet (entire file here):

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=20,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    fp16=True,          
    save_total_limit=2,
    evaluation_strategy='epoch',
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy='epoch',
    save_strategy='epoch',
    learning_rate=0.0005,
    load_best_model_at_end=True,
    metric_for_best_model='RougeL' if not EARLY_STOP_ON_VAL_LOSS else 'loss',
    predict_with_generate=True,
    skip_memory_metrics=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[early_stopping],
    compute_metrics=compute_metrics if not EARLY_STOP_ON_VAL_LOSS else None,
)

Notebook snippet (entire file here):

hyperparameters={'epochs': 1, 'model_name':'neuroscience_to_dev_bio'}

huggingface_estimator = HuggingFace(entry_point='neuroscience_to_dev_bio.py',
                                    source_dir='./scripts',
                                    instance_type='ml.g4dn.16xlarge',
                                    instance_count=1,
                                    role=role,
                                    transformers_version='4.12',
                                    pytorch_version='1.9',
                                    py_version='py38',
                                    hyperparameters = hyperparameters,
                                    volume_size=900)

huggingface_estimator.fit()
...
  Saving model checkpoint to ./neuroscience-to-dev-bio-translation/checkpoint-8
  Saving model checkpoint to ./neuroscience-to-dev-bio-translation/checkpoint-8
  Configuration saved in ./neuroscience-to-dev-bio-translation/checkpoint-8/config.json
  Configuration saved in ./neuroscience-to-dev-bio-translation/checkpoint-8/config.json
  Model weights saved in ./neuroscience-to-dev-bio-translation/checkpoint-8/pytorch_model.bin
  Model weights saved in ./neuroscience-to-dev-bio-translation/checkpoint-8/pytorch_model.bin
  tokenizer config file saved in ./neuroscience-to-dev-bio-translation/checkpoint-8/tokenizer_config.json
  tokenizer config file saved in ./neuroscience-to-dev-bio-translation/checkpoint-8/tokenizer_config.json
  Special tokens file saved in ./neuroscience-to-dev-bio-translation/checkpoint-8/special_tokens_map.json
  Special tokens file saved in ./neuroscience-to-dev-bio-translation/checkpoint-8/special_tokens_map.json
...
  OSError:
  [Errno 28] No space left on device
  During handling of the above exception, another exception occurred:
  Traceback (most recent call last):
    File "neuroscience_to_dev_bio.py", line 327, in <module>
  trainer.train()
...