Hello Everyone,
I need to finetune Llama3-8B-instruct model. I have the similar technical requirement as mentioned in the document here:
[Fine-tune Llama 3 with PyTorch FSDP and Q-Lora on Amazon SageMaker. I have followed all the instructions as it is.](Fine-tune Llama 3 with PyTorch FSDP and Q-Lora on Amazon SageMaker. I have followed all the instructions as it is.)
The google colab notebook is here:
The python script used in the google colab is here:
https://gitlab.com/keerti4p/llama3-finetune/-/blob/main/run_fsdp_qlora.py
I am getting “OSError: [Errno 28] No space left on device” error. I followed the suggestion mentioned in the discussion here
(however, i am not sure if i have made the correct changes as suggested in the discussion.)
I have added this code snippet in the line numbers, 116 and 239.
os.environ['HF_HOME'] = '/tmp';
I have also set training_args.output_dir = "/tmp";
in the line number 238.
I have done both the above changes based on the huggingface discussion below:
Please suggestion the solution/ideas if you are aware of this issue.
Error Trace:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/sagemaker/utils.py:461, in create_tar_file(source_files, target)
459 for sf in source_files:
460 # Add all files from the directory into the root of the directory structure of the tar
--> 461 t.add(sf, arcname=os.path.basename(sf))
462 return filename
File /opt/conda/lib/python3.10/tarfile.py:2186, in TarFile.add(self, name, arcname, recursive, filter)
2185 for f in sorted(os.listdir(name)):
-> 2186 self.add(os.path.join(name, f), os.path.join(arcname, f),
2187 recursive, filter=filter)
2189 else:
File /opt/conda/lib/python3.10/tarfile.py:2186, in TarFile.add(self, name, arcname, recursive, filter)
2185 for f in sorted(os.listdir(name)):
-> 2186 self.add(os.path.join(name, f), os.path.join(arcname, f),
2187 recursive, filter=filter)
2189 else:
File /opt/conda/lib/python3.10/tarfile.py:2180, in TarFile.add(self, name, arcname, recursive, filter)
2179 with bltn_open(name, "rb") as f:
-> 2180 self.addfile(tarinfo, f)
2182 elif tarinfo.isdir():
File /opt/conda/lib/python3.10/tarfile.py:2208, in TarFile.addfile(self, tarinfo, fileobj)
2207 if fileobj is not None:
-> 2208 copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
2209 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
File /opt/conda/lib/python3.10/tarfile.py:255, in copyfileobj(src, dst, length, exception, bufsize)
254 raise exception("unexpected end of data")
--> 255 dst.write(buf)
257 if remainder != 0:
File /opt/conda/lib/python3.10/gzip.py:289, in GzipFile.write(self, data)
288 if length > 0:
--> 289 self.fileobj.write(self.compress.compress(data))
290 self.size += length
OSError: [Errno 28] No space left on device
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
Cell In[9], line 9
2 data = {
3 'train': train_dataset_s3_path,
4 'test': test_dataset_s3_path,
5 'config': train_config_s3_path
6 }
8 # starting the train job with our uploaded datasets as input
----> 9 huggingface_estimator.fit(data, wait=True)
File /opt/conda/lib/python3.10/site-packages/sagemaker/workflow/pipeline_context.py:346, in runnable_by_pipeline.<locals>.wrapper(*args, **kwargs)
342 return context
344 return _StepArguments(retrieve_caller_name(self_instance), run_func, *args, **kwargs)
--> 346 return run_func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/sagemaker/estimator.py:1343, in EstimatorBase.fit(self, inputs, wait, logs, job_name, experiment_config)
1278 @runnable_by_pipeline
1279 def fit(
1280 self,
(...)
1285 experiment_config: Optional[Dict[str, str]] = None,
1286 ):
1287 """Train a model using the input training dataset.
1288
1289 The API calls the Amazon SageMaker CreateTrainingJob API to start
(...)
1341 :class:`~sagemaker.workflow.pipeline_context.PipelineSession`
1342 """
-> 1343 self._prepare_for_training(job_name=job_name)
1345 experiment_config = check_and_get_run_experiment_config(experiment_config)
1346 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
File /opt/conda/lib/python3.10/site-packages/sagemaker/estimator.py:3549, in Framework._prepare_for_training(self, job_name)
3541 def _prepare_for_training(self, job_name=None):
3542 """Set hyperparameters needed for training. This method will also validate ``source_dir``.
3543
3544 Args:
(...)
3547 constructor if applicable.
3548 """
-> 3549 super(Framework, self)._prepare_for_training(job_name=job_name)
3551 self._validate_and_set_debugger_configs()
File /opt/conda/lib/python3.10/site-packages/sagemaker/estimator.py:941, in EstimatorBase._prepare_for_training(self, job_name)
939 self.code_uri = self.uploaded_code.s3_prefix
940 else:
--> 941 self.uploaded_code = self._stage_user_code_in_s3()
942 code_dir = self.uploaded_code.s3_prefix
943 script = self.uploaded_code.script_name
File /opt/conda/lib/python3.10/site-packages/sagemaker/estimator.py:1023, in EstimatorBase._stage_user_code_in_s3(self)
1020 output_bucket, _ = parse_s3_url(self.output_path)
1021 kms_key = self.output_kms_key if code_bucket == output_bucket else None
-> 1023 return tar_and_upload_dir(
1024 session=self.sagemaker_session.boto_session,
1025 bucket=code_bucket,
1026 s3_key_prefix=code_s3_prefix,
1027 script=self.entry_point,
1028 directory=self.source_dir,
1029 dependencies=self.dependencies,
1030 kms_key=kms_key,
1031 s3_resource=self.sagemaker_session.s3_resource,
1032 settings=self.sagemaker_session.settings,
1033 )
File /opt/conda/lib/python3.10/site-packages/sagemaker/fw_utils.py:457, in tar_and_upload_dir(session, bucket, s3_key_prefix, script, directory, dependencies, kms_key, s3_resource, settings)
455 try:
456 source_files = _list_files_to_compress(script, directory) + dependencies
--> 457 tar_file = sagemaker.utils.create_tar_file(
458 source_files, os.path.join(tmp, _TAR_SOURCE_FILENAME)
459 )
461 if kms_key:
462 extra_args = {"ServerSideEncryption": "aws:kms", "SSEKMSKeyId": kms_key}
File /opt/conda/lib/python3.10/site-packages/sagemaker/utils.py:458, in create_tar_file(source_files, target)
455 else:
456 _, filename = tempfile.mkstemp()
--> 458 with tarfile.open(filename, mode="w:gz", dereference=True) as t:
459 for sf in source_files:
460 # Add all files from the directory into the root of the directory structure of the tar
461 t.add(sf, arcname=os.path.basename(sf))
File /opt/conda/lib/python3.10/tarfile.py:2770, in TarFile.__exit__(self, type, value, traceback)
2766 else:
2767 # An exception occurred. We must not call close() because
2768 # it would try to write end-of-archive blocks and padding.
2769 if not self._extfileobj:
-> 2770 self.fileobj.close()
2771 self.closed = True
File /opt/conda/lib/python3.10/gzip.py:344, in GzipFile.close(self)
342 if myfileobj:
343 self.myfileobj = None
--> 344 myfileobj.close()
OSError: [Errno 28] No space left on device```