ValueError: I/O operation on closed file when uploading dataset to S3 bucket

lcleary · December 13, 2023, 9:38pm

I’m following along the workshop tutorial on setting up a transformer for text classification on SageMaker. I am getting the following error when trying to upload the processed train and test datasets onto the default S3 bucket. What would I need to do to fix this?

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1458, in Dataset._save_to_disk_single(job_id, shard, fpath, storage_options)
   1457 for pa_table in table_iter(shard.data, batch_size=batch_size):
-> 1458     writer.write_table(pa_table)
   1459     num_examples_progress_update += len(pa_table)

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_writer.py:573, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
    572 self._num_examples += pa_table.num_rows
--> 573 self.pa_writer.write_table(pa_table, writer_batch_size)

File /opt/conda/lib/python3.10/site-packages/pyarrow/ipc.pxi:525, in pyarrow.lib._CRecordBatchWriter.write_table()

File /opt/conda/lib/python3.10/site-packages/fsspec/spec.py:1706, in AbstractBufferedFile.write(self, data)
   1705 if self.buffer.tell() >= self.blocksize:
-> 1706     self.flush()
   1707 return out

File /opt/conda/lib/python3.10/site-packages/fsspec/spec.py:1742, in AbstractBufferedFile.flush(self, force)
   1741 try:
-> 1742     self._initiate_upload()
   1743 except:  # noqa: E722

File /opt/conda/lib/python3.10/site-packages/s3fs/core.py:1688, in S3File._initiate_upload(self)
   1687 self.parts = []
-> 1688 self.mpu = self._call_s3(
   1689     "create_multipart_upload",
   1690     Bucket=self.bucket,
   1691     Key=self.key,
   1692     ACL=self.acl,
   1693 )
   1695 if self.append_block:
   1696     # use existing data in key when appending,
   1697     # and block is big enough

File /opt/conda/lib/python3.10/site-packages/s3fs/core.py:1680, in S3File._call_s3(self, method, *kwarglist, **kwargs)
   1679 def _call_s3(self, method, *kwarglist, **kwargs):
-> 1680     return self.fs.call_s3(method, self.s3_additional_kwargs, *kwarglist, **kwargs)

File /opt/conda/lib/python3.10/site-packages/fsspec/asyn.py:121, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
    120 self = obj or args[0]
--> 121 return sync(self.loop, func, *args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/fsspec/asyn.py:106, in sync(loop, func, timeout, *args, **kwargs)
    105 elif isinstance(return_result, BaseException):
--> 106     raise return_result
    107 else:

File /opt/conda/lib/python3.10/site-packages/fsspec/asyn.py:61, in _runner(event, coro, result, timeout)
     60 try:
---> 61     result[0] = await coro
     62 except Exception as ex:

File /opt/conda/lib/python3.10/site-packages/s3fs/core.py:228, in S3FileSystem._call_s3(self, method, *akwarglist, **kwargs)
    227 async def _call_s3(self, method, *akwarglist, **kwargs):
--> 228     await self._connect()
    229     method = getattr(self.s3, method)

File /opt/conda/lib/python3.10/site-packages/s3fs/core.py:365, in S3FileSystem._connect(self, refresh, kwargs)
    364 conf = AioConfig(**config_kwargs)
--> 365 self.session = aiobotocore.AioSession(**self.kwargs)
    366 s3creator = self.session.create_client(
    367     "s3", config=conf, **init_kwargs, **client_kwargs
    368 )

AttributeError: module 'aiobotocore' has no attribute 'AioSession'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
Cell In[17], line 8
      6 # save train_dataset to s3
      7 training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
----> 8 train_dataset.save_to_disk(training_input_path, fs=s3)
     10 # save test_dataset to s3
     11 test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1421, in Dataset.save_to_disk(self, dataset_path, fs, max_shard_size, num_shards, num_proc, storage_options)
   1419 for kwargs in kwargs_per_job:
   1420     with pbar:
-> 1421         for job_id, done, content in Dataset._save_to_disk_single(**kwargs):
   1422             if done:
   1423                 shards_done += 1

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1466, in Dataset._save_to_disk_single(job_id, shard, fpath, storage_options)
   1464 finally:
   1465     yield job_id, False, num_examples_progress_update
-> 1466     num_examples, num_bytes = writer.finalize()
   1467     writer.close()
   1469 yield job_id, True, (num_examples, num_bytes)

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_writer.py:587, in ArrowWriter.finalize(self, close_stream)
    585     self._build_writer(self.schema)
    586 if self.pa_writer is not None:
--> 587     self.pa_writer.close()
    588     self.pa_writer = None
    589     if close_stream:

File /opt/conda/lib/python3.10/site-packages/pyarrow/ipc.pxi:533, in pyarrow.lib._CRecordBatchWriter.close()

File /opt/conda/lib/python3.10/site-packages/fsspec/spec.py:1700, in AbstractBufferedFile.write(self, data)
   1698     raise ValueError("File not in write mode")
   1699 if self.closed:
-> 1700     raise ValueError("I/O operation on closed file.")
   1701 if self.forced:
   1702     raise ValueError("This file has been force-flushed, can only close")

ValueError: I/O operation on closed file.

gregoriorojas · June 14, 2024, 9:13pm

did you ever resolve this? i am finding the same issues here.

Topic		Replies	Views
KeyError: '_data' when training on AWS Amazon SageMaker	10	2559	August 20, 2021
Datasets mapper hanging issue 🤗Datasets	2	1236	March 8, 2023
Running custom data files on run_summarization.py Amazon SageMaker	16	1464	June 22, 2021
InternalServer Exception when deploying fine tuned model on Sagemaker Amazon SageMaker	4	858	September 14, 2021
Fine tune a BERT model in sagemaker using a custom dataset Beginners	0	742	November 19, 2021

ValueError: I/O operation on closed file when uploading dataset to S3 bucket

Related topics