I’m following along the workshop tutorial on setting up a transformer for text classification on SageMaker. I am getting the following error when trying to upload the processed train and test datasets onto the default S3 bucket. What would I need to do to fix this?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1458, in Dataset._save_to_disk_single(job_id, shard, fpath, storage_options)
1457 for pa_table in table_iter(shard.data, batch_size=batch_size):
-> 1458 writer.write_table(pa_table)
1459 num_examples_progress_update += len(pa_table)
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_writer.py:573, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
572 self._num_examples += pa_table.num_rows
--> 573 self.pa_writer.write_table(pa_table, writer_batch_size)
File /opt/conda/lib/python3.10/site-packages/pyarrow/ipc.pxi:525, in pyarrow.lib._CRecordBatchWriter.write_table()
File /opt/conda/lib/python3.10/site-packages/fsspec/spec.py:1706, in AbstractBufferedFile.write(self, data)
1705 if self.buffer.tell() >= self.blocksize:
-> 1706 self.flush()
1707 return out
File /opt/conda/lib/python3.10/site-packages/fsspec/spec.py:1742, in AbstractBufferedFile.flush(self, force)
1741 try:
-> 1742 self._initiate_upload()
1743 except: # noqa: E722
File /opt/conda/lib/python3.10/site-packages/s3fs/core.py:1688, in S3File._initiate_upload(self)
1687 self.parts = []
-> 1688 self.mpu = self._call_s3(
1689 "create_multipart_upload",
1690 Bucket=self.bucket,
1691 Key=self.key,
1692 ACL=self.acl,
1693 )
1695 if self.append_block:
1696 # use existing data in key when appending,
1697 # and block is big enough
File /opt/conda/lib/python3.10/site-packages/s3fs/core.py:1680, in S3File._call_s3(self, method, *kwarglist, **kwargs)
1679 def _call_s3(self, method, *kwarglist, **kwargs):
-> 1680 return self.fs.call_s3(method, self.s3_additional_kwargs, *kwarglist, **kwargs)
File /opt/conda/lib/python3.10/site-packages/fsspec/asyn.py:121, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
120 self = obj or args[0]
--> 121 return sync(self.loop, func, *args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/fsspec/asyn.py:106, in sync(loop, func, timeout, *args, **kwargs)
105 elif isinstance(return_result, BaseException):
--> 106 raise return_result
107 else:
File /opt/conda/lib/python3.10/site-packages/fsspec/asyn.py:61, in _runner(event, coro, result, timeout)
60 try:
---> 61 result[0] = await coro
62 except Exception as ex:
File /opt/conda/lib/python3.10/site-packages/s3fs/core.py:228, in S3FileSystem._call_s3(self, method, *akwarglist, **kwargs)
227 async def _call_s3(self, method, *akwarglist, **kwargs):
--> 228 await self._connect()
229 method = getattr(self.s3, method)
File /opt/conda/lib/python3.10/site-packages/s3fs/core.py:365, in S3FileSystem._connect(self, refresh, kwargs)
364 conf = AioConfig(**config_kwargs)
--> 365 self.session = aiobotocore.AioSession(**self.kwargs)
366 s3creator = self.session.create_client(
367 "s3", config=conf, **init_kwargs, **client_kwargs
368 )
AttributeError: module 'aiobotocore' has no attribute 'AioSession'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Cell In[17], line 8
6 # save train_dataset to s3
7 training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
----> 8 train_dataset.save_to_disk(training_input_path, fs=s3)
10 # save test_dataset to s3
11 test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1421, in Dataset.save_to_disk(self, dataset_path, fs, max_shard_size, num_shards, num_proc, storage_options)
1419 for kwargs in kwargs_per_job:
1420 with pbar:
-> 1421 for job_id, done, content in Dataset._save_to_disk_single(**kwargs):
1422 if done:
1423 shards_done += 1
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1466, in Dataset._save_to_disk_single(job_id, shard, fpath, storage_options)
1464 finally:
1465 yield job_id, False, num_examples_progress_update
-> 1466 num_examples, num_bytes = writer.finalize()
1467 writer.close()
1469 yield job_id, True, (num_examples, num_bytes)
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_writer.py:587, in ArrowWriter.finalize(self, close_stream)
585 self._build_writer(self.schema)
586 if self.pa_writer is not None:
--> 587 self.pa_writer.close()
588 self.pa_writer = None
589 if close_stream:
File /opt/conda/lib/python3.10/site-packages/pyarrow/ipc.pxi:533, in pyarrow.lib._CRecordBatchWriter.close()
File /opt/conda/lib/python3.10/site-packages/fsspec/spec.py:1700, in AbstractBufferedFile.write(self, data)
1698 raise ValueError("File not in write mode")
1699 if self.closed:
-> 1700 raise ValueError("I/O operation on closed file.")
1701 if self.forced:
1702 raise ValueError("This file has been force-flushed, can only close")
ValueError: I/O operation on closed file.