Image Dataset Generation gets killed

Hello,

I have been trying to create my custom dataset repository to work with SDXL. I have a tar file of size 123GB. I used loading script as attached. However this process gets killed while generating train splits.
How can I make it more efficient?

class MyDataset(datasets.GeneratorBasedBuilder):

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "original_image": datasets.Image(),
                    "edited_image": datasets.Image(),
                    "original_prompt":datasets.Value('string'),
                    "edit_prompt":datasets.Value("string"),
                    "edited_prompt":datasets.Value("string")
                    
                }
            ),
            homepage=_HOMEPAGE,
            citation=_CITATION,
            license=_LICENSE
        )

    def _split_generators(self, dl_manager):
        archive_path = dl_manager.download(_BASE_URL)
        split_metadata_paths = dl_manager.download(_METADATA_URLS)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "images": dl_manager.iter_archive(archive_path),
                    "metadata_path": split_metadata_paths["train"],
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "images": dl_manager.iter_archive(archive_path),
                    "metadata_path": split_metadata_paths["test"],
                },
            ),
        ]

    def _generate_examples(self, images, metadata_path):
        """Generate images and labels for splits."""
        data = pd.read_csv(metadata_path)
        print(type(images))
        for file_path, file_obj in images:
                yield file_path, {
                    "original_image": {"path": file_path, "bytes": file_obj.read()},
                    "edited_image": {"path": file_path, "bytes": file_obj.read()},
                    "original_prompt":data['input'],
                    "edit_prompt":data['edit'],
                    "edited_prompt":data['output']
                    
                }

Can you share the error message? There isn’t much to optimize in your script.

Thank you for your attention.

It does just get killed with no error messages while generating the splits.

To prevent that I implemented batching. Currently I am runnig this test.


def batch(iterable, batch_size):
    iterator = iter(iterable)
    while True:
        batch_chunk = list(islice(iterator, batch_size))
        if not batch_chunk:
            break
        yield batch_chunk

class MyDataset(datasets.GeneratorBasedBuilder):
    def __init__(self, *args, **kwargs):
        # Call the constructor of the parent class
        super().__init__(*args, **kwargs)
        
        # Set the _writer_batch_size to your desired value
        #self._writer_batch_size = 8  # Change this to your desired batch size

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "original_image": datasets.Image(),
                    "edited_image": datasets.Image(),
                    "original_prompt":datasets.Value('string'),
                    "edit_prompt":datasets.Value("string"),
                    "edited_prompt":datasets.Value("string")
                    
                }
            ),
            homepage=_HOMEPAGE,
            citation=_CITATION,
            license=_LICENSE
        )

    def _split_generators(self, dl_manager):
        archive_path = dl_manager.download(_BASE_URL)
        split_metadata_paths = dl_manager.download(_METADATA_URLS)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "images": dl_manager.iter_archive(archive_path),
                    "metadata_path": split_metadata_paths["train"],
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "images": dl_manager.iter_archive(archive_path),
                    "metadata_path": split_metadata_paths["test"],
                },
            ),
        ]

    def _generate_examples(self, images, metadata_path):
        data = pd.read_csv(metadata_path)
        batch_size = 8  # Adjust the batch size based on your available memory

        for batch_files in batch(images, batch_size):
            examples = []
            for file_path, file_obj in batch_files:
                example = {
                    "original_image": {"path": file_path, "bytes": file_obj.read()},
                    "edited_image": {"path": file_path, "bytes": file_obj.read()},
                    "original_prompt": data['input'],
                    "edit_prompt": data['edit'],
                    "edited_prompt": data['output']
                }
                examples.append((file_path, example))

            yield examples

We only support yielding one example at a time, meaning your code should fail.

You can control the number of examples kept in memory before writing them to an Arrow file with the DEFAULT_WRITER_BATCH_SIZE class variable, like so:

class MyDataset(datasets.GeneratorBasedBuilder):
    DEFAULT_WRITER_BATCH_SIZE = 100
    ...

Yes it failed with this error:

Traceback (most recent call last):                      
  File "/usr/local/bin/datasets-cli", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.9/dist-packages/datasets/commands/datasets_cli.py", line 39, in main
    service.run()
  File "/usr/local/lib/python3.9/dist-packages/datasets/commands/test.py", line 137, in run
    builder.download_and_prepare(
  File "/usr/local/lib/python3.9/dist-packages/datasets/builder.py", line 704, in download_and_prepare
    self._download_and_prepare(
  File "/usr/local/lib/python3.9/dist-packages/datasets/builder.py", line 1227, in _download_and_prepare
    super()._download_and_prepare(dl_manager, verify_infos, check_duplicate_keys=verify_infos)
  File "/usr/local/lib/python3.9/dist-packages/datasets/builder.py", line 793, in _download_and_prepare
    self._prepare_split(split_generator, **prepare_split_kwargs)
  File "/usr/local/lib/python3.9/dist-packages/datasets/builder.py", line 1210, in _prepare_split
    for key, record in logging.tqdm(
  File "/usr/local/lib/python3.9/dist-packages/tqdm/std.py", line 1195, in __iter__
    for obj in iterable:
  File "/root/.cache/huggingface/modules/datasets_modules/datasets/make_dataset/4bbb8fc5763c15089fd4ed23fac05a06ad51d0c3652e1303dfed3c7edf82ede8/make_dataset.py", line 126, in _generate_examples
    "original_image": {"path": file_path, "bytes": file_obj.read()},
  File "/usr/lib/python3.9/tarfile.py", line 683, in read
    self.fileobj.seek(offset + (self.position - start))
  File "/usr/lib/python3.9/tarfile.py", line 515, in seek
    raise StreamError("seeking backwards is not allowed")
tarfile.StreamError: seeking backwards is not allowed

Yes I realized that. Will try that.

Can I use multiple tar part files instead of a single .tar file? I tried with a single part file but extension was illegal. How can I use this?