How to tweak a dataset without a loading script?

Copying from the Parquet builder I wound up with this:

#!/usr/bin/python

import datasets

import pyarrow as pa
import pyarrow.parquet as pq

_DATA_FILES = ['data/combined-00009-of-00013-97a88bccf4215954.parquet',
 'data/combined-00004-of-00013-119d653561443d7b.parquet',
 'data/combined-00007-of-00013-ab54cce4ee6331d0.parquet',
 'data/combined-00002-of-00013-149f5d0d22fe8f52.parquet',
 'data/combined-00003-of-00013-426af6f6064e67dd.parquet',
 'data/combined-00010-of-00013-89d7565c5f0d2e4e.parquet',
 'data/combined-00000-of-00013-36d239509fb9e430.parquet',
 'data/combined-00005-of-00013-363bba92a2b7f737.parquet',
 'data/combined-00006-of-00013-4d4d574c9d87176e.parquet',
 'data/combined-00001-of-00013-d5b44e96ad7d2927.parquet',
 'data/combined-00012-of-00013-84cf41ef75dd5b76.parquet',
 'data/combined-00011-of-00013-4c21766cedd5a4a0.parquet',
 'data/combined-00008-of-00013-674f74b6f2288c61.parquet']

class OOMethodTestDataset(datasets.ArrowBasedBuilder):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def _info(self):
        return datasets.DatasetInfo()

    def _split_generators(self, dl_manager):
        files = _DATA_FILES
        downloaded_files = dl_manager.download(files)

        #print(files)
        #print(downloaded_files)

        return [
            datasets.SplitGenerator(
                name="combined",
                gen_kwargs={
                    "files": downloaded_files,
                },
            ),
        ]
    
    def _generate_tables(self, files):
        for file_idx, file in enumerate(files):
            with open(file, "rb") as f:
                parquet_file = pq.ParquetFile(f)
                try:
                    for batch_idx, record_batch in enumerate(
                        parquet_file.iter_batches(batch_size=10_000)
                    ):
                        pa_table = pa.Table.from_batches([record_batch])
                        # Uncomment for debugging (will print the Arrow table size and elements)
                        # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
                        # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
                        yield f"{file_idx}_{batch_idx}", pa_table
                except ValueError as e:
                    #logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
                    raise

I tried to inherit from the Parquet builder, but it was like swimming upstream.

It really doesn’t seem like it should be this hard to go from a non-loader script to loader script…