Copying from the Parquet builder I wound up with this:
#!/usr/bin/python
import datasets
import pyarrow as pa
import pyarrow.parquet as pq
_DATA_FILES = ['data/combined-00009-of-00013-97a88bccf4215954.parquet',
'data/combined-00004-of-00013-119d653561443d7b.parquet',
'data/combined-00007-of-00013-ab54cce4ee6331d0.parquet',
'data/combined-00002-of-00013-149f5d0d22fe8f52.parquet',
'data/combined-00003-of-00013-426af6f6064e67dd.parquet',
'data/combined-00010-of-00013-89d7565c5f0d2e4e.parquet',
'data/combined-00000-of-00013-36d239509fb9e430.parquet',
'data/combined-00005-of-00013-363bba92a2b7f737.parquet',
'data/combined-00006-of-00013-4d4d574c9d87176e.parquet',
'data/combined-00001-of-00013-d5b44e96ad7d2927.parquet',
'data/combined-00012-of-00013-84cf41ef75dd5b76.parquet',
'data/combined-00011-of-00013-4c21766cedd5a4a0.parquet',
'data/combined-00008-of-00013-674f74b6f2288c61.parquet']
class OOMethodTestDataset(datasets.ArrowBasedBuilder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _info(self):
return datasets.DatasetInfo()
def _split_generators(self, dl_manager):
files = _DATA_FILES
downloaded_files = dl_manager.download(files)
#print(files)
#print(downloaded_files)
return [
datasets.SplitGenerator(
name="combined",
gen_kwargs={
"files": downloaded_files,
},
),
]
def _generate_tables(self, files):
for file_idx, file in enumerate(files):
with open(file, "rb") as f:
parquet_file = pq.ParquetFile(f)
try:
for batch_idx, record_batch in enumerate(
parquet_file.iter_batches(batch_size=10_000)
):
pa_table = pa.Table.from_batches([record_batch])
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
# logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
yield f"{file_idx}_{batch_idx}", pa_table
except ValueError as e:
#logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
raise
I tried to inherit from the Parquet builder, but it was like swimming upstream.
It really doesn’t seem like it should be this hard to go from a non-loader script to loader script…