Hi,
Instead of generating a dataset with load_dataset
, it should be easier to create dataset chunks with Dataset.from_dict
, which we can then save to disk with save_to_disk
, reload and concatenate to get a memory-mapped dataset.
The code could look as follows:
# distribute files in multiple dirs (chunkify dir) to avoid loading the entire data into a single LineByLineWithSOPTextDataset
from datasets import Dataset, concatenate_datasets
def list_of_dicts_to_dict_of_lists(d):
dic = d[0]
keys = dic.keys()
values = [dic.values() for dic in d]
return {k: list(v) for k, v in zip(keys, zip(*values))}
chunks = []
for i, file_dir with enumerate(dirs_with_data_files):
dset = LineByLineWithSOPTextDataset(<tokenizer>, file_dir)
examples = list_of_dicts_to_dict_of_lists(dset.examples)
chunk = Dataset.from_dict(examples)
chunk = Dataset.load_from_disk(chunk.save_to_disk("./chunks_dir/{i}")) # currently `chunk` is in memory, so we save it on disk to make it memory-mapped
chunks.append(chunk)
final_dset = concatenate_datasets(chunks)