I am using dataset.map() on 160k items. It stopped at about 25.1k saying that there is error with memory allocation. Is there a workaround for this without having to get more RAM? I’m wondering if there’s a way to save_to_disk() for every 10k items? Is that possible?
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
# we need to define custom features
features = Features({
'image': Array3D(dtype="int64", shape=(3, 224, 224)),
'input_ids': Sequence(feature=Value(dtype='int64')),
'attention_mask': Sequence(Value(dtype='int64')),
'token_type_ids': Sequence(Value(dtype='int64')),
'bbox': Array2D(dtype="int64", shape=(512, 4)),
'labels': ClassLabel(num_classes=len(labels), names=labels),
})
def preprocess_data(examples):
# take a batch of images
images = [Image.open("images/"+path).convert("RGB") for path in examples['image_path']]
# LayoutLMv2Processor
encoded_inputs = processor(images, padding="max_length", truncation=True)
encoded_inputs["image"] = np.array(encoded_inputs["image"])
# add labels
encoded_inputs["labels"] = [label for label in examples["label"]]
return encoded_inputs
encoded_dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names, features=features,
batched=True, batch_size=2)