K fold cross validation

Very old thread I know, but here’s an alternative to @lewtun’s solution that I like:

import numpy as np
from sklearn.model_selection import StratifiedKFold

from datasets import load_dataset

# First make the kfold object
folds = StratifiedKFold(n_splits=5)

# Then get the dataset
datasets = load_dataset("glue", "mrpc")

# Now make our splits based off of the labels. 
# We can use `np.zeros()` here since it only works off of indices, we really care about the labels
splits = folds.split(np.zeros(datasets["train"].num_rows), datasets["train"]["label"])

# Finally, do what you want with it
# In this case I'm overriding the train/val/test
for train_idxs, val_idxs in splits:
    datasets = load_dataset("glue", "mrpc")
    datasets["test"] = datasets["validation"]
    datasets["validation"] = datasets["train"].select(val_idxs)
    datasets["train"] = datasets["train"].select(train_idxs)

For a method without having to reload the dataset, you can also do:

from datasets import DatasetDict
fold_dataset = DatasetDict({
    "train":datasets["train"].select(train_idx),
    "validation":datasets["train"].select(val_idx),
    "test":datasets["validation"]
})
4 Likes