The following
from datasets import load_dataset
from datasets import interleave_datasets
# Preprocess each dataset
c4 = load_dataset("c4", "en", split="train", streaming=True)
wikitext = load_dataset("wikitext", "wikitext-103-v1", split="train", streaming=True)
# Interleave the preprocessed datasets
datasets = [c4, wikitext]
for dataset in datasets:
print(dataset.description)
interleaved = interleave_datasets(datasets, probabilities=[0.5, 0.5])
print(interleaved)
only samples from one data set, why?
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
example.keys()=dict_keys(['text', 'timestamp', 'url'])
counts=100
colab: Google Colab
hf dicord: Discord