from datasets import load_dataset
# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
pubmed_dataset = load_dataset("json", data_files=data_files, split="train")
pubmed_dataset
---------------------------------------------------------------------------
SchemaInferenceError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/datasets/builder.py in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1948 num_shards = shard_id + 1
-> 1949 num_examples, num_bytes = writer.finalize()
1950 writer.close()
6 frames
SchemaInferenceError: Please pass `features` or at least one example when writing data
The above exception was the direct cause of the following exception:
DatasetGenerationError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/datasets/builder.py in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1956 if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
1957 e = e.__context__
-> 1958 raise DatasetGenerationError("An error occurred while generating the dataset") from e
1959
1960 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
DatasetGenerationError: An error occurred while generating the dataset
Hello, I am getting this error on Google Colab and I haven’t found any workaround for this.