ArrowNotImplementedError when loading json dataset

Hello community,

When trying to load custom json dataset based on wikipedia dump:

from datasets import load_dataset

wiki_fr_dataset_lp = load_dataset("json", data_files="/media/matthieu/HDD_4T0/Github/semantic-search-through-wikipedia-with-weaviate/step-1/articles.json", split="train")

I got the following error:

---------------------------------------------------------------------------
ArrowNotImplementedError                  Traceback (most recent call last)
<ipython-input-1-f8ba9f3864f3> in <module>
      1 from datasets import load_dataset
      2 
----> 3 wiki_fr_dataset_lp = load_dataset("json", data_files="/media/matthieu/HDD_4T0/Github/semantic-search-through-wikipedia-with-weaviate/step-1/articles.json", split="train")
      4 wiki_fr_dataset_lp

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/datasets/load.py in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, script_version, **config_kwargs)
   1630 
   1631     # Download and prepare data
-> 1632     builder_instance.download_and_prepare(
   1633         download_config=download_config,
   1634         download_mode=download_mode,

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/datasets/builder.py in download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
    605                             logger.warning("HF google storage unreachable. Downloading and preparing it from source")
    606                     if not downloaded_from_gcs:
--> 607                         self._download_and_prepare(
    608                             dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
    609                         )

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/datasets/builder.py in _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
    695             try:
    696                 # Prepare split will record examples associated to the split
--> 697                 self._prepare_split(split_generator, **prepare_split_kwargs)
    698             except OSError as e:
    699                 raise OSError(

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/datasets/builder.py in _prepare_split(self, split_generator)
   1157                 generator, unit=" tables", leave=False, disable=True  # bool(logging.get_verbosity() == logging.NOTSET)
   1158             ):
-> 1159                 writer.write_table(table)
   1160             num_examples, num_bytes = writer.finalize()
   1161 

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/datasets/arrow_writer.py in write_table(self, pa_table, writer_batch_size)
    440         # reorder the arrays if necessary + cast to self._schema
    441         # we can't simply use .cast here because we may need to change the order of the columns
--> 442         pa_table = pa.Table.from_arrays([pa_table[name] for name in self._schema.names], schema=self._schema)
    443         batches: List[pa.RecordBatch] = pa_table.to_batches(max_chunksize=writer_batch_size)
    444         self._num_bytes += sum(batch.nbytes for batch in batches)

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.Table.from_arrays()

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib._sanitize_arrays()

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.asarray()

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.ChunkedArray.cast()

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/compute.py in cast(arr, target_type, safe)
    295     else:
    296         options = CastOptions.unsafe(target_type)
--> 297     return call_function("cast", [arr], options)
    298 
    299 

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/_compute.pyx in pyarrow._compute.call_function()

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/_compute.pyx in pyarrow._compute.Function.call()

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()

~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()

ArrowNotImplementedError: Unsupported cast from struct<title: string, content: string, count: int64> to struct using function cast_struct

Anyone would have an advice on this error?

Thanks!

Hi,

it’s hard to debug these kinds of errors without having access to data. I’d assume that in some examples (JSON lines) some fields, or subfields, are missing based on the error message. Also, you can try to increase the chunksize argument and see if that works.

Thanks @mariosasko I will check with chunksize. Isn’t it possible with datasets to exclude data which contains empty field when loading?

Isn’t it possible with datasets to exclude data which contains empty field when loading?

These are the options currently supported by the JSON loader, so I’d suggest you use filter for that.