Hello community,
When trying to load custom json dataset based on wikipedia dump:
from datasets import load_dataset
wiki_fr_dataset_lp = load_dataset("json", data_files="/media/matthieu/HDD_4T0/Github/semantic-search-through-wikipedia-with-weaviate/step-1/articles.json", split="train")
I got the following error:
---------------------------------------------------------------------------
ArrowNotImplementedError Traceback (most recent call last)
<ipython-input-1-f8ba9f3864f3> in <module>
1 from datasets import load_dataset
2
----> 3 wiki_fr_dataset_lp = load_dataset("json", data_files="/media/matthieu/HDD_4T0/Github/semantic-search-through-wikipedia-with-weaviate/step-1/articles.json", split="train")
4 wiki_fr_dataset_lp
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/datasets/load.py in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, script_version, **config_kwargs)
1630
1631 # Download and prepare data
-> 1632 builder_instance.download_and_prepare(
1633 download_config=download_config,
1634 download_mode=download_mode,
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/datasets/builder.py in download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
605 logger.warning("HF google storage unreachable. Downloading and preparing it from source")
606 if not downloaded_from_gcs:
--> 607 self._download_and_prepare(
608 dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
609 )
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/datasets/builder.py in _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
695 try:
696 # Prepare split will record examples associated to the split
--> 697 self._prepare_split(split_generator, **prepare_split_kwargs)
698 except OSError as e:
699 raise OSError(
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/datasets/builder.py in _prepare_split(self, split_generator)
1157 generator, unit=" tables", leave=False, disable=True # bool(logging.get_verbosity() == logging.NOTSET)
1158 ):
-> 1159 writer.write_table(table)
1160 num_examples, num_bytes = writer.finalize()
1161
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/datasets/arrow_writer.py in write_table(self, pa_table, writer_batch_size)
440 # reorder the arrays if necessary + cast to self._schema
441 # we can't simply use .cast here because we may need to change the order of the columns
--> 442 pa_table = pa.Table.from_arrays([pa_table[name] for name in self._schema.names], schema=self._schema)
443 batches: List[pa.RecordBatch] = pa_table.to_batches(max_chunksize=writer_batch_size)
444 self._num_bytes += sum(batch.nbytes for batch in batches)
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.Table.from_arrays()
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib._sanitize_arrays()
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.asarray()
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.ChunkedArray.cast()
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/compute.py in cast(arr, target_type, safe)
295 else:
296 options = CastOptions.unsafe(target_type)
--> 297 return call_function("cast", [arr], options)
298
299
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/_compute.pyx in pyarrow._compute.call_function()
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/_compute.pyx in pyarrow._compute.Function.call()
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()
~/anaconda3/envs/sts-transformers-gpu-fresh/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowNotImplementedError: Unsupported cast from struct<title: string, content: string, count: int64> to struct using function cast_struct
Anyone would have an advice on this error?
Thanks!