Cannot load dataset on Kaggle

npvinHnivqn · July 9, 2023, 10:49am

this is my code on Kaggle:

from datasets import load_dataset
data = load_dataset('tsdocode/vi_alpaca_clean')

and this is the error I met:


---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/json/json.py:122, in Json._generate_tables(self, files)
    121 try:
--> 122     pa_table = paj.read_json(
    123         io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size)
    124     )
    125     break

File /opt/conda/lib/python3.10/site-packages/pyarrow/_json.pyx:259, in pyarrow._json.read_json()

File /opt/conda/lib/python3.10/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File /opt/conda/lib/python3.10/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()

ArrowInvalid: JSON parse error: Column() changed from object to array in row 0

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
Cell In[2], line 2
      1 from datasets import load_dataset
----> 2 data = load_dataset('tsdocode/vi_alpaca_clean')

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1691, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
   1688 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1690 # Download and prepare data
-> 1691 builder_instance.download_and_prepare(
   1692     download_config=download_config,
   1693     download_mode=download_mode,
   1694     ignore_verifications=ignore_verifications,
   1695     try_from_hf_gcs=try_from_hf_gcs,
   1696     use_auth_token=use_auth_token,
   1697 )
   1699 # Build dataset for splits
   1700 keep_in_memory = (
   1701     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1702 )

File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:605, in DatasetBuilder.download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
    603         logger.warning("HF google storage unreachable. Downloading and preparing it from source")
    604 if not downloaded_from_gcs:
--> 605     self._download_and_prepare(
    606         dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
    607     )
    608 # Sync info
    609 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:694, in DatasetBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
    690 split_dict.add(split_generator.split_info)
    692 try:
    693     # Prepare split will record examples associated to the split
--> 694     self._prepare_split(split_generator, **prepare_split_kwargs)
    695 except OSError as e:
    696     raise OSError(
    697         "Cannot find data file. "
    698         + (self.manual_download_instructions or "")
    699         + "\nOriginal error:\n"
    700         + str(e)
    701     ) from None

File /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1151, in ArrowBasedBuilder._prepare_split(self, split_generator)
   1149 generator = self._generate_tables(**split_generator.gen_kwargs)
   1150 with ArrowWriter(features=self.info.features, path=fpath) as writer:
-> 1151     for key, table in logging.tqdm(
   1152         generator, unit=" tables", leave=False, disable=True  # not logging.is_progress_bar_enabled()
   1153     ):
   1154         writer.write_table(table)
   1155     num_examples, num_bytes = writer.finalize()

File /opt/conda/lib/python3.10/site-packages/tqdm/notebook.py:254, in tqdm_notebook.__iter__(self)
    252 try:
    253     it = super(tqdm_notebook, self).__iter__()
--> 254     for obj in it:
    255         # return super(tqdm...) will not catch exception
    256         yield obj
    257 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt

File /opt/conda/lib/python3.10/site-packages/tqdm/std.py:1166, in tqdm.__iter__(self)
   1163 # If the bar is disabled, then just walk the iterable
   1164 # (note: keep this check outside the loop for performance)
   1165 if self.disable:
-> 1166     for obj in iterable:
   1167         yield obj
   1168     return

File /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/json/json.py:150, in Json._generate_tables(self, files)
    145     except json.JSONDecodeError:
    146         raise e
    147     raise ValueError(
    148         f"Not able to read records in the JSON file at {file}. "
    149         f"You should probably indicate the field of the JSON file containing your records. "
--> 150         f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
    151         f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
    152     ) from None
    153 # Uncomment for debugging (will print the Arrow table size and elements)
    154 # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
    155 # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
    156 yield (file_idx, batch_idx), self._cast_classlabels(pa_table)

AttributeError: 'list' object has no attribute 'keys'

It seems to me that the dataset has been downloaded

Abdelkareem · July 10, 2023, 8:55am

I applied the following code and it’s working

ds = load_dataset("tsdocode/vi_alpaca_clean")

The problem maybe you need to update datasets into latest version and restart the kernel

npvinHnivqn · July 10, 2023, 9:22am

So here is how I get through this, put this at the beginning of the notebook and run a new kernel(instead of restarting it)

!pip install datasets==2.13.1

Replace 2.13.1 with the latest version a.t.m

astrung · August 13, 2023, 1:34am

@npvinHnivqn
I have installed newest version from git, but it still won’t work.
!pip install git+https://github.com/huggingface/datasets#egg=datasets

mariosasko · August 16, 2023, 1:46pm

You must restart the kernel after updating the installation for the update to take effect (use import datasets; print(datasets.__version__) to check the loaded version)

Topic		Replies	Views
ArrowNotImplementedError when loading json dataset 🤗Datasets	3	1743	December 17, 2021
Problem loading datasets library from Kaggle 🤗Datasets	6	5752	October 12, 2021
Load_dataset() keep throwing `ArrowInvalid: JSON parse error` Beginners	0	680	August 12, 2024
Strange pyarrow error when extracting rows from a public dataset Intermediate	2	53	April 30, 2025
ArrowTypeError in load_dataset 🤗Datasets	1	626	June 12, 2023

Cannot load dataset on Kaggle

Related topics