DatasetGenerationError. Failed to parse string: as a scalar of type double

Im trying to load a dataset I uploaded on Huggingface and using this code (ds = load_dataset(“ArchaeonSeq/BooksData”, on_bad_lines=“skip”)), I keep getting this error:

---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/datasets/builder.py in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1869                     try:
-> 1870                         writer.write_table(table)
   1871                     except CastError as cast_error:

/usr/local/lib/python3.10/dist-packages/datasets/arrow_writer.py in write_table(self, pa_table, writer_batch_size)
    621         pa_table = pa_table.combine_chunks()
--> 622         pa_table = table_cast(pa_table, self._schema)
    623         if self.embed_local_files:

/usr/local/lib/python3.10/dist-packages/datasets/table.py in table_cast(table, schema)
   2291     if table.schema != schema:
-> 2292         return cast_table_to_schema(table, schema)
   2293     elif table.schema.metadata != schema.metadata:

/usr/local/lib/python3.10/dist-packages/datasets/table.py in cast_table_to_schema(table, schema)
   2244         )
-> 2245     arrays = [
   2246         cast_array_to_feature(

/usr/local/lib/python3.10/dist-packages/datasets/table.py in <listcomp>(.0)
   2245     arrays = [
-> 2246         cast_array_to_feature(
   2247             table[name] if name in table_column_names else pa.array([None] * len(table), type=schema.field(name).type),

/usr/local/lib/python3.10/dist-packages/datasets/table.py in wrapper(array, *args, **kwargs)
   1794         if isinstance(array, pa.ChunkedArray):
-> 1795             return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1796         else:

/usr/local/lib/python3.10/dist-packages/datasets/table.py in <listcomp>(.0)
   1794         if isinstance(array, pa.ChunkedArray):
-> 1795             return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1796         else:

/usr/local/lib/python3.10/dist-packages/datasets/table.py in cast_array_to_feature(array, feature, allow_primitive_to_str, allow_decimal_to_str)
   2101     elif not isinstance(feature, (Sequence, dict, list, tuple)):
-> 2102         return array_cast(
   2103             array,

/usr/local/lib/python3.10/dist-packages/datasets/table.py in wrapper(array, *args, **kwargs)
   1796         else:
-> 1797             return func(array, *args, **kwargs)
   1798 

/usr/local/lib/python3.10/dist-packages/datasets/table.py in array_cast(array, pa_type, allow_primitive_to_str, allow_decimal_to_str)
   1948             raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
-> 1949         return array.cast(pa_type)
   1950     raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")

/usr/local/lib/python3.10/dist-packages/pyarrow/array.pxi in pyarrow.lib.Array.cast()

/usr/local/lib/python3.10/dist-packages/pyarrow/compute.py in cast(arr, target_type, safe, options, memory_pool)
    404             options = CastOptions.safe(target_type)
--> 405     return call_function("cast", [arr], options, memory_pool)
    406 

/usr/local/lib/python3.10/dist-packages/pyarrow/_compute.pyx in pyarrow._compute.call_function()

/usr/local/lib/python3.10/dist-packages/pyarrow/_compute.pyx in pyarrow._compute.Function.call()

/usr/local/lib/python3.10/dist-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()

/usr/local/lib/python3.10/dist-packages/pyarrow/error.pxi in pyarrow.lib.check_status()

ArrowInvalid: Failed to parse string: 'B0068HLYU4' as a scalar of type double

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
<ipython-input-9-87b6ed75995d> in <cell line: 3>()
      1 from datasets import load_dataset
      2 
----> 3 ds = load_dataset("ArchaeonSeq/BooksData", on_bad_lines="skip")

/usr/local/lib/python3.10/dist-packages/datasets/load.py in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
   2149 
   2150     # Download and prepare data
-> 2151     builder_instance.download_and_prepare(
   2152         download_config=download_config,
   2153         download_mode=download_mode,

/usr/local/lib/python3.10/dist-packages/datasets/builder.py in download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, dl_manager, base_path, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    922                     if num_proc is not None:
    923                         prepare_split_kwargs["num_proc"] = num_proc
--> 924                     self._download_and_prepare(
    925                         dl_manager=dl_manager,
    926                         verification_mode=verification_mode,

/usr/local/lib/python3.10/dist-packages/datasets/builder.py in _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
    998             try:
    999                 # Prepare split will record examples associated to the split
-> 1000                 self._prepare_split(split_generator, **prepare_split_kwargs)
   1001             except OSError as e:
   1002                 raise OSError(

/usr/local/lib/python3.10/dist-packages/datasets/builder.py in _prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
   1739             job_id = 0
   1740             with pbar:
-> 1741                 for job_id, done, content in self._prepare_split_single(
   1742                     gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1743                 ):

/usr/local/lib/python3.10/dist-packages/datasets/builder.py in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1895             if isinstance(e, DatasetGenerationError):
   1896                 raise
-> 1897             raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1898 
   1899         yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

Is there a way to get it to change the encoding to string for that column, I don’t know why huggingface converted it to a double dtype seeing as it has some alplabetic characters in some of its rows. or even a way to get the load dataset to ignore those rows or something

1 Like

Is there a way to get it to change the encoding to string for that column,

If you’re importing from CSV, you can specify it with features=, but after uploading, I don’t know…

1 Like

Yeah, thanks, I was able to figure that out but would have greatly preferred I be able to change the actual dataset feature on huggingface, still no idea why it decided to encode it as float instead

1 Like

why it decided to encode it as float instead

I think that probably made that’s decision based on the first few lines…:sweat_smile:
It seems that external libraries (Pandas and PyArrow) are used for parsing CSV and JSON, and that’s probably how it works. It seems that things like on_bad_lines=“skip” are also completely thrown over to them.

1 Like