python 3.12.12
datasets 4.2.0
Hi Everyone!
I was hoping you could help me understand where this error might be coming from and how to resolve it. Here is a code sample:
from datasets import load_dataset
import json
# creating representatitve data
person1 = {
"employee": {"name": "Janice", "age": 25, "departments": []},
"divisions": ["DivA", "DivB"],
}
person2 = {
"employee": {"name": "Jake", "age": 30, "departments": ["IT", "Planning"]},
"divisions": "DivC",
}
# writing data to json file for testing
people = [person1, person2]
counter = 0
paths = []
for person in people:
counter += 1
path = "./person{}.json".format(counter)
paths.append(path)
with open(path, "w") as f:
json.dump(person, f)
# create HFDataset from paths
hf_ds = load_dataset("json", data_files=paths)
And here is the traceback:
Traceback (most recent call last):
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/builder.py", line 1831, in _prepare_split_single
writer.write_table(table)
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/arrow_writer.py", line 714, in write_table
pa_table = table_cast(pa_table, self._schema)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/table.py", line 2272, in table_cast
return cast_table_to_schema(table, schema)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/table.py", line 2224, in cast_table_to_schema
cast_array_to_feature(
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/table.py", line 1795, in wrapper
return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/table.py", line 2002, in cast_array_to_feature
_c(array.field(name) if name in array_fields else null_array, subfeature)
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/table.py", line 1797, in wrapper
return func(array, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/table.py", line 2052, in cast_array_to_feature
casted_array_values = _c(array.values, feature.feature)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/table.py", line 1797, in wrapper
return func(array, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/table.py", line 2086, in cast_array_to_feature
return array_cast(
^^^^^^^^^^^
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/table.py", line 1797, in wrapper
return func(array, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/table.py", line 1948, in array_cast
raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
TypeError: Couldn't cast array of type string to null
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/dirac/mscrw/tests/hf_ds_test.py", line 26, in <module>
hf_ds = load_dataset("json", data_files=paths)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/load.py", line 1417, in load_dataset
builder_instance.download_and_prepare(
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/builder.py", line 894, in download_and_prepare
self._download_and_prepare(
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/builder.py", line 970, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/builder.py", line 1702, in _prepare_split
for job_id, done, content in self._prepare_split_single(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/dirac/mscrw/.venv/lib/python3.12/site-packages/datasets/builder.py", line 1858, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
```
I have a hunch about what it could be, but I was hoping to get some feedback here. If you need any more information, just let me know. Thanks in advance for your help!