I’m trying to fine tune a model using my own data on my Windows machine with WSL (Ubuntu).
In my app.py
file, I have my code.
from datasets import load_dataset
dataset = load_dataset("json", data_files="sample.json")
The sample.json
is stored locally, and as follows
[
{
"instruction": "Some string"
"input": "Some string"
"output": "Some string"
},
{
"instruction": "Some string"
"input": "Some string"
"output": "Some string"
}
]
While running python app.py
, I keep getting ArrowInvalid: JSON parse error: Column() changed from object to string in row 0
.
Full error below:
Traceback (most recent call last):
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/datasets/packaged_modules/json/json.py", line 130, in _generate_tables
pa_table = paj.read_json(
File "pyarrow/_json.pyx", line 308, in pyarrow._json.read_json
File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: JSON parse error: Column() changed from object to string in row 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/datasets/builder.py", line 1997, in _prepare_split_single
for _, table in generator:
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/datasets/packaged_modules/json/json.py", line 153, in _generate_tables
df = pd.read_json(f, dtype_backend="pyarrow")
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/io/json/_json.py", line 815, in read_json
return json_reader.read()
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/io/json/_json.py", line 1025, in read
obj = self._get_object_parser(self.data)
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/io/json/_json.py", line 1051, in _get_object_parser
obj = FrameParser(json, **kwargs).parse()
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/io/json/_json.py", line 1187, in parse
self._parse()
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/io/json/_json.py", line 1402, in _parse
self.obj = DataFrame(
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/core/frame.py", line 851, in __init__
arrays, columns, index = nested_data_to_arrays(
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/core/internals/construction.py", line 520, in nested_data_to_arrays
arrays, columns = to_arrays(data, columns, dtype=dtype)
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/core/internals/construction.py", line 837, in to_arrays
arr, columns = _list_of_dict_to_arrays(data, columns)
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/core/internals/construction.py", line 918, in _list_of_dict_to_arrays
columns = ensure_index(pre_cols)
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 7647, in ensure_index
return Index(index_like, copy=copy, tupleize_cols=False)
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 565, in __new__
arr = sanitize_array(data, None, dtype=dtype, copy=copy)
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/core/construction.py", line 654, in sanitize_array
subarr = maybe_convert_platform(data)
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/pandas/core/dtypes/cast.py", line 139, in maybe_convert_platform
arr = lib.maybe_convert_objects(arr)
File "lib.pyx", line 2538, in pandas._libs.lib.maybe_convert_objects
TypeError: Cannot convert numpy.ndarray to numpy.ndarray
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/ptamzz/AI/app.py", line 68, in <module>
dataset = load_dataset("json", data_files="sample.json")
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/datasets/load.py", line 2616, in load_dataset
builder_instance.download_and_prepare(
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/datasets/builder.py", line 1029, in download_and_prepare
self._download_and_prepare(
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/datasets/builder.py", line 1124, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/datasets/builder.py", line 1884, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/home/ptamzz/anaconda3/envs/unsloth_env/lib/python3.10/site-packages/datasets/builder.py", line 2040, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
If I reformat sample.json
to below, the error seem to go away.
[
"instruction": "Some string"
"input": "Some string"
"output": "Some string"
]
How can I fix this issue? I’ve run the same code on Google Colab and it seem to work fine.