Error with load model from JSON in datasets

I have nlp dataset for chat its formated like ultrachat_200k

its formated like this :

[
{
“prompt”: “”: [
{
“content”: “”,
“role”: “user”
},
{
“content”: “”,
“role”: “assistant”
},
{
“content”: “”,
“role”: “user”
},
{
“content”: “”,
“role”: “assistant”
},
{
“content”: “”,
“role”: “user”
},
{
“content”: “”,
“role”: “assistant”
}
],
“prompt_id”: 0
}, …
]

im trying to load it like this :

from datasets import load_dataset
dataset = load_dataset("json", data_files="UC_db.json", field="messages")

and im getting error :

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/builder.py:1917, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1916 _time = time.time()
-> 1917 for _, table in generator:
   1918     if max_shard_size is not None and writer._num_bytes > max_shard_size:

File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/packaged_modules/json/json.py:85, in Json._generate_tables(self, files)
     84 # We keep only the field we are interested in
---> 85 dataset = dataset[self.config.field]
     87 # We accept two format: a list of dicts or a dict of lists

TypeError: list indices must be integers or slices, not str

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
/home/mustafa/Desktop/LaierTwo/database/create_data_hf.ipynb Cell 2 line 2
      1 from datasets import load_dataset
----> 2 dataset = load_dataset("json", data_files="try11.json", field="messages")

File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/load.py:2152, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   2149 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   2151 # Download and prepare data
-> 2152 builder_instance.download_and_prepare(
   2153     download_config=download_config,
   2154     download_mode=download_mode,
   2155     verification_mode=verification_mode,
   2156     try_from_hf_gcs=try_from_hf_gcs,
   2157     num_proc=num_proc,
   2158     storage_options=storage_options,
   2159 )
   2161 # Build dataset for splits
   2162 keep_in_memory = (
   2163     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   2164 )

File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/builder.py:948, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    946     if num_proc is not None:
    947         prepare_split_kwargs["num_proc"] = num_proc
--> 948     self._download_and_prepare(
    949         dl_manager=dl_manager,
    950         verification_mode=verification_mode,
    951         **prepare_split_kwargs,
    952         **download_and_prepare_kwargs,
    953     )
    954 # Sync info
    955 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/builder.py:1043, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
   1039 split_dict.add(split_generator.split_info)
   1041 try:
   1042     # Prepare split will record examples associated to the split
-> 1043     self._prepare_split(split_generator, **prepare_split_kwargs)
   1044 except OSError as e:
   1045     raise OSError(
   1046         "Cannot find data file. "
   1047         + (self.manual_download_instructions or "")
   1048         + "\nOriginal error:\n"
   1049         + str(e)
   1050     ) from None

File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/builder.py:1805, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
   1803 job_id = 0
   1804 with pbar:
-> 1805     for job_id, done, content in self._prepare_split_single(
   1806         gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1807     ):
   1808         if done:
   1809             result = content

File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/builder.py:1950, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1948     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1949         e = e.__context__
-> 1950     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1952 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

what is the problem here ?

field only works on JSON objects, and this is a JSON list. Assuming all of the list’s items have the same keys (with messages being one of them), you can avoid the error by reading the file first and then selecting the messages column:

from datasets import load_dataset
dataset = load_dataset("json", data_files="UC_db.json")
dataset = dataset.select_columns("messages")
1 Like

Thanks mario,

when i try to load the whole dataset it gives me this error that it looks like it tries to convert the first dict in the messages to double and i don’t know the reason

Could not convert "what is your name ?" with type str: tried to convert to double