I have nlp dataset for chat its formated like ultrachat_200k
its formated like this :
[
{
“prompt”: “”: [
{
“content”: “”,
“role”: “user”
},
{
“content”: “”,
“role”: “assistant”
},
{
“content”: “”,
“role”: “user”
},
{
“content”: “”,
“role”: “assistant”
},
{
“content”: “”,
“role”: “user”
},
{
“content”: “”,
“role”: “assistant”
}
],
“prompt_id”: 0
}, …
]
im trying to load it like this :
from datasets import load_dataset
dataset = load_dataset("json", data_files="UC_db.json", field="messages")
and im getting error :
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/builder.py:1917, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1916 _time = time.time()
-> 1917 for _, table in generator:
1918 if max_shard_size is not None and writer._num_bytes > max_shard_size:
File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/packaged_modules/json/json.py:85, in Json._generate_tables(self, files)
84 # We keep only the field we are interested in
---> 85 dataset = dataset[self.config.field]
87 # We accept two format: a list of dicts or a dict of lists
TypeError: list indices must be integers or slices, not str
The above exception was the direct cause of the following exception:
DatasetGenerationError Traceback (most recent call last)
/home/mustafa/Desktop/LaierTwo/database/create_data_hf.ipynb Cell 2 line 2
1 from datasets import load_dataset
----> 2 dataset = load_dataset("json", data_files="try11.json", field="messages")
File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/load.py:2152, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
2149 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
2151 # Download and prepare data
-> 2152 builder_instance.download_and_prepare(
2153 download_config=download_config,
2154 download_mode=download_mode,
2155 verification_mode=verification_mode,
2156 try_from_hf_gcs=try_from_hf_gcs,
2157 num_proc=num_proc,
2158 storage_options=storage_options,
2159 )
2161 # Build dataset for splits
2162 keep_in_memory = (
2163 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
2164 )
File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/builder.py:948, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
946 if num_proc is not None:
947 prepare_split_kwargs["num_proc"] = num_proc
--> 948 self._download_and_prepare(
949 dl_manager=dl_manager,
950 verification_mode=verification_mode,
951 **prepare_split_kwargs,
952 **download_and_prepare_kwargs,
953 )
954 # Sync info
955 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/builder.py:1043, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
1039 split_dict.add(split_generator.split_info)
1041 try:
1042 # Prepare split will record examples associated to the split
-> 1043 self._prepare_split(split_generator, **prepare_split_kwargs)
1044 except OSError as e:
1045 raise OSError(
1046 "Cannot find data file. "
1047 + (self.manual_download_instructions or "")
1048 + "\nOriginal error:\n"
1049 + str(e)
1050 ) from None
File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/builder.py:1805, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
1803 job_id = 0
1804 with pbar:
-> 1805 for job_id, done, content in self._prepare_split_single(
1806 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
1807 ):
1808 if done:
1809 result = content
File ~/anaconda3/envs/btc2/lib/python3.9/site-packages/datasets/builder.py:1950, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1948 if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
1949 e = e.__context__
-> 1950 raise DatasetGenerationError("An error occurred while generating the dataset") from e
1952 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
DatasetGenerationError: An error occurred while generating the dataset
what is the problem here ?