### Describe the bug
unicodedecodeerror: 'utf-8' codec can't decode byte 0xac⌠in position 25: invalid start byte
### Steps to reproduce the bug
```
import sys
sys.getdefaultencoding()
'utf-8'
from datasets import load_dataset
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")
Resolving data files: 100%
159/159 [00:00<00:00, 9909.28it/s]
Using custom data configuration samsum-0b1209637541c9e6
Downloading and preparing dataset json/samsum to C:/Users/Administrator/.cache/huggingface/datasets/json/samsum-0b1209637541c9e6/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...
Downloading data files: 100%
3/3 [00:00<00:00, 119.99it/s]
Extracting data files: 100%
3/3 [00:00<00:00, 9.54it/s]
Generating train split:
88392/0 [00:15<00:00, 86848.17 examples/s]
Generating test split:
0/0 [00:00<?, ? examples/s]
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\packaged_modules\json\json.py:132, in Json._generate_tables(self, files)
131 try:
--> 132 pa_table = paj.read_json(
133 io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size)
134 )
135 break
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\_json.pyx:290, in pyarrow._json.read_json()
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pyarrow\error.pxi:100, in pyarrow.lib.check_status()
ArrowInvalid: JSON parse error: Invalid value. in row 0
During handling of the above exception, another exception occurred:
UnicodeDecodeError Traceback (most recent call last)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1819, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1818 _time = time.time()
-> 1819 for _, table in generator:
1820 if max_shard_size is not None and writer._num_bytes > max_shard_size:
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\packaged_modules\json\json.py:153, in Json._generate_tables(self, files)
152 with open(file, encoding="utf-8") as f:
--> 153 dataset = json.load(f)
154 except json.JSONDecodeError:
File ~\AppData\Local\Programs\Python\Python310\lib\json\__init__.py:293, in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
276 """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
277 a JSON document) to a Python object.
278
(...)
291 kwarg; otherwise ``JSONDecoder`` is used.
292 """
--> 293 return loads(fp.read(),
294 cls=cls, object_hook=object_hook,
295 parse_float=parse_float, parse_int=parse_int,
296 parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File ~\AppData\Local\Programs\Python\Python310\lib\codecs.py:322, in BufferedIncrementalDecoder.decode(self, input, final)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xac in position 25: invalid start byte
The above exception was the direct cause of the following exception:
DatasetGenerationError Traceback (most recent call last)
Cell In[81], line 5
1 from datasets import load_dataset
3 # Load dataset from the hub
4 #dataset = load_dataset("json",data_files="C:/Users/Administrator/Desktop/samsum/samsum/data/corpus/train.json",field="data")
----> 5 dataset = load_dataset('json',"samsum")
6 #dataset = load_dataset("samsum")
7 print(f"Train dataset size: {len(dataset['train'])}")
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py:1758, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
1755 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
1757 # Download and prepare data
-> 1758 builder_instance.download_and_prepare(
1759 download_config=download_config,
1760 download_mode=download_mode,
1761 ignore_verifications=ignore_verifications,
1762 try_from_hf_gcs=try_from_hf_gcs,
1763 num_proc=num_proc,
1764 )
1766 # Build dataset for splits
1767 keep_in_memory = (
1768 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
1769 )
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:860, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
858 if num_proc is not None:
859 prepare_split_kwargs["num_proc"] = num_proc
--> 860 self._download_and_prepare(
861 dl_manager=dl_manager,
862 verify_infos=verify_infos,
863 **prepare_split_kwargs,
864 **download_and_prepare_kwargs,
865 )
866 # Sync info
867 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:953, in DatasetBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
949 split_dict.add(split_generator.split_info)
951 try:
952 # Prepare split will record examples associated to the split
--> 953 self._prepare_split(split_generator, **prepare_split_kwargs)
954 except OSError as e:
955 raise OSError(
956 "Cannot find data file. "
957 + (self.manual_download_instructions or "")
958 + "\nOriginal error:\n"
959 + str(e)
960 ) from None
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1708, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
1706 gen_kwargs = split_generator.gen_kwargs
1707 job_id = 0
-> 1708 for job_id, done, content in self._prepare_split_single(
1709 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
1710 ):
1711 if done:
1712 result = content
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\builder.py:1851, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1849 if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
1850 e = e.__context__
-> 1851 raise DatasetGenerationError("An error occurred while generating the dataset") from e
1853 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
DatasetGenerationError: An error occurred while generating the dataset
```
### Expected behavior
can't load dataset
### Environment info
dataset:samsum
system :win10
gpu:m40 24G