Dataset: raygx/NepaliTextCorpus
Every package and modules are upto date
ERROR Trace Below
Downloading and preparing dataset json/raygxâNepaliTextCorpus to /root/.cache/huggingface/datasets/raygx___json/raygxâNepaliTextCorpus-172878a4edc47604/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6eâŚ
Downloading data files: 100%
1/1 [00:00<00:00, 69.76it/s]
Extracting data files: 100%
1/1 [00:00<00:00, 58.73it/s]
ValueErrorTraceback (most recent call last)
File /usr/local/lib/python3.8/dist-packages/datasets/builder.py:1875, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1868 writer = writer_class(
1869 features=writer._features,
1870 path=fpath.replace(âSSSSSâ, f"{shard_id:05d}â).replace(âJJJJJâ, fâ{job_id:05d}"),
(âŚ)
1873 embed_local_files=embed_local_files,
1874 )
â 1875 writer.write_table(table)
1876 num_examples_progress_update += len(table)
File /usr/local/lib/python3.8/dist-packages/datasets/arrow_writer.py:568, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
567 pa_table = pa_table.combine_chunks()
â 568 pa_table = table_cast(pa_table, self._schema)
569 if self.embed_local_files:
File /usr/local/lib/python3.8/dist-packages/datasets/table.py:2290, in table_cast(table, schema)
2289 if table.schema != schema:
â 2290 return cast_table_to_schema(table, schema)
2291 elif table.schema.metadata != schema.metadata:
File /usr/local/lib/python3.8/dist-packages/datasets/table.py:2248, in cast_table_to_schema(table, schema)
2247 if sorted(table.column_names) != sorted(features):
â 2248 raise ValueError(f"Couldnât cast\n{table.schema}\nto\n{features}\nbecause column names donât match")
2249 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
ValueError: Couldnât cast
_data_files: list<item: struct<filename: string>>
child 0, item: struct<filename: string>
child 0, filename: string
_fingerprint: string
_format_columns: null
_format_kwargs: struct<>
_format_type: null
_indexes: struct<>
_output_all_columns: bool
_split: null
to
{âbuilder_nameâ: Value(dtype=ânullâ, id=None), âcitationâ: Value(dtype=âstringâ, id=None), âconfig_nameâ: Value(dtype=ânullâ, id=None), âdataset_sizeâ: Value(dtype=ânullâ, id=None), âdescriptionâ: Value(dtype=âstringâ, id=None), âdownload_checksumsâ: Value(dtype=ânullâ, id=None), âdownload_sizeâ: Value(dtype=ânullâ, id=None), âfeaturesâ: {âtextâ: {âdtypeâ: Value(dtype=âstringâ, id=None), âidâ: Value(dtype=ânullâ, id=None), â_typeâ: Value(dtype=âstringâ, id=None)}}, âhomepageâ: Value(dtype=âstringâ, id=None), âlicenseâ: Value(dtype=âstringâ, id=None), âpost_processedâ: Value(dtype=ânullâ, id=None), âpost_processing_sizeâ: Value(dtype=ânullâ, id=None), âsize_in_bytesâ: Value(dtype=ânullâ, id=None), âsplitsâ: Value(dtype=ânullâ, id=None), âsupervised_keysâ: Value(dtype=ânullâ, id=None), âtask_templatesâ: Value(dtype=ânullâ, id=None), âversionâ: Value(dtype=ânullâ, id=None)}
because column names donât match
The above exception was the direct cause of the following exception:
DatasetGenerationErrorTraceback (most recent call last)
File :4
File /usr/local/lib/python3.8/dist-packages/datasets/load.py:1791, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
1788 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
1790 # Download and prepare data
â 1791 builder_instance.download_and_prepare(
1792 download_config=download_config,
1793 download_mode=download_mode,
1794 verification_mode=verification_mode,
1795 try_from_hf_gcs=try_from_hf_gcs,
1796 num_proc=num_proc,
1797 storage_options=storage_options,
1798 )
1800 # Build dataset for splits
1801 keep_in_memory = (
1802 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
1803 )
File /usr/local/lib/python3.8/dist-packages/datasets/builder.py:891, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
889 if num_proc is not None:
890 prepare_split_kwargs[ânum_procâ] = num_proc
â 891 self._download_and_prepare(
892 dl_manager=dl_manager,
893 verification_mode=verification_mode,
894 **prepare_split_kwargs,
895 **download_and_prepare_kwargs,
896 )
897 # Sync info
898 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File /usr/local/lib/python3.8/dist-packages/datasets/builder.py:986, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
982 split_dict.add(split_generator.split_info)
984 try:
985 # Prepare split will record examples associated to the split
â 986 self._prepare_split(split_generator, **prepare_split_kwargs)
987 except OSError as e:
988 raise OSError(
989 "Cannot find data file. "
990 + (self.manual_download_instructions or ââ)
991 + â\nOriginal error:\nâ
992 + str(e)
993 ) from None
File /usr/local/lib/python3.8/dist-packages/datasets/builder.py:1748, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
1746 job_id = 0
1747 with pbar:
â 1748 for job_id, done, content in self._prepare_split_single(
1749 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
1750 ):
1751 if done:
1752 result = content
File /usr/local/lib/python3.8/dist-packages/datasets/builder.py:1893, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1891 if isinstance(e, SchemaInferenceError) and e.context is not None:
1892 e = e.context
â 1893 raise DatasetGenerationError(âAn error occurred while generating the datasetâ) from e
1895 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
DatasetGenerationError: An error occurred while generating the dataset