Cannot load Conll2003

I am trying to load conll2003 dataset the basic way I learned like this

from datasets import load_dataset
dataset = load_dataset("conll2003")

but I am running into this error

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[15], line 3
      1 from datasets import load_dataset
----> 3 dataset = load_dataset("conll2003")

File ~/.local/lib/python3.12/site-packages/datasets/load.py:1397, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, **config_kwargs)
   1392 verification_mode = VerificationMode(
   1393     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   1394 )
   1396 # Create a dataset builder
-> 1397 builder_instance = load_dataset_builder(
   1398     path=path,
   1399     name=name,
   1400     data_dir=data_dir,
   1401     data_files=data_files,
   1402     cache_dir=cache_dir,
   1403     features=features,
   1404     download_config=download_config,
   1405     download_mode=download_mode,
   1406     revision=revision,
   1407     token=token,
   1408     storage_options=storage_options,
   1409     **config_kwargs,
   1410 )
   1412 # Return iterable dataset in case of streaming
   1413 if streaming:

File ~/.local/lib/python3.12/site-packages/datasets/load.py:1137, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, storage_options, **config_kwargs)
   1135 if features is not None:
   1136     features = _fix_for_backward_compatible_features(features)
-> 1137 dataset_module = dataset_module_factory(
   1138     path,
   1139     revision=revision,
   1140     download_config=download_config,
   1141     download_mode=download_mode,
   1142     data_dir=data_dir,
   1143     data_files=data_files,
   1144     cache_dir=cache_dir,
   1145 )
   1146 # Get dataset builder class
   1147 builder_kwargs = dataset_module.builder_kwargs

File ~/.local/lib/python3.12/site-packages/datasets/load.py:1036, in dataset_module_factory(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)
   1031             if isinstance(e1, FileNotFoundError):
   1032                 raise FileNotFoundError(
   1033                     f"Couldn't find any data file at {relative_to_absolute_path(path)}. "
   1034                     f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1035                 ) from None
-> 1036             raise e1 from None
   1037 else:
   1038     raise FileNotFoundError(f"Couldn't find any data file at {relative_to_absolute_path(path)}.")

File ~/.local/lib/python3.12/site-packages/datasets/load.py:994, in dataset_module_factory(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)
    986 try:
    987     api.hf_hub_download(
    988         repo_id=path,
    989         filename=filename,
   (...)
    992         proxies=download_config.proxies,
    993     )
--> 994     raise RuntimeError(f"Dataset scripts are no longer supported, but found {filename}")
    995 except EntryNotFoundError:
    996     # Use the infos from the parquet export except in some cases:
    997     if data_dir or data_files or (revision and revision != "main"):

RuntimeError: Dataset scripts are no longer supported, but found conll2003.py

Could someone tell me what is wrong?

1 Like

Try:

from datasets import load_dataset
dataset = load_dataset("lhoestq/conll2003")

This is because support for trust_remote_code=True was removed in datasets library version 4.0.0 and later. You can work around this by using datasets that don’t rely on builder scripts (like the one shown above) or by downgrading the datasets library to version 3.6.0 or earlier.

1 Like

That works, thank you.
That’s interesting, so I assume the support for loading scripts has also been removed, so if I want to upload a custom dataset, I will need to manually convert it into DatasetDict and push it using this class.

1 Like

This topic was automatically closed 12 hours after the last reply. New replies are no longer allowed.