KeyError: 'length' when using using load_dataset on Sagemaker

Hello everyone,

I am currently working on finetuning an object detection model in Sagemaker Studio. However, I cannot load dataset by "load_dataset ". I looked into KeyError Length during training following workshop MLOps · Issue #12 · philschmid/huggingface-sagemaker-workshop-series · GitHub and tried to pip install datasets==1.18.4. It does not work for me.

My environment:

Python 3.8.10
Pytorch 1.10.2+cpu

My requirements.txt

transformers==4.17
datasets==1.18.4
huggingface_hub
evaluate
timm
albumentations
wandb
sagemaker
ipywidgets==7.0

My code:

from datasets import load_dataset

huggingface_dataset = 'oschan77/algae_data_v0'
algae_dataset = load_dataset(huggingface_dataset, split='train', use_auth_token=True)

The traceback I received:

KeyError                                  Traceback (most recent call last)
<ipython-input-16-e654f64ea3fe> in <module>
      8 
      9 # Load the HuggingFace dataset split called 'train'
---> 10 algae_dataset = load_dataset(huggingface_dataset, split='train', use_auth_token=True)
     11 
     12 # Set up an S3 filesystem

/opt/conda/lib/python3.8/site-packages/datasets/load.py in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, script_version, **config_kwargs)
   1700 
   1701     # Download and prepare data
-> 1702     builder_instance.download_and_prepare(
   1703         download_config=download_config,
   1704         download_mode=download_mode,

/opt/conda/lib/python3.8/site-packages/datasets/builder.py in download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
    592                             logger.warning("HF google storage unreachable. Downloading and preparing it from source")
    593                     if not downloaded_from_gcs:
--> 594                         self._download_and_prepare(
    595                             dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
    596                         )

/opt/conda/lib/python3.8/site-packages/datasets/builder.py in _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
    681             try:
    682                 # Prepare split will record examples associated to the split
--> 683                 self._prepare_split(split_generator, **prepare_split_kwargs)
    684             except OSError as e:
    685                 raise OSError(

/opt/conda/lib/python3.8/site-packages/datasets/builder.py in _prepare_split(self, split_generator)
   1134                 generator, unit=" tables", leave=False, disable=True  # bool(logging.get_verbosity() == logging.NOTSET)
   1135             ):
-> 1136                 writer.write_table(table)
   1137             num_examples, num_bytes = writer.finalize()
   1138 

/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py in write_table(self, pa_table, writer_batch_size)
    508             writer_batch_size = self.writer_batch_size
    509         if self.pa_writer is None:
--> 510             self._build_writer(inferred_schema=pa_table.schema)
    511         pa_table = table_cast(pa_table, self._schema)
    512         batches: List[pa.RecordBatch] = pa_table.to_batches(max_chunksize=writer_batch_size)

/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py in _build_writer(self, inferred_schema)
    342     def _build_writer(self, inferred_schema: pa.Schema):
    343         schema = self.schema
--> 344         inferred_features = Features.from_arrow_schema(inferred_schema)
    345         if self._features is not None:
    346             if self.update_features:  # keep original features it they match, or update them

/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in from_arrow_schema(cls, pa_schema)
   1240             metadata = json.loads(pa_schema.metadata["huggingface".encode("utf-8")].decode())
   1241             if "info" in metadata and "features" in metadata["info"] and metadata["info"]["features"] is not None:
-> 1242                 return Features.from_dict(metadata["info"]["features"])
   1243         obj = {field.name: generate_from_arrow_type(field.type) for field in pa_schema}
   1244         return cls(**obj)

/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in from_dict(cls, dic)
   1269             {'_type': Value(dtype='string', id=None)}
   1270         """
-> 1271         obj = generate_from_dict(dic)
   1272         return cls(**obj)
   1273 

/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in generate_from_dict(obj)
   1074     # Otherwise we have a dict or a dataclass
   1075     if "_type" not in obj or isinstance(obj["_type"], dict):
-> 1076         return {key: generate_from_dict(value) for key, value in obj.items()}
   1077     class_type = globals()[obj.pop("_type")]
   1078 

/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in <dictcomp>(.0)
   1074     # Otherwise we have a dict or a dataclass
   1075     if "_type" not in obj or isinstance(obj["_type"], dict):
-> 1076         return {key: generate_from_dict(value) for key, value in obj.items()}
   1077     class_type = globals()[obj.pop("_type")]
   1078 

/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in generate_from_dict(obj)
   1074     # Otherwise we have a dict or a dataclass
   1075     if "_type" not in obj or isinstance(obj["_type"], dict):
-> 1076         return {key: generate_from_dict(value) for key, value in obj.items()}
   1077     class_type = globals()[obj.pop("_type")]
   1078 

/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in <dictcomp>(.0)
   1074     # Otherwise we have a dict or a dataclass
   1075     if "_type" not in obj or isinstance(obj["_type"], dict):
-> 1076         return {key: generate_from_dict(value) for key, value in obj.items()}
   1077     class_type = globals()[obj.pop("_type")]
   1078 

/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in generate_from_dict(obj)
   1078 
   1079     if class_type == Sequence:
-> 1080         return Sequence(feature=generate_from_dict(obj["feature"]), length=obj["length"])
   1081 
   1082     field_names = {f.name for f in fields(class_type)}

KeyError: 'length'

I would greatly appreciate any guidance or advice on how to resolve this issue. Thank you very much in advance for your help!

Is the error appearing in the notebook or in the sagemaker training job?

It appears in the notebook. I have not started the sagemaker training job and this error appears. Thanks!

try to install datasets 2.12.0 (recent version),