Hello everyone,
I am currently working on finetuning an object detection model in Sagemaker Studio. However, I cannot load dataset by "load_dataset ". I looked into KeyError Length during training following workshop MLOps · Issue #12 · philschmid/huggingface-sagemaker-workshop-series · GitHub and tried to pip install datasets==1.18.4. It does not work for me.
My environment:
Python 3.8.10
Pytorch 1.10.2+cpu
My requirements.txt
transformers==4.17
datasets==1.18.4
huggingface_hub
evaluate
timm
albumentations
wandb
sagemaker
ipywidgets==7.0
My code:
from datasets import load_dataset
huggingface_dataset = 'oschan77/algae_data_v0'
algae_dataset = load_dataset(huggingface_dataset, split='train', use_auth_token=True)
The traceback I received:
KeyError Traceback (most recent call last)
<ipython-input-16-e654f64ea3fe> in <module>
8
9 # Load the HuggingFace dataset split called 'train'
---> 10 algae_dataset = load_dataset(huggingface_dataset, split='train', use_auth_token=True)
11
12 # Set up an S3 filesystem
/opt/conda/lib/python3.8/site-packages/datasets/load.py in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, script_version, **config_kwargs)
1700
1701 # Download and prepare data
-> 1702 builder_instance.download_and_prepare(
1703 download_config=download_config,
1704 download_mode=download_mode,
/opt/conda/lib/python3.8/site-packages/datasets/builder.py in download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)
592 logger.warning("HF google storage unreachable. Downloading and preparing it from source")
593 if not downloaded_from_gcs:
--> 594 self._download_and_prepare(
595 dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
596 )
/opt/conda/lib/python3.8/site-packages/datasets/builder.py in _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
681 try:
682 # Prepare split will record examples associated to the split
--> 683 self._prepare_split(split_generator, **prepare_split_kwargs)
684 except OSError as e:
685 raise OSError(
/opt/conda/lib/python3.8/site-packages/datasets/builder.py in _prepare_split(self, split_generator)
1134 generator, unit=" tables", leave=False, disable=True # bool(logging.get_verbosity() == logging.NOTSET)
1135 ):
-> 1136 writer.write_table(table)
1137 num_examples, num_bytes = writer.finalize()
1138
/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py in write_table(self, pa_table, writer_batch_size)
508 writer_batch_size = self.writer_batch_size
509 if self.pa_writer is None:
--> 510 self._build_writer(inferred_schema=pa_table.schema)
511 pa_table = table_cast(pa_table, self._schema)
512 batches: List[pa.RecordBatch] = pa_table.to_batches(max_chunksize=writer_batch_size)
/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py in _build_writer(self, inferred_schema)
342 def _build_writer(self, inferred_schema: pa.Schema):
343 schema = self.schema
--> 344 inferred_features = Features.from_arrow_schema(inferred_schema)
345 if self._features is not None:
346 if self.update_features: # keep original features it they match, or update them
/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in from_arrow_schema(cls, pa_schema)
1240 metadata = json.loads(pa_schema.metadata["huggingface".encode("utf-8")].decode())
1241 if "info" in metadata and "features" in metadata["info"] and metadata["info"]["features"] is not None:
-> 1242 return Features.from_dict(metadata["info"]["features"])
1243 obj = {field.name: generate_from_arrow_type(field.type) for field in pa_schema}
1244 return cls(**obj)
/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in from_dict(cls, dic)
1269 {'_type': Value(dtype='string', id=None)}
1270 """
-> 1271 obj = generate_from_dict(dic)
1272 return cls(**obj)
1273
/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in generate_from_dict(obj)
1074 # Otherwise we have a dict or a dataclass
1075 if "_type" not in obj or isinstance(obj["_type"], dict):
-> 1076 return {key: generate_from_dict(value) for key, value in obj.items()}
1077 class_type = globals()[obj.pop("_type")]
1078
/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in <dictcomp>(.0)
1074 # Otherwise we have a dict or a dataclass
1075 if "_type" not in obj or isinstance(obj["_type"], dict):
-> 1076 return {key: generate_from_dict(value) for key, value in obj.items()}
1077 class_type = globals()[obj.pop("_type")]
1078
/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in generate_from_dict(obj)
1074 # Otherwise we have a dict or a dataclass
1075 if "_type" not in obj or isinstance(obj["_type"], dict):
-> 1076 return {key: generate_from_dict(value) for key, value in obj.items()}
1077 class_type = globals()[obj.pop("_type")]
1078
/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in <dictcomp>(.0)
1074 # Otherwise we have a dict or a dataclass
1075 if "_type" not in obj or isinstance(obj["_type"], dict):
-> 1076 return {key: generate_from_dict(value) for key, value in obj.items()}
1077 class_type = globals()[obj.pop("_type")]
1078
/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in generate_from_dict(obj)
1078
1079 if class_type == Sequence:
-> 1080 return Sequence(feature=generate_from_dict(obj["feature"]), length=obj["length"])
1081
1082 field_names = {f.name for f in fields(class_type)}
KeyError: 'length'
I would greatly appreciate any guidance or advice on how to resolve this issue. Thank you very much in advance for your help!