I’m trying to create a loading script from a tfrecord file. I follow the tutorials and tried with the following code, my issue is in the yield line of the _generate_examples function
import tensorflow as tf
import numpy as np
import datasets
_DESCRIPTION = (
"This dataset consists 90k images of Chest-X-Ray from the Mimic-CXR dataset."
"For each image, we have a consise report obtain from de PRO-CXR dataset"
"All images have a size of 512x512 pixels."
)
_BASE_URL = "https://drive.google.com/file/d/1u27GCgIIRqDz8a5-VTcMJ1pEFQbGv_QB/view?usp=sharing"
FEATURE_DESCRIPTION_TFRECORD = {
'report': tf.io.FixedLenFeature([], tf.string),
'image': tf.io.FixedLenFeature([], tf.string),
}
def _parse_image_function(example_proto):
# Parse the input tf.train.Example proto using the dictionary above.
return tf.io.parse_single_example(example_proto, FEATURE_DESCRIPTION_TFRECORD)
class ReportsCXR(datasets.GeneratorBasedBuilder):
def _info(self):
return datasets.DatasetInfo(
description = _DESCRIPTION,
features = datasets.Features({
'image': datasets.Image(),
'report': datasets.Value(dtype='string')
})
)
def _get_drive_url(self, url):
base_url = 'https://drive.google.com/uc?id='
split_url = url.split('/')
return base_url + split_url[5]
def _split_generators(self, dl_manager):
archive_path = dl_manager.download(self._get_drive_url(_BASE_URL))
return [
datasets.SplitGenerator(name='full', gen_kwargs={'filepath': archive_path})
]
def _generate_examples(self, filepath):
raw_image_dataset = tf.data.TFRecordDataset(filepath)
parsed_image_dataset = raw_image_dataset.map(_parse_image_function)
for image_features in parsed_image_dataset:
image_raw = image_features['image'].numpy()
yield {'image': image_raw, 'report': image_features['report']}
Generating full split: 0 examples [00:00, ? examples/s]Traceback (most recent call last):
File "/home/djm/Documents/Projects/Medical Imaging Jax Diffusers/code/medical-stable-diffusion/.conda/lib/python3.10/site-packages/datasets/builder.py", line 1627, in _prepare_split_single
example = self.info.features.encode_example(record) if self.info.features is not None else record
File "/home/djm/Documents/Projects/Medical Imaging Jax Diffusers/code/medical-stable-diffusion/.conda/lib/python3.10/site-packages/datasets/features/features.py", line 1813, in encode_example
return encode_nested_example(self, example)
File "/home/djm/Documents/Projects/Medical Imaging Jax Diffusers/code/medical-stable-diffusion/.conda/lib/python3.10/site-packages/datasets/features/features.py", line 1212, in encode_nested_example
{
File "/home/djm/Documents/Projects/Medical Imaging Jax Diffusers/code/medical-stable-diffusion/.conda/lib/python3.10/site-packages/datasets/features/features.py", line 1212, in <dictcomp>
{
File "/home/djm/Documents/Projects/Medical Imaging Jax Diffusers/code/medical-stable-diffusion/.conda/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 302, in zip_dict
yield key, tuple(d[key] for d in dicts)
File "/home/djm/Documents/Projects/Medical Imaging Jax Diffusers/code/medical-stable-diffusion/.conda/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 302, in <genexpr>
yield key, tuple(d[key] for d in dicts)
TypeError: string indices must be integers