I am creating a new image datasets for the purpospe of finetuning a model.
I have a folder called images which contains all the images. I have compressed it as:
“images.tar.gz”
I also have another json file which contains coco json of image’s object classification labels. I have compressed it as:
“metadata.tar.gz”
Below is my dataset builder:
logging.basicConfig(filename='debug.log', level=logging.DEBUG)
_URL = "https://huggingface.co/datasets/minseochh02/image-demo/resolve/main"
class ImageDatasetBuilder(datasets.GeneratorBasedBuilder):
def _info(self):
return datasets.DatasetInfo(
features=datasets.Features(
{
"image_id": datasets.Value("int32"),
"image": datasets.Image(),
"file_name": datasets.Value("string"),
"height": datasets.Value("int32"),
"width": datasets.Value("int32"),
"objects": datasets.Sequence({
"id": datasets.Value("int32"),
"area": datasets.Value("int32"),
"bbox": datasets.Array2D(shape=(4,), dtype="float32"),
"category_id": datasets.Value("int32"),
})
}
),
supervised_keys=None,
homepage="URL to your dataset's homepage",
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls_to_download = {
"json_file": "https://huggingface.co/datasets/minseochh02/receipts_ko/resolve/main/metadata.tar.gz?download=true",
"images": "https://huggingface.co/datasets/minseochh02/receipts_ko/resolve/main/images.tar.gz?download=true"
}
downloaded_files = dl_manager.download_and_extract(urls_to_download)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"json_filepath": downloaded_files["json_file"] + '/20240418_RECEIPTS_COCO.json',
"images_dir": downloaded_files["images"] + '/images',
},
)
]
def _generate_examples(self, json_filepath, images_dir):
with open(json_filepath, "r") as f:
data = json.load(f)
for image in data["images"]:
image_id = image["id"]
file_name = image["file_name"]
height = image["height"]
width = image["width"]
image_path = os.path.join(images_dir, file_name)
image_path = os.path.join(images_dir, file_name)
if not os.path.exists(image_path):
logging.error(f"Image file not found: {image_path}")
continue # Skip this iteration if the file does not exist
image = Image.open(image_path)
image = np.array(image)
image = np.array(image)
objects = []
for annotation in data["annotations"]:
if annotation["image_id"] == image_id:
objects.append({
"id": annotation["id"],
"area": annotation["area"],
"bbox": annotation["bbox"],
"category_id": annotation["category_id"]
})
yield image_id, {
"image_id": image_id,
"image": image,
"file_name": file_name,
"height": height,
"width": width,
"objects": objects
}
When i try to build a dataset with:
# Set up logging
logging.basicConfig(filename='debug.log', level=logging.DEBUG)
# Use logging
try:
dataset = load_dataset("minseochh02/receipts_ko", split="train")
except Exception as e:
logging.exception("Exception occurred")
# Log the dataset
logging.debug(f"Dataset: {dataset}")
I get an error as following:
ERROR:root:Exception occurred
Traceback (most recent call last):
File "/Users/vickycha/Library/Python/3.9/lib/python/site-packages/datasets/builder.py", line 1726, in _prepare_split_single
for key, record in generator:
File "/Users/vickycha/Library/Python/3.9/lib/python/site-packages/datasets/packaged_modules/webdataset/webdataset.py", line 109, in _generate_examples
example[field_name] = {"path": example["__key__"] + "." + field_name, "bytes": example[field_name]}
KeyError: '/images/sample1.jpeg'
DEBUG:urllib3.connectionpool:Resetting dropped connection: huggingface.co
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/datasets/minseochh02/receipts_ko HTTP/1.1" 200 452
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): s3.amazonaws.com:443
DEBUG:urllib3.connectionpool:https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/datasets/minseochh02/receipts_ko/minseochh02/receipts_ko.py HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/datasets/minseochh02/receipts_ko HTTP/1.1" 200 452
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/minseochh02/receipts_ko/resolve/9782c77835476efeea37168ce0b4c8d7612de81c/README.md HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/minseochh02/receipts_ko/resolve/9782c77835476efeea37168ce0b4c8d7612de81c/.huggingface.yaml HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/datasets/minseochh02/receipts_ko/tree/9782c77835476efeea37168ce0b4c8d7612de81c/data?recursive=False&expand=False HTTP/1.1" 404 79
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/datasets/minseochh02/receipts_ko/tree/9782c77835476efeea37168ce0b4c8d7612de81c/data?recursive=False&expand=False HTTP/1.1" 404 79
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /datasets/minseochh02/receipts_ko/resolve/9782c77835476efeea37168ce0b4c8d7612de81c/dataset_infos.json HTTP/1.1" 404 0
DEBUG:filelock:Attempting to acquire lock 4890159616 on /Users/vickycha/.cache/huggingface/datasets/_Users_vickycha_.cache_huggingface_datasets_minseochh02___receipts_ko_default_0.0.0_9782c77835476efeea37168ce0b4c8d7612de81c.lock
DEBUG:filelock:Lock 4890159616 acquired on /Users/vickycha/.cache/huggingface/datasets/_Users_vickycha_.cache_huggingface_datasets_minseochh02___receipts_ko_default_0.0.0_9782c77835476efeea37168ce0b4c8d7612de81c.lock
DEBUG:filelock:Attempting to release lock 4890159616 on /Users/vickycha/.cache/huggingface/datasets/_Users_vickycha_.cache_huggingface_datasets_minseochh02___receipts_ko_default_0.0.0_9782c77835476efeea37168ce0b4c8d7612de81c.lock
DEBUG:filelock:Lock 4890159616 released on /Users/vickycha/.cache/huggingface/datasets/_Users_vickycha_.cache_huggingface_datasets_minseochh02___receipts_ko_default_0.0.0_9782c77835476efeea37168ce0b4c8d7612de81c.lock
DEBUG:filelock:Attempting to acquire lock 4890125216 on /Users/vickycha/.cache/huggingface/datasets/minseochh02___receipts_ko/default/0.0.0/9782c77835476efeea37168ce0b4c8d7612de81c_builder.lock
DEBUG:filelock:Lock 4890125216 acquired on /Users/vickycha/.cache/huggingface/datasets/minseochh02___receipts_ko/default/0.0.0/9782c77835476efeea37168ce0b4c8d7612de81c_builder.lock
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): storage.googleapis.com:443
DEBUG:urllib3.connectionpool:https://storage.googleapis.com:443 "HEAD /huggingface-nlp/cache/datasets/minseochh02___receipts_ko/default/0.0.0/dataset_info.json HTTP/1.1" 404 0
DEBUG:fsspec.local:open file: /Users/vickycha/.cache/huggingface/datasets/minseochh02___receipts_ko/default/0.0.0/9782c77835476efeea37168ce0b4c8d7612de81c.incomplete/receipts_ko-train-00000-00000-of-NNNNN.arrow
DEBUG:filelock:Attempting to release lock 4890125216 on /Users/vickycha/.cache/huggingface/datasets/minseochh02___receipts_ko/default/0.0.0/9782c77835476efeea37168ce0b4c8d7612de81c_builder.lock
DEBUG:filelock:Lock 4890125216 released on /Users/vickycha/.cache/huggingface/datasets/minseochh02___receipts_ko/default/0.0.0/9782c77835476efeea37168ce0b4c8d7612de81c_builder.lock
ERROR:root:Exception occurred
Traceback (most recent call last):
File "/Users/vickycha/Library/Python/3.9/lib/python/site-packages/datasets/builder.py", line 1726, in _prepare_split_single
for key, record in generator:
File "/Users/vickycha/Library/Python/3.9/lib/python/site-packages/datasets/packaged_modules/webdataset/webdataset.py", line 109, in _generate_examples
example[field_name] = {"path": example["__key__"] + "." + field_name, "bytes": example[field_name]}
KeyError: '/images/sample1.jpeg'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/var/folders/d_/9kbv1x3j5r90450mn9fw7rbm0000gn/T/ipykernel_1031/3551765180.py", line 8, in <module>
dataset = load_dataset("minseochh02/receipts_ko", split="train")
File "/Users/vickycha/Library/Python/3.9/lib/python/site-packages/datasets/load.py", line 2582, in load_dataset
builder_instance.download_and_prepare(
File "/Users/vickycha/Library/Python/3.9/lib/python/site-packages/datasets/builder.py", line 1005, in download_and_prepare
self._download_and_prepare(
File "/Users/vickycha/Library/Python/3.9/lib/python/site-packages/datasets/builder.py", line 1767, in _download_and_prepare
super()._download_and_prepare(
File "/Users/vickycha/Library/Python/3.9/lib/python/site-packages/datasets/builder.py", line 1100, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/Users/vickycha/Library/Python/3.9/lib/python/site-packages/datasets/builder.py", line 1605, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/Users/vickycha/Library/Python/3.9/lib/python/site-packages/datasets/builder.py", line 1762, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
DEBUG:root:Dataset: None
I don’t understand why I am getting a key error when I am not trying to retrieve value from an object using “images/sample1.jpeg”