Handling non-existing url in image dataset while cast_column

I am trying to convert urls to from an image dataset to PIL type image, on huggingface platform I can see both image and url so when i only want image not url I tried to download it and use cast_column to convert it as follow:

from datasets import load_dataset
from datasets import Dataset
import datasets
dataset = load_dataset("mcemilg/laion2B-multi-turkish-subset", split = "train[:30000]")
dataset = dataset.cast_column("URL", datasets.Image())

dataset.push_to_hub("umarigan/clip_dataset")

Since some urls doesnt exist anymore it returns error, I couldn’t find to quickly eliminate that kind of url for bigger image dataset. The error I am facing is as follow:

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-19-6c0f55a01058> in <cell line: 1>()
----> 1 dataset['train'][8]

11 frames
/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py in __getitem__(self, key)
   2798     def __getitem__(self, key):  # noqa: F811
   2799         """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
-> 2800         return self._getitem(key)
   2801 
   2802     def __getitems__(self, keys: List) -> List:

/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py in _getitem(self, key, **kwargs)
   2783         formatter = get_formatter(format_type, features=self._info.features, **format_kwargs)
   2784         pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
-> 2785         formatted_output = format_table(
   2786             pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns
   2787         )

/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in format_table(table, key, formatter, format_columns, output_all_columns)
    627     python_formatter = PythonFormatter(features=formatter.features)
    628     if format_columns is None:
--> 629         return formatter(pa_table, query_type=query_type)
    630     elif query_type == "column":
    631         if key in format_columns:

/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in __call__(self, pa_table, query_type)
    394     def __call__(self, pa_table: pa.Table, query_type: str) -> Union[RowFormat, ColumnFormat, BatchFormat]:
    395         if query_type == "row":
--> 396             return self.format_row(pa_table)
    397         elif query_type == "column":
    398             return self.format_column(pa_table)

/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in format_row(self, pa_table)
    435             return LazyRow(pa_table, self)
    436         row = self.python_arrow_extractor().extract_row(pa_table)
--> 437         row = self.python_features_decoder.decode_row(row)
    438         return row
    439 

/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in decode_row(self, row)
    213 
    214     def decode_row(self, row: dict) -> dict:
--> 215         return self.features.decode_example(row) if self.features else row
    216 
    217     def decode_column(self, column: list, column_name: str) -> list:

/usr/local/lib/python3.10/dist-packages/datasets/features/features.py in decode_example(self, example, token_per_repo_id)
   1927         """
   1928 
-> 1929         return {
   1930             column_name: decode_nested_example(feature, value, token_per_repo_id=token_per_repo_id)
   1931             if self._column_requires_decoding[column_name]

/usr/local/lib/python3.10/dist-packages/datasets/features/features.py in <dictcomp>(.0)
   1928 
   1929         return {
-> 1930             column_name: decode_nested_example(feature, value, token_per_repo_id=token_per_repo_id)
   1931             if self._column_requires_decoding[column_name]
   1932             else value

/usr/local/lib/python3.10/dist-packages/datasets/features/features.py in decode_nested_example(schema, obj, token_per_repo_id)
   1337         # we pass the token to read and decode files from private repositories in streaming mode
   1338         if obj is not None and schema.decode:
-> 1339             return schema.decode_example(obj, token_per_repo_id=token_per_repo_id)
   1340     return obj
   1341 

/usr/local/lib/python3.10/dist-packages/datasets/features/image.py in decode_example(self, value, token_per_repo_id)
    178                         token = None
    179                     download_config = DownloadConfig(token=token)
--> 180                     with xopen(path, "rb", download_config=download_config) as f:
    181                         bytes_ = BytesIO(f.read())
    182                     image = PIL.Image.open(bytes_)

/usr/local/lib/python3.10/dist-packages/datasets/download/streaming_download_manager.py in xopen(file, mode, download_config, *args, **kwargs)
    504     kwargs = {**kwargs, **(storage_options or {})}
    505     try:
--> 506         file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()
    507     except ValueError as e:
    508         if str(e) == "Cannot seek streaming HTTP file":

/usr/local/lib/python3.10/dist-packages/fsspec/core.py in open(urlpath, mode, compression, encoding, errors, protocol, newline, **kwargs)
    449     )
    450     if not out:
--> 451         raise FileNotFoundError(urlpath)
    452     return out[0]
    453 

FileNotFoundError: https://i0.wp.com/webadubradio.fr/wp-content/uploads/2018/12/webadubradio.fr-ogc-nice-antillais.png?resize=364%2C205&amp;ssl=1```

I want to be able to eliminate urls that doesnt exist or cant be cast as Image type.
dataset version: 2.16.1
enviroment: Google Colab

Hi! You can use this code to download the images while ignoring non-existing urls.

1 Like

Thanks Mario, this solutions works. I actually needed to learn conceptional captions term.