Problem with loading custom dataset from jsonl file

Hi. I create simple test dataset in jsonl file and try to load it.

from datasets import load_dataset
dataset = load_dataset("json", data_files="test_dataset.jsonl", field="label")

dataset

But have an error:

JSONDecodeError                           Traceback (most recent call last)
File ~/.local/lib/python3.9/site-packages/datasets/builder.py:1860, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1859 _time = time.time()
-> 1860 for _, table in generator:
   1861     if max_shard_size is not None and writer._num_bytes > max_shard_size:

File ~/.local/lib/python3.9/site-packages/datasets/packaged_modules/json/json.py:80, in _generate_tables(self, files)
     79     files = [dl_manager.iter_files(file) for file in files]
---> 80     splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
     81 return splits

File /usr/lib/python3.9/json/__init__.py:293, in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    276 """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
    277 a JSON document) to a Python object.
    278 
   (...)
    291 kwarg; otherwise ``JSONDecoder`` is used.
    292 """
--> 293 return loads(fp.read(),
    294     cls=cls, object_hook=object_hook,
    295     parse_float=parse_float, parse_int=parse_int,
    296     parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)

File /usr/lib/python3.9/json/__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    343 if (cls is None and object_hook is None and
    344         parse_int is None and parse_float is None and
    345         parse_constant is None and object_pairs_hook is None and not kw):
--> 346     return _default_decoder.decode(s)
    347 if cls is None:

File /usr/lib/python3.9/json/decoder.py:340, in JSONDecoder.decode(self, s, _w)
    339 if end != len(s):
--> 340     raise JSONDecodeError("Extra data", s, end)
    341 return obj

JSONDecodeError: Extra data: line 2 column 1 (char 233)

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
Cell In[21], line 2
      1 from datasets import load_dataset
----> 2 dataset = load_dataset("json", data_files="test_dataset.jsonl", field="label")
      4 dataset

File ~/.local/lib/python3.9/site-packages/datasets/load.py:1791, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   1788 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1790 # Download and prepare data
-> 1791 builder_instance.download_and_prepare(
   1792     download_config=download_config,
   1793     download_mode=download_mode,
   1794     verification_mode=verification_mode,
   1795     try_from_hf_gcs=try_from_hf_gcs,
   1796     num_proc=num_proc,
   1797     storage_options=storage_options,
   1798 )
   1800 # Build dataset for splits
   1801 keep_in_memory = (
   1802     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1803 )

File ~/.local/lib/python3.9/site-packages/datasets/builder.py:891, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    889     if num_proc is not None:
    890         prepare_split_kwargs["num_proc"] = num_proc
--> 891     self._download_and_prepare(
    892         dl_manager=dl_manager,
    893         verification_mode=verification_mode,
    894         **prepare_split_kwargs,
    895         **download_and_prepare_kwargs,
    896     )
    897 # Sync info
    898 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~/.local/lib/python3.9/site-packages/datasets/builder.py:986, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
    982 split_dict.add(split_generator.split_info)
    984 try:
    985     # Prepare split will record examples associated to the split
--> 986     self._prepare_split(split_generator, **prepare_split_kwargs)
    987 except OSError as e:
    988     raise OSError(
    989         "Cannot find data file. "
    990         + (self.manual_download_instructions or "")
    991         + "\nOriginal error:\n"
    992         + str(e)
    993     ) from None

File ~/.local/lib/python3.9/site-packages/datasets/builder.py:1748, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
   1746 job_id = 0
   1747 with pbar:
-> 1748     for job_id, done, content in self._prepare_split_single(
   1749         gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1750     ):
   1751         if done:
   1752             result = content

File ~/.local/lib/python3.9/site-packages/datasets/builder.py:1893, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1891     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1892         e = e.__context__
-> 1893     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1895 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

My file very little - 11 Kb. Link to file test_dataset.jsonl - Google Drive

What I doing wrong? And how fix this?

I think your problem has to do with the fact that your “label” field in the JSONL is a list of lists, where the elements in the inner lists are (sometimes) a mix of strings and integers.

For example, the first line of the file has the following label: [[0, 7, 0], [8, 11, 0], [12, 23, "B-LOC"],...], which is problematic because it combines numbers with strings like “B-LOC”.

If you really need those numbers you can simply map them to strings, so that your labels are a list of lists of strings. For instance:

import pandas as pd
from datasets import Dataset

def int2str(labels):
    fixed_labels = []
    for label_array in labels:
        curr_array = []
        for label in label_array:
            if not isinstance(label,str):
                label = str(label)
            curr_array.append(label)
        fixed_labels.append(curr_array)
    return fixed_labels

df = pd.read_json("test_dataset.jsonl", lines=True)
df["label"] = df.apply(lambda x: int2str(x.label), axis=1)
dataset = Dataset.from_pandas(df)
print(dataset)
2 Likes