Add new column to a dataset

In the dataset I have 5000000 rows, I would like to add a column called ‘embeddings’ to my dataset.

dataset = dataset.add_column('embeddings', embeddings)

The variable embeddings is a numpy memmap array of size (5000000, 512).

But I get this error:

ArrowInvalidTraceback (most recent call last)
in
----> 1 dataset = dataset.add_column(‘embeddings’, embeddings)

/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
486 }
487 # apply actual function
→ 488 out: Union[“Dataset”, “DatasetDict”] = func(self, *args, **kwargs)
489 datasets: List[“Dataset”] = list(out.values()) if isinstance(out, dict) else [out]
490 # re-apply format to the output

/opt/conda/lib/python3.8/site-packages/datasets/fingerprint.py in wrapper(*args, **kwargs)
404 # Call actual function
405
→ 406 out = func(self, *args, **kwargs)
407
408 # Update fingerprint of in-place transforms + update in-place history of transforms

/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py in add_column(self, name, column, new_fingerprint)
3346 :class:Dataset
3347 “”"
→ 3348 column_table = InMemoryTable.from_pydict({name: column})
3349 # Concatenate tables horizontally
3350 table = ConcatenationTable.from_tables([self._data, column_table], axis=1)

/opt/conda/lib/python3.8/site-packages/datasets/table.py in from_pydict(cls, *args, **kwargs)
367 @classmethod
368 def from_pydict(cls, *args, **kwargs):
→ 369 return cls(pa.Table.from_pydict(*args, **kwargs))
370
371 @inject_arrow_table_documentation(pa.Table.from_batches)

/opt/conda/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.Table.from_pydict()

/opt/conda/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib._from_pydict()

/opt/conda/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.asarray()

/opt/conda/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.array()

/opt/conda/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._ndarray_to_array()

/opt/conda/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()

ArrowInvalid: only handle 1-dimensional arrays

How can I solve?

Hi,

it should work if you use concatenate_datasets instead:

import datasets
dset_embed = datasets.Dataset.from_dict({"embeddings": embeddings})
dset_concat = datasets.concatenate_datasets([dset, dset_embed], axis=1)

I have also the problem that the array ‘embeddings’ does not fit the RAM, so I suspect that the method you are proposing is not actually feaseble.

I’ve tried anyway and I got this error:

ArrowInvalidTraceback (most recent call last)
in
1 import datasets
----> 2 dataset_embed = datasets.Dataset.from_dict({“embeddings”: embeddings})

/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py in from_dict(cls, mapping, features, info, split)
783 for col, data in mapping.items()
784 }
→ 785 pa_table = InMemoryTable.from_pydict(mapping=mapping)
786 return cls(pa_table, info=info, split=split)
787

/opt/conda/lib/python3.8/site-packages/datasets/table.py in from_pydict(cls, *args, **kwargs)
367 @classmethod
368 def from_pydict(cls, *args, **kwargs):
→ 369 return cls(pa.Table.from_pydict(*args, **kwargs))
370
371 @inject_arrow_table_documentation(pa.Table.from_batches)

/opt/conda/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.Table.from_pydict()

/opt/conda/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib._from_pydict()

/opt/conda/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.asarray()

/opt/conda/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.array()

/opt/conda/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._handle_arrow_array_protocol()

/opt/conda/lib/python3.8/site-packages/datasets/arrow_writer.py in arrow_array(self, type)
111 out = pa.ExtensionArray.from_storage(type, storage)
112 elif isinstance(self.data, np.ndarray):
→ 113 out = numpy_to_pyarrow_listarray(self.data)
114 if type is not None:
115 out = out.cast(type)

/opt/conda/lib/python3.8/site-packages/datasets/features/features.py in numpy_to_pyarrow_listarray(arr, type)
921 n_offsets = reduce(mul, arr.shape[: arr.ndim - i - 1], 1)
922 step_offsets = arr.shape[arr.ndim - i - 1]
→ 923 offsets = pa.array(np.arange(n_offsets + 1) * step_offsets, type=pa.int32())
924 values = pa.ListArray.from_arrays(offsets, values)
925 return values

/opt/conda/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.array()

/opt/conda/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._ndarray_to_array()

/opt/conda/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()

ArrowInvalid: Integer value 2147483648 not in range: -2147483648 to 2147483647

Hi ! Not sure if datasets is able to convert a memory-mapped NumPy array to an Arrow array without bringing the NumPy array in RAM (I haven’t tested).

However the error you’re getting looks more like an integer precision issue because the array you are passing has more than 2147483648 elements, maybe you could try chunking your list of embeddings, then get a dataset object per chunk, and concatenate them to get the dataset with all the embeddings.