I get the error ValueError: Unable to avoid copy while creating an array as requested
when using .set_format(type=torch)
. I’ve triple checked the formatting to tutorials and can not figure it out.
This is how I’m processing my data:
# not sure why I need this
features = Features({
'input_ids': Sequence(feature=Value(dtype='int64')),
'attention_mask': Sequence(Value(dtype='int64')),
'bbox': Array2D(dtype="int64", shape=(512, 4)),
'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
'labels': ClassLabel(num_classes=len(labels), names=labels),
'image_path': Value(dtype='string', id=None)
})
# Function to process a batch of examples
def process_batch(batch):
# take a batch of images
images = [Image.open(root + 'images/'+path).convert("RGB") for path in batch['image_path']]
encoded_inputs = processor(images, padding="max_length", truncation=True)
# encoded_inputs['labels'] = [label for label in batch["label"]]
# encoded_inputs['image_path'] = [path for path in batch['image_path']]
return encoded_inputs
# Process each split in the dataset
for split in ['test', 'val', 'train']:
labels_data = pd.read_table(root + f'labels/{split}.txt',header=None, names = ["image_path", "label"],sep=" ")
dataset = Dataset.from_pandas(labels_data[:2])
processed_split = dataset.map(process_batch, remove_columns=dataset.column_names, features=features,
batched=True, batch_size=2)
processed_split.save_to_disk(f"layoutlmv3_processed_rvl_cdip_{split}")
and this is the full Traceback:
Traceback (most recent call last):
File "/path/to/project/layoutlmv3train.py", line 24, in <module>
batch = next(iter(train_loader))
File "/path/to/project/venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 631, in __next__
data = self._next_data()
File "/path/to/project/venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 675, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/path/to/project/venv/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
data = self.dataset.__getitems__(possibly_batched_index)
File "/path/to/project/venv/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 2870, in __getitems__
batch = self.__getitem__(keys)
File "/path/to/project/venv/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 2866, in __getitem__
return self._getitem(key)
File "/path/to/project/venv/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 2851, in _getitem
formatted_output = format_table(
File "/path/to/project/venv/lib/python3.10/site-packages/datasets/formatting/formatting.py", line 641, in format_table
formatted_output = formatter(pa_table_to_format, query_type=query_type)
File "/path/to/project/venv/lib/python3.10/site-packages/datasets/formatting/formatting.py", line 401, in __call__
return self.format_batch(pa_table)
File "/path/to/project/venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py", line 110, in format_batch
batch = self.numpy_arrow_extractor().extract_batch(pa_table)
File "/path/to/project/venv/lib/python3.10/site-packages/datasets/formatting/formatting.py", line 165, in extract_batch
return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}
File "/path/to/project/venv/lib/python3.10/site-packages/datasets/formatting/formatting.py", line 165, in <dictcomp>
return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}
File "/path/to/project/venv/lib/python3.10/site-packages/datasets/formatting/formatting.py", line 197, in _arrow_array_to_numpy
return np.array(array, copy=False)
ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.