I am attempting to fine tune the layoutlmv3-base model and am running into issues when trying to use the model to encode. Any help would be useful.
Here is my code:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
def convert_ner_tags_to_id(ner_tags):
return [label2id[ner_tag] for ner_tag in ner_tags]
def prepare_dataset(annotations):
images = annotations['image']
boxes = annotations['bboxes']
words = []
for i in range (len(boxes)):
words.append(" ")
ner_tags = [convert_ner_tags_to_id(ner_tags) for ner_tags in annotations['ner_tags']]
encoding = processor(images, words, boxes=boxes, word_labels=ner_tags, truncation=True, padding="max_length")
return encoding
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
features = Features({
'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
'input_ids': Sequence(feature=Value(dtype='int64')),
'attention_mask': Sequence(Value(dtype='int64')),
'bbox': Array2D(dtype="int64", shape=(512, 4)),
'labels': Sequence(feature=Value(dtype='int64')),
})
train_dataset = layoutlm_ds["train"].map(
prepare_dataset,
batched=True,
remove_columns=column_names,
features=features,
)
eval_dataset = layoutlm_ds["test"].map(
prepare_dataset,
batched=True,
remove_columns=column_names,
features=features,
)
And here is the error that I am getting:
TypeError: Couldn't cast array of type
int64
to
Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)
File <command-1408067255887537>, line 19
4 features = Features({
5 # 'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
6 # 'input_ids': Value("int64"),
(...)
16 'labels': Sequence(feature=Value(dtype='int64')),
17 })
18 # Prepare our train & eval dataset
---> 19 train_dataset = layoutlm_ds["train"].map(
20 prepare_dataset,
21 batched=True,
22 remove_columns=column_names,
23 features=features,
24 )
25 eval_dataset = layoutlm_ds["test"].map(
26 prepare_dataset,
27 batched=True,
28 remove_columns=column_names,
29 features=features,
30 )
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/datasets/arrow_dataset.py:602, in transmit_tasks.<locals>.wrapper(*args, **kwargs)
600 self: "Dataset" = kwargs.pop("self")
601 # apply actual function
--> 602 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
603 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
604 for dataset in datasets:
605 # Remove task templates if a column mapping of the template is no longer valid
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/datasets/arrow_dataset.py:567, in transmit_format.<locals>.wrapper(*args, **kwargs)
560 self_format = {
561 "type": self._format_type,
562 "format_kwargs": self._format_kwargs,
563 "columns": self._format_columns,
564 "output_all_columns": self._output_all_columns,
565 }
566 # apply actual function
--> 567 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
568 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
569 # re-apply format to the output
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/datasets/arrow_dataset.py:3161, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
3155 if transformed_dataset is None:
3156 with hf_tqdm(
3157 unit=" examples",
3158 total=pbar_total,
3159 desc=desc or "Map",
3160 ) as pbar:
-> 3161 for rank, done, content in Dataset._map_single(**dataset_kwargs):
3162 if done:
3163 shards_done += 1
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/datasets/arrow_dataset.py:3575, in Dataset._map_single(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)
3573 writer.write_table(batch.to_arrow())
3574 else:
-> 3575 writer.write_batch(batch)
3576 num_examples_progress_update += num_examples_in_batch
3577 if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/datasets/arrow_writer.py:568, in ArrowWriter.write_batch(self, batch_examples, writer_batch_size)
566 col_try_type = try_features[col] if try_features is not None and col in try_features else None
567 typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
--> 568 arrays.append(pa.array(typed_sequence))
569 inferred_features[col] = typed_sequence.get_inferred_type()
570 schema = inferred_features.arrow_schema if self.pa_writer is None else self.schema
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/pyarrow/array.pxi:247, in pyarrow.lib.array()
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/pyarrow/array.pxi:112, in pyarrow.lib._handle_arrow_array_protocol()
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/datasets/arrow_writer.py:208, in TypedSequence.__arrow_array__(self, type)
203 # otherwise we can finally use the user's type
204 elif type is not None:
205 # We use cast_array_to_feature to support casting to custom types like Audio and Image
206 # Also, when trying type "string", we don't want to convert integers or floats to "string".
207 # We only do it if trying_type is False - since this is what the user asks for.
--> 208 out = cast_array_to_feature(
209 out, type, allow_primitive_to_str=not self.trying_type, allow_decimal_to_str=not self.trying_type
210 )
211 return out
212 except (
213 TypeError,
214 pa.lib.ArrowInvalid,
215 pa.lib.ArrowNotImplementedError,
216 ) as e: # handle type errors and overflows
217 # Ignore ArrowNotImplementedError caused by trying type, otherwise re-raise
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/datasets/table.py:1804, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
1802 return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
1803 else:
-> 1804 return func(array, *args, **kwargs)
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/datasets/table.py:2122, in cast_array_to_feature(array, feature, allow_primitive_to_str, allow_decimal_to_str)
2115 elif not isinstance(feature, (Sequence, dict, list, tuple)):
2116 return array_cast(
2117 array,
2118 feature(),
2119 allow_primitive_to_str=allow_primitive_to_str,
2120 allow_decimal_to_str=allow_decimal_to_str,
2121 )
-> 2122 raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}")