Hai,
I am trying to finetune LayoutLMV2 model for document image classification but in preprocessing stage I am getting error. Can some one please help me in fixing the error.
dataset = Dataset.from_pandas(df_copy)
dataset
Dataset({
features: ['Image_File_Path', 'Label'],
num_rows: 12
})
this is the code
# we need to define custom features
path = './Image FIle/'
features = Features({
'image': Array3D(dtype="int64", shape=(3, 224, 224)),
'input_ids': Sequence(feature=Value(dtype='int64')),
'attention_mask': Sequence(Value(dtype='int64')),
'token_type_ids': Sequence(Value(dtype='int64')),
'bbox': Array2D(dtype="int64", shape=(512, 4)),
'labels': Sequence(Value(dtype='int64')),
})
def preprocess_data(examples):
# take a batch of images
images = [Image.open(os.path.join(path, file)).convert("RGB") for file in examples['Image_File_Path']]
encoded_inputs = processor(images, padding="max_length", truncation=True)
# add labels
encoded_inputs["labels"] = [label2id[label] for label in examples["Label"]]
return encoded_inputs
encoded_dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names, features=features, batched=True, batch_size=2)
this is the error
**TypeError** Traceback (most recent call last) **d:\Working\Trans_LayoutLMV2_Doc_Classification.ipynb Cell 24** in <cell line: 21>**()** [18]encoded_inputs["labels"] = [label2id[label] for label in examples["Label"]]
[19]return encoded_inputs **--->
[21]encoded_dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names, features=features, batched=True, batch_size=2) File **~\AppData\Roaming\Python\Python310\site-packages\datasets\arrow_dataset.py:2387**, in Dataset.map**(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)**
2384 disable_tqdm = not logging.is_progress_bar_enabled()
2386 if num_proc is None or num_proc == 1: **->
2387** return self._map_single(
2388 function=function,
2389 with_indices=with_indices,
2390 with_rank=with_rank,
2391 input_columns=input_columns,
2392 batched=batched,
2393 batch_size=batch_size,
2394 drop_last_batch=drop_last_batch,
2395 remove_columns=remove_columns,
2396 keep_in_memory=keep_in_memory,
2397 load_from_cache_file=load_from_cache_file,
2398 cache_file_name=cache_file_name,
2399 writer_batch_size=writer_batch_size,
2400 features=features,
2401 disable_nullable=disable_nullable,
2402 fn_kwargs=fn_kwargs,
2403 new_fingerprint=new_fingerprint,
2404 disable_tqdm=disable_tqdm,
2405 desc=desc,
2406 )
2407 else:
2409 def format_cache_file_name(cache_file_name, rank): File **~\AppData\Roaming\Python\Python310\site-packages\datasets\arrow_dataset.py:557**, in transmit_tasks.<locals>.wrapper**(*args, **kwargs)** 555 self: "Dataset" = kwargs.pop("self")
...
**TypeError**: Couldn't cast array of type int64 to Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)