Hai,
I am using LayoutLMV2 model and trying to finetune the model in own data set but getting error. The data set contain file_name, words, boxes, word_labels and getting the error while preprocessing the data. Can someone please help me.
processor = LayoutLMv2Processor.from_pretrained(path, apply_ocr = False)
tokenizer = LayoutLMv2TokenizerFast.from_pretrained(path)
model = LayoutLMv2ForTokenClassification.from_pretrained(path, num_labels=len(label2id))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
features = Features({
'image': Array3D(dtype="int64", shape=(3, 224, 224)),
'input_ids': Sequence(feature=Value(dtype='int64')),
'attention_mask': Sequence(Value(dtype='int64')),
'token_type_ids': Sequence(Value(dtype='int64')),
'bbox': Array2D(dtype="int64", shape=(512, 4)),
'labels': Sequence(ClassLabel(names=labels)),
})
def preprocess_data(examples):
images = [Image.open(path).convert("RGB") for path in set(examples['File_name'])]
words = examples['words']
boxes = examples['boxes']
word_labels = examples['Label']
encoded_inputs = processor(images, words, boxes=boxes, word_labels=word_labels, padding="max_length", truncation=True)
return encoded_inputs
train_dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names, features=features, batched=True)
This is the error i am getting
TypeError Traceback (most recent call last)
d:\Working\Trans_LayoutLMV2_Key-Value.ipynb Cell 21 in <cell line: 20>()
17 encoded_inputs = processor(images, words, boxes=boxes, word_labels=word_labels, padding="max_length", truncation=True)
18 return encoded_inputs
---> 20 train_dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names, features=features, batched=True)
File c:\Users\name\.conda\envs\detectron_env\lib\site-packages\datasets\arrow_dataset.py:2376, in Dataset.map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
2373 disable_tqdm = not logging.is_progress_bar_enabled()
2375 if num_proc is None or num_proc == 1:
-> 2376 return self._map_single(
2377 function=function,
2378 with_indices=with_indices,
2379 with_rank=with_rank,
2380 input_columns=input_columns,
2381 batched=batched,
2382 batch_size=batch_size,
2383 drop_last_batch=drop_last_batch,
2384 remove_columns=remove_columns,
2385 keep_in_memory=keep_in_memory,
2386 load_from_cache_file=load_from_cache_file,
2387 cache_file_name=cache_file_name,
2388 writer_batch_size=writer_batch_size,
2389 features=features,
2390 disable_nullable=disable_nullable,
2391 fn_kwargs=fn_kwargs,
2392 new_fingerprint=new_fingerprint,
2393 disable_tqdm=disable_tqdm,
2394 desc=desc,
2395 )
2396 else:
2398 def format_cache_file_name(cache_file_name, rank):
File c:\Users\name\.conda\envs\detectron_env\lib\site-packages\datasets\arrow_dataset.py:551, in transmit_tasks.<locals>.wrapper(*args, **kwargs)
549 self: "Dataset" = kwargs.pop("self")
...
TypeError: Couldn't cast array of type
int64
to
Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)