LayoutLM data format for bounding box classification

I’m following the LayoutLMV3 tutorial on the FUNSD dataset for token classification. However, my use case is classifying bounding boxes from an image (1 label per bounding box).

There’s another tutorial for text classification referenced the LayoutLM docs, but it’s different enough in that each document just gets classified (as opposed to each bounding box).

In my use case, each bounding box can have one or more words associated with it and only 1 label. I don’t want to feed a token-label pair for each token into the model. I’m having trouble figuring out how to set up the pre-processing so that I get the data into this format.

Here is an almost reproducible example (you’ll have to just use 2 dummy images and specify their path from your machine):


from datasets import Dataset, Features, Sequence, ClassLabel, Value, Array2D, Array3D
import pandas as pd
from PIL import Image
from transformers import AutoProcessor
from transformers.data.data_collator import default_data_collator

img_dict = {}

# specify names of any 2 local jpeg images
files = ['file1', 'file2']

for file in files:
    file_path = f"./your_direc/{file}.jpg"
    image = Image.open(file_path).convert('RGB')
    img_dict[file] = image

# 2 rows - each row has 3 bounding boxes, with 1 label for each bounding box, and or more words per BB
df = pd.DataFrame({'label': [["yes", "no", "maybe"], ["yes", "yes", "maybe"]],
                   'text': [['potato', 'strange feeling', 'hydrogen'],['cat', 'man smiles', 'sun rises']],
                   'bbox': [[[3, 5, 10, 20], [0, 1, 8, 4], [4,5, 9, 15]], [[4, 6, 12, 23], [2, 3, 11, 6], [5,6,11,13]]],
                   'file': files})


data_dict = {}
data_dict['id'] = df.index
data_dict['label'] = df.label
data_dict['text'] = df.text

data_dict['bbox'] = df.bbox
# include the image reference
data_dict['image'] = [img_dict[i] for i in df.file]

data_temp = Dataset.from_dict(data_dict)

full_dataset = data_temp.cast_column('label', Sequence(ClassLabel(names=["yes", "no", "maybe"])))

features = full_dataset.features
column_names = full_dataset.column_names

image_column_name = "image"
text_column_name = "text"
boxes_column_name = "bbox"
label_column_name = "label"

def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

if isinstance(features[label_column_name], ClassLabel):
    label_list = features[label_column_name].names
    # No need to convert the labels since they are already ints.
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
else:
    label_list = get_label_list(full_dataset[label_column_name])
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
num_labels = len(label_list)

processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

def prepare_examples(examples):
  images = examples[image_column_name]
  words = examples[text_column_name]
  boxes = examples[boxes_column_name]
  word_labels = examples[label_column_name]

  encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
                       truncation=True, padding="max_length")

  return encoding

# we need to define custom features for `set_format` (used later on) to work properly
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

train_dataset = full_dataset.map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

If I run the below, the output is 512:
len(train_dataset['labels'][0])

And if I run this:
processor.tokenizer.decode(train_dataset[0]["input_ids"])

I get (padded to 512):
'<s> potato strange feeling hydrogen</s><pad>......<pad>'

What I would want to see is instead 3 labels for train_datasets['label'][0] (1 for each bounding box).
And for the input input’s for train_dataset[0]['input_ids'] I’d want to see a list of 3, with each inner list containing a variable number of tokens.

I’ve tried modifying prepare_examples:

def prepare_examples(examples):
    input_ids = []
    attention_masks = []
    pixel_values = []

    texts = examples["text"]
    bboxes = examples["bbox"]
    labels = examples["label"]
    images = examples["image"]

    for text, image in zip(texts, images):
        tokenized_sequences = []
        for text_seq in text:
            # Assuming that text is a string, and each bounding box corresponds to a substring of text
            tokenized_seq = processor.tokenizer(text_seq, truncation=True, max_length=30, padding="max_length", return_tensors="pt")
            tokenized_sequences.append(tokenized_seq)

        input_ids.append([seq["input_ids"].squeeze() for seq in tokenized_sequences])
        attention_masks.append([seq["attention_mask"].squeeze() for seq in tokenized_sequences])

        # Encode the image to get the pixel_values
        encoded_image = processor.encode_image(image)
        pixel_values.append(encoded_image)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels,
        "bbox": bboxes,
        "pixel_values": pixel_values,
    }

But get an error when I try to use that one:
ValueError: Words must be of type List[str](single pretokenized example), orList[List[str]](batch of pretokenized examples).