Visualbert lower accuracy in validation dataset

wayne123456 · November 20, 2023, 3:15am

Hi all,

I have VQA model inference lower accuracy in validation dataset. Guessing the problem is the frcnn image features are not exact.

Another question is the id2label and label2id in config.json. It does not contain real label values. So I copied the id2label from the demo page https://github.com/huggingface/transformers/blob/main/examples/research_projects/visual_bert/demo.ipynb.

Can huggingface team solve this issue? Much appreciate. Thank you very much.

class VQADataset(torch.utils.data.Dataset):
“”“VQA (v2) dataset.”“”
def init(self, questions, annotations, tokenizer, image_preprocess, frcnn, frcnn_cfg):
self.questions = questions
self.annotations = annotations
self.tokenizer = tokenizer
self.image_preprocess = image_preprocess
self.frcnn = frcnn
self.frcnn_cfg = frcnn_cfg

def __len__(self):
    return len(self.annotations)

def __getitem__(self, idx):
     # answer
    annotation = self.annotations[idx]
    #  question
    questions = self.questions[idx]
    image_path = id_to_filename[annotation["image_id"]]
    image_path = image_path.replace("./multimodal_data/vqa2/val2014/.", "", 1)
    text = questions['question']

    images, sizes, scales_yx = self.image_preprocess(image_path)
    output_dict = self.frcnn(
                         images,
                         sizes,
                         scales_yx=scales_yx,
                         padding="max_detections",
                         max_detections=self.frcnn_cfg.max_detections,
                         return_tensors="pt")

    # Very important that the boxes are normalized
    feature = output_dict.get("roi_features")
    normalized_boxes = output_dict.get("normalized_boxes")

    inputs = self.tokenizer(
                    text,
                    padding="max_length",
                    max_length=25,
                    truncation=True,
                    return_token_type_ids=True,
                    return_attention_mask=True,
                    add_special_tokens=True,
                    return_tensors="pt")

    inputs.update(
        {
         "visual_embeds": feature,
         "visual_attention_mask": torch.ones(feature.shape[:-1], dtype=torch.float),
         "visual_token_type_ids": torch.ones(feature.shape[:-1], dtype=torch.long),
         # "output_attentions": False
         }
    )

    # remove batch dimension
    for k, v in inputs.items():
         if isinstance(v, torch.Tensor):
            inputs[k] = v.squeeze()

    # add labels
    labels = annotation['labels']
    # print("label candidate:", labels)
    scores = annotation["scores"]

    targets = torch.zeros(len(config.id2label), dtype=torch.float)
    for label, score in zip(labels, scores):
        # print(f"Setting target at index {label} to {score}")
        targets[label] = score
    inputs["labels"] = targets
    inputs["text"] = text
    return inputs


# image input
from visualbert.processing_image import Preprocess
from visualbert.visualizing_image import SingleImageViz
from visualbert.modeling_frcnn import GeneralizedRCNN
from visualbert.utils import Config

frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)
image_preprocess = Preprocess(frcnn_cfg)

# text input
from transformers import VisualBertForQuestionAnswering, AutoTokenizer, BertTokenizerFast
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa",
                                                       num_labels=len(config.id2label),
                                                       id2label=config.id2label,
                                                       label2id=config.label2id,
                                                       output_hidden_states=True)


# if cfg.use_multi_gpu:
# model = nn.DataParallel(model)
# model = model.to(device=device)
model.to(device)
model.eval()

dataset = VQADataset(questions=questions[:5000],
                     annotations=annotations[:5000],
                     tokenizer=tokenizer,
                     image_preprocess=image_preprocess,
                     frcnn=frcnn,
                     frcnn_cfg=frcnn_cfg)

test_dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

correct = 0.0
total = 0

# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)

# from visualbert import utils
# VQA_URL = "https://dl.fbaipublicfiles.com/pythia/data/answers_vqa.txt"
# vqa_answers = utils.get_data(VQA_URL)

for batch in tqdm(test_dataloader):
    batch_text = batch.copy()
    if "text" in batch:
        del batch["text"]

    batch = {k: v.to(device) for k, v in batch.items()}

    outputs = model(**batch)
    logits = outputs.logits  # [batch_size, 3129]
    _, pre = torch.max(logits, 1)
    _, target = torch.max(batch["labels"], 1)

    print("prediction:", pre)
    print("target:", target)

    # print("prediction from VisualBert VQA:", vqa_answers[pre])
    # print("Predicted answer:", model.config.id2label[pre.item()])
    # TODO label not right
    # print("Target answer:", model.config.id2label[target.item()])

    correct += (pre == target).sum()
    total = total + 1
    print(total)

final_acc = correct / float(len(test_dataloader.dataset))
print('Accuracy of test: %f %%' % (100 * float(final_acc)))

Topic		Replies	Views
Visualbert config.json Beginners	0	307	July 18, 2023
Multiclass Classification: "labels" format Beginners	0	670	October 26, 2022
Dino2 for classification has wrong number of labels 🤗Transformers	2	466	October 2, 2023
Token Classification Label order Intermediate	0	566	November 11, 2022
Shape mismatch between labels and logits 🤗Transformers	1	1683	December 27, 2023

Visualbert lower accuracy in validation dataset

Related topics