Visualbert lower accuracy in validation dataset

Hi all,

I have VQA model inference lower accuracy in validation dataset. Guessing the problem is the frcnn image features are not exact.

Another question is the id2label and label2id in config.json. It does not contain real label values. So I copied the id2label from the demo page https://github.com/huggingface/transformers/blob/main/examples/research_projects/visual_bert/demo.ipynb.

Can huggingface team solve this issue? Much appreciate. Thank you very much.

class VQADataset(torch.utils.data.Dataset):
“”“VQA (v2) dataset.”“”
def init(self, questions, annotations, tokenizer, image_preprocess, frcnn, frcnn_cfg):
self.questions = questions
self.annotations = annotations
self.tokenizer = tokenizer
self.image_preprocess = image_preprocess
self.frcnn = frcnn
self.frcnn_cfg = frcnn_cfg

def __len__(self):
    return len(self.annotations)

def __getitem__(self, idx):
     # answer
    annotation = self.annotations[idx]
    #  question
    questions = self.questions[idx]
    image_path = id_to_filename[annotation["image_id"]]
    image_path = image_path.replace("./multimodal_data/vqa2/val2014/.", "", 1)
    text = questions['question']

    images, sizes, scales_yx = self.image_preprocess(image_path)
    output_dict = self.frcnn(
                         images,
                         sizes,
                         scales_yx=scales_yx,
                         padding="max_detections",
                         max_detections=self.frcnn_cfg.max_detections,
                         return_tensors="pt")

    # Very important that the boxes are normalized
    feature = output_dict.get("roi_features")
    normalized_boxes = output_dict.get("normalized_boxes")

    inputs = self.tokenizer(
                    text,
                    padding="max_length",
                    max_length=25,
                    truncation=True,
                    return_token_type_ids=True,
                    return_attention_mask=True,
                    add_special_tokens=True,
                    return_tensors="pt")

    inputs.update(
        {
         "visual_embeds": feature,
         "visual_attention_mask": torch.ones(feature.shape[:-1], dtype=torch.float),
         "visual_token_type_ids": torch.ones(feature.shape[:-1], dtype=torch.long),
         # "output_attentions": False
         }
    )

    # remove batch dimension
    for k, v in inputs.items():
         if isinstance(v, torch.Tensor):
            inputs[k] = v.squeeze()

    # add labels
    labels = annotation['labels']
    # print("label candidate:", labels)
    scores = annotation["scores"]

    targets = torch.zeros(len(config.id2label), dtype=torch.float)
    for label, score in zip(labels, scores):
        # print(f"Setting target at index {label} to {score}")
        targets[label] = score
    inputs["labels"] = targets
    inputs["text"] = text
    return inputs


# image input
from visualbert.processing_image import Preprocess
from visualbert.visualizing_image import SingleImageViz
from visualbert.modeling_frcnn import GeneralizedRCNN
from visualbert.utils import Config

frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)
image_preprocess = Preprocess(frcnn_cfg)

# text input
from transformers import VisualBertForQuestionAnswering, AutoTokenizer, BertTokenizerFast
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa",
                                                       num_labels=len(config.id2label),
                                                       id2label=config.id2label,
                                                       label2id=config.label2id,
                                                       output_hidden_states=True)


# if cfg.use_multi_gpu:
# model = nn.DataParallel(model)
# model = model.to(device=device)
model.to(device)
model.eval()

dataset = VQADataset(questions=questions[:5000],
                     annotations=annotations[:5000],
                     tokenizer=tokenizer,
                     image_preprocess=image_preprocess,
                     frcnn=frcnn,
                     frcnn_cfg=frcnn_cfg)

test_dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

correct = 0.0
total = 0

# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)

# from visualbert import utils
# VQA_URL = "https://dl.fbaipublicfiles.com/pythia/data/answers_vqa.txt"
# vqa_answers = utils.get_data(VQA_URL)

for batch in tqdm(test_dataloader):
    batch_text = batch.copy()
    if "text" in batch:
        del batch["text"]

    batch = {k: v.to(device) for k, v in batch.items()}

    outputs = model(**batch)
    logits = outputs.logits  # [batch_size, 3129]
    _, pre = torch.max(logits, 1)
    _, target = torch.max(batch["labels"], 1)

    print("prediction:", pre)
    print("target:", target)

    # print("prediction from VisualBert VQA:", vqa_answers[pre])
    # print("Predicted answer:", model.config.id2label[pre.item()])
    # TODO label not right
    # print("Target answer:", model.config.id2label[target.item()])

    correct += (pre == target).sum()
    total = total + 1
    print(total)

final_acc = correct / float(len(test_dataloader.dataset))
print('Accuracy of test: %f %%' % (100 * float(final_acc)))