Hi all,
I have VQA model inference lower accuracy in validation dataset. Guessing the problem is the frcnn image features are not exact.
Another question is the id2label and label2id in config.json. It does not contain real label values. So I copied the id2label from the demo page https://github.com/huggingface/transformers/blob/main/examples/research_projects/visual_bert/demo.ipynb.
Can huggingface team solve this issue? Much appreciate. Thank you very much.
class VQADataset(torch.utils.data.Dataset):
“”“VQA (v2) dataset.”“”
def init(self, questions, annotations, tokenizer, image_preprocess, frcnn, frcnn_cfg):
self.questions = questions
self.annotations = annotations
self.tokenizer = tokenizer
self.image_preprocess = image_preprocess
self.frcnn = frcnn
self.frcnn_cfg = frcnn_cfg
def __len__(self):
return len(self.annotations)
def __getitem__(self, idx):
# answer
annotation = self.annotations[idx]
# question
questions = self.questions[idx]
image_path = id_to_filename[annotation["image_id"]]
image_path = image_path.replace("./multimodal_data/vqa2/val2014/.", "", 1)
text = questions['question']
images, sizes, scales_yx = self.image_preprocess(image_path)
output_dict = self.frcnn(
images,
sizes,
scales_yx=scales_yx,
padding="max_detections",
max_detections=self.frcnn_cfg.max_detections,
return_tensors="pt")
# Very important that the boxes are normalized
feature = output_dict.get("roi_features")
normalized_boxes = output_dict.get("normalized_boxes")
inputs = self.tokenizer(
text,
padding="max_length",
max_length=25,
truncation=True,
return_token_type_ids=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt")
inputs.update(
{
"visual_embeds": feature,
"visual_attention_mask": torch.ones(feature.shape[:-1], dtype=torch.float),
"visual_token_type_ids": torch.ones(feature.shape[:-1], dtype=torch.long),
# "output_attentions": False
}
)
# remove batch dimension
for k, v in inputs.items():
if isinstance(v, torch.Tensor):
inputs[k] = v.squeeze()
# add labels
labels = annotation['labels']
# print("label candidate:", labels)
scores = annotation["scores"]
targets = torch.zeros(len(config.id2label), dtype=torch.float)
for label, score in zip(labels, scores):
# print(f"Setting target at index {label} to {score}")
targets[label] = score
inputs["labels"] = targets
inputs["text"] = text
return inputs
# image input
from visualbert.processing_image import Preprocess
from visualbert.visualizing_image import SingleImageViz
from visualbert.modeling_frcnn import GeneralizedRCNN
from visualbert.utils import Config
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)
image_preprocess = Preprocess(frcnn_cfg)
# text input
from transformers import VisualBertForQuestionAnswering, AutoTokenizer, BertTokenizerFast
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa",
num_labels=len(config.id2label),
id2label=config.id2label,
label2id=config.label2id,
output_hidden_states=True)
# if cfg.use_multi_gpu:
# model = nn.DataParallel(model)
# model = model.to(device=device)
model.to(device)
model.eval()
dataset = VQADataset(questions=questions[:5000],
annotations=annotations[:5000],
tokenizer=tokenizer,
image_preprocess=image_preprocess,
frcnn=frcnn,
frcnn_cfg=frcnn_cfg)
test_dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
correct = 0.0
total = 0
# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)
# from visualbert import utils
# VQA_URL = "https://dl.fbaipublicfiles.com/pythia/data/answers_vqa.txt"
# vqa_answers = utils.get_data(VQA_URL)
for batch in tqdm(test_dataloader):
batch_text = batch.copy()
if "text" in batch:
del batch["text"]
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
logits = outputs.logits # [batch_size, 3129]
_, pre = torch.max(logits, 1)
_, target = torch.max(batch["labels"], 1)
print("prediction:", pre)
print("target:", target)
# print("prediction from VisualBert VQA:", vqa_answers[pre])
# print("Predicted answer:", model.config.id2label[pre.item()])
# TODO label not right
# print("Target answer:", model.config.id2label[target.item()])
correct += (pre == target).sum()
total = total + 1
print(total)
final_acc = correct / float(len(test_dataloader.dataset))
print('Accuracy of test: %f %%' % (100 * float(final_acc)))