DocVQA test dataset evaluation on qwen2.5-VL-3B

I want to test my peft lora on DocVQA (Qwen-2.5-VL-3B) but I am unable to do so. here is the code to reproduce error. I tried asking claude and chatGPT but they were saying image token is present/not present. i checked and its present as “151665” token id. (just remove peft and if possible help me running a test/evaluation on docVQA dataset.) Also is there some universal way of handling data on hugging face or it depends on dataset to dataset ?

from peft import get_peft_model, PeftModel, LoraConfig, TaskType
from transformers import AutoProcessor, AutoModelForImageTextToText , BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer
import torch

device = “cuda” if torch.cuda.is_available() else “cpu”
model_id = “Qwen/Qwen2.5-VL-3B-Instruct”

min_pixels = 2242828
max_pixels = 2242828
processor = AutoProcessor.from_pretrained(model_id, min_pixels=min_pixels, max_pixels=max_pixels)
processor.tokenizer.padding_side = “right”

base_model = AutoModelForImageTextToText.from_pretrained(
model_id,
torch_dtype=torch.float16,
load_in_8bit=True, # Load in 8-bit precision
device_map=“auto”
)

from datasets import load_dataset
from tqdm import tqdm
import torch
from collections import defaultdict
from PIL import Image

Assuming base_model and processor are already loaded

model = base_model
processor = processor # Replace with the actual processor instance

def evaluate_docvqa(model, processor, num_samples=None, device=“cuda”):
# Load the DocVQA dataset (test split)
dataset = load_dataset(“lmms-lab/DocVQA”, “DocVQA”, split=“test”)

# Limit the number of samples if num_samples is provided
if num_samples:
    dataset = dataset.select(range(min(num_samples, len(dataset))))

correct = 0
results = defaultdict(list)

model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    for sample in tqdm(dataset):
        # Add the <image> token to the question text
        text_with_image = f"<image>{sample['question']}"

        # Convert the NumPy array to a PIL Image
        image = sample['image']
        image = Image.fromarray(image)  # Convert to PIL Image if it's not already
        image = image.resize((224, 224))  # Resize to a specific dimension (adjust as necessary)

        # Process the image and text
        inputs = processor(
            images=image,  # Use resized image
            text=text_with_image,  # Include <image> token
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(device)  # Move inputs to the same device as the model

        # Debug: Check processed input tensor dimensions
        print(f"Processed input size: {inputs['pixel_values'].size()}")

        # Generate the output (predicted answer)
        try:
            outputs = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True)
        except Exception as e:
            print(f"Error during generation: {e}")
            continue  # Skip to the next sample if there's an error

        predicted_answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]

        # Check if the predicted answer matches any ground truth answer
        is_correct = any(predicted_answer.lower() == ans.lower() for ans in sample['answers'])
        correct += int(is_correct)

        # Store results for later analysis
        results['questions'].append(sample['question'])
        results['predictions'].append(predicted_answer)
        results['ground_truth'].append(sample['answers'])
        results['correct'].append(is_correct)

# Calculate the accuracy
accuracy = correct / len(dataset)
return accuracy, results

Call the function and print the results

accuracy, results = evaluate_docvqa(model, processor, num_samples=100, device=“cuda”)
print(f"Test Accuracy: {accuracy:.4f}")

1 Like