I want to test my peft lora on DocVQA (Qwen-2.5-VL-3B) but I am unable to do so. here is the code to reproduce error. I tried asking claude and chatGPT but they were saying image token is present/not present. i checked and its present as “151665” token id. (just remove peft and if possible help me running a test/evaluation on docVQA dataset.) Also is there some universal way of handling data on hugging face or it depends on dataset to dataset ?
from peft import get_peft_model, PeftModel, LoraConfig, TaskType
from transformers import AutoProcessor, AutoModelForImageTextToText , BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer
import torch
device = “cuda” if torch.cuda.is_available() else “cpu”
model_id = “Qwen/Qwen2.5-VL-3B-Instruct”
min_pixels = 2242828
max_pixels = 2242828
processor = AutoProcessor.from_pretrained(model_id, min_pixels=min_pixels, max_pixels=max_pixels)
processor.tokenizer.padding_side = “right”
base_model = AutoModelForImageTextToText.from_pretrained(
model_id,
torch_dtype=torch.float16,
load_in_8bit=True, # Load in 8-bit precision
device_map=“auto”
)
from datasets import load_dataset
from tqdm import tqdm
import torch
from collections import defaultdict
from PIL import Image
Assuming base_model and processor are already loaded
model = base_model
processor = processor # Replace with the actual processor instance
def evaluate_docvqa(model, processor, num_samples=None, device=“cuda”):
# Load the DocVQA dataset (test split)
dataset = load_dataset(“lmms-lab/DocVQA”, “DocVQA”, split=“test”)
# Limit the number of samples if num_samples is provided
if num_samples:
dataset = dataset.select(range(min(num_samples, len(dataset))))
correct = 0
results = defaultdict(list)
model.eval() # Set the model to evaluation mode
with torch.no_grad():
for sample in tqdm(dataset):
# Add the <image> token to the question text
text_with_image = f"<image>{sample['question']}"
# Convert the NumPy array to a PIL Image
image = sample['image']
image = Image.fromarray(image) # Convert to PIL Image if it's not already
image = image.resize((224, 224)) # Resize to a specific dimension (adjust as necessary)
# Process the image and text
inputs = processor(
images=image, # Use resized image
text=text_with_image, # Include <image> token
return_tensors="pt",
padding=True,
truncation=True
).to(device) # Move inputs to the same device as the model
# Debug: Check processed input tensor dimensions
print(f"Processed input size: {inputs['pixel_values'].size()}")
# Generate the output (predicted answer)
try:
outputs = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True)
except Exception as e:
print(f"Error during generation: {e}")
continue # Skip to the next sample if there's an error
predicted_answer = processor.batch_decode(outputs, skip_special_tokens=True)[0]
# Check if the predicted answer matches any ground truth answer
is_correct = any(predicted_answer.lower() == ans.lower() for ans in sample['answers'])
correct += int(is_correct)
# Store results for later analysis
results['questions'].append(sample['question'])
results['predictions'].append(predicted_answer)
results['ground_truth'].append(sample['answers'])
results['correct'].append(is_correct)
# Calculate the accuracy
accuracy = correct / len(dataset)
return accuracy, results
Call the function and print the results
accuracy, results = evaluate_docvqa(model, processor, num_samples=100, device=“cuda”)
print(f"Test Accuracy: {accuracy:.4f}")