I would like to know how to use a data collator when dealing with text and images. I have a task where I need to input an image and text information to a model. My goal is to create a Pytorch Dataset
which can output images and tokenized texts. This is my Dataset
class, however I don’t know how I can use a data collator with this definition.
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer):
self.dataframe = dataframe
self.tokenizer = tokenizer
def __len__(self):
return len(self.dataframe)
def __getitem__(self, index):
idx = self.dataframe['image_id'][index] # Select image id for OOF
image_path = self.dataframe['path'][index] # Get image path
text = self.dataframe['text'][index] # Get image text
image = cv2.imread(image_path) # Read image
image = self.transform(image=image) # Apply transforms
image = image['image'] # Extract image from dictionary
# Tokenize the input text
encoded_text = self.tokenizer.encode_plus(
text,
padding=True,
truncation=True,
return_tensors='pt'
)
# Convert to tensors
for k, v in encoded_text.items():
encoded_text[k] = torch.tensor(v, dtype=torch.long)
print(encoded_text[k].shape)
data = {
"index": idx,
"image": image,
"text": encoded_text
}
return data
I have tried:
tokenizer = AutoTokenizer.from_pretrained(config.TOKENIZER_NAME)
data_collator = DataCollatorWithPadding(tokenizer, padding="longest", return_tensors = 'pt')
train = CustomDataset(train_data, tokenizer)
train_loader = DataLoader(train,
batch_size=config.BATCH_SIZE_TRAIN,
shuffle=True,
num_workers=config.NUM_WORKERS,
drop_last=True,
collate_fn=data_collator)
And get the following error when iterating with the data loader:
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['index', 'image', 'text']