I am trying to fine-tune a vision transformer model and my datasets are in (image, text) format. I am following this example code from Pix2Struct:
https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb.
This is the dataset and collator function code:
processor = Pix2StructProcessor.from_pretrained('google/deplot')
model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')
class DeplotDataset(Dataset):
def __init__(self, image_folder, text_folder, processor, transform=None):
self.image_folder = image_folder
self.text_folder = text_folder
self.processor = processor
self.transform = transform
self.image_filenames = sorted(os.listdir(image_folder))
self.text_filenames = sorted(os.listdir(text_folder))
def __len__(self):
return len(self.image_filenames)
def __getitem__(self, index):
image_filename = self.image_filenames[index]
text_filename = self.text_filenames[index]
image_path = os.path.join(self.image_folder, image_filename)
text_path = os.path.join(self.text_folder, text_filename)
image = Image.open(image_path)
with open(text_path, 'r') as f:
text = f.read()
if self.transform:
image = self.transform(image)
encoding = self.processor(images=image, text="Generate underlying data table of the figure below:", return_tensors="pt", add_special_tokens=True, max_patches=MAX_PATCHES)
encoding = {k:v.squeeze() for k,v in encoding.items()}
encoding["text"] = text
return encoding
def collator(batch):
new_batch = {"flattened_patches":[], "attention_mask":[]}
texts = [item["text"] for item in batch]
text_inputs = processor(text=texts, padding="max_length", return_tensors="pt", add_special_tokens=True, max_length=20)
new_batch["labels"] = text_inputs.input_ids
for item in batch:
new_batch["flattened_patches"].append(item["flattened_patches"])
new_batch["attention_mask"].append(item["attention_mask"])
new_batch["flattened_patches"] = torch.stack(new_batch["flattened_patches"])
new_batch["attention_mask"] = torch.stack(new_batch["attention_mask"])
return new_batch
The problem is that I keep getting this ValueError
ValueError Traceback (most recent call last)
File /data_new/robin/Deplot/venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:731, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
730 if not is_tensor(value):
--> 731 tensor = as_tensor(value)
733 # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
734 # # at-least2d
735 # if tensor.ndim > 2:
736 # tensor = tensor.squeeze(0)
737 # elif tensor.ndim < 2:
738 # tensor = tensor[None, :]
ValueError: expected sequence of length 236 at dim 1 (got 67)
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
Cell In[15], line 15
13 for epoch in tqdm(range(EPOCHS)):
14 print("Epoch:", epoch)
---> 15 for idx, batch in enumerate(train_dataloader):
16 labels = batch.pop("labels").to(device)
17 flattened_patches = batch.pop("flattened_patches").to(device)
...
Cell In[12], line 5, in collator(batch)
2 new_batch = {"flattened_patches":[], "attention_mask":[]}
3 texts = [item["text"] for item in batch]
----> 5 text_inputs = processor(text=texts, padding="max_length", return_tensors="pt", add_special_tokens=True, max_length=20)
7 new_batch["labels"] = text_inputs.input_ids
9 for item in batch:
...
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
I have tried to fine-tune the model by using padding="max_length"
and truncate=True
, but it seemed like the model did not converge (maybe due to incomplete tokenization). Any ideas or guidance on how I could fix this would be appreciated.