Pytorch tokenizer unable to create tensor error

I am trying to fine-tune a vision transformer model and my datasets are in (image, text) format. I am following this example code from Pix2Struct:
https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb.

This is the dataset and collator function code:

processor = Pix2StructProcessor.from_pretrained('google/deplot')
model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot')

class DeplotDataset(Dataset):
    def __init__(self, image_folder, text_folder, processor, transform=None):
        self.image_folder = image_folder
        self.text_folder = text_folder
        self.processor = processor
        self.transform = transform

        self.image_filenames = sorted(os.listdir(image_folder))
        self.text_filenames = sorted(os.listdir(text_folder))

    def __len__(self):
        return len(self.image_filenames)    

    def __getitem__(self, index):
        image_filename = self.image_filenames[index]
        text_filename = self.text_filenames[index]

        image_path = os.path.join(self.image_folder, image_filename)
        text_path = os.path.join(self.text_folder, text_filename)

        image = Image.open(image_path)
        with open(text_path, 'r') as f:
            text = f.read()

        if self.transform:
            image = self.transform(image)

        encoding = self.processor(images=image, text="Generate underlying data table of the figure below:", return_tensors="pt", add_special_tokens=True, max_patches=MAX_PATCHES)
        
        encoding = {k:v.squeeze() for k,v in encoding.items()}
        encoding["text"] = text
        return encoding

def collator(batch):
    new_batch = {"flattened_patches":[], "attention_mask":[]}
    texts = [item["text"] for item in batch]

    text_inputs = processor(text=texts, padding="max_length", return_tensors="pt", add_special_tokens=True, max_length=20)

    new_batch["labels"] = text_inputs.input_ids

    for item in batch:
        new_batch["flattened_patches"].append(item["flattened_patches"])
        new_batch["attention_mask"].append(item["attention_mask"])

    new_batch["flattened_patches"] = torch.stack(new_batch["flattened_patches"])
    new_batch["attention_mask"] = torch.stack(new_batch["attention_mask"])

    return new_batch

The problem is that I keep getting this ValueError

ValueError                                Traceback (most recent call last)
File /data_new/robin/Deplot/venv/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:731, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
    730 if not is_tensor(value):
--> 731     tensor = as_tensor(value)
    733     # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
    734     # # at-least2d
    735     # if tensor.ndim > 2:
    736     #     tensor = tensor.squeeze(0)
    737     # elif tensor.ndim < 2:
    738     #     tensor = tensor[None, :]

ValueError: expected sequence of length 236 at dim 1 (got 67)

The above exception was the direct cause of the following exception:
ValueError                                Traceback (most recent call last)
Cell In[15], line 15
     13 for epoch in tqdm(range(EPOCHS)):
     14     print("Epoch:", epoch)
---> 15     for idx, batch in enumerate(train_dataloader):
     16         labels = batch.pop("labels").to(device)
     17         flattened_patches = batch.pop("flattened_patches").to(device)

...

Cell In[12], line 5, in collator(batch)
      2 new_batch = {"flattened_patches":[], "attention_mask":[]}
      3 texts = [item["text"] for item in batch]
----> 5 text_inputs = processor(text=texts, padding="max_length", return_tensors="pt", add_special_tokens=True, max_length=20)
      7 new_batch["labels"] = text_inputs.input_ids
      9 for item in batch:

...

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

I have tried to fine-tune the model by using padding="max_length" and truncate=True, but it seemed like the model did not converge (maybe due to incomplete tokenization). Any ideas or guidance on how I could fix this would be appreciated.