Changing Hidden size in Clip Text encoder

I am trying to use CLIP text encorder with ‘clip-vit-base-patch16’ and seems like the default hidden size is 512. However, when I try change hidden size to 768 using CLIPTextConfig and load pre train weights it revert back to default 512. Is there any workaround for this? Or a weight set that trained with hidden size of 768?

My code is below

import torch
from transformers import AutoTokenizer, CLIPTextModel,CLIPTextConfig
import numpy as np

class ClipEmbedder:
    def __init__(self,
        configuration = CLIPTextConfig(hidden_size=768,projection_dim = 768)
        self.pretrained_weights = pretrained_weights
        self.model = model(configuration)
        self.model = self.model.from_pretrained(self.pretrained_weights).to('cuda')
        self.max_seq_len = max_seq_len
        self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_weights)
        self.vocab_size = self.tokenizer.vocab_size

    def get_text(self, text):
        encoding=self.tokenizer(["a photo of a "+ text], padding=True, return_tensors="pt")
        return (text, encoding)

    def get_clip_embeddings(self,
        outputs = self.model(**inputs)
        # last_hidden_state = outputs.last_hidden_state
        # pooled_output = outputs.pooler_output
        # import pdb; pdb.set_trace()
        return outputs