I am trying to use CLIP text encorder with ‘clip-vit-base-patch16’ and seems like the default hidden size is 512. However, when I try change hidden size to 768 using CLIPTextConfig and load pre train weights it revert back to default 512. Is there any workaround for this? Or a weight set that trained with hidden size of 768?
My code is below
import torch
from transformers import AutoTokenizer, CLIPTextModel,CLIPTextConfig
import numpy as np
class ClipEmbedder:
def __init__(self,
pretrained_weights='openai/clip-vit-base-patch16',
model=CLIPTextModel,
max_seq_len=8,
config=None):
super().__init__()
configuration = CLIPTextConfig(hidden_size=768,projection_dim = 768)
self.pretrained_weights = pretrained_weights
self.model = model(configuration)
self.model = self.model.from_pretrained(self.pretrained_weights).to('cuda')
self.max_seq_len = max_seq_len
self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_weights)
self.vocab_size = self.tokenizer.vocab_size
def get_text(self, text):
encoding=self.tokenizer(["a photo of a "+ text], padding=True, return_tensors="pt")
return (text, encoding)
def get_clip_embeddings(self,
inputs):
outputs = self.model(**inputs)
# last_hidden_state = outputs.last_hidden_state
# pooled_output = outputs.pooler_output
# import pdb; pdb.set_trace()
return outputs