I wanted to generate a image using text embedding instead of text as input using clip to tokenizes & embeds.
The code so far :
from transformers import AutoTokenizer, CLIPTextModelWithProjection
model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
from diffusers import StableDiffusionPipeline, DDIMScheduler
import torch
path ="path_to_my_model.safetensors"
pipe = StableDiffusionPipeline.from_single_file(f"{path}", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
pipe.to("cuda")
import numpy as np
import torch
prompt = "some random prompt"
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
text_embeddings = model(text_input.input_ids)[0]
batch_size = len(text_input)
uncond_input = tokenizer(
[""] * batch_size, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt", truncation=True
)
uncond_embeddings = model(uncond_input.input_ids)[0]
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
output_image = pipe(prompt_embeds=text_embeddings).images[0]
I get so far ValueError: not enough values to unpack (expected 3, got 2)
Althought the embedding shape is [3,512], text_embeddings.shape
torch.Size([3, 512])
I can’t figure out where the issues is. I also tried to not concat with uncond_embedding.