Diffusers load custom embedding

I wanted to generate a image using text embedding instead of text as input using clip to tokenizes & embeds.
The code so far :

from transformers import AutoTokenizer, CLIPTextModelWithProjection

model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")


from diffusers import  StableDiffusionPipeline, DDIMScheduler
import torch


path ="path_to_my_model.safetensors"


pipe = StableDiffusionPipeline.from_single_file(f"{path}", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")

pipe.to("cuda")
import numpy as np
import torch

prompt = "some random prompt"

text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")

text_embeddings = model(text_input.input_ids)[0] 

batch_size = len(text_input)

uncond_input = tokenizer(
    [""] * batch_size, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt", truncation=True
)

uncond_embeddings = model(uncond_input.input_ids)[0] 


text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

output_image = pipe(prompt_embeds=text_embeddings).images[0]

I get so far ValueError: not enough values to unpack (expected 3, got 2)

Althought the embedding shape is [3,512], text_embeddings.shape torch.Size([3, 512])

I can’t figure out where the issues is. I also tried to not concat with uncond_embedding.

1 Like