Working from the following code, but would like to use the tokenized prompt so that I can interpolate between 2 prompts and produce a video output. The creation of the video from frames is easy enough, but I’m stuck here at the beginning. Any advice is appreciated.
from diffusers import AutoPipelineForText2Image
import torch
torch.cuda.empty_cache()
pipe = AutoPipelineForText2Image.from_pretrained(
"stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16"
)
pipe.to("cuda")
for item in pipe.components:
print(item)
prompt = "A cinematic shot of a baby racoon wearing an intricate italian priest robe."
# tokenizing and encoding the prompt into embeddings.
prompt_tokens = pipe.tokenizer(
prompt,
padding="max_length",
max_length=pipe.tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
image = pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]
# save image
image.save("output.png")