I take this numpy arrays and I convert it to image with the following function
def convert_array_to_pil(image_array, mode="L"):
if image_array.dtype != np.uint8:
# Ensure values are within [0, 1]. If not, you might need to normalize.
image_array = np.clip(image_array, 0.0, 1.0)
image_array = (image_array * 255).astype(np.uint8)
return Image.fromarray(image_array, mode)
image_pil = convert_array_to_pil(image_np, mode="L")
#Then I convert to RGB
image_rgb = image_pil.convert("RGB")
# Convert the mask to a PIL Image (the mask is assumed to be already uint8 with values 0 or 255)
mask_pil = Image.fromarray(mask_np.astype(np.uint8), mode="L")
Now do the inpainintg with thte fowling script
pipe = StableDiffusionInpaintPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-inpainting",
torch_dtype=torch.float16,
)
pipe.to("cuda")
# Run the pipeline with the prompt.
result = pipe(
prompt=prompt,
image=image_rgb,
mask_image=mask_pil,
num_inference_steps=num_inference_steps,
height=image_np.shape[0],
width=image_np.shape[1]
)
# The result is a list of PIL Images; take the first result.
inpainted_rgb = result.images[0]
# Convert the resulting RGB image back to grayscale.
inpainted_gray = inpainted_rgb.convert("L")
# Convert the PIL image to a NumPy array (default uint8) and then normalize to [0,1] as float64.
inpainted_uint8 = np.array(inpainted_gray)
inpainted_array = inpainted_uint8.astype(np.float64) / 255.0
The results I get are the middle figure, which is completely different with expectations.
What am I doing wrong?
The cause is probably that image_np is not a normal RGB (3D) image. The dimension is a 2D image that Diffusers cannot handle. (When I forcibly reproduced a 2D image using Grayscale (P), the code itself worked…)
import numpy as np
from PIL import Image
from diffusers import StableDiffusionInpaintPipeline
import torch
prompt = "test"
num_inference_steps = 28
image_np_file = "np_image.png"
#image_np = np.array(Image.open(image_np_file).resize((512, 512)).convert("P")) # it works (with strange output)
image_np = np.array(Image.open(image_np_file).resize((512, 512)).convert("RGB")) # it occurs an error "ValueError: Too many dimensions: 3 > 2."
mask_np=np.zeros_like(image_np)
mask_np[image_np==np.nan]=255
def convert_array_to_pil(image_array, mode="L"):
if image_array.dtype != np.uint8:
# Ensure values are within [0, 1]. If not, you might need to normalize.
image_array = np.clip(image_array, 0.0, 1.0)
image_array = (image_array * 255).astype(np.uint8)
return Image.fromarray(image_array, mode)
image_pil = convert_array_to_pil(image_np, mode="L")
#Then I convert to RGB
image_rgb = image_pil.convert("RGB")
# Convert the mask to a PIL Image (the mask is assumed to be already uint8 with values 0 or 255)
mask_pil = Image.fromarray(mask_np.astype(np.uint8), mode="L")
pipe = StableDiffusionInpaintPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-inpainting",
torch_dtype=torch.float16,
)
pipe.to("cuda")
# Run the pipeline with the prompt.
result = pipe(
prompt=prompt,
image=image_rgb,
mask_image=mask_pil,
num_inference_steps=num_inference_steps,
height=image_np.shape[0],
width=image_np.shape[1]
)
# The result is a list of PIL Images; take the first result.
inpainted_rgb = result.images[0]
# Convert the resulting RGB image back to grayscale.
inpainted_gray = inpainted_rgb.convert("L")
# Convert the PIL image to a NumPy array (default uint8) and then normalize to [0,1] as float64.
inpainted_uint8 = np.array(inpainted_gray)
inpainted_array = inpainted_uint8.astype(np.float64) / 255.0
Image.fromarray(image_np).convert("RGB").save("_image_np.png")
Image.fromarray(mask_np).convert("RGB").save("_mask_np.png")
inpainted_rgb.convert("RGB").save("_inpaint_rgb.png")
Image.fromarray(inpainted_array).convert("RGB").save("_inpaint_array.png")
I tried many algorithms to make the 2d array as an RGB image, but nothing seems to produce reasonable results.
Is this case two simple for this approach?
I think that Inpainting and Image-to-image in Diffusers can’t handle colors well if they are not RGB. I think that they work as expected if they are grayscale (1D) or RGB (3D).
It would be more reliable to convert the 2D array in numpy to a 3D array first and then pass it to Inpainting.