requirements.txt
torch
transformers
diffusers
gradio==3.35.0
gradio-client
accelerate
safetensors
huggingface_hub
app.py
import os
import gradio as gr
import spaces # Import ‘spaces’ BEFORE ‘torch’
import torch
print(f"CUDA available: {torch.cuda.is_available()}“)
print(f"CUDA device count: {torch.cuda.device_count()}”)
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from PIL import Image
import numpy as np
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
Reserve GPU for 60 seconds (Zero GPU setting )
@spaces.GPU(duration=60)
def request_gpu():
return “Reserving GPU for 60 seconds…”
Load the segmentation model and processor
image_processor = SegformerFeatureExtractor.from_pretrained(“nvidia/segformer-b0-finetuned-ade-512-512”)
image_segmentor = SegformerForSemanticSegmentation.from_pretrained(“nvidia/segformer-b0-finetuned-ade-512-512”).to(“cuda”)
Load ControlNet (semantic segmentation)
controlnet = ControlNetModel.from_pretrained(
“lllyasviel/sd-controlnet-seg”, torch_dtype=torch.float16
).to(“cuda”)
Load Stable Diffusion v1.5 with ControlNet
pipe = StableDiffusionControlNetPipeline.from_pretrained(
“runwayml/stable-diffusion-v1-5”, controlnet=controlnet, safety_checker=None, torch_dtype=torch.float16
).to(“cuda”)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload() # Keep this if you have limited GPU
Define a color palette for the segmentation mask
palette = np.array([[0, 0, 0], [120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
[140, 140, 140], [204, 5, 255], [230, 230, 230], [4, 250, 7], [224, 5, 255],
[235, 255, 7], [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
[143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 235, 255],
[118, 71, 183], [255, 177, 1], [232, 0, 116], [255, 153, 6], [255, 255, 7],
[255, 255, 0], [255, 255, 255], [255, 0, 0], [0, 0, 255], [255, 255, 255]])
Inference function
def generate(image: Image.Image, prompt: str):
# Process the input image
image = image.convert(“RGB”).resize((512, 512))
pixel_values = image_processor(image, return_tensors=“pt”).pixel_values.to(“cuda”)
with torch.no_grad():
outputs = image_segmentor(pixel_values).logits
# Resize the logits to the original image size
upsampled_logits = torch.nn.functional.interpolate(
outputs, size=image.size[::-1], mode="bilinear", align_corners=False
)
# Get the predicted segmentation map
segmentation_map = upsampled_logits.argmax(dim=1)[0].cpu().numpy()
# Create a color image from the segmentation map
color_seg = np.zeros((segmentation_map.shape[0], segmentation_map.shape[1], 3), dtype=np.uint8)
for label, color in enumerate(palette):
color_seg[segmentation_map == label, :] = color
control_image = Image.fromarray(color_seg.astype(np.uint8)).resize((512, 512)) # Resize for ControlNet
with torch.autocast("cuda"):
result = pipe(prompt=prompt, image=control_image, guidance_scale=7.5).images[0]
# Save the result to a temporary file
output_path = "/tmp/generated_output.png"
result.save(output_path)
# Return the file path for download
return result
Gradio interface setup
demo = gr.Interface(
fn=generate,
inputs=[
gr.Image(type=“pil”, label=“Upload SketchUp Render”),
gr.Textbox(label=“Prompt”, value=“Photorealistic architecture with modern materials and lighting”, type=“text”)
],
outputs=[
gr.Image(type=“pil”, label=“Generated Image”),
],
title=“SketchUp to Realistic Render with Stable Diffusion 1.5 + ControlNet”,
description=“Upload a SketchUp render with materials. This app uses semantic segmentation via UperNet + ControlNet Segmentation + Stable Diffusion v1.5 to generate photorealistic results. Optimized for Hugging Face GPU Spaces.”
)
if name == “main”:
demo.launch()