Program not working on GPU but works on CPU

1650Ti doesn’t support fp16…?

1 Like

so it won’t work, right? I should go ahead with cpu

import logging
from diffusers import DiffusionPipeline
import torch

# =========================
# STEP 0: Logging Setup
# =========================

# Remove all existing logging handlers
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

log_file = "generation_log.txt"

# Set up logging to file (and optionally console)
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file, mode='w'),
        logging.StreamHandler()  # Remove this line if you want logs only in the file
    ]
)

# Enable DEBUG logging for all major libraries
libraries_to_log = [
    "accelerate", "aiofiles", "annotated_types", "anyio", "certifi", "charset_normalizer",
    "click", "colorama", "deepspeed", "diffusers", "exceptiongroup", "fastapi", "ffmpy",
    "filelock", "flash_attention", "fsspec", "gradio", "gradio_client", "groovy", "h11",
    "hjson", "httpcore", "httpx", "huggingface_hub", "idna", "importlib_metadata", "Jinja2",
    "markdown_it_py", "MarkupSafe", "mdurl", "mpmath", "mypy_extensions", "networkx",
    "ninja", "numpy", "orjson", "packaging", "pandas", "peft", "pillow", "psutil", "py-cpuinfo",
    "pydantic", "pydantic_core", "pydub", "Pygments", "pyre_extensions", "python_dateutil",
    "python_multipart", "pytz", "PyYAML", "regex", "requests", "rich", "ruff", "safehttpx",
    "safetensors", "semantic_version", "setuptools", "shellingham", "six", "sniffio",
    "starlette", "sympy", "tokenizers", "tomlkit", "torch", "torchaudio", "torchvision",
    "tqdm", "transformers", "typer", "typing_extensions", "typing_inspect",
    "typing_inspection", "tzdata", "urllib3", "uvicorn", "websockets", "zipp"
]

for lib in libraries_to_log:
    logging.getLogger(lib).setLevel(logging.DEBUG)

# =========================
# STEP 1: Model Load & Image Generation
# =========================

logger = logging.getLogger(__name__)
logger.info("Loading the diffusion pipeline...")

pipe = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float32
).to("cpu")

logger.info("Pipeline loaded successfully.")

prompt = "A clear sunny landscape with mountains and a river"
logger.info(f"Generating image for prompt: {prompt}")

image = pipe(prompt=prompt).images[0]
image.save("test_image.png")

logger.info("Image saved to test_image.png")

1 Like

Or maybe we should write some code to make better use of the GPU while keeping it as float32…
For example, quantizing or placing only VAE on the GPU…
Well, if speed becomes an issue, we’ll just try some trial and error.

chatgpt improved on the code, and now there is some result but imperfect.

Image

Code

from flask import Flask, send_file, request
from diffusers import StableDiffusionXLPipeline
import torch, os, numpy as np, random, logging

app = Flask(__name__)
model_dir = "D:\\Ganu\\AIImage\\apache-webserver\\Apache24\\htdocs"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
pipeline = None

def load_pipeline():
    global pipeline
    if pipeline is not None:
        return

    print(f"[INFO] Loading full SDXL pipeline on {device} with float32...")

    pipeline_local = StableDiffusionXLPipeline.from_pretrained(
        "stabilityai/stable-diffusion-xl-base-1.0",
        torch_dtype=torch.float32,
        use_safetensors=True
    ).to(device)

    pipeline_local.enable_attention_slicing()
    pipeline_local.safety_checker = lambda images, **kwargs: (images, False)

    pipeline_local.set_progress_bar_config(disable=True)
    pipeline = pipeline_local
    logging.info("[INFO] Full pipeline loaded on GPU.")

@app.route("/")
def home():
    return """
    <html>
        <head><title>SDXL Generator</title></head>
        <body style="font-family: sans-serif;">
            <h1>Stable Diffusion XL (float32)</h1>
            <form action="/generate" method="get">
                Prompt: <input type="text" name="prompt" size="60">
                <input type="submit" value="Generate">
            </form>
            <p>Try: <a href="/generate?prompt=Futuristic+Indian+market">Futuristic Indian market</a></p>
        </body>
    </html>
    """

@app.route("/generate")
def generate_image():
    load_pipeline()

    seed = request.args.get("seed", default=random.randint(0, 9999999), type=int)
    torch.manual_seed(seed)
    np.random.seed(seed)

    prompt = request.args.get("prompt", default="", type=str).strip()
    if not prompt:
        return "Error: prompt required", 400

    width = request.args.get("width", default=512, type=int)
    height = request.args.get("height", default=512, type=int)

    def round_to_64(x): return max(64, int(round(x / 64)) * 64)
    width = min(round_to_64(width), 768)
    height = min(round_to_64(height), 768)

    negative_prompt = (
        "text, watermark, blurry, deformed, double face, bad proportions, UI elements"
    )

    print(f"[INFO] Generating image (seed={seed}, size={width}x{height})")

    result = pipeline(
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=width,
        height=height,
        guidance_scale=7.5,
        num_inference_steps=20
    )

    image = result.images[0]
    output_dir = os.path.join(model_dir, "outputs")
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"generated_image_{seed}.png")
    image.save(output_path)

    print(f"[INFO] Image saved: {output_path}")
    return send_file(output_path, mimetype='image/png')

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    app.run(host="127.0.0.1", port=80, debug=True)

1 Like

This may improve VRAM consumption a little.

    pipeline_local.enable_attention_slicing()
    pipeline_local.enable_model_cpu_offload() # this