Hello,
Iâm currently trying to implement a latent generative diffuser using a custom-trained VAE which maps 3x256x256 to 1x256 and a UNet1DModel with the following parameters :
UNet1DModel(in_channels = 1, out_channels=1,down_block_types =
(âDownBlock1Dâ, âDownBlock1Dâ, âDownBlock1Dâ,âAttnDownBlock1Dâ,âDownBlock1Dâ),
up_block_types = (âUpBlock1Dâ,âUpBlock1Dâ, âUpBlock1Dâ, âAttnUpBlock1Dâ, âUpBlock1Dâ),
block_out_channels = (32, 64, 128, 256, 512), time_embedding_type = âfourierâ,layers_per_block = 2, act_fn = âsiluâ)
Training is however pretty tough as I canât reach a small enough training loss, I need something of the order of 1e-4.
Here is the training code :
"
Lets train
from accelerate import Accelerator
from diffusers.hub_utils import init_git_repo, push_to_hub
import os
def train_loop(config, model, vae,noise_scheduler, optimizer, train_dataloader, val_dataloader, lr_scheduler):
Initialize accelerator and tensorboard logging
accelerator = Accelerator(
mixed_precision=config.mixed_precision,
gradient_accumulation_steps=config.gradient_accumulation_steps,
log_with=âtensorboardâ,
logging_dir=os.path.join(config.output_dir, âlogsâ)
)
if accelerator.is_main_process:
accelerator.init_trackers(âtrain_exampleâ)
model, vae, optimizer, train_dataloader, val_dataloader,lr_scheduler = accelerator.prepare(
model, vae, optimizer, train_dataloader, val_dataloader,lr_scheduler
)
global_step = 0
# Now you train the model
epoch_wise_val_loss = []
for epoch in range(config.num_epochs):
batch = next(iter(train_dataloader))
if epoch%500 == 0:
print(f"Epoch : {epoch}",flush=True)
with torch.no_grad():
batch = batch.to(torch_device)
mu, log_var = vae.encode(batch)
clean_images = vae.reparameterize(mu, log_var).unsqueeze(1)
# Sample noise to add to the images
noise = torch.randn(clean_images.shape).to(clean_images.device)
bs = clean_images.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bs,), device=clean_images.device).long()
# Add noise to the clean images according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
with accelerator.accumulate(model):
# Predict the noise residual
noise_pred = model(noisy_images, timesteps, return_dict=False)[0]
loss = F.mse_loss(noise_pred, noise)
accelerator.backward(loss)
accelerator.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
accelerator.log(logs, step=global_step)
global_step += 1
# After each epoch you optionally sample some demo images with evaluate() and save the model
if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1:
unwrapped_model = accelerator.unwrap_model(model)
#unwrapped_model.save_pretrained(save_directory=config.output_dir+"/epoch_"+str(epoch+1))
pipeline_trained = LDMPipeline(vae=vae,unet=unwrapped_model,scheduler=noise_scheduler,input_device=torch_device)
evaluate(config,epoch+1,pipeline_trained)
Im using a cosine scheduler with a lr of 2e-5 (Iâve tried e-4 to e-8) and I donât need to normalize the data as they already have mean=0 and std=1 after being mapped by the VAE.
The training loss is :
Have you got any idea on the source of the problem?
Thanks