Trying to do contrastive pre-retraining model using NTXentLoss and microsoft/swin-tiny-patch4-window7-224 using deepspeed. The following steps were taken and I am still trying to learn and figure out the issues . Any pointers and suggestion welcome.
-
Xview2 dataset is different from imagenet - hence the pretraining was done using mean,std computed on Xview 2
-
The deep speed config used → {
“train_batch_size”: 192,
“gradient_clipping”: 1.0,
“train_micro_batch_size_per_gpu”: 48,
“gradient_accumulation_steps”: 1,
“steps_per_print”: 100,
“optimizer”: {
“type”: “AdamW”,
“params”: {
“lr”: 1e-5,
“betas”: [0.9, 0.98],
“eps”: 1e-8,
“weight_decay”: 1e-5
}
},
“logging”: {
“level”: “error”
},
“scheduler”: {
“type”: “WarmupDecayLR”,
“params”: {
“warmup_min_lr”: 1e-6,
“warmup_max_lr”: 1e-5,
“warmup_num_steps”: 1152,
“total_num_steps”: 5760
}
},
“fp16”: {
“enabled”: true,
“loss_scale”: 0,
“initial_scale_power”: 14,
“hysteresis”: 2,
“min_loss_scale”: 1
},
“activation_checkpointing”: {
“partition_activations”: false,
“contiguous_memory_optimization”: false
},
“wall_clock_breakdown”: true
} -
Over a period of time I consistently get nan’s in Swin initial patch embedding layers .
-
The model being trained - class PreTrainingSimCLRTiny(nn.Module):
def init(self, config, patch_resolution=(48, 48), input_resolution=256, pretrained=True):
super(PreTrainingSimCLRTiny, self).init()
self.model_name = config[‘training’][‘model_name_small’]
self.model = SwinModel.from_pretrained(self.model_name)
self.image_processor = AutoImageProcessor.from_pretrained(self.model_name)# Remove classification head to use hidden states self.model.classifier = nn.Identity() self.patch_resolution = patch_resolution self.input_resolution = input_resolution self.config = config # Dimensionality settings d_model = config['training']['d_model_small'] d_proj = config['training']['proj_d_model'] self.hidden_size = d_model # Normalization, dropout, and projection head self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(p=0.1) # Projection head for contrastive learning self.projector = nn.Sequential( nn.Linear(d_model, d_proj), nn.GELU(), nn.Linear(d_proj, d_model), nn.LayerNorm(d_model) # Add normalization to projector output ) self.projector.apply(self.init_weights) # Forward hook to monitor embedding outputs def hook_fn(module, input, output): if torch.isnan(output).any() or torch.isinf(output).any(): print(f"NaN or Inf detected in {module}.") print(f"Output stats: mean={output.mean()}, std={output.std()}, min={output.min()}, max={output.max()}") raise ValueError("NaN/Inf detected in model embeddings.") self.model.embeddings.patch_embeddings.register_forward_hook(hook_fn)
def init_weights(self, module):
if isinstance(module, nn.Linear):
nn.init.xavier_uniform_(module.weight)
if module.bias is not None:
nn.init.zeros_(module.bias)def forward(self, x, inputs, task=“contrastive”):
“”"
Forward method for contrastive learning.Args: x: Tensor of images. inputs: Preprocessed inputs for the Hugging Face model. task: Task type (default: "contrastive"). Returns: Projected features for contrastive loss. """ # Ensure inputs are on the same device as the model device = next(self.model.parameters()).device # Convert inputs to FP16 inputs = {k: v.half().to(device) for k, v in inputs.items()} # Convert model to FP16 self.model.half() # Pass through the model outputs = self.model(**inputs) hidden_states = outputs.last_hidden_state # (batch_size, num_patches, hidden_size) if task == "contrastive": # Normalize and project embeddings normalized_embeddings = F.normalize(hidden_states.mean(dim=1), dim=-1) normalized_embeddings = self.norm(normalized_embeddings) normalized_embeddings = self.dropout(normalized_embeddings) projected = self.projector(normalized_embeddings) return projected else: raise ValueError(f"Unknown task type: {task}")