DeepSpeed error: a leaf Variable that requires grad is being used in an in-place operation

Hi!

I’m trying to finetune a huggingface model using the Lightning trainer and Deepspeed. However, when I use multiple gpus I get the following error:

RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.

Can someone help me solving this issue?

The code I am using:

...
model = "openai-community/gpt2"
self.model = GPT2Model.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
...
def forward(self, input_ids, attention_mask):
        embedding = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        last_token = last_token_pool(embedding, attention_mask)
        normalized_embedding = F.normalize(last_token, p=2, dim=1)
        return normalized_embedding
...
def last_token_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
...
def compute_loss(self, input_embeddings, output_embeddings, labels, aggregate=True):
        true_labels = 2 * labels - 1
        similarity = torch.matmul(input_embeddings, output_embeddings.T)# * self._t.exp() + self._b
        loglik = F.logsigmoid(true_labels * similarity)
        nll = -torch.sum(loglik, dim=-1)
        loss = nll if not aggregate else nll.mean()
        return loss
...


trainer = L.Trainer(
      max_epochs=MAX_EPOCHS,
      precision="bf16-mixed",
      accumulate_grad_batches=ACCUMULATE_GRAD_BATCHES,
      val_check_interval=0.10,
      callbacks=[checkpoint_callback],
      logger=aim_logger,
      gradient_clip_val=2.0,
      strategy=DeepSpeedStrategy(
          process_group_backend="gloo",
      ),
      devices=1,
  )

I only get this error when I use

 process_group_backend="gloo",