Hi!
I’m trying to finetune a huggingface model using the Lightning trainer and Deepspeed. However, when I use multiple gpus I get the following error:
RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
Can someone help me solving this issue?
The code I am using:
...
model = "openai-community/gpt2"
self.model = GPT2Model.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
...
def forward(self, input_ids, attention_mask):
embedding = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
last_token = last_token_pool(embedding, attention_mask)
normalized_embedding = F.normalize(last_token, p=2, dim=1)
return normalized_embedding
...
def last_token_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
...
def compute_loss(self, input_embeddings, output_embeddings, labels, aggregate=True):
true_labels = 2 * labels - 1
similarity = torch.matmul(input_embeddings, output_embeddings.T)# * self._t.exp() + self._b
loglik = F.logsigmoid(true_labels * similarity)
nll = -torch.sum(loglik, dim=-1)
loss = nll if not aggregate else nll.mean()
return loss
...
trainer = L.Trainer(
max_epochs=MAX_EPOCHS,
precision="bf16-mixed",
accumulate_grad_batches=ACCUMULATE_GRAD_BATCHES,
val_check_interval=0.10,
callbacks=[checkpoint_callback],
logger=aim_logger,
gradient_clip_val=2.0,
strategy=DeepSpeedStrategy(
process_group_backend="gloo",
),
devices=1,
)