Hi,
I am running some experiments on a multi-GPU cluster, and I’m using accelerate. I’m trying to calculate some metrics after every batch iteration in the training dataloader. While the training code seems to work fine using accelerate, I run into an error when trying to calculate said metrics. It seems that after doing a forward pass when evaluating the output tensors are put on another device than the input tensors. The code that gives an error is the following:
def calculatePerplexity(sentence, model, tokenizer, accelerator):
"""
exp(loss)
"""
input_ids = torch.tensor(sentence).unsqueeze(0)
print(f"Input ids device: {input_ids.device}")
model.eval()
with torch.no_grad():
outputs = model(input_ids, labels=input_ids)
outputs = accelerator.gather_for_metrics(outputs)
loss, logits = outputs[:2]
loss, logits = accelerator.prepare(loss, logits)
print(f"Loss device: {loss.device}")
print(f'Model device: {model.device}')
print(f'Logits device: {logits.device}')
probabilities = torch.nn.functional.softmax(logits, dim=-1)
all_prob = []
input_ids_processed = input_ids[0][1:]
for i, token_id in enumerate(input_ids_processed):
probability = probabilities[0, i, token_id].item()
all_prob.append(probability)
# stuff for metric calculation
probs = torch.nn.functional.softmax(logits[0, :-1], dim=-1)
log_probs = torch.nn.functional.log_softmax(logits[0, :-1], dim=-1)
token_log_probs = log_probs.gather(dim=-1, index=input_ids_processed.unsqueeze(-1)).squeeze(-1)
mu = (probs * log_probs).sum(-1)
sigma = (probs * torch.square(log_probs)).sum(-1) - torch.square(mu)
mink_plus = (token_log_probs - mu) / sigma.sqrt()
The output of the debugging statements is as follows:
Input ids device: cpu
Loss device: cuda:0
Model device: cpu
Logits device: cuda:0
This is the same for the 4 different GPUs I’m using, and this results in the following error:
token_log_probs = log_probs.gather(dim=-1, index=input_ids_processed.unsqueeze(-1)).squeeze(-1)
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank2]: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA_gather)
I’m not sure what I’m doing wrong here, I thought that using either of the gather_for_metrics
method or calling accelerator.prepare
on the loss and logits would help but it doesn’t (I get the same error when removing these statements). Any advice would be greatly appreciated!
For completeness sake, here is the rest of the (relevant) code I use when calculating the metrics:
# Training loop
for i, (batch_inputs, batch_labels) in tqdm(enumerate(dataloader)):
all_labels += batch_labels
unlearned_model, tokenizer = load_base_model(self.experiment_args.model_dir_prefix, self.experiment_args.model)
torch.cuda.empty_cache()
optimizer = torch.optim.Adam(unlearned_model.parameters(), lr=self.unlearning_args.lr)
unlearned_model, optimizer, batch_inputs = accelerator.prepare(unlearned_model, optimizer, batch_inputs)
# Unlearn data and calculate PPL values
for i in range(self.unlearning_args.steps):
unlearned_model = unlearn_dataslice(unlearned_model, optimizer, batch_inputs, self.unlearning_args, accelerator)
torch.cuda.empty_cache()
UL_PPL_vals += calculate_PPL_values(unlearned_model, tokenizer, batch_inputs, accelerator)
def unlearn_dataslice(model, optimizer, sentences, args, accelerator):
learning_rate = args.lr
model.train()
optimizer.zero_grad()
input_data = sentences.clone().detach()
output = model(input_data)
# Add a minus do to gradient ascent instead of descent
loss = -output[0]['logits']
accelerator.backward(loss.mean())
torch.cuda.empty_cache()
optimizer.step()
del optimizer
torch.cuda.empty_cache()
return model
def calculate_PPL_values(model, tokenizer, text_batch, accelerator):
PPL_values = []
for text in text_batch:
PPL = calculatePerplexity(text, model, tokenizer, accelerator)[0]
PPL_values.append(PPL)
return PPL_values
In this code I removed a lot of debugging statements that checked that the device of both the unlearned_model
and batch_inputs
are on the same device cpu
throughout the training loop, so I’m pretty sure there’s no inconsistency there.