I’m trying to train a model in a workstation with 2 GPUs. It’s a basic training loop, which has been adapted for accelerate. However, when I launch it through the terminal using
accelerate launch --multi_gpu main.py
the same code is computed separately in the GPUs. Thus, it outputs something as:
Epoch: 0, accuracy: 78.45345
Epoch: 0, accuracy: 77.22342
Epoch: 1, accuracy: 80.23424
Epoch: 1, accuracy: 80.75432
Is there an explanation for that behavior?
The main script is the following:
from accelerate import Accelerator
def main():
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from datetime import datetimestart = datetime.now() accelerator = Accelerator() # transform to normalize the data transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]) # Download and load the training data trainset = datasets.FashionMNIST('./data', download=True, train=True, transform=transform) train_loader = DataLoader(trainset, batch_size=64, shuffle=True) # Download and load the test data validationset = datasets.FashionMNIST('./data', download=True, train=False, transform=transform) val_loader = DataLoader(validationset, batch_size=64, shuffle=True) class CustomModel(nn.Module): def __init__(self): super(CustomModel, self).__init__() self.fc1 = nn.Linear(784, 80000) self.fc2 = nn.Linear(80000, 2000) self.fc3 = nn.Linear(2000, 256) self.fc4 = nn.Linear(256, 64) self.fc5 = nn.Linear(64, 10) self.dropout = nn.Dropout(0.5) self.activation = torch.nn.ReLU() def forward(self, x): x = torch.flatten(x, start_dim=1) x = self.dropout(x) x = self.fc1(x) x = self.activation(x) x = self.dropout(x) x = self.fc2(x) x = self.activation(x) x = self.dropout(x) x = self.fc3(x) x = self.activation(x) x = self.dropout(x) x = self.fc4(x) x = self.activation(x) x = self.dropout(x) x = self.fc5(x) logits = self.activation(x) return logits # creating model 1 model = CustomModel() learning_rate = 0.0001 optimizer = optim.Adam(model.parameters(), lr=learning_rate) loss_fn = nn.CrossEntropyLoss() def accuracy_fn(y_true, y_pred): correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal acc = (correct / len(y_pred)) * 100 return acc epochs = 15 model, optimizer, train_loader, val_loader = accelerator.prepare(model, optimizer, train_loader, val_loader) for epoch in range(epochs): model.train() model_train_acc = 0 for batch in train_loader: data, label = batch # 1. Forward pass logits = model(data) # 2. Calculate loss loss = loss_fn(logits, label) # 3. Optimizer zero grad optimizer.zero_grad() # 4. Loss backwards accelerator.backward(loss) #loss.backward() # 5. Optmize optimizer.step() # Calculate accuracy y_train = torch.argmax(logits, dim=1) train_acc = accuracy_fn(y_true=label, y_pred=y_train) model_train_acc += train_acc model.eval() with torch.inference_mode(): total_acc = 0 for batch in val_loader: # 1. Forward pass data, label = batch logits = model(data) # 2. Calculate accuracy y_pred = torch.argmax(logits, dim=1) acc = accuracy_fn(y_true=label, y_pred=y_pred) total_acc += acc print(f'Epoch {epoch}. Model 1 accuracy: {total_acc/len(val_loader)}.') print(datetime.now()-start)
if name == “main”:
main()