Replicating the same code in gpus

I’m trying to train a model in a workstation with 2 GPUs. It’s a basic training loop, which has been adapted for accelerate. However, when I launch it through the terminal using

accelerate launch --multi_gpu main.py

the same code is computed separately in the GPUs. Thus, it outputs something as:

Epoch: 0, accuracy: 78.45345
Epoch: 0, accuracy: 77.22342
Epoch: 1, accuracy: 80.23424
Epoch: 1, accuracy: 80.75432

Is there an explanation for that behavior?

The main script is the following:

from accelerate import Accelerator

def main():
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from datetime import datetime

start = datetime.now()
accelerator = Accelerator()

# transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])

# Download and load the training data
trainset = datasets.FashionMNIST('./data', download=True, train=True, transform=transform)
train_loader = DataLoader(trainset, batch_size=64, shuffle=True)

# Download and load the test data
validationset = datasets.FashionMNIST('./data', download=True, train=False, transform=transform)
val_loader = DataLoader(validationset, batch_size=64, shuffle=True)


class CustomModel(nn.Module):

    def __init__(self):
        super(CustomModel, self).__init__()

        self.fc1 = nn.Linear(784, 80000)
        self.fc2 = nn.Linear(80000, 2000)
        self.fc3 = nn.Linear(2000, 256)
        self.fc4 = nn.Linear(256, 64)
        self.fc5 = nn.Linear(64, 10)
        self.dropout = nn.Dropout(0.5)
        self.activation = torch.nn.ReLU()

    def forward(self, x):
        x = torch.flatten(x, start_dim=1)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc4(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc5(x)
        logits = self.activation(x)

        return logits

# creating model 1
model = CustomModel()

learning_rate = 0.0001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

loss_fn = nn.CrossEntropyLoss()
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100
    return acc

epochs = 15

model, optimizer, train_loader, val_loader = accelerator.prepare(model, optimizer, train_loader, val_loader)

for epoch in range(epochs):
    model.train()

    model_train_acc = 0

    for batch in train_loader:
        data, label = batch

        # 1. Forward pass
        logits = model(data)

        # 2. Calculate loss
        loss = loss_fn(logits, label)

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backwards
        accelerator.backward(loss)
        #loss.backward()

        # 5. Optmize
        optimizer.step()

        # Calculate accuracy
        y_train = torch.argmax(logits, dim=1)
        train_acc = accuracy_fn(y_true=label, y_pred=y_train)
        model_train_acc += train_acc

    model.eval()

    with torch.inference_mode():

        total_acc = 0

        for batch in val_loader:

            # 1. Forward pass
            data, label = batch
            logits = model(data)

            # 2. Calculate accuracy
            y_pred = torch.argmax(logits, dim=1)

            acc = accuracy_fn(y_true=label, y_pred=y_pred)

            total_acc += acc

    print(f'Epoch {epoch}. Model 1 accuracy: {total_acc/len(val_loader)}.')

print(datetime.now()-start)

if name == “main”:
main()

Yes, you’re not being mindful of your metric calculations and gathering. See these docs which talk about distributed evaluation: Quick tour