Low GPU utilization with the Decision Transformer

Hello everyone, I adapted this tutorial into a single script as below. I noticed that the model gets moved to the GPU, since the memory increases, but the utilization remains at 0% througout training.

I tried the following settings:

  1. Running the script with CUDA available (batch size 64)
  2. Running the script with CUDA available (batch size 1048576)
  3. Running the script with NO-CUDA (batch size 64)

I put all results in one big screenshot since I can’t upload more than one:

The results show no real differences between all three settings in terms of training speed. I had expected the GPU utilization to go up when I made the batch size significantly larger (1048576 instead of 64), but that was not the case. Only the memory usage went up.

I had also expected the training time to go up when using the CPU instead of CUDA, but that was not the case. All settings take roughly 1 hour 50 mins. It’s as if training is being done on the CPU no matter the setting. I also checked htop, there isn’t a bottleneck anywhere as far as I can see.

So my question is: is this expected behavior? Or is there something amiss? Could the collator be the cause of this massive overhead? But even if that is the case, shouldn’t we see greater GPU utilization when the data is moved to the device for the forward pass? Any thoughts on this odd behvior would be very appreciated.

Code:

import os
import random
from dataclasses import dataclass

import numpy as np
import torch
from datasets import load_dataset
from torch import nn
from transformers import (
    DecisionTransformerConfig,
    DecisionTransformerModel,
    Trainer,
    TrainingArguments,
)

os.environ["WANDB_DISABLED"] = (
    "true"  # we diable weights and biases logging for this tutorial
)


class TrainableDT(DecisionTransformerModel):
    def __init__(self, config):
        super().__init__(config)

    def forward(self, **kwargs):
        output = super().forward(**kwargs)
        # add the DT loss
        action_preds = output[1]
        action_targets = kwargs["actions"]
        attention_mask = kwargs["attention_mask"]
        act_dim = action_preds.shape[2]
        action_preds = action_preds.reshape(-1, act_dim)[
            attention_mask.reshape(-1) > 0
        ]
        action_targets = action_targets.reshape(-1, act_dim)[
            attention_mask.reshape(-1) > 0
        ]

        loss = torch.mean((action_preds - action_targets) ** 2)

        return {"loss": loss}

    def original_forward(self, **kwargs):
        return super().forward(**kwargs)


@dataclass
class DecisionTransformerGymDataCollator:
    return_tensors: str = "pt"
    max_len: int = 20  # subsets of the episode we use for training
    state_dim: int = 17  # size of state space
    act_dim: int = 6  # size of action space
    max_ep_len: int = 1000  # max episode length in the dataset
    scale: float = 1000.0  # normalization of rewards/returns
    state_mean: np.array = None  # to store state means
    state_std: np.array = None  # to store state stds
    p_sample: np.array = (
        None  # a distribution to take account trajectory lengths
    )
    n_traj: int = 0  # to store the number of trajectories in the dataset

    def __init__(self, dataset) -> None:
        self.act_dim = len(dataset[0]["actions"][0])
        self.state_dim = len(dataset[0]["observations"][0])
        self.dataset = dataset
        # calculate dataset stats for normalization of states
        states = []
        traj_lens = []
        for obs in dataset["observations"]:
            states.extend(obs)
            traj_lens.append(len(obs))
        self.n_traj = len(traj_lens)
        states = np.vstack(states)
        self.state_mean, self.state_std = (
            np.mean(states, axis=0),
            np.std(states, axis=0) + 1e-6,
        )

        traj_lens = np.array(traj_lens)
        self.p_sample = traj_lens / sum(traj_lens)

    def _discount_cumsum(self, x, gamma):
        discount_cumsum = np.zeros_like(x)
        discount_cumsum[-1] = x[-1]
        for t in reversed(range(x.shape[0] - 1)):
            discount_cumsum[t] = x[t] + gamma * discount_cumsum[t + 1]
        return discount_cumsum

    def __call__(self, features):
        batch_size = len(features)
        # this is a bit of a hack to be able to sample of a non-uniform distribution
        batch_inds = np.random.choice(
            np.arange(self.n_traj),
            size=batch_size,
            replace=True,
            p=self.p_sample,  # reweights so we sample according to timesteps
        )
        # a batch of dataset features
        s, a, r, d, rtg, timesteps, mask = [], [], [], [], [], [], []

        for ind in batch_inds:
            # for feature in features:
            feature = self.dataset[int(ind)]
            si = random.randint(0, len(feature["rewards"]) - 1)

            # get sequences from dataset
            s.append(
                np.array(
                    feature["observations"][si : si + self.max_len]
                ).reshape(1, -1, self.state_dim)
            )
            a.append(
                np.array(feature["actions"][si : si + self.max_len]).reshape(
                    1, -1, self.act_dim
                )
            )
            r.append(
                np.array(feature["rewards"][si : si + self.max_len]).reshape(
                    1, -1, 1
                )
            )

            d.append(
                np.array(feature["dones"][si : si + self.max_len]).reshape(
                    1, -1
                )
            )
            timesteps.append(np.arange(si, si + s[-1].shape[1]).reshape(1, -1))
            timesteps[-1][timesteps[-1] >= self.max_ep_len] = (
                self.max_ep_len - 1
            )  # padding cutoff
            rtg.append(
                self._discount_cumsum(
                    np.array(feature["rewards"][si:]), gamma=1.0
                )[
                    : s[-1].shape[1]  # TODO check the +1 removed here
                ].reshape(
                    1, -1, 1
                )
            )
            if rtg[-1].shape[1] < s[-1].shape[1]:
                print("if true")
                rtg[-1] = np.concatenate([rtg[-1], np.zeros((1, 1, 1))], axis=1)

            # padding and state + reward normalization
            tlen = s[-1].shape[1]
            s[-1] = np.concatenate(
                [np.zeros((1, self.max_len - tlen, self.state_dim)), s[-1]],
                axis=1,
            )
            s[-1] = (s[-1] - self.state_mean) / self.state_std
            a[-1] = np.concatenate(
                [
                    np.ones((1, self.max_len - tlen, self.act_dim)) * -10.0,
                    a[-1],
                ],
                axis=1,
            )
            r[-1] = np.concatenate(
                [np.zeros((1, self.max_len - tlen, 1)), r[-1]], axis=1
            )
            d[-1] = np.concatenate(
                [np.ones((1, self.max_len - tlen)) * 2, d[-1]], axis=1
            )
            rtg[-1] = (
                np.concatenate(
                    [np.zeros((1, self.max_len - tlen, 1)), rtg[-1]], axis=1
                )
                / self.scale
            )
            timesteps[-1] = np.concatenate(
                [np.zeros((1, self.max_len - tlen)), timesteps[-1]], axis=1
            )
            mask.append(
                np.concatenate(
                    [np.zeros((1, self.max_len - tlen)), np.ones((1, tlen))],
                    axis=1,
                )
            )

        s = torch.from_numpy(np.concatenate(s, axis=0)).float()
        a = torch.from_numpy(np.concatenate(a, axis=0)).float()
        r = torch.from_numpy(np.concatenate(r, axis=0)).float()
        d = torch.from_numpy(np.concatenate(d, axis=0))
        rtg = torch.from_numpy(np.concatenate(rtg, axis=0)).float()
        timesteps = torch.from_numpy(np.concatenate(timesteps, axis=0)).long()
        mask = torch.from_numpy(np.concatenate(mask, axis=0)).float()

        return {
            "states": s,
            "actions": a,
            "rewards": r,
            "returns_to_go": rtg,
            "timesteps": timesteps,
            "attention_mask": mask,
        }


def set_seed(seed: int):
    """Setting one seed for different libraries (for reproducability)

    Args:
        seed (int): Fixed number to be used for reproducibility
            of results whenever the code runs
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def train() -> None:
    cfg = {
        "general": {
            "seed": 42,
        },
        "trainer": {
            "output_dir": "output/",
            "remove_unused_columns": False,
            "num_train_epochs": 120,
            "per_device_train_batch_size": 64,
            "learning_rate": 1e-4,
            "weight_decay": 1e-4,
            "warmup_ratio": 0.1,
            "warmup_steps": 500,
            "optim": "adamw_torch",
            "max_grad_norm": 0.25,
            "no_cuda": False,
        },
        "dataset": {
            "path": "edbeeching/decision_transformer_gym_replay",
            "name": "halfcheetah-expert-v2",
        },
    }

    set_seed(cfg["general"]["seed"])

    dataset = load_dataset(
        cfg["dataset"]["path"],
        cfg["dataset"]["name"],
    )

    collator = DecisionTransformerGymDataCollator(dataset["train"])

    model_config = DecisionTransformerConfig(
        state_dim=collator.state_dim, act_dim=collator.act_dim
    )
    model = TrainableDT(model_config)

    training_args = TrainingArguments(**cfg["trainer"])

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        data_collator=collator,
    )

    trainer.train()


if __name__ == "__main__":
    train()

Requirements:

accelerate==0.30.1
datasets==2.19.1
torch==2.3.0
transformers==4.40.2
tqdm==4.66.4