Low GPU utilization with the Decision Transformer

agkhalil · May 16, 2024, 8:38am

Hello everyone, I adapted this tutorial into a single script as below. I noticed that the model gets moved to the GPU, since the memory increases, but the utilization remains at 0% througout training.

I tried the following settings:

Running the script with CUDA available (batch size 64)
Running the script with CUDA available (batch size 1048576)
Running the script with NO-CUDA (batch size 64)

I put all results in one big screenshot since I can’t upload more than one:

The results show no real differences between all three settings in terms of training speed. I had expected the GPU utilization to go up when I made the batch size significantly larger (1048576 instead of 64), but that was not the case. Only the memory usage went up.

I had also expected the training time to go up when using the CPU instead of CUDA, but that was not the case. All settings take roughly 1 hour 50 mins. It’s as if training is being done on the CPU no matter the setting. I also checked htop, there isn’t a bottleneck anywhere as far as I can see.

So my question is: is this expected behavior? Or is there something amiss? Could the collator be the cause of this massive overhead? But even if that is the case, shouldn’t we see greater GPU utilization when the data is moved to the device for the forward pass? Any thoughts on this odd behvior would be very appreciated.

Code:

import os
import random
from dataclasses import dataclass

import numpy as np
import torch
from datasets import load_dataset
from torch import nn
from transformers import (
    DecisionTransformerConfig,
    DecisionTransformerModel,
    Trainer,
    TrainingArguments,
)

os.environ["WANDB_DISABLED"] = (
    "true"  # we diable weights and biases logging for this tutorial
)


class TrainableDT(DecisionTransformerModel):
    def __init__(self, config):
        super().__init__(config)

    def forward(self, **kwargs):
        output = super().forward(**kwargs)
        # add the DT loss
        action_preds = output[1]
        action_targets = kwargs["actions"]
        attention_mask = kwargs["attention_mask"]
        act_dim = action_preds.shape[2]
        action_preds = action_preds.reshape(-1, act_dim)[
            attention_mask.reshape(-1) > 0
        ]
        action_targets = action_targets.reshape(-1, act_dim)[
            attention_mask.reshape(-1) > 0
        ]

        loss = torch.mean((action_preds - action_targets) ** 2)

        return {"loss": loss}

    def original_forward(self, **kwargs):
        return super().forward(**kwargs)


@dataclass
class DecisionTransformerGymDataCollator:
    return_tensors: str = "pt"
    max_len: int = 20  # subsets of the episode we use for training
    state_dim: int = 17  # size of state space
    act_dim: int = 6  # size of action space
    max_ep_len: int = 1000  # max episode length in the dataset
    scale: float = 1000.0  # normalization of rewards/returns
    state_mean: np.array = None  # to store state means
    state_std: np.array = None  # to store state stds
    p_sample: np.array = (
        None  # a distribution to take account trajectory lengths
    )
    n_traj: int = 0  # to store the number of trajectories in the dataset

    def __init__(self, dataset) -> None:
        self.act_dim = len(dataset[0]["actions"][0])
        self.state_dim = len(dataset[0]["observations"][0])
        self.dataset = dataset
        # calculate dataset stats for normalization of states
        states = []
        traj_lens = []
        for obs in dataset["observations"]:
            states.extend(obs)
            traj_lens.append(len(obs))
        self.n_traj = len(traj_lens)
        states = np.vstack(states)
        self.state_mean, self.state_std = (
            np.mean(states, axis=0),
            np.std(states, axis=0) + 1e-6,
        )

        traj_lens = np.array(traj_lens)
        self.p_sample = traj_lens / sum(traj_lens)

    def _discount_cumsum(self, x, gamma):
        discount_cumsum = np.zeros_like(x)
        discount_cumsum[-1] = x[-1]
        for t in reversed(range(x.shape[0] - 1)):
            discount_cumsum[t] = x[t] + gamma * discount_cumsum[t + 1]
        return discount_cumsum

    def __call__(self, features):
        batch_size = len(features)
        # this is a bit of a hack to be able to sample of a non-uniform distribution
        batch_inds = np.random.choice(
            np.arange(self.n_traj),
            size=batch_size,
            replace=True,
            p=self.p_sample,  # reweights so we sample according to timesteps
        )
        # a batch of dataset features
        s, a, r, d, rtg, timesteps, mask = [], [], [], [], [], [], []

        for ind in batch_inds:
            # for feature in features:
            feature = self.dataset[int(ind)]
            si = random.randint(0, len(feature["rewards"]) - 1)

            # get sequences from dataset
            s.append(
                np.array(
                    feature["observations"][si : si + self.max_len]
                ).reshape(1, -1, self.state_dim)
            )
            a.append(
                np.array(feature["actions"][si : si + self.max_len]).reshape(
                    1, -1, self.act_dim
                )
            )
            r.append(
                np.array(feature["rewards"][si : si + self.max_len]).reshape(
                    1, -1, 1
                )
            )

            d.append(
                np.array(feature["dones"][si : si + self.max_len]).reshape(
                    1, -1
                )
            )
            timesteps.append(np.arange(si, si + s[-1].shape[1]).reshape(1, -1))
            timesteps[-1][timesteps[-1] >= self.max_ep_len] = (
                self.max_ep_len - 1
            )  # padding cutoff
            rtg.append(
                self._discount_cumsum(
                    np.array(feature["rewards"][si:]), gamma=1.0
                )[
                    : s[-1].shape[1]  # TODO check the +1 removed here
                ].reshape(
                    1, -1, 1
                )
            )
            if rtg[-1].shape[1] < s[-1].shape[1]:
                print("if true")
                rtg[-1] = np.concatenate([rtg[-1], np.zeros((1, 1, 1))], axis=1)

            # padding and state + reward normalization
            tlen = s[-1].shape[1]
            s[-1] = np.concatenate(
                [np.zeros((1, self.max_len - tlen, self.state_dim)), s[-1]],
                axis=1,
            )
            s[-1] = (s[-1] - self.state_mean) / self.state_std
            a[-1] = np.concatenate(
                [
                    np.ones((1, self.max_len - tlen, self.act_dim)) * -10.0,
                    a[-1],
                ],
                axis=1,
            )
            r[-1] = np.concatenate(
                [np.zeros((1, self.max_len - tlen, 1)), r[-1]], axis=1
            )
            d[-1] = np.concatenate(
                [np.ones((1, self.max_len - tlen)) * 2, d[-1]], axis=1
            )
            rtg[-1] = (
                np.concatenate(
                    [np.zeros((1, self.max_len - tlen, 1)), rtg[-1]], axis=1
                )
                / self.scale
            )
            timesteps[-1] = np.concatenate(
                [np.zeros((1, self.max_len - tlen)), timesteps[-1]], axis=1
            )
            mask.append(
                np.concatenate(
                    [np.zeros((1, self.max_len - tlen)), np.ones((1, tlen))],
                    axis=1,
                )
            )

        s = torch.from_numpy(np.concatenate(s, axis=0)).float()
        a = torch.from_numpy(np.concatenate(a, axis=0)).float()
        r = torch.from_numpy(np.concatenate(r, axis=0)).float()
        d = torch.from_numpy(np.concatenate(d, axis=0))
        rtg = torch.from_numpy(np.concatenate(rtg, axis=0)).float()
        timesteps = torch.from_numpy(np.concatenate(timesteps, axis=0)).long()
        mask = torch.from_numpy(np.concatenate(mask, axis=0)).float()

        return {
            "states": s,
            "actions": a,
            "rewards": r,
            "returns_to_go": rtg,
            "timesteps": timesteps,
            "attention_mask": mask,
        }


def set_seed(seed: int):
    """Setting one seed for different libraries (for reproducability)

    Args:
        seed (int): Fixed number to be used for reproducibility
            of results whenever the code runs
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def train() -> None:
    cfg = {
        "general": {
            "seed": 42,
        },
        "trainer": {
            "output_dir": "output/",
            "remove_unused_columns": False,
            "num_train_epochs": 120,
            "per_device_train_batch_size": 64,
            "learning_rate": 1e-4,
            "weight_decay": 1e-4,
            "warmup_ratio": 0.1,
            "warmup_steps": 500,
            "optim": "adamw_torch",
            "max_grad_norm": 0.25,
            "no_cuda": False,
        },
        "dataset": {
            "path": "edbeeching/decision_transformer_gym_replay",
            "name": "halfcheetah-expert-v2",
        },
    }

    set_seed(cfg["general"]["seed"])

    dataset = load_dataset(
        cfg["dataset"]["path"],
        cfg["dataset"]["name"],
    )

    collator = DecisionTransformerGymDataCollator(dataset["train"])

    model_config = DecisionTransformerConfig(
        state_dim=collator.state_dim, act_dim=collator.act_dim
    )
    model = TrainableDT(model_config)

    training_args = TrainingArguments(**cfg["trainer"])

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        data_collator=collator,
    )

    trainer.train()


if __name__ == "__main__":
    train()

Requirements:

accelerate==0.30.1
datasets==2.19.1
torch==2.3.0
transformers==4.40.2
tqdm==4.66.4

tuan124816 · October 30, 2024, 10:02am

Hi @agkhalil, I’m facing the same issue where the model only utilizes the CPU, even though CUDA is available. I’ve also tried increasing the batch size and confirming that the GPU memory increases, but utilization remains at 0% throughout training. Did you happen to find a solution or any workaround for this? Any insights would be really appreciated!

John6666 · October 30, 2024, 11:01am

There are many possible causes, such as CUDA_VISIBLE_DEVICES not set, torch not installed for CUDA but for CPU, or no CUDA toolkit, but I wouldn’t know if it’s a trainer bug. …

tuan124816 · October 30, 2024, 12:18pm

@John6666 I’ve already checked and confirmed that I’ve set CUDA_VISIBLE_DEVICES, installed PyTorch for CUDA, and have the CUDA toolkit in place. Everything indicates that it’s working fine on my end. If you need me to run any specific code or send over any part of my code for you to review, just let me know!

John6666 · October 30, 2024, 12:24pm

I sometimes hear errors related to deepspeed, but it’s not used in this code, and it doesn’t look like a particularly problematic code…
If the problem is about trainer settings I don’t know, but maybe the more knowledgeable HF staff who come in from time to time might know something about it.
Even if you don’t do the following, the torch and HF libraries try to use CUDA preferentially if they can recognize CUDA in the environment, so the code below or so shouldn’t make any sense…

#model = TrainableDT(model_config)
model = TrainableDT(model_config).to("cuda")

Edit:
I think torch 2.3.0 was buggy…

torch==2.3.0

tuan124816 · October 30, 2024, 12:35pm

Haha, you’re right, it doesn’t make much sense! My current version of PyTorch is 2.5.1+cu124. I’ve also tried using other versions, but the issue persists, so I don’t think it’s related to PyTorch.

Edit: The hub doesn’t allow me to write more reply so I will write here I guess
@John6666 I hope someone knowledgeable or staff can read this and help us out! Could you share the link to those other complaints you mentioned? It might give us more insight into the issue (Hopefully). Thanks!

John6666 · October 30, 2024, 12:39pm

I saw several people complain about a similar problem with task allocation to multiple GPUs, but I think the cause of that one also remains unknown in the end.
But in the case of these unintelligible bugs, the cause is probably somewhere unseen. About somewhere between hardware to OS to Python to libraries. In such cases, there is no solution, only a workaround, sometimes barely.

Edit:
I followed the ironclad rule of “look for a working program first” and tried to imitate it properly, but the GPU usage didn’t go up…
I just came up with the ironclad rule.

github.com

kzl/decision-transformer/blob/master/gym/experiment.py

import gym
import numpy as np
import torch
import wandb

import argparse
import pickle
import random
import sys

from decision_transformer.evaluation.evaluate_episodes import evaluate_episode, evaluate_episode_rtg
from decision_transformer.models.decision_transformer import DecisionTransformer
from decision_transformer.models.mlp_bc import MLPBCModel
from decision_transformer.training.act_trainer import ActTrainer
from decision_transformer.training.seq_trainer import SequenceTrainer


def discount_cumsum(x, gamma):
    discount_cumsum = np.zeros_like(x)
    discount_cumsum[-1] = x[-1]

This file has been truncated. show original

import os
import random
from dataclasses import dataclass

import numpy as np
import torch
from datasets import load_dataset
from torch import nn
from transformers import (
    DecisionTransformerConfig,
    DecisionTransformerModel,
    Trainer,
    TrainingArguments,
)

os.environ["WANDB_DISABLED"] = (
    "true"  # we diable weights and biases logging for this tutorial
)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {device}.")

class TrainableDT(DecisionTransformerModel):
    def __init__(self, config):
        super().__init__(config)

    def forward(self, **kwargs):
        output = super().forward(**kwargs)
        # add the DT loss
        action_preds = output[1]
        action_targets = kwargs["actions"]
        attention_mask = kwargs["attention_mask"]
        act_dim = action_preds.shape[2]
        action_preds = action_preds.reshape(-1, act_dim)[
            attention_mask.reshape(-1) > 0
        ]
        action_targets = action_targets.reshape(-1, act_dim)[
            attention_mask.reshape(-1) > 0
        ]

        loss = torch.mean((action_preds - action_targets) ** 2)

        return {"loss": loss}

    def original_forward(self, **kwargs):
        return super().forward(**kwargs)


@dataclass
class DecisionTransformerGymDataCollator:
    return_tensors: str = "pt"
    max_len: int = 20  # subsets of the episode we use for training
    state_dim: int = 17  # size of state space
    act_dim: int = 6  # size of action space
    max_ep_len: int = 1000  # max episode length in the dataset
    scale: float = 1000.0  # normalization of rewards/returns
    state_mean: np.array = None  # to store state means
    state_std: np.array = None  # to store state stds
    p_sample: np.array = (
        None  # a distribution to take account trajectory lengths
    )
    n_traj: int = 0  # to store the number of trajectories in the dataset

    def __init__(self, dataset) -> None:
        self.act_dim = len(dataset[0]["actions"][0])
        self.state_dim = len(dataset[0]["observations"][0])
        self.dataset = dataset
        # calculate dataset stats for normalization of states
        states = []
        traj_lens = []
        for obs in dataset["observations"]:
            states.extend(obs)
            traj_lens.append(len(obs))
        self.n_traj = len(traj_lens)
        states = np.vstack(states)
        self.state_mean, self.state_std = (
            np.mean(states, axis=0),
            np.std(states, axis=0) + 1e-6,
        )

        traj_lens = np.array(traj_lens)
        self.p_sample = traj_lens / sum(traj_lens)

    def _discount_cumsum(self, x, gamma):
        discount_cumsum = np.zeros_like(x)
        discount_cumsum[-1] = x[-1]
        for t in reversed(range(x.shape[0] - 1)):
            discount_cumsum[t] = x[t] + gamma * discount_cumsum[t + 1]
        return discount_cumsum

    def __call__(self, features):
        batch_size = len(features)
        # this is a bit of a hack to be able to sample of a non-uniform distribution
        batch_inds = np.random.choice(
            np.arange(self.n_traj),
            size=batch_size,
            replace=True,
            p=self.p_sample,  # reweights so we sample according to timesteps
        )
        # a batch of dataset features
        s, a, r, d, rtg, timesteps, mask = [], [], [], [], [], [], []

        for ind in batch_inds:
            # for feature in features:
            feature = self.dataset[int(ind)]
            si = random.randint(0, len(feature["rewards"]) - 1)

            # get sequences from dataset
            s.append(
                np.array(
                    feature["observations"][si : si + self.max_len]
                ).reshape(1, -1, self.state_dim)
            )
            a.append(
                np.array(feature["actions"][si : si + self.max_len]).reshape(
                    1, -1, self.act_dim
                )
            )
            r.append(
                np.array(feature["rewards"][si : si + self.max_len]).reshape(
                    1, -1, 1
                )
            )

            d.append(
                np.array(feature["dones"][si : si + self.max_len]).reshape(
                    1, -1
                )
            )
            timesteps.append(np.arange(si, si + s[-1].shape[1]).reshape(1, -1))
            timesteps[-1][timesteps[-1] >= self.max_ep_len] = (
                self.max_ep_len - 1
            )  # padding cutoff
            rtg.append(
                self._discount_cumsum(
                    np.array(feature["rewards"][si:]), gamma=1.0
                )[
                    : s[-1].shape[1]  # TODO check the +1 removed here
                ].reshape(
                    1, -1, 1
                )
            )
            if rtg[-1].shape[1] < s[-1].shape[1]:
                print("if true")
                rtg[-1] = np.concatenate([rtg[-1], np.zeros((1, 1, 1))], axis=1)

            # padding and state + reward normalization
            tlen = s[-1].shape[1]
            s[-1] = np.concatenate(
                [np.zeros((1, self.max_len - tlen, self.state_dim)), s[-1]],
                axis=1,
            )
            s[-1] = (s[-1] - self.state_mean) / self.state_std
            a[-1] = np.concatenate(
                [
                    np.ones((1, self.max_len - tlen, self.act_dim)) * -10.0,
                    a[-1],
                ],
                axis=1,
            )
            r[-1] = np.concatenate(
                [np.zeros((1, self.max_len - tlen, 1)), r[-1]], axis=1
            )
            d[-1] = np.concatenate(
                [np.ones((1, self.max_len - tlen)) * 2, d[-1]], axis=1
            )
            rtg[-1] = (
                np.concatenate(
                    [np.zeros((1, self.max_len - tlen, 1)), rtg[-1]], axis=1
                )
                / self.scale
            )
            timesteps[-1] = np.concatenate(
                [np.zeros((1, self.max_len - tlen)), timesteps[-1]], axis=1
            )
            mask.append(
                np.concatenate(
                    [np.zeros((1, self.max_len - tlen)), np.ones((1, tlen))],
                    axis=1,
                )
            )

        s = torch.from_numpy(np.concatenate(s, axis=0)).float().to(device)
        a = torch.from_numpy(np.concatenate(a, axis=0)).float().to(device)
        r = torch.from_numpy(np.concatenate(r, axis=0)).float().to(device)
        d = torch.from_numpy(np.concatenate(d, axis=0)).to(device)
        rtg = torch.from_numpy(np.concatenate(rtg, axis=0)).float().to(device)
        timesteps = torch.from_numpy(np.concatenate(timesteps, axis=0)).long().to(device)
        mask = torch.from_numpy(np.concatenate(mask, axis=0)).float().to(device)

        return {
            "states": s,
            "actions": a,
            "rewards": r,
            "returns_to_go": rtg,
            "timesteps": timesteps,
            "attention_mask": mask,
        }


def set_seed(seed: int):
    """Setting one seed for different libraries (for reproducability)

    Args:
        seed (int): Fixed number to be used for reproducibility
            of results whenever the code runs
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def train() -> None:
    cfg = {
        "general": {
            "seed": 42,
        },
        "trainer": {
            "output_dir": "output/",
            "remove_unused_columns": False,
            "num_train_epochs": 120,
            "per_device_train_batch_size": 64,
            "learning_rate": 1e-4,
            "weight_decay": 1e-4,
            "warmup_ratio": 0.1,
            "warmup_steps": 500,
            "optim": "adamw_torch",
            "max_grad_norm": 0.25,
            "no_cuda": False,
            "use_cpu": False,
            "dataloader_pin_memory": False,
            "fsdp": False,
        },
        "dataset": {
            "path": "edbeeching/decision_transformer_gym_replay",
            "name": "halfcheetah-expert-v2",
        },
    }

    set_seed(cfg["general"]["seed"])

    dataset = load_dataset(
        cfg["dataset"]["path"],
        cfg["dataset"]["name"],
    )

    collator = DecisionTransformerGymDataCollator(dataset["train"])

    model_config = DecisionTransformerConfig(
        state_dim=collator.state_dim, act_dim=collator.act_dim
    )
    model = TrainableDT(model_config).to(device)

    training_args = TrainingArguments(**cfg["trainer"])

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        data_collator=collator,
    )
    trainer.place_model_on_device = True

    trainer.train()


if __name__ == "__main__":
    train()

Topic		Replies	Views
GPU utilization almost always 0 during training 🤗Transformers	2	326	January 22, 2025
Finetuning GPT2 using Multiple GPU and Trainer 🤗Transformers	14	6767	May 22, 2023
Trainer use multigpu 🤗Transformers	0	501	July 29, 2021
Training using multiple GPUs Beginners	20	20065	February 25, 2024
Training Transformer doesn't reach full GPU usage 🤗Transformers	0	533	February 10, 2023

Low GPU utilization with the Decision Transformer

Related topics