## ❓ Any advice for freeing up GPU memory after training a large model (e.g., ro…berta-large)?
### System Info
```
Platform Linux-4.4.0-1096-aws-x86_64-with-debian-stretch-sid
Python 3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 17:14:51)
[GCC 7.2.0]
PyTorch 1.3.0
AWS EC2 p3.2xlarge (single GPU)
```
My current objective is to run `roberta-large` multiple training jobs sequentially from the same python script (i.e., for a simple HPO search). Even after deleting all of the objects and clearing the cuda cache after the first training job ends I am still stuck with 41% GPU memory usage (as compared to 15% before starting the training process).
I am attaching a reproducible example to get the error:
### Here is the relevant code
```python
show_gpu('Initial GPU memory usage:')
for i in range(2):
model, optimizer, scheduler = get_training_obj(params)
show_gpu(f'{i}: GPU memory usage after loading training objects:')
for epoch in range(1):
epoch_start = time.time()
model.train()
for batch in dp.train_dataloader:
xb,mb,_,yb = tuple(t.to(params['device']) for t in batch)
outputs = model(input_ids = xb, attention_mask = mb, labels = yb)
loss = outputs[0]
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
show_gpu(f'{i}: GPU memory usage after training model:')
del model, optimizer, scheduler, loss, outputs
torch.cuda.empty_cache()
torch.cuda.synchronize()
show_gpu(f'{i}: GPU memory usage after clearing cache:')
```
### Here is the output and full traceback
```
Initial GPU memory usage: 0.0% (0 out of 16130)
0: GPU memory usage after loading training objects: 14.7% (2377 out of 16130)
0: GPU memory usage after training model: 70.8% (11415 out of 16130)
0: GPU memory usage after clearing cache: 41.8% (6741 out of 16130)
1: GPU memory usage after loading training objects: 50.2% (8093 out of 16130)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-7-20a3bdec1bf4> in <module>()
8 for batch in dp.train_dataloader:
9 xb,mb,_,yb = tuple(t.to(params['device']) for t in batch)
---> 10 outputs = model(input_ids = xb, attention_mask = mb, labels = yb)
11 loss = outputs[0]
12 loss.backward()
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~/anaconda3/lib/python3.6/site-packages/transformers/modeling_roberta.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, labels)
326 token_type_ids=token_type_ids,
327 position_ids=position_ids,
--> 328 head_mask=head_mask)
329 sequence_output = outputs[0]
330 logits = self.classifier(sequence_output)
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~/anaconda3/lib/python3.6/site-packages/transformers/modeling_roberta.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask)
179 token_type_ids=token_type_ids,
180 position_ids=position_ids,
--> 181 head_mask=head_mask)
182
183
~/anaconda3/lib/python3.6/site-packages/transformers/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask)
625 encoder_outputs = self.encoder(embedding_output,
626 extended_attention_mask,
--> 627 head_mask=head_mask)
628 sequence_output = encoder_outputs[0]
629 pooled_output = self.pooler(sequence_output)
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~/anaconda3/lib/python3.6/site-packages/transformers/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask)
346 all_hidden_states = all_hidden_states + (hidden_states,)
347
--> 348 layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
349 hidden_states = layer_outputs[0]
350
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~/anaconda3/lib/python3.6/site-packages/transformers/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask)
326 attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
327 attention_output = attention_outputs[0]
--> 328 intermediate_output = self.intermediate(attention_output)
329 layer_output = self.output(intermediate_output, attention_output)
330 outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them
~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
~/anaconda3/lib/python3.6/site-packages/transformers/modeling_bert.py in forward(self, hidden_states)
298 def forward(self, hidden_states):
299 hidden_states = self.dense(hidden_states)
--> 300 hidden_states = self.intermediate_act_fn(hidden_states)
301 return hidden_states
302
~/anaconda3/lib/python3.6/site-packages/transformers/modeling_bert.py in gelu(x)
126 Also see https://arxiv.org/abs/1606.08415
127 """
--> 128 return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
129
130 def gelu_new(x):
RuntimeError: CUDA out of memory. Tried to allocate 26.00 MiB (GPU 0; 15.75 GiB total capacity; 14.24 GiB already allocated; 8.88 MiB free; 476.01 MiB cached)
```
### Appendix / complete code to reproduce
```python
import platform; print("Platform", platform.platform())
import sys; print("Python", sys.version)
import torch; print("PyTorch", torch.__version__)
from __future__ import absolute_import, division, print_function
import glob
import logging
import os
import time
import json
import random
import numpy as np
import pandas as pd
from random import sample, seed
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformers import RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer
from transformers import DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
from transformers import AdamW, WarmupLinearSchedule
from transformers import glue_compute_metrics as compute_metrics
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers import glue_convert_examples_to_features as convert_examples_to_features
import subprocess
params = {
'num_epochs': 2,
'warmup_ratio': 0.06,
'weight_decay': 0.1,
'adam_epsilon': 1e-6,
'model_name': 'roberta-large',
'max_grad_norm': 1.0,
'lr': 2e-5,
'bs': 32,
'device': 'cuda',
'task': 'cola',
'data_dir': '/home/ubuntu/glue_data/CoLA',
'max_seq_length': 50,
'metric_name': 'mcc',
'patience': 3,
'seed': 935,
'n': -1,
}
class DataProcessor():
'''Preprocess the data, store data loaders and tokenizer'''
_TOKEN_TYPES = {
'roberta': RobertaTokenizer,
'distilbert': DistilBertTokenizer,
'bert': BertTokenizer,
}
def __init__(self, params):
model_type = params['model_name'].split('-')[0]
assert model_type in self._TOKEN_TYPES.keys()
self.tok = self._TOKEN_TYPES[model_type]
self.params = params
self.processor = processors[self.params['task']]()
self.output_mode = output_modes[self.params['task']]
self.label_list = self.processor.get_labels()
@staticmethod
def _convert_to_tensors(features):
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
return TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
def _load_examples(self, tokenizer, evaluate):
if evaluate:
examples = self.processor.get_dev_examples(self.params['data_dir'])
else:
examples = self.processor.get_train_examples(self.params['data_dir'])
if self.params['n'] >= 0:
examples = sample(examples, self.params['n'])
features = convert_examples_to_features(examples,
tokenizer,
label_list=self.label_list,
max_length=self.params['max_seq_length'],
output_mode=self.output_mode,
pad_on_left=False,
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
pad_token_segment_id=0)
return self._convert_to_tensors(features)
def _define_tokenizer(self):
return self.tok.from_pretrained(self.params['model_name'], do_lower_case=True)
def load_data(self):
tokenizer = self._define_tokenizer()
self.train_data = self._load_examples(tokenizer, False)
self.valid_data = self._load_examples(tokenizer, True)
self.train_n = len(self.train_data)
self.valid_n = len(self.valid_data)
self.params['total_steps'] = self.params['num_epochs'] * self.train_n
return self.params
def create_loaders(self):
self.train_dataloader = DataLoader(self.train_data, shuffle=True, batch_size=self.params['bs'])
self.valid_dataloader = DataLoader(self.valid_data, shuffle=False, batch_size=2*self.params['bs'])
dp = DataProcessor(params)
params = dp.load_data()
dp.create_loaders()
def show_gpu(msg):
"""
ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4
"""
def query(field):
return(subprocess.check_output(
['nvidia-smi', f'--query-gpu={field}',
'--format=csv,nounits,noheader'],
encoding='utf-8'))
def to_int(result):
return int(result.strip().split('\n')[0])
used = to_int(query('memory.used'))
total = to_int(query('memory.total'))
pct = used/total
print('\n' + msg, f'{100*pct:2.1f}% ({used} out of {total})')
# ### Running the Training Loop
def get_training_obj(params):
config = RobertaConfig.from_pretrained(params['model_name'], num_labels=2)
model = RobertaForSequenceClassification.from_pretrained(params['model_name'], config=config).to(params['device'])
no_decay = ['bias', 'LayerNorm.weight']
gpd_params = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
'weight_decay': params['weight_decay']},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
'weight_decay': 0.0}
]
optimizer = AdamW(gpd_params, lr=params['lr'], eps=params['adam_epsilon'])
warmup_steps = int(params['warmup_ratio'] * params['total_steps'])
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=params['total_steps'])
return model, optimizer, scheduler
show_gpu('Initial GPU memory usage:')
for i in range(2):
model, optimizer, scheduler = get_training_obj(params)
show_gpu(f'{i}: GPU memory usage after loading training objects:')
for epoch in range(1):
epoch_start = time.time()
model.train()
for batch in dp.train_dataloader:
xb,mb,_,yb = tuple(t.to(params['device']) for t in batch)
outputs = model(input_ids = xb, attention_mask = mb, labels = yb)
loss = outputs[0]
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
show_gpu(f'{i}: GPU memory usage after training model:')
del model, optimizer, scheduler, loss, outputs
torch.cuda.empty_cache()
torch.cuda.synchronize()
show_gpu(f'{i}: GPU memory usage after clearing cache:')
```