I have an endpoint that processes inference in batches (currently 100 items per request - see inference script below). I occasionally get the following error:
2022-05-19 15:25:43,484 [INFO ] W-model-4-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - mms.service.PredictionException: CUDA out of memory. Tried to allocate 94.00 MiB (GPU 0; 14.76 GiB total capacity; 747.06 MiB already allocated; 66.44 MiB free; 1.01 GiB reserved in total by PyTorch) : 400
I am currently using a single g4dn.xlarge instance.
What are my options for addressing this?
- Increase to an instance with more GPU memory? It looks like all g4dn instances have the same GPU memory (16) up to g4dn.12xlarge (which has 64)? What would be the cheapest way to increase GPU memory from a g4dn.xlarge? g5.xlarge. looks like it has 24 GB in GPU memory so that looks like the next step up for inference in terms of GPU memory and cost? EDIT: looks like this is not available for inference endpoints. I also tested g4dn.2xlarge with same result. So it seems to get more GPU memory you have to increase instace costs pretty drastically?
- Reduce max length for truncation (currently set at 512)
- Split the request of 100 up into smaller batches of requests
Do I have any other options for getting around GPU memory issues at inference time?
Here is my inference script (the endpoint returns the embeddings for the text document):
import subprocess
import sys
import json
import os
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from importlib import reload
print('\nb3 not loaded')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('\ndevice:',device)
print('os getenv response size', os.getenv('TS_MAX_RESPONSE_SIZE'))
print('os getenv response size', os.getenv('MMS_MAX_RESPONSE_SIZE'))
def forward_pass(batch, model):
input_ids = torch.tensor(batch["input_ids"]).to(device)
attention_mask = torch.tensor(batch["attention_mask"]).to(device)
with torch.no_grad():
last_hidden_state = model(input_ids, attention_mask).last_hidden_state
last_hidden_state = last_hidden_state.cpu().numpy()
# Use average of unmasked hidden states for classification
lhs_shape = last_hidden_state.shape
boolean_mask = ~np.array(batch["attention_mask"]).astype(bool)
boolean_mask = np.repeat(boolean_mask, lhs_shape[-1], axis=-1)
boolean_mask = boolean_mask.reshape(lhs_shape)
masked_mean = np.ma.array(last_hidden_state, mask=boolean_mask).mean(axis=1)
batch["hidden_state"] = masked_mean.data
return batch
def preprocess_function(examples):
print('Attempting to tokenize')
t = tokenizer(examples, padding=True, truncation=True)
t['source'] = examples
return t
print('\nos.getcwd()', os.getcwd())
print('\nWalk:')
for path, subdirs, files in os.walk('/opt/ml'):
for name in files: print(os.path.join(path, name))
tokenizer = AutoTokenizer.from_pretrained('/opt/ml/model')
def model_fn(model_dir):
print('\nIn model_fn')
print('\nmodel_dir::', model_dir)
model = AutoModel.from_pretrained('/opt/ml/model', output_hidden_states=True).to(device)
return model
print('\inference directory', os.listdir(os.curdir) )
print('\nWalk:')
for path, subdirs, files in os.walk('/opt/ml'):
for name in files: print(os.path.join(path, name))
def input_fn(data, content_type):
print('\nin data', data, content_type, type(data))
request = json.loads(data)
print('request', request)
descriptions = request
print('descriptions', descriptions)
print('attempting preprocess')
response = preprocess_function(request)
print('response', response)
print('\nfwd pass')
return response
def predict_fn(data, model):
print('\nin predict!', data)
res2 = forward_pass(data, model)
return res2
def output_fn(prediction, accept):
print('\nin output', type(prediction))
j = [{"inputs":prediction.input_ids, "source":prediction.source,"embeddings":prediction.hidden_state.tolist()}]
return json.dumps(j)