I raised a previous issue here, where it was suggested that the issue was caused by how SageMaker pipelines work. However, I’m now calling a single endpoint directly and getting the same error (I created a separate post for it since the other deals mostly with pipelines, and this is focused on the endpoint). I am making 200 requests at about 50-60 requests per second and the error is triggered. It is offset by using multiple instances, but still appears occasionally. Average latency is about 1-2 seconds, with 2.1 seconds max. I’ve also tried using a larger instance, up to ml.m5.12xlarge but same result.
Below is my sagemaker deployment code, and the entrypoint script.
@philschmid you mentioned you haven’t seen this error before. But I am seeing this everywhere, and at not very high request loads directly to the endpoint. This makes me think there must be some underlying issue somehwere, but I am doing a pretty basic model deployment. Could there be some issue with the model.tar.gz arhcive itself? Unless I’m doing something I shouldn’t in the inference.py script below I really don’t know what’s causing this. Any help is appreciated. Thanks!
model = HuggingFaceModel(transformers_version="4.6", # transformers version used
pytorch_version="1.7", # pytorch version used
py_version='py36', # python version used
entry_point = 'embed_source/inference.py',
model_data=emb_model.model_artifacts,
model_server_workers = 4,
sagemaker_session=sagemaker_session,
name= emb_name, role=role)
model.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge',
endpoint_name=model_name, wait=True)
I have also tested using the latest container version
transformers_version="4.12.3", # transformers version used
pytorch_version="1.9.1", # pytorch version used
py_version='py38'
with the same results.
inference.py
import subprocess
import sys
import json
import os
import numpy as np
import torch
import boto3
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from importlib import reload
print('\nboto3 loaded')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('\ndevice:',device)
def forward_pass(batch, model):
input_ids = torch.tensor(batch["input_ids"]).to(device)
attention_mask = torch.tensor(batch["attention_mask"]).to(device)
with torch.no_grad():
last_hidden_state = model(input_ids, attention_mask).last_hidden_state
last_hidden_state = last_hidden_state.cpu().numpy()
# Use average of unmasked hidden states for classification
lhs_shape = last_hidden_state.shape
boolean_mask = ~np.array(batch["attention_mask"]).astype(bool)
boolean_mask = np.repeat(boolean_mask, lhs_shape[-1], axis=-1)
boolean_mask = boolean_mask.reshape(lhs_shape)
masked_mean = np.ma.array(last_hidden_state, mask=boolean_mask).mean(axis=1)
batch["hidden_state"] = masked_mean.data
return batch
def preprocess_function(examples):
print('attempting to tokenize:', examples)
if examples['type'] == 'incomeType':
my_type = 'income'
else:
my_type = 'expense'
print('my_type', my_type)
t = tokenizer([examples[my_type]["description"]], truncation=True)
t['description'] = examples[my_type]["description"]
t['date'] = examples[my_type]["date"]
t['amount'] = examples[my_type]["amount"]
t['type'] = examples['type']
print('t', t)
return t
print('\nos.getcwd()', os.getcwd())
print('\nModel Walk:')
for path, subdirs, files in os.walk('/opt/ml'):
for name in files: print(os.path.join(path, name))
tokenizer = AutoTokenizer.from_pretrained('/opt/ml/model')
def model_fn(model_dir):
print('\nIn model_fn')
print('\nmodel_dir:', model_dir)
model = AutoModel.from_pretrained('/opt/ml/model', output_hidden_states=True).to(device)
return model
print('\inference directory', os.listdir(os.curdir) )
print('\nWalk:')
for path, subdirs, files in os.walk('/opt/ml'):
for name in files: print(os.path.join(path, name))
def input_fn(data, content_type):
print('\nin data', data, content_type, type(data))
request = json.loads(data)
# preprocess dataset
print('attempting preprocess')
print('request', request)
response = preprocess_function(request)
print('response', response)
print('\nfwd pass')
return response
def predict_fn(data, model):
print('\nin predict:', data)
res2 = forward_pass(data, model)
return res2
def output_fn(prediction, accept):
print('\nin output', type(prediction))
j = [{ "description":prediction.description,
"amount":prediction.amount,
"type": prediction.type,
"date":prediction.date,
"inputs":prediction.input_ids,
"embeddings":prediction.hidden_state.tolist()}]
return json.dumps(j)