I am trying to follow along this example, but using my own pretrained model.
However I get the following error
Incorrect tensor shape at input tensor #0: received 5 128 768, expected 1 128 768.\n Incorrect tensor shape at input tensor #1: received 5 1 1 128, expected 1 1 1 128.\n"
}
I’m sending a list of 5 requests to the endpoint, and it looks like it is expecting just a single request?
Compilation and inference code is below. I am guessing it is because I am compiling the model with a single dummy input value?
I was trying to do something like
tokenizer(**[dummy_input]*5**, max_length=max_length, padding="max_length",return_tensors="pt")
But I get a compilation failure/trace aborted message. How do I get it accept batches? I know it doesn’t accept dynamic batching - does this mean every batch request must have the same number of requests in it?
compilation code:
import os
import tensorflow # to workaround a protobuf version conflict issue
import torch
import torch.neuron
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# load tokenizer and model
model = AutoModel.from_pretrained("tmp/", torchscript=True)
tokenizer = AutoTokenizer.from_pretrained('tmp/')
# create dummy input for max length 128
dummy_input = "dummy input which will be padded later"
max_length = 128
embeddings = tokenizer(dummy_input, max_length=max_length, padding="max_length",return_tensors="pt")
neuron_inputs = tuple(embeddings.values())
# compile model with torch.neuron.trace and update config
model_neuron = torch.neuron.trace(model, neuron_inputs)
model.config.update({"traced_sequence_length": max_length})
# save tokenizer, neuron model and config for later use
save_dir="tmp-neuron"
os.makedirs(save_dir,exist_ok=True)
model_neuron.save(os.path.join(save_dir,"neuron_model.pt"))
tokenizer.save_pretrained(save_dir)
model.config.save_pretrained(save_dir)
Inference script
import subprocess
import sys
import json
import os
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig
from importlib import reload
import torch.neuron
# To use one neuron core per worker
os.environ["NEURON_RT_NUM_CORES"] = "1"
# saved weights name
AWS_NEURON_TRACED_WEIGHTS_NAME = "neuron_model.pt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('\ndevice:',device)
print('os getenv response size', os.getenv('TS_MAX_RESPONSE_SIZE'))
print('os getenv response size', os.getenv('MMS_MAX_RESPONSE_SIZE'))
def forward_pass(token_ids, model):
print('token_ids', token_ids)
#input_ids = torch.tensor(batch["input_ids"]).to(device)
#attention_mask = torch.tensor(batch["attention_mask"]).to(device)
with torch.no_grad():
model_out = model(*tuple(token_ids.values()))
print('model_out!!', model_out)
print('model out shape', model_out[0].shape)
last_hidden_state = model_out[0]
last_hidden_state = last_hidden_state.cpu().numpy()
# Use average of unmasked hidden states for classification
lhs_shape = last_hidden_state.shape
print('lhs_shape', lhs_shape)
boolean_mask = ~np.array(token_ids["attention_mask"]).astype(bool)
boolean_mask = np.repeat(boolean_mask, lhs_shape[-1], axis=-1)
boolean_mask = boolean_mask.reshape(lhs_shape)
masked_mean = np.ma.array(last_hidden_state, mask=boolean_mask).mean(axis=1)
res = {}
res["hidden_state"] = masked_mean.data
res["input_ids"] = token_ids["input_ids"]
return res
print('\nos.getcwd()', os.getcwd())
print('\nWalk:')
for path, subdirs, files in os.walk('/opt/ml'):
for name in files: print(os.path.join(path, name))
def model_fn(model_dir):
print('\nIn model_fn')
print('\nmodel_dir::', model_dir)
#model = AutoModel.from_pretrained('/opt/ml/model', output_hidden_states=True).to(device)
tokenizer = AutoTokenizer.from_pretrained('/opt/ml/model')
model = torch.jit.load(os.path.join('/opt/ml/model', AWS_NEURON_TRACED_WEIGHTS_NAME))
model_config = AutoConfig.from_pretrained(model_dir)
print('\nmodel read in::')
#tokenizer = AutoTokenizer.from_pretrained(model_dir)
print('\nmodel_dir', model_dir)
return model, tokenizer, model_config
for path, subdirs, files in os.walk('/opt/ml'):
for name in files: print(os.path.join(path, name))
def predict_fn(data, model_tokenizer_model_config):
print('\nin predict!', model_tokenizer_model_config)
print('\ndata[:5]', data[:5])
model, tokenizer, model_config = model_tokenizer_model_config
token_ids = tokenizer(
data,
return_tensors="pt",
max_length=model_config.traced_sequence_length,
padding="max_length",
truncation=True,
)
# convert to tuple for neuron model
print('embeddings?', token_ids)
res2 = forward_pass(token_ids, model)
print('res2', res2)
return res2
def output_fn(prediction, accept):
print('\nin output', type(prediction))
print('\nin output', prediction)
j = [{"inputs":prediction["input_ids"].tolist(), "embeddings":prediction["hidden_state"].tolist()}]
return json.dumps(j)