Hey, I’m trying to deploy falcon 7b model via SageMaker.
At first I was trying to use the “Deploy Amazon Sage Maker” code from https://huggingface.co/tiiuae/falcon-7b:
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel
try:
role = sagemaker.get_execution_role()
except ValueError:
iam = boto3.client('iam')
role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
# Hub Model configuration. https://huggingface.co/models
hub = {
'HF_MODEL_ID':'tiiuae/falcon-7b',
'HF_TASK':'text-generation'
}
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
transformers_version='4.26.0',
pytorch_version='1.13.1',
py_version='py39',
env=hub,
role=role,
)
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
initial_instance_count=1, # number of instances
instance_type='ml.m5.xlarge' # ec2 instance type
)
predictor.predict({
"inputs": "Can you please let us know more details about your ",
})
Then i got this error:
ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{
"code": 400,
"type": "InternalServerException",
"message": "Loading /.sagemaker/mms/models/tiiuae__falcon-7b requires you to execute the configuration file in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code\u003dTrue` to remove this error."
}
Then i was trying to create inference endpoint in other way - using custom inference.py i’ve created:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
model_name = 'tiiuae/falcon-7b'
def model_fn(model_dir):
tokenizer = AutoTokenizer.from_pretrained(model_name)
pipeline = transformers.pipeline(
"text-generation",
model=model_name,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
)
return pipeline, tokenizer
def predict_fn(data, pipeline_and_tokenizer):
pipeline, tokenizer = pipeline_and_tokenizer
sequences = pipeline(
data,
max_length=200,
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
)
return sequences
After deploying it using this code:
from sagemaker.huggingface.model import HuggingFaceModel
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
model_data=model_url, # path to your model and script
role=role, # iam role with permissions to create an Endpoint
transformers_version="4.26.0", # transformers version used
pytorch_version="1.13.1", # pytorch version used
py_version='py39', # python version used
model_server_workers=1
)
model_name = 'falcon7bmodel'
# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
initial_instance_count=1,
instance_type="ml.g5.2xlarge",
endpoint_name=model_name + "endpoint",
)
I was trying to predict and got this error:
File "/opt/conda/lib/python3.9/site-packages/transformers/pipelines/text_generation.py", line 205, in preprocess
prefix + prompt_text, padding=False, add_special_tokens=False, return_tensors=self.framework
TypeError: can only concatenate str (not "dict") to str
Does someone deployed falcon-7b successfully and can help me here to understand what i’m doing wrong here?
Thanks!