Getting an error when deploying llama2 7B on custom dataset using sagemaker inference endpoint

Hello All,

I’m trying to deploy llama2 7B model and unable to successfully create endpoint.I have shared below the .py file that trains model on custom dataset and .py file that creates the inference dataset:

“”“save_model_Llama2.ipynb”“”

Automatically generated by Colaboratory.

Original file is located at
Google Colab
“”"

!pip install -q huggingface_hub
!pip install -q -U trl transformers accelerate peft
!pip install -q -U datasets bitsandbytes einops wandb

!pip install git+https://github.com/huggingface/peft.git

from huggingface_hub import notebook_login
notebook_login()

from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from peft import LoraConfig
from transformers.generation.utils import top_k_top_p_filtering
from trl import SFTTrainer

dataset_name = “Harit10/extra_PII_650”
dataset = load_dataset(dataset_name, split=“train”)

base_model_name = “meta-llama/Llama-2-7b-hf”

bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=“nf4”,
bnb_4bit_compute_dtype=torch.float16,
)

device_map = {“”: 0}

device_map

base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
#quantization_config=bnb_config,
device_map=device_map,
trust_remote_code=True,
use_auth_token=True
)
base_model.config.use_cache = False

from google.colab import drive
drive.mount(‘/content/drive’)

More info: [`Llama2`] replace `self.pretraining_tp` with `self.config.pretraining_tp` by younesbelkada · Pull Request #24906 · huggingface/transformers · GitHub

base_model.config.pretraining_tp = 1

‘’‘peft_config = LoraConfig(
lora_alpha=16
lora_dropout=0.1,
r=64,
bias=“none”,
task_type=“CAUSAL_LM”,
)’‘’

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

output_dir = “./results”

training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
logging_steps=10,
max_steps=100,
push_to_hub=True,
hub_model_id=“Llama2-PII_final”
)

max_seq_length = 512

trainer = SFTTrainer(
model=base_model,
train_dataset=dataset,
#peft_config=peft_config,
dataset_text_field=“text”,
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_args,
)

trainer.train()

Prepare for merging adapter and base model

import os
from peft import PeftModel
final_checkpoint_path = os.path.join(output_dir, “final_checkpoint”)

print(final_checkpoint_path)

Merge adapter with the base model

merged_model = PeftModel.from_pretrained(base_model, final_checkpoint_path).merge_and_unload()

Save the merged model and tokenizer, ensuring config.json is included

merged_model.save_pretrained(final_checkpoint_path)
tokenizer.save_pretrained(final_checkpoint_path)

import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

output_dir = os.path.join(output_dir, “final_checkpoint”)

Ensure the directory exists before proceeding

if not os.path.exists(output_dir):
print(f"The directory {output_dir} does not exist. Please check your training checkpoints.")
else:
# Initialize the base model and tokenizer again for clarity and completeness
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
return_dict=True,
torch_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load the PEFT model from the final checkpoint
try:
    ft_model = PeftModel.from_pretrained(base_model, output_dir)

    # Merge the adapter weights back to the base model
    model_merged = ft_model.merge_and_unload()

    # Save the merged model and tokenizer to the final checkpoint directory
    model_merged.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    print("Model and tokenizer saved successfully.")

except Exception as e:
    print(f"An error occurred: {e}")

import os
output_dir = os.path.join(output_dir, “final_checkpoint”)
trainer.model.merge_and_unload()

trainer.model.save_pretrained(output_dir)

output_dir

trainer.push_to_hub(“End of training”)

Check if CUDA (GPU support) is available and set the device accordingly

device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)

device

from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(‘results/final_checkpoint’, device_map=device_map, torch_dtype=torch.bfloat16)
#text = “Regarding Amelia Thompson, student ID 654789, SIN 321654987, contactable at 555-678-1234, for her declining grades in advanced calculus. A letter of academic probation is warranted. Task it to Anonymize the personal identifiers.”
#text = “Attention needed for Mason Miller, with student ID 369741, SIN 147852369, and phone number 741-555-8522, due to his poor performance on group assignments in business studies. Issue an academic probation letter. Task is to anonymize the personal identifiers.”
#text = “Evelyn Rodriguez, with student ID 123654, SIN 789123654, and phone 654-555-3211, has been displaying a lack of initiative in volunteer programs. An academic probation letter is needed. Task is to anonymize the personal identifiers.”
#text = “For Jackson Thomas, bearing student ID 987654, SIN 321654987, and can be contacted at 123-555-7890, who has shown insufficient progress in his internship requirements. Please draft an academic probation letter. Task is to anonymize the personal identifiers.”
#text = “Student named Grace Harris, with student ID 264813, SIN 579213648, and phone 852-555-9634, has exhibited poor teamwork skills in project-based learning courses. A letter of academic probation is required. Task is to anonymize the personal identifiers.”
#text = “Anonymize the personal identifiers in the following text: Attention needed for Mason Miller, with student ID 369741, SIN 147852369, and phone number 741-555-8522, due to his poor performance on group assignments in business studies. Issue an academic probation letter.”
#text = “Anonymize the personal identifiers for this: Student named Grace Harris, with student ID 264813, SIN 579213648, and phone 852-555-9634, has exhibited poor teamwork skills in project-based learning courses. A letter of academic probation is required.”
#text = “Anonymize the Personal Identifiable Information in the following text: Evelyn Rodriguez, with student ID 123654, SIN 789123654, and phone 654-555-3211, has been displaying a lack of initiative in volunteer programs.”
#text = “Anonymize the Personal Identifiable Information in the following text: Attention needed for Mason Miller, with student ID 369741, SIN 147852369, and phone number 741-555-8522, due to his poor performance on group assignments in business studies. Issue an academic probation letter.”

#text = “Anonymize the personal identifiers for this - Student named Grace Harris, with student ID 264813, SIN 579213648, and phone 852-555-9634, has exhibited poor teamwork skills in project-based learning courses. A letter of academic probation is required.”
#text = “Anonymize the personal identifiable information in the following text: Attention needed for Mason Miller, with student ID 369741, SIN 147852369, and phone number 741-555-8522, due to his poor performance on group assignments in business studies. Issue an academic probation letter.”
#text = “Anonymize the personal identifiers for in the following text: "
#text = “Anonymize the personal identifiable information in the following text. Student named Grace Harris, with student ID 264813, SIN 579213648, and phone 852-555-9634, has exhibited poor teamwork skills in project-based learning courses. A letter of academic probation is required.”
#text = “Anonymize the personal identifiable information in the following text by replacing the personal identifiers with <> brackets. For Jackson Thomas, bearing student ID 987654, SIN 321654987, and can be contacted at 123-555-7890, who has shown insufficient progress in his internship requirements. Please draft an academic probation letter.”
#text = “Anonymize the personal identifiable information in the following text: For Nathan Zhao, identified by student ID 345678, SIN 987654321, and reachable at 345-555-6789, who has repeatedly missed critical deadlines for project submissions in the software development course. A letter of academic probation is necessary.”
#text = " Anonymize the personal identifiable information in the following text - ‘Concerning Emma Patel, with student ID 123890, SIN 567890123, and phone number 567-555-1234, for her lack of contributions to team projects in the business analytics class. Draft a letter of academic probation.’ and i want the output in this form ‘:Emily Johnson, <STUDENT_ID>:876543, :123456789, :436-555-2345’”
#text = “Anonymize the personal identifiable information in the following text. Addressing the performance of Lucas Smith, student ID 789012, SIN 234567890, and contactable at 678-555-7890, due to his inadequate effort in the creative writing workshops. An academic probation letter needs to be prepared.”
#text = “Anonymize the personal identifiable information in the following text: Regarding Mia Wang, with student ID 234567, SIN 456789012, and reachable at 890-555-2345, for her continuous absence from mandatory lab sessions in the biology course. A letter of academic probation must be drafted.”

#text = “Anonymize the personal identifiable information in the following text and i want the output in this format ‘:Name’ ‘Student named Grace Harris, with student ID 264813, SIN 579213648, and phone 852-555-9634, has exhibited poor teamwork skills in project-based learning courses. A letter of academic probation is required.’”
#text = “Anonymize the personal identifiable information in the following text and i want the output in this format ‘:Name’ - ‘For Nathan Zhao, identified by student ID 345678, SIN 987654321, and reachable at 345-555-6789, who has repeatedly missed critical deadlines for project submissions in the software development course. A letter of academic probation is necessary.’”

#text = “Anonymize the personal identifiable information in the following text and i want the output in this format ‘:Name’ - ‘Addressing the performance of Lucas Smith, student ID 789012, SIN 234567890, and contactable at 678-555-7890, due to his inadequate effort in the creative writing workshops. An academic probation letter needs to be prepared.’”

#text = “Anonymize the personal identifiable information in the following text and i want the output in this format ‘:Name’ - ‘Regarding Mia Wang, with student ID 234567, SIN 456789012, and reachable at 890-555-2345, for her continuous absence from mandatory lab sessions in the biology course. A letter of academic probation must be drafted.’”

#text = “Anonymize the personal information in the following text. Input: ‘Charlotte Davis, student ID 213456, SIN 546789123, with contact at 212-555-1234, has shown a decline in her mathematics course grades. A letter of academic probation should be issued.’ Format: ‘:Name, <STUDENT_ID>:ID, :Number, :Phone Number’.”
#text = “Anonymize the personal information in the following text and output in the specified format. Format: ‘:Name, <STUDENT_ID>:ID, :Number, :Phone Number’. Input: ‘Student named Grace Harris, with student ID 264813, SIN 579213648, and phone 852-555-9634, has exhibited poor teamwork skills in project-based learning courses. A letter of academic probation is required.’”
#text = “Anonymize the personal information in the following text and output in the specified format. Format: ‘:Name, <STUDENT_ID>:ID, :Number, :Phone Number’. Input: ‘Regarding Amelia Thompson, student ID 654789, SIN 321654987, contactable at 555-678-1234, for her declining grades in advanced calculus. A letter of academic probation is warranted.’”
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Regarding Amelia Thompson, student ID 654789, SIN 321654987, contactable at 555-678-1234, for her declining grades in advanced calculus. A letter of academic probation is warranted.’ Format: ‘:Name, <STUDENT_ID>:ID, :Number, :Phone Number’"

#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Student named Grace Harris, with student ID 264813, SIN 579213648, and phone 852-555-9634, has exhibited poor teamwork skills in project-based learning courses. A letter of academic probation is required.’ Format: ‘:Name, <STUDENT_ID>:ID, :Number, :Phone Number’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘For Nathan Zhao, identified by student ID 345678, SIN 987654321, and reachable at 345-555-6789, who has repeatedly missed critical deadlines for project submissions in the software development course. A letter of academic probation is necessary.’"

#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: 'For Jackson Thomas, bearing student ID 987654, SIN 321654987, and can be contacted at 123-555-7890, who has shown insufficient progress in his internship requirements. Please draft an academic probation letter."

#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Addressing the performance of Lucas Smith, student ID 789012, SIN 234567890, and contactable at 678-555-7890, due to his inadequate effort in the creative writing workshops. An academic probation letter needs to be prepared.’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Charlotte Davis, student ID 213456, SIN 546789123, with contact at 212-555-1234, has shown a decline in her mathematics course grades. A letter of academic probation should be issued.’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Regarding Amelia Thompson, student ID 654789, SIN 321654987, contactable at 555-678-1234, for her declining grades in advanced calculus. A letter of academic probation is warranted.’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘The student’s name is Mia Wang, and her SIN is 456789012, and her phone number is 890-555-2345 amd her student ID is 234567, for her continuous absence from mandatory lab sessions in the biology course. A letter of academic probation must be drafted.’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Attention needed for Mason Miller, with student ID 369741, SIN 147852369, and phone number 741-555-8522, due to his poor performance on group assignments in business studies. Issue an academic probation letter.’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Attention needed for Mason Miller, with student ID 369741, SIN 147852369, and phone number 741-555-8522, due to his poor performance on group assignments in business studies. Issue an academic probation letter.’"

#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Concerns about Saoirse Íngrid Björnsdóttir, with student ID 597213, SIN 213597846, and phone number 818-555-2134, have been raised due to his consistent failure to meet coursework deadlines. An academic probation letter is necessary.’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Julia Morris, student ID 482759, SIN 759482316, and contact at 213-555-4789, has shown a marked decline in class participation and exam scores. The issuance of an academic probation letter is warranted.’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘X Æ A-12 Musk, carrying student ID 364285, SIN 285364719, and phone number 504-555-6921, has been reported for disruptive behavior and lack of engagement in group projects. Draft an academic probation letter.’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Reports indicate that Sarah Chen, identified by student ID 951753, passport number M0969597,SIN 753951462, with a reachable phone number of 707-555-5309, has not achieved the minimum grade requirements in her major subjects. Prepare an academic probation letter.’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘The academic performance of Chloë Éloïse Müller-Wang, contact number 323-555-7486, SIN 297846513 and student ID 846297, has been unsatisfactory across multiple subjects, leading to a recommendation for academic probation. An official letter must be drafted.’"
#text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Reports indicate that Alexander-Jonathon Fitzgerald Beauregard-Smith, with phone number 312-555-8264, SIN 639428071, and student ID 428639, has exhibited a lack of respect towards faculty members, leading to multiple complaints. An academic probation letter is deemed necessary.’"

text = " Task: Anonymize the personal information in the following text and output in the specified format. Input: ‘Reports indicate that Alexander-Jonathon Fitzgerald Beauregard-Smith with phone number 312-555-8264 SIN 639428071 and student ID 428639 has exhibited a lack of respect towards faculty members leading to multiple complaints. An academic probation letter is deemed necessary.’"

inputs = tokenizer(text, return_tensors=“pt”).to(device)
outputs = model.generate(input_ids=inputs[“input_ids”].to(“cuda”), attention_mask=inputs[“attention_mask”], max_new_tokens=100, pad_token_id=tokenizer.eos_token_id, temperature = 0.5)

#print(tokenizer.decode(outputs[0], skip_special_tokens=True))

decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(decoded_output)

import re

Define a regex pattern to capture everything after “Expected Output: '”

up to the closing quote

pattern = r"Expected Output: ‘(.*?)’"

Search for the pattern in the output

match = re.search(pattern, decoded_output)

Extract the matched part

if match:
extracted_text = match.group(1) # group(1) to get the content of the first capturing group
print(“Extracted Part:”, extracted_text)
else:
print(“No match found”)

Notebook for inference endpoint:

%% [markdown]

# Set up LLaMA 2 on SageMaker

%% [markdown]

## Set up Dependencies

%%

!pip install sagemaker

%%

pip list | grep sagemaker

%%

pip install -U sagemaker

%%

import sagemaker
import boto3
import numpy
import scipy

Declare the SageMaker execution role variable globally

sagemaker_execution_role = None

setup sagemaker session

def setup_sagemaker_session(default_bucket=None):
“”"
Setup SageMaker session and get IAM execution role

Params:
-default_bucket: Default bucket nmae to use for the session

Returns:
-session: SageMaker session object
-role_arn: ARN of the IAM execution role



"""

global sagemaker_execution_role
session = sagemaker.Session(default_bucket=default_bucket)

try:
    sagemaker_execution_role = sagemaker.get_execution_role()
except:
    iam = boto3.client("iam")
    sagemaker_execution_role = iam.get_role(RoleName="sagemaker_execution_role")["Role"]["Arn"]
    
return session, sagemaker_execution_role

def mask_account_id(account_id):
return “*” * len(account_id)

def main():
sagemaker_session_bucket = None

session, sagemaker_execution_role = setup_sagemaker_session(default_bucket=sagemaker_session_bucket)

# Mask with ****
account_id = sagemaker_execution_role.split(":")[4]
masked_account_id = mask_account_id(account_id)
masked_role = sagemaker_execution_role.replace(account_id, masked_account_id)

print(f"SageMaker role ARN: {masked_role}")
print(f"SageMaker session region: {session.boto_region_name}")
print(sagemaker_execution_role)

if name == “main”:
main()

%%

from sagemaker.huggingface import get_huggingface_llm_image_uri

#Fetch Docker image URI for the Hugging Face DLC

1. backend name

2. Hugging Face LDC version

llm_image = get_huggingface_llm_image_uri(“huggingface”, version=“0.9.3”)

Log the docker image URI

print(f"llm image uri {llm_image}")

%% [markdown]

## Determine Model Requirements

%% [markdown]

- m1.g5.12xlarge instance

- Meta LLaMa 2 model - request form fullfilled/approved

- Hugging Face account

%%

import json

confirm instance requirments are met for SageMaker session

with open(“/opt/ml/metadata/resource-metadata.json”) as f:
metadata = json.load(f)
print(metadata[“ResourceName”])

%%

confirm requirements met for kernel

import json

def get_instance_type_from_metadata():
with open(“/opt/ml/metadata/resource-metadata.json”) as f:
metadata = json.load(f)
resource_name = metadata.get(“ResourceName”, “”)
return resource_name

def main():
resource_name = get_instance_type_from_metadata()

# List valid instance types
valid_instance_types = ["ml-g5-2xlarge","ml-g5-12xlarge", "ml-g5-48xlarge"]

if any(instance_type in resource_name for instance_type in valid_instance_types):
    print("Instance configured correctly")
else:
    print("Need to upgrade to at least ml.g5-2xlarge instance")

if name == “main”:
main()

%% [markdown]

## Deploy Meta’s LLaMa model to Amazon SageMaker

%%

import json
import getpass
from sagemaker.huggingface import HuggingFaceModel

def get_sagemaker_config():
# Configure sagemaker instance details
instance_type = “ml.g5.12xlarge”
number_of_gpu = 1
health_check_timeout = 300

# Configure Hugging Face details
config = {
    "HF_MODEL_ID": "Harit10/Llama2-config",
    "SM_NUM_GPUS": json.dumps(number_of_gpu),
    "MAX_INPUT_TOKENS": json.dumps(2048),
    "MAX_TOTAL_TOKENS": json.dumps(4096),
    "MAX_BATCH_TOTAL_TOKENS": json.dumps(8192),
    "HUGGING_FACE_HUB_TOKEN": getpass.getpass("Enter your Hugging Face Hub Token:")
}

return instance_type, health_check_timeout, config

def create_huggingface_model(instance_type, config, role, image_uri):
assert config[“HUGGING_FACE_HUB_TOKEN”] != “”, “Please set your Hugging Face Hub Token”

llm_model = HuggingFaceModel(role = role, 
                             image_uri = image_uri,
                             env = config)
return llm_model

def main():
instance_type, health_check_timeout, config = get_sagemaker_config()

role = sagemaker_execution_role
llm_image_to_ref = llm_image

llm_model = create_huggingface_model(instance_type, config, role, llm_image_to_ref)

if llm_model:
    llm = llm_model.deploy(initial_instance_count = 1,
                           instance_type = instance_type,
                           container_startup_health_check_timeout = health_check_timeout)

if name == “main”:
main()

%%

import json
import boto3

sagemaker_runtime = boto3.client(“sagemaker-runtime”)

endpoint_name = “huggingface-pytorch-tgi-inference-2024-03-15-13-24-31-061”

def build_llama2_prompt(message):
stopPrompt = “”
startPrompt = “[INST] "
endPrompt = " [/INST]”
conversation =
for index, message in enumerate(message):
if message[“role”] == “system” and index == 0:
# conversation.append(f"<>\n{message[“content”]}\n<>\n\n")
conversation.append(f"<>\n{message[‘content’]}\n<>\n\n")
elif message[“role”] == “user”:
conversation.append(message[“content”].strip())
else:
# conversation.append(f" [/INST] {message[“content”].strip()}
[INST] “)
conversation.append(f”{endPrompt} {message[‘content’].strip()} {stop_token}{startPrompt}")
return startPrompt + “”.join(conversation) + endPrompt

messages = [
{
“role”: “system”,
“content”: “You are a nonprofit advocate and champion. Your goal is to help entrepreneurs and movers and shakers find their purpose through positivity”
}
]

instruction = “What does the world need more of right now?”

messages.append({“role”: “user”, “content”: instruction})
prompt = build_llama2_prompt(messages)

input_data = {
“inputs”: prompt,
“parameters”: {
“do_sample”: True,
“top_p”: 0.6,
“temperature”: 0.9,
“top_k”: 50,
“max_new_tokens”: 512,
“repetition_penalty”: 1.03,
“stop”: [“
”],
},
}

input_data_json = json.dumps(input_data)

content_type = “application/json”

response = sagemaker_runtime.invoke_endpoint(
EndpointName = endpoint_name,
ContentType = content_type,
Body = input_data_json.encode(“utf-8”),
)

response_body = response[“Body”].read().decode(“utf-8”)
response_json = json.loads(response_body)

generated_text = response_json[0][“generated_text”]
print(generated_text[len(prompt):])

%%