I solved it by inputting a single string using the official Llama 2 format (see Llama 2 is here - get it on Hugging Face). I don’t know why the default Sagemaker Llama endpoint doesn’t work that way. But this works for me:
import json
import requests
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
headers = {"Authorization": f"Bearer hf_XXXXXXXXXXXXXXXXXX",
"Content-Type": "application/json",}
def query(payload):
json_body = {
"inputs": f"[INST] <<SYS>> Your job is to talk like a pirate. Every reponse must sound like a pirate. <<SYS>> {payload} [/INST] ",
"parameters": {"max_new_tokens":256, "top_p":0.9, "temperature":0.7}
}
data = json.dumps(json_body)
response = requests.request("POST", API_URL, headers=headers, data=data)
try:
return json.loads(response.content.decode("utf-8"))
except:
return response
data = query("Just say hi!")
print(data[0]['generated_text'].split('[/INST] ')[1])