According to the Inference API documentation, when the option “use_cache” is set to false, it should not return deterministic output, but does not work for me.
def huggingface_api_completion(prompt: str) -> str:
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_V2}"
headers = {"Authorization": f"Bearer {os.environ.get('HUGGINGFACE_KEY')}"}
data = json.dumps(
{
"inputs": prompt,
"parameters": {
"max_length": round(len(prompt) / 5) + 100,
"num_return_sequences": 1,
"return_text": False,
"return_full_text": False,
"do_sample": True,
"top_k": 50,
"top_p": 0.95,
"end_sequence": "\n"
},
"options": {
"wait_for_model": True,
"use_cache": False,
},
}
)
response = requests.request("POST", API_URL, headers=headers, data=data)
data = json.loads(response.content.decode("utf-8"))
completions = data[0]["generated_text"].strip()
return completions
How can I enforce non-deterministic outputs? When doing local inference I tweak the seed but here IDK?