Hey,
I put the falcon-7b-instruct on huggingface inference endpoint.
The issue I have is that the output it gives me is very short. Even when prompted: ‘Write a 300 word poem’ or given a question plus document prompt, it returns answers (60 to 100 tokens long ~ 10-15 words).
Is there something fundamentally wrong?
Many thanks,
A.
P.S.
I changed nothing in the tiiuae/falcon-7b-instruct repo apart from the below:
- handler.py
import torch
from typing import Any, Dict
from transformers import AutoModelForCausalLM, AutoTokenizer
class EndpointHandler:
def __init__(self, path=""):
self.tokenizer = AutoTokenizer.from_pretrained(path)
self.model = AutoModelForCausalLM.from_pretrained(
path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.max_length = 4096
self.max_new_tokens = 4096
self.top_k = 100
self.top_p = 0.95
self.temperature = 0.9
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", {})
inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
parameters["max_length"] = self.max_length
parameters["max_new_tokens"] = self.max_new_tokens
parameters["top_k"] = self.top_k
parameters["top_p"] = self.top_p
parameters["temperature"] = self.temperature
outputs = self.model.generate(**inputs, **parameters)
prediction = self.tokenizer.decode(
outputs[0], skip_special_tokens=True)
return [{"generated_text": prediction}]