Simple use of Transformers breaks

relatively simply program

model is chavinlo/alpaca-13b · Hugging Face

import  torch
from transformers import LlamaTokenizer, LlamaForCausalLM

model_folder = "alpaca-13b"
model = "./alpaca/"+ model_folder

tokenizer = LlamaTokenizer.from_pretrained(model)
model = LlamaForCausalLM.from_pretrained(
    model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

def ask(message):
    print('message: ' + message)
    input_ids = tokenizer(message, return_tensors="pt")#.input_ids.to("cuda")
    print('input_ids: ' + str(input_ids))
    generated_ids = model.generate(input_ids, max_new_tokens=250, do_sample=True)#, repetition_penalty=1.3, temperature=0.8, top_p=0.75, top_k=40)
    print('generated_ids: ' + str(generated_ids))
    # response = tokenizer.batch_decode(generated_ids[0][input_ids.shape[-1]:])
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    print('response: ' + str(response))
    return response

if __name__ == "__main__":
    while True:
        message = input("Enter your message: ")
        response = ask(message)

I get the error

message: asdf
input_ids: {'input_ids': tensor([[   2,  408, 2176]]), 'attention_mask': tensor([[1, 1, 1]])}
/home/nick/Programs/miniconda3/envs/discord-bot/lib/python3.11/site-packages/transformers/generation/utils.py:1255: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation)
  warnings.warn(
Traceback (most recent call last):
  File "/home/nick/Programs/miniconda3/envs/discord-bot/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 249, in __getattr__
    return self.data[item]
           ~~~~~~~~~^^^^^^
KeyError: 'shape'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/nick/Work/llm/alpaca-discord/alpaca-test.py", line 29, in <module>
    response = ask(message)
               ^^^^^^^^^^^^
  File "/home/nick/Work/llm/alpaca-discord/alpaca-test.py", line 19, in ask
    generated_ids = model.generate(input_ids, max_new_tokens=250, do_sample=True)#, repetition_penalty=1.3, temperature=0.8, top_p=0.75, top_k=40)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nick/Programs/miniconda3/envs/discord-bot/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/nick/Programs/miniconda3/envs/discord-bot/lib/python3.11/site-packages/transformers/generation/utils.py", line 1293, in generate
    batch_size = inputs_tensor.shape[0]
                 ^^^^^^^^^^^^^^^^^^^
  File "/home/nick/Programs/miniconda3/envs/discord-bot/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 251, in __getattr__
    raise AttributeError
AttributeError

I had to move the input_ids to ‘cuda’ and then it works, the model seems to just repeat whatever I put in which is not great but at least it doesn’t break anymore