RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!

Sure thing!

from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import dispatch_model, infer_auto_device_map
from accelerate.utils import get_balanced_memory
from torch.cuda.amp import autocast
import torch

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b-instruct", trust_remote_code=True, device_map='auto', torch_dtype=torch.bfloat16)

max_memory = get_balanced_memory(
    model,
    max_memory=None,
    no_split_module_classes=["DecoderLayer", "Attention", "MLP", "LayerNorm", "Linear"],
    dtype='float16',
    low_zero=False,
)

device_map = infer_auto_device_map(
    model,
    max_memory=max_memory,
    no_split_module_classes=["DecoderLayer", "Attention", "MLP", "LayerNorm", "Linear"],
    dtype='float16'
)

model = dispatch_model(model, device_map=device_map)

generation_kwargs = {
    "min_length": -1,
    "top_k": 0,
    "top_p": 0.85,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "min_new_tokens": 10,
    "max_new_tokens": 50,
    "eos_token_id": tokenizer.eos_token_id,
}


with autocast():
    print(tokenizer.decode(model.generate(tokenizer.encode("Hello World!", return_tensors="pt").to("cuda:0"), **generation_kwargs)[0]))