When using AutoModelForCausalLM, THUDM/cogagent-vqa-hf and load_in_8bit I get this error : self and mat2 must have the same dtype, but got Half and Char

the error: self and mat2 must have the same dtype, but got Half and Char

there are no visible errors on CMD window this error returns as response

Same code load in 4 bit working

I think issue happens due to some unsqueeze and to DEVICE but i don’t know how to fix

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_PATH = "THUDM/cogagent-vqa-hf"
tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
torch_type = torch.float16

    model = AutoModelForCausalLM.from_pretrained(
def process_image(image, input_text, temperature, top_p, top_k, do_sample):
    with torch.no_grad():
        input_by_model = model.build_conversation_input_ids(tokenizer, query=input_text, history=[], images=[image], template_version='base')
        inputs = {
            'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
            'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
            'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
            'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]],
        if 'cross_images' in input_by_model and input_by_model['cross_images']:
            inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]]

        gen_kwargs = {
                "max_length": 2048,
                "temperature": temperature,
                "do_sample": do_sample,
                "top_p": top_p,
                "top_k": top_k
        outputs = model.generate(**inputs, **gen_kwargs)
        outputs = outputs[:, inputs['input_ids'].shape[1]:]
        response = tokenizer.decode(outputs[0])
        return response.split("</s>")[0]