the error: self and mat2 must have the same dtype, but got Half and Char
there are no visible errors on CMD window this error returns as response
Same code load in 4 bit working
I think issue happens due to some unsqueeze and to DEVICE but i don’t know how to fix
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_PATH = "THUDM/cogagent-vqa-hf"
tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
torch_type = torch.float16
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
low_cpu_mem_usage=True,
load_in_8bit=True,
trust_remote_code=True
).eval()
def process_image(image, input_text, temperature, top_p, top_k, do_sample):
with torch.no_grad():
input_by_model = model.build_conversation_input_ids(tokenizer, query=input_text, history=[], images=[image], template_version='base')
inputs = {
'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]],
}
if 'cross_images' in input_by_model and input_by_model['cross_images']:
inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]]
gen_kwargs = {
"max_length": 2048,
"temperature": temperature,
"do_sample": do_sample,
"top_p": top_p,
"top_k": top_k
}
outputs = model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
response = tokenizer.decode(outputs[0])
return response.split("</s>")[0]