Hi,
I’m currently testing a gradio-based chatbot I’m developing after converting THUDM/glm-4-9b-chat-hf to float8 using optimum.quanto, but I’m running into a snag. Which part should I fix?
Thanks.
it is code.
# glm_4_hf.py
import torch
import logging
import traceback
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from optimum.quanto import QuantizedModelForCausalLM
logger = logging.getLogger(__name__)
class GLM4HfHandler:
def __init__(self, model_dir):
self.model_dir = model_dir
self.tokenizer = None
self.model = None
self.load_model()
def load_model(self):
try:
logger.info(f"[*] Loading tokenizer from {self.model_dir}")
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_dir
)
logger.info(f"[*] Loading model from {self.model_dir}")
if "float8" in self.model_dir:
self.model=QuantizedModelForCausalLM.from_pretrained(
self.model_dir,
device_map="auto",
)
else:
self.model = AutoModelForCausalLM.from_pretrained(
self.model_dir,
torch_dtype=torch.bfloat16,
device_map="auto",
)
logger.info(f"[*] Model loaded successfully: {self.model_dir}")
except Exception as e:
logger.error(f"Failed to load GLM4 Model: {str(e)}\n\n{traceback.format_exc()}")
raise
def generate_answer(self, history):
try:
# 메시지 처리
prompt_messages = [{"role": msg['role'], "content": msg['content']} for msg in history]
logger.info(f"[*] Prompt messages for GLM: {prompt_messages}")
inputs = self.tokenizer.apply_chat_template(
prompt_messages,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt",
return_dict=True
).to(self.model.device)
logger.info("[*] GLM input template applied successfully")
input_len = inputs['input_ids'].shape[1]
# 생성 설정
generation_config = {
"input_ids": inputs['input_ids'],
"attention_mask": inputs['attention_mask'],
"max_new_tokens": 128,
"do_sample": False,
}
# 텍스트 생성
outputs = self.model.generate(**generation_config)
logger.info("[*] GLM model generated the response")
# 결과 처리
generated_text = self.tokenizer.decode(
outputs[0][input_len:],
skip_special_tokens=True
)
logger.info(f"[*] Generated text: {generated_text}")
return generated_text.strip()
except Exception as e:
error_msg = f"Error during GLM answer generation: {str(e)}\n\n{traceback.format_exc()}"
logger.error(error_msg)
return error_msg
and this is error traceback
Traceback (most recent call last):
File “/Users/janghyeonbin/easy-llm/model_handlers/glm_4_hf.py”, line 67, in generate_answer
outputs = self.model.generate(**generation_config)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/utils/_contextlib.py”, line 116, in decorate_context
return func(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/generation/utils.py”, line 2252, in generate
result = self._sample(
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/generation/utils.py”, line 3251, in _sample
outputs = self(**model_inputs, return_dict=True)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1747, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 1059, in forward
outputs = self.model(
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1747, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 812, in forward
layer_outputs = decoder_layer(
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1747, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 536, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1747, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 436, in forward
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 178, in apply_rotary_pos_emb
q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 142, in rotate_half
return torch.stack((-x2, x1), dim=-1).flatten(-2)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/optimum/quanto/tensor/activations/qbytes.py”, line 90, in torch_dispatch
return qdispatch(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/optimum/quanto/tensor/activations/qbytes_ops.py”, line 233, in stack
return qfallback(inputs, dim)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/optimum/quanto/tensor/qtensor.py”, line 29, in qfallback
return callable(*args, **kwargs)
TypeError: ‘list’ object is not callable