TypeError: 'list' object is not callable

Hi,

I’m currently testing a gradio-based chatbot I’m developing after converting THUDM/glm-4-9b-chat-hf to float8 using optimum.quanto, but I’m running into a snag. Which part should I fix?

Thanks.

it is code.

# glm_4_hf.py
import torch
import logging
import traceback
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from optimum.quanto import QuantizedModelForCausalLM

logger = logging.getLogger(__name__)

class GLM4HfHandler:
    def __init__(self, model_dir):
        self.model_dir = model_dir
        self.tokenizer = None
        self.model = None
        self.load_model()

    def load_model(self):
        try:
            logger.info(f"[*] Loading tokenizer from {self.model_dir}")
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_dir
            )
            
            logger.info(f"[*] Loading model from {self.model_dir}")
            if "float8" in self.model_dir:
                self.model=QuantizedModelForCausalLM.from_pretrained(
                    self.model_dir,
                    device_map="auto",
                )
            else:
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_dir,
                    torch_dtype=torch.bfloat16,
                    device_map="auto",
                )
            logger.info(f"[*] Model loaded successfully: {self.model_dir}")
        except Exception as e:
            logger.error(f"Failed to load GLM4 Model: {str(e)}\n\n{traceback.format_exc()}")
            raise
    def generate_answer(self, history):
        try:
            # 메시지 처리
            prompt_messages = [{"role": msg['role'], "content": msg['content']} for msg in history]
            logger.info(f"[*] Prompt messages for GLM: {prompt_messages}")
            
            inputs = self.tokenizer.apply_chat_template(
                prompt_messages,
                add_generation_prompt=True, 
                tokenize=True, 
                return_tensors="pt",
                return_dict=True
            ).to(self.model.device)
            logger.info("[*] GLM input template applied successfully")
                
            input_len = inputs['input_ids'].shape[1]
                
            # 생성 설정
            generation_config = {
                "input_ids": inputs['input_ids'],
                "attention_mask": inputs['attention_mask'],
                "max_new_tokens": 128,
                "do_sample": False,
            }
                
            # 텍스트 생성
            outputs = self.model.generate(**generation_config)
            logger.info("[*] GLM model generated the response")
                
            # 결과 처리
            generated_text = self.tokenizer.decode(
                outputs[0][input_len:],
                skip_special_tokens=True
            )
            logger.info(f"[*] Generated text: {generated_text}")
                
            return generated_text.strip()
            
        except Exception as e:
            error_msg = f"Error during GLM answer generation: {str(e)}\n\n{traceback.format_exc()}"
            logger.error(error_msg)
            return error_msg

and this is error traceback

Traceback (most recent call last):
File “/Users/janghyeonbin/easy-llm/model_handlers/glm_4_hf.py”, line 67, in generate_answer
outputs = self.model.generate(**generation_config)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/utils/_contextlib.py”, line 116, in decorate_context
return func(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/generation/utils.py”, line 2252, in generate
result = self._sample(
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/generation/utils.py”, line 3251, in _sample
outputs = self(**model_inputs, return_dict=True)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1747, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 1059, in forward
outputs = self.model(
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1747, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 812, in forward
layer_outputs = decoder_layer(
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1747, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 536, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1747, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 436, in forward
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 178, in apply_rotary_pos_emb
q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/glm/modeling_glm.py”, line 142, in rotate_half
return torch.stack((-x2, x1), dim=-1).flatten(-2)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/optimum/quanto/tensor/activations/qbytes.py”, line 90, in torch_dispatch
return qdispatch(*args, **kwargs)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/optimum/quanto/tensor/activations/qbytes_ops.py”, line 233, in stack
return qfallback(inputs, dim)
File “/opt/miniconda3/envs/llm/lib/python3.10/site-packages/optimum/quanto/tensor/qtensor.py”, line 29, in qfallback
return callable(*args, **kwargs)
TypeError: ‘list’ object is not callable

1 Like

In my environment, this worked well with a smaller model.

            if "float8" in self.model_dir:
                self.model=QuantizedModelForCausalLM.from_pretrained(
                    self.model_dir,
                   # device_map="auto", # unsupported in quantizatied model
                )
# for debug
modelname = "THUDM/glm-edge-1.5b-chat"
qmodeldir = "./glm-edge-1.5b-chat-float8"

from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.quanto import QuantizedModelForCausalLM, qfloat8

model = AutoModelForCausalLM.from_pretrained(modelname)
tokenizer = AutoTokenizer.from_pretrained(modelname)
qmodel = QuantizedModelForCausalLM.quantize(model, weights=qfloat8)
qmodel.save_pretrained(qmodeldir)
tokenizer.save_pretrained(qmodeldir)

model = GLM4HfHandler(qmodeldir)