Some questions about GPT-J inference using int8

Currently huggingface transformers support loading model into int8, which saves a lot GPU VRAM.

I’ve tried it in GPT-J, but found that the inference time comsume in int8 is much slower, about 8x more than in the normal float16.

Can somebody tell me why and how can I solve it?

below are the codes I used

and my gpu is A30


from typing import *

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


def load_model_tokenizer(
    pretrained_model_name_or_path,
    model_args: Optional[List[Any]] = None,
    tokenizer_args: Optional[List[Any]] = None,
    state_dict: Optional[Dict[str, torch.Tensor]] = None,
    proxies: Optional[Dict[str, str]] = None,
    output_loading_info: bool = False,
    use_auth_token: Union[bool, str] = False,
    revision: str = "main",
    mirror: Optional[str] = None,
    low_cpu_mem_usage: bool = False,
    torch_dtype: Union[str, torch.dtype] = "auto",
    device_map: Union[str, Dict[str, Union[int, str, torch.device]]] = "auto",
    load_in_8bit: bool = False,
    load_in_8bit_threshold: float = 0.6,
    load_in_8bit_skip_modules: Optional[List[str]] = None,
    subfolder: str = "",
    use_fast_tokenizer: bool = False,
    model_kwargs: Optional[Dict[str, Any]] = None,
    tokenizer_kwargs: Optional[Dict[str, Any]] = None,
):
    if not model_args:
        model_args = []
    if not tokenizer_args:
        tokenizer_args = []
    if not model_kwargs:
        model_kwargs = dict()
    if not tokenizer_kwargs:
        tokenizer_kwargs = dict()

    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path,
        *model_args,
        state_dict=state_dict,
        proxies=proxies,
        output_loading_info=output_loading_info,
        use_auth_token=use_auth_token,
        revision=revision,
        mirror=mirror,
        low_cpu_mem_usage=low_cpu_mem_usage,
        torch_dtype=torch_dtype,
        device_map=device_map,
        load_in_8bit=load_in_8bit,
        load_in_8bit_threshold=load_in_8bit_threshold,
        load_in_8bit_skip_modules=load_in_8bit_skip_modules,
        subfolder=subfolder,
        **model_kwargs
    )
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path,
        *tokenizer_args,
        proxies=proxies,
        revision=revision,
        subfolder=subfolder,
        use_fast=use_fast_tokenizer,
        **tokenizer_kwargs
    )

    return model, tokenizer


if __name__ == "__main__":
    import time

    @torch.inference_mode()
    @torch.cuda.amp.autocast()
    def generate_text(model, tokenizer):
        inputs = tokenizer("HuggingFace Transformers is a", return_tensors='pt').to(model.device)

        start = time.time()
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=32,
            min_length=8,
            early_stopping=True
        )
        end = time.time()
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True).split('\n')[0].strip()
        print(f"generate text: {generated_text}\ntime used: {end-start:.4f}")

    model, tokenizer = load_model_tokenizer(
        pretrained_model_name_or_path="/root/gemsouls_workstation/GemsoulsPersonaChatter/gpc_src/nn_models/core/gptj",
        low_cpu_mem_usage=True,
        device_map="auto",
        revision="float16",
        # load_in_8bit=True,
        torch_dtype="auto"
    )
    model.eval()

    print(f"model memory footprint: {model.get_memory_footprint()}")
    generate_text(model, tokenizer)

Hi PanEa!
Sorry for the delay, yes for now this is expected :slight_smile: Please check the related issue here: Memory Decreases! But Latency Increases.... · Issue #6 · TimDettmers/bitsandbytes · GitHub - speed improvement is on the roadmap of bitsandbytes and should be addressed in the next releases! Stay tuned