ONNX only faster at lower sequence lengths

Hi, I converted the BAAI/bge-base-en-v1.5 model to onnx format and did some performance testing.

It seems that at small sequence lengths the ONNX model is faster than a transformers and sentence_transformers implementation.

However, as the sequence length increased it goes from being almost twice the speed to actually being slower. Does anyone know why this is?

@JamesXanda Can you share the commands you used to export the model and the script you used to test it please?

Sure, the code I used for exporting to ONNX was:

import logging
import argparse
from huggingface_hub import login
import os
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTOptimizer, ORTQuantizer
from optimum.onnxruntime.configuration import AutoOptimizationConfig, AutoQuantizationConfig


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_id", type=str, default="BAAI/bge-base-en-v1.5")
    parser.add_argument("--hf_token", type=str, default=None)
    parser.add_argument("--quantize", type=bool, default=False)
    parser.add_argument("--optimize", type=bool, default=False)
    parser.add_argument("--full_precision_output_dir", type=str, default="onnx_model")
    parser.add_argument("--optimized_output_dir", type=str, default="onnx_model_optimized")
    parser.add_argument("--quantized_output_dir", type=str, default="onnx_model_quantized")

    return parser.parse_args()


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    logging.info("Parsing arguments...\n")
    args = parse_args()
    model_id = args.model_id
    hf_token = args.hf_token
    quantize = args.quantize
    optimize = args.optimize
    full_precision_output_dir = args.full_precision_output_dir
    optimized_output_dir = args.optimized_output_dir
    quantized_output_dir = args.quantized_output_dir


    logging.info("Logging in to Hugging Face...\n")
    if args.hf_token:
        os.environ["HF_TOKEN"] = hf_token
    login(token=hf_token)

    logging.info("Loading model...\n")
    # model = ORTModelForFeatureExtraction.from_pretrained(model_id, file_name="onnx/model.onnx")
    model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    logging.info("Checking if ONNX model directory exists...\n")
    if os.path.exists(full_precision_output_dir):
        logging.info("ONNX model directory already exists. Deleting it...\n")
        os.system(f"rm -r {full_precision_output_dir}")

    logging.info("Exporting model to ONNX...\n")
    # save onnx checkpoint and tokenizer
    model.save_pretrained(full_precision_output_dir)
    tokenizer.save_pretrained(full_precision_output_dir)

    if optimize:
        logging.info("Optimizing...\n")

        logging.info("Checking if optimized model directory exists...\n")
        if os.path.exists(optimized_output_dir):
            logging.info("Optimized model directory already exists. Deleting it...\n")
            os.system(f"rm -r {optimized_output_dir}")

        logging.info("Building optimizer...\n")
        optimizer = ORTOptimizer.from_pretrained(full_precision_output_dir)
        optimization_config = AutoOptimizationConfig.O2()

        logging.info("Optimizing model...\n")
        optimizer.optimize(save_dir=optimized_output_dir, optimization_config=optimization_config)
        tokenizer.save_pretrained(optimized_output_dir)


    if quantize:
        logging.info("Quantizing...\n")

        logging.info("Loading model to quantize...\n")
        if optimize:
            logging.info("Loading optimized model...\n")
            model = ORTModelForFeatureExtraction.from_pretrained(optimized_output_dir)
        else:
            model = ORTModelForFeatureExtraction.from_pretrained(full_precision_output_dir)

        # create ORTQuantizer and define quantization configuration
        logging.info("Creating quantizer...\n")
        quantizer = ORTQuantizer.from_pretrained(model)
        dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

        logging.info("Quantizing model...\n")
        model_quantized_path = quantizer.quantize(
            save_dir=quantized_output_dir,
            quantization_config=dqconfig,
        )
        tokenizer.save_pretrained(model_quantized_path)

This is run on cpu and depending on the argument it saves up to 3 versions. The one I am referencing is the full precision model.

The performance testing code is:

import os
import argparse
import logging
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from optimum.onnxruntime import ORTModelForFeatureExtraction
import torch
import numpy as np
import time

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_id", type=str, default="BAAI/bge-base-en-v1.5")
    parser.add_argument("--hf_token", type=str, default=None)
    parser.add_argument("--quantize", type=bool, default=False)
    parser.add_argument("--optimize", type=bool, default=False)
    parser.add_argument("--samples", type=int, default=1000)
    parser.add_argument("--max_length", type=int, default=500)

    return parser.parse_args()
    
def generate_samples(tokenizer, num_samples=100, max_length=500):
    vocab_size = tokenizer.vocab_size

    samples = []

    for _ in range(num_samples):
        length = np.random.randint(1, max_length)
        input_ids = torch.randint(low=1,high = vocab_size, size=(length,)).tolist()
        string = tokenizer.decode(input_ids, skip_special_tokens=True)

        inputs_pt = tokenizer(string, padding=True, truncation=True, return_tensors="pt")
        inputs_np = tokenizer(string, padding=True, truncation=True, return_tensors="np")

        samples.append(
            (inputs_pt, inputs_np, string)
        )

    return samples


def benchmark_transformers(model_id: str, samples: list):
    model = AutoModel.from_pretrained(model_id)
    model.eval()
    model.to("cpu")
    
    start_time = time.time()
    with torch.no_grad():
        for sample, _, _ in samples:
            output = model(**sample)
            sentence_embeddings = output[0][:, 0]
            sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    end_time = time.time()

    logging.info(f" Transformers Time taken: {end_time - start_time} seconds/n")

    info = {
        "Framework": "Transformers",
        "Time": end_time - start_time
    }

    return info

def benchmark_sentence_transformers(model_id: str, samples: list):
    model = SentenceTransformer(model_id)
    model.eval()
    model.to("cpu"  )

    start_time = time.time()
    with torch.no_grad():
        for _ , _, string in samples:
            sentence_embeddings = model.encode(string, normalize_embeddings=True, show_progress_bar=False)
    end_time = time.time()

    logging.info(f" Sentence Transformers Time taken: {end_time - start_time} seconds/n")

    info = {
        "Framework": "Sentence Transformers",
        "Time": end_time - start_time
    }

    return info

def benchmark_onnx(model_id: str, samples: list):
    model = ORTModelForFeatureExtraction.from_pretrained("onnx_model")


    start_time = time.time()
    
    for _, sample, _ in samples:
        output = model(**sample)
        sentence_embeddings = output[0][:, 0]
        sentence_embeddings = sentence_embeddings / np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)
    end_time = time.time()

    logging.info(f" ONNX Time taken: {end_time - start_time} seconds/n")

    info = {
        "Framework": "ONNX",
        "Time": end_time - start_time
    }

    return info

def benchmark_optimized(model_id: str, samples: list):
    model = ORTModelForFeatureExtraction.from_pretrained("onnx_model_optimized")

    start_time = time.time()

    for _, sample, _ in samples:
        output = model(**sample)
        sentence_embeddings = output[0][:, 0]
        sentence_embeddings = sentence_embeddings / np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)
    end_time = time.time()

    logging.info(f" Optimized Time taken: {end_time - start_time} seconds/n")

    info = {
        "Framework": "Optimized",
        "Time": end_time - start_time
    }

    return info

def benchmark_quantized(model_id: str, samples: list):
    model = ORTModelForFeatureExtraction.from_pretrained("onnx_model_quantized")

    start_time = time.time()

    for _, sample, _ in samples:
        output = model(**sample)
        sentence_embeddings = output[0][:, 0]
        sentence_embeddings =sentence_embeddings / np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)
    end_time = time.time()

    logging.info(f" Quantized Time taken: {end_time - start_time} seconds/n")

    info = {
        "Framework": "Quantized",
        "Time": end_time - start_time
    }

    return info



if __name__=="__main__":
    logging.basicConfig(level=logging.INFO)

    logging.info("Parsing arguments...\n")
    args = parse_args()
    model_id = args.model_id
    hf_token = args.hf_token
    quantize = args.quantize
    optimize = args.optimize
    samples = args.samples
    max_length = args.max_length

    logging.info("Logging in to Hugging Face...\n")
    if args.hf_token:
        os.environ["HF_TOKEN"] = hf_token
    login(token=hf_token)

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    logging.info("Generating samples...\n")
    samples = generate_samples(tokenizer, num_samples=samples, max_length=max_length)

    infos = []


    logging.info("Benchmarking ONNX...\n")
    onnx_info = benchmark_onnx(model_id=model_id, samples=samples)
    infos.append(onnx_info)

    if optimize:
        logging.info("Benchmarking optimized...\n")
        optimized_info = benchmark_optimized(model_id=model_id, samples=samples)
        infos.append(optimized_info)
    
    if quantize:
        logging.info("Benchmarking quantized...\n")
        quantized_info = benchmark_quantized(model_id=model_id, samples=samples)
        infos.append(quantized_info)

    logging.info("Benchmarking transformers...\n")
    transformers_info = benchmark_transformers(model_id=model_id, samples=samples)
    infos.append(transformers_info)

    logging.info("Benchmarking sentence transformers...\n")
    sentence_transformers_info = benchmark_sentence_transformers(model_id=model_id, samples=samples)
    infos.append(sentence_transformers_info)

    
    results = "\n\n"
    for info in infos:
        results += f"{info['Framework']} took {info['Time']} seconds\n\n"

    logging.info(results)

It firs generates a sample of inputs of random length although with a maximum defined by max_length. Then it gets each version of the model to run through these samples. Varying the max_length parameter will get the effect I was describing.