Sure, the code I used for exporting to ONNX was:
import logging
import argparse
from huggingface_hub import login
import os
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTOptimizer, ORTQuantizer
from optimum.onnxruntime.configuration import AutoOptimizationConfig, AutoQuantizationConfig
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_id", type=str, default="BAAI/bge-base-en-v1.5")
parser.add_argument("--hf_token", type=str, default=None)
parser.add_argument("--quantize", type=bool, default=False)
parser.add_argument("--optimize", type=bool, default=False)
parser.add_argument("--full_precision_output_dir", type=str, default="onnx_model")
parser.add_argument("--optimized_output_dir", type=str, default="onnx_model_optimized")
parser.add_argument("--quantized_output_dir", type=str, default="onnx_model_quantized")
return parser.parse_args()
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
logging.info("Parsing arguments...\n")
args = parse_args()
model_id = args.model_id
hf_token = args.hf_token
quantize = args.quantize
optimize = args.optimize
full_precision_output_dir = args.full_precision_output_dir
optimized_output_dir = args.optimized_output_dir
quantized_output_dir = args.quantized_output_dir
logging.info("Logging in to Hugging Face...\n")
if args.hf_token:
os.environ["HF_TOKEN"] = hf_token
login(token=hf_token)
logging.info("Loading model...\n")
# model = ORTModelForFeatureExtraction.from_pretrained(model_id, file_name="onnx/model.onnx")
model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
logging.info("Checking if ONNX model directory exists...\n")
if os.path.exists(full_precision_output_dir):
logging.info("ONNX model directory already exists. Deleting it...\n")
os.system(f"rm -r {full_precision_output_dir}")
logging.info("Exporting model to ONNX...\n")
# save onnx checkpoint and tokenizer
model.save_pretrained(full_precision_output_dir)
tokenizer.save_pretrained(full_precision_output_dir)
if optimize:
logging.info("Optimizing...\n")
logging.info("Checking if optimized model directory exists...\n")
if os.path.exists(optimized_output_dir):
logging.info("Optimized model directory already exists. Deleting it...\n")
os.system(f"rm -r {optimized_output_dir}")
logging.info("Building optimizer...\n")
optimizer = ORTOptimizer.from_pretrained(full_precision_output_dir)
optimization_config = AutoOptimizationConfig.O2()
logging.info("Optimizing model...\n")
optimizer.optimize(save_dir=optimized_output_dir, optimization_config=optimization_config)
tokenizer.save_pretrained(optimized_output_dir)
if quantize:
logging.info("Quantizing...\n")
logging.info("Loading model to quantize...\n")
if optimize:
logging.info("Loading optimized model...\n")
model = ORTModelForFeatureExtraction.from_pretrained(optimized_output_dir)
else:
model = ORTModelForFeatureExtraction.from_pretrained(full_precision_output_dir)
# create ORTQuantizer and define quantization configuration
logging.info("Creating quantizer...\n")
quantizer = ORTQuantizer.from_pretrained(model)
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
logging.info("Quantizing model...\n")
model_quantized_path = quantizer.quantize(
save_dir=quantized_output_dir,
quantization_config=dqconfig,
)
tokenizer.save_pretrained(model_quantized_path)
This is run on cpu and depending on the argument it saves up to 3 versions. The one I am referencing is the full precision model.
The performance testing code is:
import os
import argparse
import logging
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from optimum.onnxruntime import ORTModelForFeatureExtraction
import torch
import numpy as np
import time
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_id", type=str, default="BAAI/bge-base-en-v1.5")
parser.add_argument("--hf_token", type=str, default=None)
parser.add_argument("--quantize", type=bool, default=False)
parser.add_argument("--optimize", type=bool, default=False)
parser.add_argument("--samples", type=int, default=1000)
parser.add_argument("--max_length", type=int, default=500)
return parser.parse_args()
def generate_samples(tokenizer, num_samples=100, max_length=500):
vocab_size = tokenizer.vocab_size
samples = []
for _ in range(num_samples):
length = np.random.randint(1, max_length)
input_ids = torch.randint(low=1,high = vocab_size, size=(length,)).tolist()
string = tokenizer.decode(input_ids, skip_special_tokens=True)
inputs_pt = tokenizer(string, padding=True, truncation=True, return_tensors="pt")
inputs_np = tokenizer(string, padding=True, truncation=True, return_tensors="np")
samples.append(
(inputs_pt, inputs_np, string)
)
return samples
def benchmark_transformers(model_id: str, samples: list):
model = AutoModel.from_pretrained(model_id)
model.eval()
model.to("cpu")
start_time = time.time()
with torch.no_grad():
for sample, _, _ in samples:
output = model(**sample)
sentence_embeddings = output[0][:, 0]
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
end_time = time.time()
logging.info(f" Transformers Time taken: {end_time - start_time} seconds/n")
info = {
"Framework": "Transformers",
"Time": end_time - start_time
}
return info
def benchmark_sentence_transformers(model_id: str, samples: list):
model = SentenceTransformer(model_id)
model.eval()
model.to("cpu" )
start_time = time.time()
with torch.no_grad():
for _ , _, string in samples:
sentence_embeddings = model.encode(string, normalize_embeddings=True, show_progress_bar=False)
end_time = time.time()
logging.info(f" Sentence Transformers Time taken: {end_time - start_time} seconds/n")
info = {
"Framework": "Sentence Transformers",
"Time": end_time - start_time
}
return info
def benchmark_onnx(model_id: str, samples: list):
model = ORTModelForFeatureExtraction.from_pretrained("onnx_model")
start_time = time.time()
for _, sample, _ in samples:
output = model(**sample)
sentence_embeddings = output[0][:, 0]
sentence_embeddings = sentence_embeddings / np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)
end_time = time.time()
logging.info(f" ONNX Time taken: {end_time - start_time} seconds/n")
info = {
"Framework": "ONNX",
"Time": end_time - start_time
}
return info
def benchmark_optimized(model_id: str, samples: list):
model = ORTModelForFeatureExtraction.from_pretrained("onnx_model_optimized")
start_time = time.time()
for _, sample, _ in samples:
output = model(**sample)
sentence_embeddings = output[0][:, 0]
sentence_embeddings = sentence_embeddings / np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)
end_time = time.time()
logging.info(f" Optimized Time taken: {end_time - start_time} seconds/n")
info = {
"Framework": "Optimized",
"Time": end_time - start_time
}
return info
def benchmark_quantized(model_id: str, samples: list):
model = ORTModelForFeatureExtraction.from_pretrained("onnx_model_quantized")
start_time = time.time()
for _, sample, _ in samples:
output = model(**sample)
sentence_embeddings = output[0][:, 0]
sentence_embeddings =sentence_embeddings / np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)
end_time = time.time()
logging.info(f" Quantized Time taken: {end_time - start_time} seconds/n")
info = {
"Framework": "Quantized",
"Time": end_time - start_time
}
return info
if __name__=="__main__":
logging.basicConfig(level=logging.INFO)
logging.info("Parsing arguments...\n")
args = parse_args()
model_id = args.model_id
hf_token = args.hf_token
quantize = args.quantize
optimize = args.optimize
samples = args.samples
max_length = args.max_length
logging.info("Logging in to Hugging Face...\n")
if args.hf_token:
os.environ["HF_TOKEN"] = hf_token
login(token=hf_token)
tokenizer = AutoTokenizer.from_pretrained(model_id)
logging.info("Generating samples...\n")
samples = generate_samples(tokenizer, num_samples=samples, max_length=max_length)
infos = []
logging.info("Benchmarking ONNX...\n")
onnx_info = benchmark_onnx(model_id=model_id, samples=samples)
infos.append(onnx_info)
if optimize:
logging.info("Benchmarking optimized...\n")
optimized_info = benchmark_optimized(model_id=model_id, samples=samples)
infos.append(optimized_info)
if quantize:
logging.info("Benchmarking quantized...\n")
quantized_info = benchmark_quantized(model_id=model_id, samples=samples)
infos.append(quantized_info)
logging.info("Benchmarking transformers...\n")
transformers_info = benchmark_transformers(model_id=model_id, samples=samples)
infos.append(transformers_info)
logging.info("Benchmarking sentence transformers...\n")
sentence_transformers_info = benchmark_sentence_transformers(model_id=model_id, samples=samples)
infos.append(sentence_transformers_info)
results = "\n\n"
for info in infos:
results += f"{info['Framework']} took {info['Time']} seconds\n\n"
logging.info(results)
It firs generates a sample of inputs of random length although with a maximum defined by max_length. Then it gets each version of the model to run through these samples. Varying the max_length parameter will get the effect I was describing.