Package versions
While exporting a question answering model (“deepset/minilm-uncased-squad2”) to ONNX and quantizing it(dynamic quantization) with Optimum, the model size is 68 MB.
The same model exported while using ONNXRuntime is 32 MB.
Why is there a difference between both the exported models when the model is the same and the quantization too ?
Optimum Code to convert the model to ONNX and Quantization.
from pathlib import Path
from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTOptimizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig, OptimizationConfig
from optimum.onnxruntime import ORTQuantizer
from optimum.pipelines import pipeline
from transformers import AutoTokenizer
model_checkpoint = "deepset/minilm-uncased-squad2"
save_directory = Path.home()/'onnx/optimum/minilm-uncased-squad2'
save_directory.mkdir(exist_ok=True,parents=True)
file_name = "minilm-uncased-squad2.onnx"
onnx_path = save_directory/"minilm-uncased-squad2.onnx"
# Load a model from transformers and export it through the ONNX format
model = ORTModelForQuestionAnswering.from_pretrained(model_checkpoint, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# Save the onnx model and tokenizer
model.save_pretrained(save_directory, file_name=file_name)
tokenizer.save_pretrained(save_directory)
# Define the quantization methodology
qconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=True)
quantizer = ORTQuantizer.from_pretrained(model_checkpoint, feature="question-answering")
# Apply dynamic quantization on the model
quantizer.export(
onnx_model_path=onnx_path,
onnx_quantized_model_output_path= save_directory/"minilm-uncased-squad2-quantized.onnx",
quantization_config=qconfig,
)
quantizer.model.config.save_pretrained(save_directory)
Path(save_directory/"minilm-uncased-squad2-quantized.onnx").stat().st_size/1024**2
ONNX Runtime Code
from transformers.convert_graph_to_onnx import convert
from transformers import AutoTokenizer
from pathlib import Path
model_ckpt = "deepset/minilm-uncased-squad2"
onnx_model_path = Path("../../onnx/minilm-uncased-squad2.onnx")
tokenizer= AutoTokenizer.from_pretrained(model_ckpt)
convert(framework="pt", model=model_ckpt, tokenizer=tokenizer,
output=onnx_model_path, opset=12, pipeline_name="question-answering")
from onnxruntime.quantization import quantize_dynamic, QuantType
onnx_model_path = Path("../../../onnx/minilm-uncased-squad2.onnx")
model_output = "../../onnx/minilm-uncased-squad2.quant.onnx"
quantize_dynamic(onnx_model_path, model_output, weight_type=QuantType.QInt8)
Path(model_output).stat().st_size/1024**2
Thank you