I used optimum and onnxruntime to optimize and quantize a roberta squad QA model using the example from this blog
The code used to work fine until this week when it started to break. I suddenly get an error when quantizing the model which suggest that the model cannot lookup the right data type of the attention layer.
RuntimeError: Unable to find data type for weight_name=‘/roberta/encoder/layer.0/attention/output/dense/MatMul_output_0’
Here is a minimal code example to reproduce the error.
! pip -q install optimum[exporters,onnxruntime]
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForQuestionAnswering
model_id = "deepset/roberta-base-squad2"
onnx_path = Path("onnx")
task = "question-answering"
# load vanilla transformers and convert to onnx
model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig
# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(onnx_path)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations
optimizer.optimize(save_dir=onnx_path, optimization_config=optimization_config)
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
# create ORTQuantizer and define quantization configuration
quantizer = ORTQuantizer.from_pretrained(onnx_path, file_name="model_optimized.onnx")
#quantizer = ORTQuantizer.from_pretrained(onnx_path, file_name="model.onnx")
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
# apply the quantization configuration to the model
quantizer.quantize(save_dir=onnx_path, quantization_config=qconfig)
Error message:
Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/s8, channel-wise: True)
Quantizing model...
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[5], line 11
7 qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
10 # apply the quantization configuration to the model
---> 11 quantizer.quantize(save_dir=onnx_path, quantization_config=qconfig)
File /opt/conda/lib/python3.11/site-packages/optimum/onnxruntime/quantization.py:417, in ORTQuantizer.quantize(self, quantization_config, save_dir, file_suffix, calibration_tensors_range, use_external_data_format, preprocessor)
389 quantizer = quantizer_factory(
390 model=onnx_model,
391 static=quantization_config.is_static,
(...)
413 },
414 )
416 LOGGER.info("Quantizing model...")
--> 417 quantizer.quantize_model()
419 suffix = f"_{file_suffix}" if file_suffix else ""
420 quantized_model_path = save_dir.joinpath(f"{self.onnx_model_path.stem}{suffix}").with_suffix(".onnx")
File /opt/conda/lib/python3.11/site-packages/onnxruntime/quantization/onnx_quantizer.py:403, in ONNXQuantizer.quantize_model(self)
401 number_of_existing_new_nodes = len(self.new_nodes)
402 op_quantizer = CreateOpQuantizer(self, node)
--> 403 op_quantizer.quantize()
404 for i in range(number_of_existing_new_nodes, len(self.new_nodes)):
405 for output_name in self.new_nodes[i].output:
File /opt/conda/lib/python3.11/site-packages/onnxruntime/quantization/operators/matmul.py:78, in MatMulInteger.quantize(self)
76 # Add cast operation to cast matmulInteger output to float.
77 cast_op_output = matmul_integer_output + "_cast_output"
---> 78 otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
79 cast_node = onnx.helper.make_node(
80 "Cast",
81 [matmul_integer_output],
(...)
84 to=otype,
85 )
86 nodes.append(cast_node)
File /opt/conda/lib/python3.11/site-packages/onnxruntime/quantization/onnx_quantizer.py:461, in ONNXQuantizer.get_tensor_type(self, tensor_name, mandatory)
459 if (not self.enable_subgraph_quantization) or (self.parent is None):
460 if mandatory:
--> 461 raise RuntimeError(f"Unable to find data type for weight_name={tensor_name!r}")
462 return None
463 otype = self.parent.is_valid_quantize_weight(tensor_name)
RuntimeError: Unable to find data type for weight_name='/roberta/encoder/layer.0/attention/output/dense/MatMul_output_0'
The full notebook code is on github here: sutd-mlops-course-code/03_optimize_onnx.ipynb at main · ddahlmeier/sutd-mlops-course-code · GitHub