How to push on hub a quantized model

I quantized my model using torch but when I try to push model_quantized_qint8 to the hub it don’t work

from torch.quantization import quantize_dynamic

model_ckpt = “Sohaibsoussi/distilbert-base-uncased-distilled-finetuned-clinc”
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to(“cpu”)

Dynamic Quantization for INT8

model_quantized_qint8 = quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)

{
“name”: “TypeError”,
“message”: “_batch_encode_plus() got an unexpected keyword argument ‘dtype’”,
“stack”: "---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[169], line 5
2 optim_type = "Distillation + Static Quantization"
4 pb = PerformanceBenchmark(pipe, clinc["test"], optim_type=optim_type)
----> 5 perf_metrics.update(pb.run_benchmark())

Cell In[3], line 20, in PerformanceBenchmark.run_benchmark(self)
18 metrics = {}
19 metrics[self.optim_type] = self.compute_size()
—> 20 metrics[self.optim_type].update(self.time_pipeline())
21 metrics[self.optim_type].update(self.compute_accuracy())
22 return metrics

Cell In[17], line 8, in time_pipeline(self, query)
6 # Warmup
7 for _ in range(10):
----> 8 _ = self.pipeline(query)
9 # Timed Run
10 for _ in range(100):

File ~/miniconda3/envs/langchain/lib/python3.8/site-packages/transformers/pipelines/text_classification.py:155, in TextClassificationPipeline.call(self, *args, **kwargs)
121 def call(self, *args, **kwargs):
122 """
123 Classify the text(s) given as inputs.
124
(…)
153 If top_k is used, one such dictionary is returned per label.
154 """
→ 155 result = super().call(*args, **kwargs)
156 # TODO try and retrieve it in a nicer way from _sanitize_parameters.
157 _legacy = "top_k" not in kwargs

File ~/miniconda3/envs/langchain/lib/python3.8/site-packages/transformers/pipelines/base.py:1196, in Pipeline.call(self, inputs, num_workers, batch_size, *args, **kwargs)
1188 return next(
1189 iter(
1190 self.get_iterator(
(…)
1193 )
1194 )
1195 else:
→ 1196 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)

File ~/miniconda3/envs/langchain/lib/python3.8/site-packages/transformers/pipelines/base.py:1202, in Pipeline.run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
1201 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
→ 1202 model_inputs = self.preprocess(inputs, **preprocess_params)
1203 model_outputs = self.forward(model_inputs, **forward_params)
1204 outputs = self.postprocess(model_outputs, **postprocess_params)

File ~/miniconda3/envs/langchain/lib/python3.8/site-packages/transformers/pipelines/text_classification.py:179, in TextClassificationPipeline.preprocess(self, inputs, **tokenizer_kwargs)
173 elif isinstance(inputs, list):
174 # This is likely an invalid usage of the pipeline attempting to pass text pairs.
175 raise ValueError(
176 "The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a"
177 ’ dictionary {\"text\": \"My text\", \"text_pair\": \"My pair\"} in order to send a text pair.’
178 )
→ 179 return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)

File ~/miniconda3/envs/langchain/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2829, in PreTrainedTokenizerBase.call(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2827 if not self._in_target_context_manager:
2828 self._switch_to_input_mode()
→ 2829 encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
2830 if text_target is not None:
2831 self._switch_to_target_mode()

File ~/miniconda3/envs/langchain/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2935, in PreTrainedTokenizerBase._call_one(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2915 return self.batch_encode_plus(
2916 batch_text_or_text_pairs=batch_text_or_text_pairs,
2917 add_special_tokens=add_special_tokens,
(…)
2932 **kwargs,
2933 )
2934 else:
→ 2935 return self.encode_plus(
2936 text=text,
2937 text_pair=text_pair,
2938 add_special_tokens=add_special_tokens,
2939 padding=padding,
2940 truncation=truncation,
2941 max_length=max_length,
2942 stride=stride,
2943 is_split_into_words=is_split_into_words,
2944 pad_to_multiple_of=pad_to_multiple_of,
2945 return_tensors=return_tensors,
2946 return_token_type_ids=return_token_type_ids,
2947 return_attention_mask=return_attention_mask,
2948 return_overflowing_tokens=return_overflowing_tokens,
2949 return_special_tokens_mask=return_special_tokens_mask,
2950 return_offsets_mapping=return_offsets_mapping,
2951 return_length=return_length,
2952 verbose=verbose,
2953 **kwargs,
2954 )

File ~/miniconda3/envs/langchain/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3008, in PreTrainedTokenizerBase.encode_plus(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2998 # Backward compatibility for ‘truncation_strategy’, ‘pad_to_max_length’
2999 padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
3000 padding=padding,
3001 truncation=truncation,
(…)
3005 **kwargs,
3006 )
→ 3008 return self._encode_plus(
3009 text=text,
3010 text_pair=text_pair,
3011 add_special_tokens=add_special_tokens,
3012 padding_strategy=padding_strategy,
3013 truncation_strategy=truncation_strategy,
3014 max_length=max_length,
3015 stride=stride,
3016 is_split_into_words=is_split_into_words,
3017 pad_to_multiple_of=pad_to_multiple_of,
3018 return_tensors=return_tensors,
3019 return_token_type_ids=return_token_type_ids,
3020 return_attention_mask=return_attention_mask,
3021 return_overflowing_tokens=return_overflowing_tokens,
3022 return_special_tokens_mask=return_special_tokens_mask,
3023 return_offsets_mapping=return_offsets_mapping,
3024 return_length=return_length,
3025 verbose=verbose,
3026 **kwargs,
3027 )

File ~/miniconda3/envs/langchain/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:576, in PreTrainedTokenizerFast._encode_plus(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
554 def _encode_plus(
555 self,
556 text: Union[TextInput, PreTokenizedInput],
(…)
573 **kwargs,
574 ) → BatchEncoding:
575 batched_input = [(text, text_pair)] if text_pair else [text]
→ 576 batched_output = self._batch_encode_plus(
577 batched_input,
578 is_split_into_words=is_split_into_words,
579 add_special_tokens=add_special_tokens,
580 padding_strategy=padding_strategy,
581 truncation_strategy=truncation_strategy,
582 max_length=max_length,
583 stride=stride,
584 pad_to_multiple_of=pad_to_multiple_of,
585 return_tensors=return_tensors,
586 return_token_type_ids=return_token_type_ids,
587 return_attention_mask=return_attention_mask,
588 return_overflowing_tokens=return_overflowing_tokens,
589 return_special_tokens_mask=return_special_tokens_mask,
590 return_offsets_mapping=return_offsets_mapping,
591 return_length=return_length,
592 verbose=verbose,
593 **kwargs,
594 )
596 # Return tensor is None, then we can remove the leading batch axis
597 # Overflowing tokens are returned as a batch of output so we keep them in this case
598 if return_tensors is None and not return_overflowing_tokens:

TypeError: _batch_encode_plus() got an unexpected keyword argument ‘dtype’"
}