How to improve model latency using quantization

I am using transformers model for one of my paraphrasing task, I am currently dependent only on CPU, so I want to increase my model latency (response time). I tried to implement the PyTorch quantization but it did not worked with it. Can anybody has a better solution.

My Code:

 def __init__(self, model_tag='prithivida/parrot_adequacy_on_BART'):
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    self.nli_model = AutoModelForSequenceClassification.from_pretrained(model_tag)
    self.tokenizer = AutoTokenizer.from_pretrained(model_tag)

  def filter(self, input_phrase, para_phrases, adequacy_threshold, device="cpu"):
      top_adequacy_phrases = []
      for para_phrase in para_phrases:
        x = self.tokenizer.encode(input_phrase, para_phrase, return_tensors='pt',truncation_strategy='only_first')
        self.nli_model = self.nli_model.to(device)
        logits = self.nli_model(x.to(device))[0]
        # we throw away "neutral" (dim 1) and take the probability of "entailment" (2) as the adequacy score
        entail_contradiction_logits = logits[:,[0,2]]
        probs = entail_contradiction_logits.softmax(dim=1)
        prob_label_is_true = probs[:,1]
        adequacy_score = prob_label_is_true[0].item()
        if adequacy_score >= adequacy_threshold:
            top_adequacy_phrases.append(para_phrase)
      return top_adequacy_phrases