BertPreTrainedModel and RobertaPreTrainedModel works, however PreTrainedModel does not work

class bert_cls**(BertPreTrainedModel)**:
def init(self, config):
super(bert_cls, self).init(config)

    self.num_labels = config.num_labels
    self.config = config

    self.pre_trained_model = BertModel(config)

    cls_dropout = (config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob)

    self.ccto_dropout = nn.Dropout(cls_dropout)
    self.ccto_cls = nn.Linear(config.hidden_size, config.num_labels)


    self.switch_dropout = nn.Dropout(cls_dropout)
    self.switch_cls = nn.Linear(config.hidden_size, config.num_labels)


def forward(self, input_ids, mask_ids, segment_ids, ccto_label, switch_label):
    outputs = self.pre_trained_model(
        input_ids=input_ids,
        attention_mask=mask_ids,
        token_type_ids=segment_ids)

    pooled_output = outputs.pooler_output

    ccto_logits = self.ccto_cls(self.ccto_dropout(pooled_output))

    switch_logits = self.switch_cls(self.switch_dropout(pooled_output))

    ccto_loss, switch_loss = None, None
    if ccto_label is not None and switch_label is not None:
        loss_fct = nn.CrossEntropyLoss()
        ccto_loss = loss_fct(ccto_logits, ccto_label.view(-1))
        switch_loss = loss_fct(switch_logits, switch_label.view(-1))
        return ccto_loss, switch_loss, ccto_logits, switch_logits
    else:
        return ccto_logits, switch_logits

The above function can be called by:
tokenizer = BertTokenizer.from_pretrained(‘bert-base-cased’)
config = BertConfig.from_pretrained(‘bert-base-cased’)
config.num_labels = 2
classifiers_model = bert_cls(config)
which works, also work if we change ‘Bert*’ to ‘Roberta’. However, it does not work in the following code:

class binary_cls**(PreTrainedModel):**
def init(self, config):
super(binary_cls, self).init(config)

    self.num_labels = config.num_labels
    self.config = config
    
    **if args.model_name == 'BERT':**

** self.pre_trained_model = BertModel(config)**
** elif args.model_name == ‘RoBERTa’:**
** self.pre_trained_model = RobertaModel(config)**

    cls_dropout = (config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob)

    self.ccto_dropout = nn.Dropout(cls_dropout)
    self.ccto_cls = nn.Linear(config.hidden_size, config.num_labels)


    self.switch_dropout = nn.Dropout(cls_dropout)
    self.switch_cls = nn.Linear(config.hidden_size, config.num_labels)


def forward(self, input_ids, mask_ids, segment_ids, ccto_label, switch_label):
    outputs = self.pre_trained_model(
        input_ids=input_ids,
        attention_mask=mask_ids,
        token_type_ids=segment_ids)

    pooled_output = outputs.pooler_output


    ccto_logits = self.ccto_cls(self.ccto_dropout(pooled_output))

    switch_logits = self.switch_cls(self.switch_dropout(pooled_output))

    ccto_loss, switch_loss = None, None
    if ccto_label is not None and switch_label is not None:
        loss_fct = nn.CrossEntropyLoss()
        ccto_loss = loss_fct(ccto_logits, ccto_label.view(-1))
        switch_loss = loss_fct(switch_logits, switch_label.view(-1))
        return ccto_loss, switch_loss, ccto_logits, switch_logits
    else:
        return ccto_logits, switch_logits

It has the error:
config, model_kwargs = cls.config_class.from_pretrained(
AttributeError: ‘NoneType’ object has no attribute 'from_pretrained’