How to save and load the custom Hugging face model including config.json file using pytorch

Model description

I add simple custom pytorch-crf layer on top of TokenClassification model. It will make the model more robust.

I train the model successfully but when I save the mode. The folder doesn’t have config.json file inside it. How to save the config.json file for this custom model ?

When I load the custom trained model, the last CRF layer was not there?

from torchcrf import CRF

model_checkpoint = "dslim/bert-base-NER"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)
bert_model = BertForTokenClassification.from_pretrained(
                        model_checkpoint,id2label=id2label,label2id=label2id)
bert_model.config.output_hidden_states=True


class BERT_CRF(nn.Module):
    
    def __init__(self, bert_model, num_labels):
        super(BERT_CRF, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.25)
        
        self.classifier = nn.Linear(768, num_labels)

        self.crf = CRF(num_labels, batch_first = True)
    
    def forward(self, input_ids, attention_mask,  labels=None, token_type_ids=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        
        sequence_output = torch.stack((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4])).mean(dim=0)
        sequence_output = self.dropout(sequence_output)
        
        emission = self.classifier(sequence_output) # [32,256,17]
        labels=labels.reshape(attention_mask.size()[0],attention_mask.size()[1])
        
        if labels is not None:    
            loss = -self.crf(log_soft(emission, 2), labels, mask=attention_mask.type(torch.uint8), reduction='mean')
            prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
            return [loss, prediction]
                
        else:         
            prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
            return prediction


model = BERT_CRF(bert_model, num_labels=len(label2id))
model.to(device)

args = TrainingArguments(
    "spanbert_crf_ner2",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    # per_device_eval_batch_size=32
    fp16=True
    # bf16=True #Ampere GPU
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    # eval_dataset=train_data,
    # data_collator=data_collator,
    # compute_metrics=compute_metrics,
    tokenizer=tokenizer)

trainer.train()
trainer.save_model("model_spanbert_ner")

Saved model

Saving model checkpoint to spanbert_crf_ner2/checkpoint-62500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in spanbert_crf_ner2/checkpoint-62500/tokenizer_config.json
Special tokens file saved in spanbert_crf_ner2/checkpoint-62500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 62500/62500 [15:30:27<00:00,  1.12it/s]
Saving model checkpoint to model_spanbert_ner
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
{'train_runtime': 55837.6817, 'train_samples_per_second': 17.909, 'train_steps_per_second': 1.119, 'train_loss': 1.8942613859863282, 'epoch': 2.0}
tokenizer config file saved in model_spanbert_ner/tokenizer_config.json
Special tokens file saved in model_spanbert_ner/special_tokens_map.json

Trained model last layer

(11): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
            (intermediate): BertIntermediate(
              (dense): Linear(in_features=768, out_features=3072, bias=True)
              (intermediate_act_fn): GELUActivation()
            )
            (output): BertOutput(
              (dense): Linear(in_features=3072, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
        )
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
    (classifier): Linear(in_features=768, out_features=21, bias=True)
  )
  (dropout): Dropout(p=0.25, inplace=False)
  (classifier): Linear(in_features=768, out_features=21, bias=True)
  (crf): CRF(num_tags=21)
)

When I loaded it after training:

model = AutoModelForTokenClassification.from_pretrained("model_spanbert_ner",ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained("model_spanbert_ner",model_max_length=512)



(11): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=21, bias=True)

CRF layer was not there??