Share a Multi-Task Model on the huggingface Hub

Hi All
I try to share a multi-task model on the hub but i failed to load it after for inference. My Bert model works by having a shared BERT-style encoder transformer, and two different task heads for each task. The two heads are a binary classification head (num_label =2) and a sentiment classification head (num_label = 5)

Here is the code :

class SequenceClassificationHead(nn.Module):

 def __init__(self, hidden_size, num_labels, dropout_p=0.1):

    super().__init__()
    self.num_labels = num_labels
    self.dropout = nn.Dropout(dropout_p)
    self.classifier = nn.Linear(hidden_size, num_labels)

    self._init_weights()

def _init_weights(self):
  self.classifier.weight.data.normal_(mean=0.0, std=0.02)
  if self.classifier.bias is not None:
      self.classifier.bias.data.zero_()

def forward(self, sequence_output, pooled_output, labels=None, **kwargs):
  pooled_output = self.dropout(pooled_output)
  logits = self.classifier(pooled_output)

  loss = None
  if labels is not None:
      
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(
          logits.view(-1, self.num_labels), labels.long().view(-1)
      )

  return logits, loss

class MultiTaskModel(BertPreTrainedModel):

def __init__(self, checkpoint, tasks: List):
  super().__init__(PretrainedConfig())

  self.encoder = BertModel.from_pretrained(checkpoint)

  self.output_heads = nn.ModuleDict()
  for task in tasks:
      decoder = self._create_output_head(self.encoder.config.hidden_size, task)
      # ModuleDict requires keys to be strings
      self.output_heads[str(task.id)] = decoder

@staticmethod
def _create_output_head(encoder_hidden_size: int, task):
  if task.type == "seq_classification":
      return SequenceClassificationHead(encoder_hidden_size, task.num_labels)
  else:
      raise NotImplementedError()

def forward(
  self,
  input_ids=None,
  attention_mask=None,
  token_type_ids=None,
  position_ids=None,
  head_mask=None,
  inputs_embeds=None,
  labels=None,
  task_ids=None,
  **kwargs,

):

  outputs = self.encoder(
      input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids,
      position_ids=position_ids,
      head_mask=head_mask,
      inputs_embeds=inputs_embeds,
  )

  sequence_output, pooled_output = outputs[:2]
  unique_task_ids_list = torch.unique(task_ids).tolist()

  loss_list = []
  logits = None
  for unique_task_id in unique_task_ids_list:

      task_id_filter = task_ids == unique_task_id
      logits, task_loss = self.output_heads[str(unique_task_id)].forward(
          sequence_output[task_id_filter],
          pooled_output[task_id_filter],
          labels=None if labels is None else labels[task_id_filter],
          attention_mask=attention_mask[task_id_filter],
      )

I train it with the trainer API and share it with the Trainer API. It works but when i want to use it for inference and load from the hub i have this message :

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models–HCKLab–BiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models–HCKLab–BiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models–HCKLab–BiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models–HCKLab–BiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models–HCKLab–BiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/config.json
Model config BertConfig {
“architectures”: [
“MultiTaskModel”
],
“attention_probs_dropout_prob”: 0.1,
“classifier_dropout”: null,
“hidden_act”: “gelu”,
“hidden_dropout_prob”: 0.1,
“hidden_size”: 768,
“initializer_range”: 0.02,
“intermediate_size”: 3072,
“layer_norm_eps”: 1e-12,
“max_position_embeddings”: 512,
“model_type”: “bert”,
“num_attention_heads”: 12,
“num_hidden_layers”: 12,
“pad_token_id”: 0,
“position_embedding_type”: “absolute”,
“torch_dtype”: “float32”,
“transformers_version”: “4.22.1”,
“type_vocab_size”: 2,
“use_cache”: true,
“vocab_size”: 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models–HCKLab–BiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/pytorch_model.bin
Some weights of the model checkpoint at HCKLab/BiBert-MultiTask were not used when initializing BertModel: [‘encoder.encoder.layer.4.output.LayerNorm.bias’, ‘encoder.encoder.layer.3.attention.self.key.weight’, ‘encoder.encoder.layer.1.attention.self.query.bias’, ‘encoder.encoder.layer.4.attention.self.query.bias’, ‘encoder.encoder.layer.5.output.LayerNorm.bias’, ‘encoder.encoder.layer.4.attention.output.LayerNorm.bias’, ‘encoder.encoder.layer.11.attention.output.dense.bias’, ‘encoder.encoder.layer.2.attention.self.query.bias’, ‘encoder.pooler.dense.weight’, ‘encoder.encoder.layer.6.intermediate.dense.weight’, ‘encoder.encoder.layer.1.attention.self.key.bias’, ‘encoder.encoder.layer.7.attention.output.dense.weight’, ‘encoder.encoder.layer.9.attention.output.LayerNorm.weight’, ‘encoder.embeddings.LayerNorm.bias’, ‘encoder.encoder.layer.8.intermediate.dense.bias’, ‘encoder.encoder.layer.4.attention.output.LayerNorm.weight’, ‘encoder.encoder.layer.4.attention.self.value.weight’, ‘encoder.encoder.layer.5.output.dense.bias’, ‘encoder.encoder.layer.2.output.LayerNorm.weight’, ‘encoder.encoder.layer.5.output.LayerNorm.weight’, ‘encoder.encoder.layer.6.attention.output.LayerNorm.bias’, ‘encoder.encoder.layer.7.output.dense.weight’, ‘encoder.encoder.layer.7.intermediate.dense.bias’, ‘encoder.encoder.layer.9.output.dense.bias’, ‘encoder.encoder.layer.4.output.dense.weight’, ‘encoder.encoder.layer.10.attention.self.key.weight’, ‘encoder.encoder.layer.11.output.dense.bias’, ‘encoder.embeddings.position_embeddings.weight’, ‘encoder.encoder.layer.1.attention.self.value.bias’, ‘encoder.encoder.layer.6.attention.self.value.weight’, ‘encoder.encoder.layer.10.attention.self.value.bias’, ‘encoder.encoder.layer.6.attention.output.dense.bias’, ‘encoder.encoder.layer.5.attention.self.query.weight’, ‘encoder.encoder.layer.11.attention.output.dense.weight’, ‘encoder.encoder.layer.0.attention.output.LayerNorm.weight’, ‘encoder.encoder.layer.0.attention.self.key.weight’, ‘encoder.encoder.layer.11.attention.output.LayerNorm.bias’, ‘encoder.encoder.layer.1.attention.output.LayerNorm.bias’, ‘encoder.encoder.layer.3.output.LayerNorm.bias’, ‘encoder.encoder.layer.0.intermediate.dense.weight’, ‘encoder.encoder.layer.8.attention.self.query.weight’, ‘encoder.encoder.layer.10.attention.output.LayerNorm.bias’, ‘encoder.encoder.layer.3.attention.output.dense.bias’, ‘encoder.encoder.layer.3.output.LayerNorm.weight’, ‘encoder.encoder.layer.10.attention.self.key.bias’, ‘encoder.encoder.layer.1.attention.output.LayerNorm.weight’, ‘encoder.encoder.layer.5.attention.self.key.weight’, ‘encoder.encoder.layer.7.attention.self.key.weight’, ‘encoder.encoder.layer.9.attention.self.key.bias’, ‘encoder.encoder.layer.6.attention.self.query.bias’, ‘encoder.encoder.layer.9.output.LayerNorm.bias’, ‘encoder.encoder.layer.10.attention.output.dense.weight’, ‘encoder.encoder.layer.1.output.LayerNorm.bias’, ‘encoder.encoder.layer.0.output.dense.bias’, ‘encoder.encoder.layer.11.attention.self.value.weight’, ‘encoder.encoder.layer.6.attention.self.query.weight’, ‘encoder.encoder.layer.2.attention.output.LayerNorm.bias’, ‘output_heads.0.classifier.bias’, ‘encoder.encoder.layer.10.output.dense.weight’, ‘encoder.encoder.layer.5.attention.self.query.bias’, ‘encoder.encoder.layer.8.attention.output.dense.weight’, ‘encoder.encoder.layer.8.intermediate.dense.weight’, ‘encoder.encoder.layer.1.intermediate.dense.weight’, ‘encoder.encoder.layer.7.attention.self.query.bias’, ‘encoder.embeddings.token_type_embeddings.weight’, ‘encoder.encoder.layer.5.intermediate.dense.weight’, ‘encoder.encoder.layer.4.attention.output.dense.weight’, ‘encoder.encoder.layer.9.intermediate.dense.weight’, ‘encoder.encoder.layer.7.attention.output.LayerNorm.weight’, ‘encoder.encoder.layer.10.attention.output.dense.bias’, ‘encoder.encoder.layer.3.output.dense.weight’, ‘encoder.encoder.layer.11.attention.self.query.weight’, ‘encoder.encoder.layer.6.attention.self.key.bias’, ‘encoder.encoder.layer.8.output.dense.weight’, ‘encoder.encoder.layer.0.attention.self.value.bias’, ‘encoder.encoder.layer.0.attention.self.query.weight’, ‘encoder.pooler.dense.bias’, ‘encoder.encoder.layer.8.output.LayerNorm.bias’, ‘encoder.encoder.layer.6.attention.output.dense.weight’, ‘encoder.encoder.layer.7.attention.self.value.bias’, ‘encoder.embeddings.position_ids’, ‘encoder.encoder.layer.10.attention.self.value.weight’, ‘encoder.encoder.layer.10.output.dense.bias’, ‘encoder.encoder.layer.7.attention.output.LayerNorm.bias’, ‘output_heads.0.classifier.weight’, ‘encoder.encoder.layer.8.output.LayerNorm.weight’, ‘encoder.encoder.layer.6.attention.self.key.weight’, ‘encoder.encoder.layer.0.intermediate.dense.bias’, ‘encoder.encoder.layer.2.attention.output.LayerNorm.weight’, ‘encoder.embeddings.word_embeddings.weight’, ‘encoder.encoder.layer.4.attention.self.key.bias’, ‘encoder.encoder.layer.6.output.dense.bias’, ‘encoder.encoder.layer.2.attention.self.value.bias’, ‘encoder.encoder.layer.5.attention.self.key.bias’, ‘encoder.encoder.layer.2.attention.self.key.weight’, ‘encoder.encoder.layer.5.attention.output.LayerNorm.weight’, ‘encoder.encoder.layer.11.attention.self.key.bias’, ‘encoder.encoder.layer.1.attention.self.key.weight’, ‘encoder.encoder.layer.0.output.LayerNorm.bias’, ‘encoder.encoder.layer.2.attention.self.value.weight’, ‘encoder.encoder.layer.2.intermediate.dense.weight’, ‘encoder.encoder.layer.4.attention.self.query.weight’, ‘encoder.encoder.layer.5.attention.output.LayerNorm.bias’, ‘encoder.encoder.layer.5.attention.output.dense.weight’, ‘encoder.encoder.layer.9.intermediate.dense.bias’, ‘encoder.encoder.layer.3.attention.self.value.weight’, ‘encoder.encoder.layer.11.output.LayerNorm.weight’, ‘encoder.encoder.layer.6.attention.self.value.bias’, ‘encoder.encoder.layer.7.attention.output.dense.bias’, ‘encoder.encoder.layer.7.attention.self.query.weight’, ‘encoder.encoder.layer.3.intermediate.dense.bias’, ‘encoder.encoder.layer.11.attention.output.LayerNorm.weight’, ‘encoder.encoder.layer.1.attention.output.dense.bias’, ‘encoder.encoder.layer.11.attention.self.query.bias’, ‘encoder.encoder.layer.5.attention.output.dense.bias’, ‘encoder.encoder.layer.8.attention.self.value.bias’, ‘encoder.encoder.layer.7.output.LayerNorm.weight’, ‘output_heads.1.classifier.weight’, ‘encoder.encoder.layer.2.intermediate.dense.bias’, ‘encoder.encoder.layer.10.attention.output.LayerNorm.weight’, ‘encoder.encoder.layer.9.attention.self.value.bias’, ‘encoder.encoder.layer.10.output.LayerNorm.weight’, ‘encoder.encoder.layer.10.output.LayerNorm.bias’, ‘encoder.encoder.layer.5.attention.self.value.bias’, ‘encoder.encoder.layer.9.attention.self.query.bias’, ‘encoder.encoder.layer.8.attention.self.query.bias’, ‘encoder.encoder.layer.11.output.dense.weight’, ‘output_heads.1.classifier.bias’, ‘encoder.encoder.layer.4.attention.output.dense.bias’, ‘encoder.encoder.layer.2.output.dense.weight’, ‘encoder.encoder.layer.1.output.LayerNorm.weight’, ‘encoder.encoder.layer.2.attention.output.dense.bias’, ‘encoder.encoder.layer.9.output.LayerNorm.weight’, ‘encoder.encoder.layer.2.output.dense.bias’, ‘encoder.encoder.layer.9.attention.output.dense.bias’, ‘encoder.encoder.layer.10.attention.self.query.bias’, ‘encoder.encoder.layer.7.intermediate.dense.weight’, ‘encoder.encoder.layer.0.attention.output.dense.bias’, ‘encoder.encoder.layer.11.attention.self.value.bias’, ‘encoder.encoder.layer.3.intermediate.dense.weight’, ‘encoder.encoder.layer.3.attention.self.query.bias’, ‘encoder.encoder.layer.8.attention.self.value.weight’, ‘encoder.encoder.layer.11.intermediate.dense.bias’, ‘encoder.encoder.layer.5.output.dense.weight’, ‘encoder.encoder.layer.2.output.LayerNorm.bias’, ‘encoder.encoder.layer.10.intermediate.dense.weight’, ‘encoder.encoder.layer.11.intermediate.dense.weight’, ‘encoder.encoder.layer.5.attention.self.value.weight’, ‘encoder.encoder.layer.9.attention.output.dense.weight’, ‘encoder.encoder.layer.2.attention.output.dense.weight’, ‘encoder.encoder.layer.6.output.dense.weight’, ‘encoder.encoder.layer.1.output.dense.bias’, ‘encoder.encoder.layer.3.attention.self.value.bias’, ‘encoder.encoder.layer.3.attention.output.dense.weight’, ‘encoder.encoder.layer.4.intermediate.dense.bias’, ‘encoder.encoder.layer.0.attention.self.value.weight’, ‘encoder.encoder.layer.9.attention.output.LayerNorm.bias’, ‘encoder.encoder.layer.7.attention.self.value.weight’, ‘encoder.encoder.layer.10.intermediate.dense.bias’, ‘encoder.encoder.layer.5.intermediate.dense.bias’, ‘encoder.encoder.layer.8.output.dense.bias’, ‘encoder.encoder.layer.3.attention.output.LayerNorm.weight’, ‘encoder.encoder.layer.4.output.dense.bias’, ‘encoder.encoder.layer.4.output.LayerNorm.weight’, ‘encoder.encoder.layer.8.attention.output.LayerNorm.bias’, ‘encoder.encoder.layer.0.attention.output.LayerNorm.bias’, ‘encoder.encoder.layer.4.intermediate.dense.weight’, ‘encoder.encoder.layer.6.output.LayerNorm.weight’, ‘encoder.encoder.layer.9.attention.self.key.weight’, ‘encoder.encoder.layer.3.output.dense.bias’, ‘encoder.encoder.layer.0.attention.output.dense.weight’, ‘encoder.encoder.layer.9.output.dense.weight’, ‘encoder.encoder.layer.0.output.LayerNorm.weight’, ‘encoder.encoder.layer.11.output.LayerNorm.bias’, ‘encoder.encoder.layer.3.attention.self.query.weight’, ‘encoder.encoder.layer.0.attention.self.query.bias’, ‘encoder.encoder.layer.0.attention.self.key.bias’, ‘encoder.encoder.layer.3.attention.self.key.bias’, ‘encoder.encoder.layer.1.attention.output.dense.weight’, ‘encoder.encoder.layer.7.output.dense.bias’, ‘encoder.encoder.layer.9.attention.self.query.weight’, ‘encoder.encoder.layer.8.attention.output.LayerNorm.weight’, ‘encoder.encoder.layer.10.attention.self.query.weight’, ‘encoder.encoder.layer.4.attention.self.value.bias’, ‘encoder.encoder.layer.3.attention.output.LayerNorm.bias’, ‘encoder.encoder.layer.8.attention.output.dense.bias’, ‘encoder.encoder.layer.7.attention.self.key.bias’, ‘encoder.encoder.layer.0.output.dense.weight’, ‘encoder.encoder.layer.11.attention.self.key.weight’, ‘encoder.encoder.layer.8.attention.self.key.bias’, ‘encoder.embeddings.LayerNorm.weight’, ‘encoder.encoder.layer.2.attention.self.query.weight’, ‘encoder.encoder.layer.6.output.LayerNorm.bias’, ‘encoder.encoder.layer.7.output.LayerNorm.bias’, ‘encoder.encoder.layer.2.attention.self.key.bias’, ‘encoder.encoder.layer.6.intermediate.dense.bias’, ‘encoder.encoder.layer.6.attention.output.LayerNorm.weight’, ‘encoder.encoder.layer.9.attention.self.value.weight’, ‘encoder.encoder.layer.1.intermediate.dense.bias’, ‘encoder.encoder.layer.1.attention.self.query.weight’, ‘encoder.encoder.layer.4.attention.self.key.weight’, ‘encoder.encoder.layer.1.output.dense.weight’, ‘encoder.encoder.layer.8.attention.self.key.weight’, ‘encoder.encoder.layer.1.attention.self.value.weight’]

  • This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
  • This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
    Some weights of BertModel were not initialized from the model checkpoint at HCKLab/BiBert-MultiTask and are newly initialized: [‘encoder.layer.0.intermediate.dense.weight’, ‘encoder.layer.7.attention.self.key.weight’, ‘encoder.layer.6.attention.self.query.weight’, ‘encoder.layer.5.output.LayerNorm.bias’, ‘encoder.layer.7.attention.output.LayerNorm.weight’, ‘encoder.layer.6.attention.output.LayerNorm.weight’, ‘encoder.layer.3.output.dense.weight’, ‘encoder.layer.2.attention.output.dense.bias’, ‘encoder.layer.0.attention.output.LayerNorm.weight’, ‘encoder.layer.1.attention.self.key.bias’, ‘encoder.layer.0.attention.output.dense.weight’, ‘encoder.layer.1.attention.self.value.bias’, ‘encoder.layer.4.attention.self.value.weight’, ‘encoder.layer.1.attention.output.dense.bias’, ‘encoder.layer.7.intermediate.dense.bias’, ‘encoder.layer.2.output.LayerNorm.bias’, ‘encoder.layer.8.intermediate.dense.bias’, ‘encoder.layer.0.output.dense.bias’, ‘encoder.layer.10.intermediate.dense.weight’, ‘encoder.layer.5.attention.self.query.bias’, ‘encoder.layer.2.attention.self.query.weight’, ‘encoder.layer.5.attention.self.query.weight’, ‘encoder.layer.0.intermediate.dense.bias’, ‘encoder.layer.8.intermediate.dense.weight’, ‘encoder.layer.10.output.dense.bias’, ‘encoder.layer.0.attention.self.key.weight’, ‘encoder.layer.5.attention.output.dense.bias’, ‘encoder.layer.5.output.LayerNorm.weight’, ‘encoder.layer.7.intermediate.dense.weight’, ‘encoder.layer.8.output.dense.bias’, ‘encoder.layer.9.attention.self.key.bias’, ‘encoder.layer.11.output.dense.weight’, ‘encoder.layer.9.attention.self.key.weight’, ‘embeddings.LayerNorm.bias’, ‘encoder.layer.6.intermediate.dense.weight’, ‘encoder.layer.7.attention.self.query.bias’, ‘encoder.layer.1.intermediate.dense.weight’, ‘encoder.layer.7.attention.self.key.bias’, ‘encoder.layer.11.attention.output.dense.bias’, ‘encoder.layer.4.output.LayerNorm.weight’, ‘encoder.layer.7.attention.output.dense.weight’, ‘encoder.layer.11.attention.output.LayerNorm.bias’, ‘encoder.layer.5.output.dense.bias’, ‘encoder.layer.3.attention.self.query.bias’, ‘encoder.layer.8.attention.self.key.bias’, ‘encoder.layer.11.attention.self.query.bias’, ‘encoder.layer.1.attention.output.LayerNorm.weight’, ‘encoder.layer.4.attention.output.LayerNorm.weight’, ‘pooler.dense.bias’, ‘encoder.layer.3.intermediate.dense.weight’, ‘encoder.layer.10.attention.self.query.bias’, ‘encoder.layer.8.output.LayerNorm.weight’, ‘encoder.layer.7.attention.output.LayerNorm.bias’, ‘encoder.layer.4.output.LayerNorm.bias’, ‘encoder.layer.3.attention.self.query.weight’, ‘encoder.layer.1.output.dense.weight’, ‘encoder.layer.4.output.dense.bias’, ‘encoder.layer.10.attention.self.value.bias’, ‘encoder.layer.4.attention.self.query.weight’, ‘encoder.layer.7.output.dense.weight’, ‘encoder.layer.2.attention.self.query.bias’, ‘encoder.layer.1.intermediate.dense.bias’, ‘encoder.layer.10.output.LayerNorm.weight’, ‘encoder.layer.2.attention.self.value.bias’, ‘encoder.layer.11.attention.self.key.bias’, ‘encoder.layer.4.attention.output.LayerNorm.bias’, ‘encoder.layer.8.attention.output.dense.bias’, ‘encoder.layer.2.attention.self.value.weight’, ‘encoder.layer.6.output.LayerNorm.bias’, ‘encoder.layer.8.attention.self.key.weight’, ‘encoder.layer.0.attention.self.query.weight’, ‘encoder.layer.6.attention.self.query.bias’, ‘encoder.layer.8.attention.self.query.weight’, ‘encoder.layer.4.attention.output.dense.weight’, ‘encoder.layer.6.output.dense.weight’, ‘encoder.layer.11.attention.output.LayerNorm.weight’, ‘encoder.layer.9.attention.output.LayerNorm.weight’, ‘encoder.layer.11.output.dense.bias’, ‘encoder.layer.1.output.LayerNorm.weight’, ‘encoder.layer.1.attention.output.dense.weight’, ‘encoder.layer.6.attention.self.value.bias’, ‘encoder.layer.7.attention.output.dense.bias’, ‘encoder.layer.8.attention.self.value.bias’, ‘encoder.layer.5.attention.self.value.bias’, ‘encoder.layer.3.intermediate.dense.bias’, ‘encoder.layer.11.intermediate.dense.bias’, ‘encoder.layer.9.attention.self.value.bias’, ‘encoder.layer.1.attention.self.key.weight’, ‘encoder.layer.9.attention.self.query.weight’, ‘encoder.layer.9.attention.self.value.weight’, ‘encoder.layer.4.attention.self.key.weight’, ‘embeddings.LayerNorm.weight’, ‘encoder.layer.3.attention.output.LayerNorm.bias’, ‘encoder.layer.2.attention.self.key.weight’, ‘encoder.layer.9.intermediate.dense.weight’, ‘encoder.layer.8.attention.output.LayerNorm.weight’, ‘encoder.layer.5.intermediate.dense.bias’, ‘embeddings.token_type_embeddings.weight’, ‘encoder.layer.7.output.LayerNorm.bias’, ‘encoder.layer.7.attention.self.value.bias’, ‘encoder.layer.9.attention.self.query.bias’, ‘encoder.layer.3.attention.self.key.weight’, ‘encoder.layer.3.attention.output.dense.bias’, ‘encoder.layer.0.output.dense.weight’, ‘encoder.layer.6.attention.self.key.bias’, ‘encoder.layer.4.intermediate.dense.weight’, ‘encoder.layer.8.attention.self.value.weight’, ‘encoder.layer.10.attention.self.key.bias’, ‘encoder.layer.7.attention.self.value.weight’, ‘encoder.layer.11.attention.self.value.weight’, ‘pooler.dense.weight’, ‘encoder.layer.8.attention.self.query.bias’, ‘encoder.layer.0.attention.self.key.bias’, ‘encoder.layer.9.output.dense.weight’, ‘encoder.layer.10.attention.output.LayerNorm.weight’, ‘encoder.layer.9.output.LayerNorm.bias’, ‘encoder.layer.2.intermediate.dense.weight’, ‘encoder.layer.10.attention.self.query.weight’, ‘encoder.layer.11.attention.self.value.bias’, ‘encoder.layer.0.attention.output.dense.bias’, ‘encoder.layer.1.attention.self.value.weight’, ‘encoder.layer.0.output.LayerNorm.bias’, ‘encoder.layer.6.attention.self.key.weight’, ‘encoder.layer.6.attention.output.LayerNorm.bias’, ‘encoder.layer.7.attention.self.query.weight’, ‘encoder.layer.6.attention.output.dense.bias’, ‘encoder.layer.5.attention.self.value.weight’, ‘encoder.layer.3.attention.self.value.weight’, ‘encoder.layer.5.output.dense.weight’, ‘encoder.layer.4.intermediate.dense.bias’, ‘encoder.layer.5.attention.output.LayerNorm.weight’, ‘encoder.layer.1.output.LayerNorm.bias’, ‘encoder.layer.7.output.LayerNorm.weight’, ‘encoder.layer.3.output.LayerNorm.weight’, ‘encoder.layer.5.attention.output.dense.weight’, ‘encoder.layer.11.attention.self.key.weight’, ‘encoder.layer.9.attention.output.dense.bias’, ‘encoder.layer.6.output.dense.bias’, ‘encoder.layer.2.output.dense.weight’, ‘encoder.layer.11.intermediate.dense.weight’, ‘encoder.layer.11.output.LayerNorm.weight’, ‘encoder.layer.1.attention.self.query.bias’, ‘encoder.layer.2.attention.output.dense.weight’, ‘encoder.layer.2.output.LayerNorm.weight’, ‘encoder.layer.0.attention.self.query.bias’, ‘encoder.layer.1.attention.output.LayerNorm.bias’, ‘encoder.layer.9.attention.output.dense.weight’, ‘encoder.layer.10.intermediate.dense.bias’, ‘encoder.layer.9.intermediate.dense.bias’, ‘embeddings.word_embeddings.weight’, ‘encoder.layer.0.attention.output.LayerNorm.bias’, ‘encoder.layer.6.intermediate.dense.bias’, ‘encoder.layer.8.output.LayerNorm.bias’, ‘encoder.layer.4.output.dense.weight’, ‘encoder.layer.10.output.dense.weight’, ‘encoder.layer.9.output.dense.bias’, ‘encoder.layer.10.attention.output.dense.weight’, ‘encoder.layer.6.attention.output.dense.weight’, ‘encoder.layer.4.attention.self.query.bias’, ‘encoder.layer.6.output.LayerNorm.weight’, ‘encoder.layer.11.attention.self.query.weight’, ‘encoder.layer.2.attention.output.LayerNorm.weight’, ‘encoder.layer.1.attention.self.query.weight’, ‘encoder.layer.3.attention.self.key.bias’, ‘encoder.layer.7.output.dense.bias’, ‘encoder.layer.0.output.LayerNorm.weight’, ‘encoder.layer.3.attention.output.LayerNorm.weight’, ‘encoder.layer.5.intermediate.dense.weight’, ‘encoder.layer.6.attention.self.value.weight’, ‘encoder.layer.8.attention.output.dense.weight’, ‘encoder.layer.11.attention.output.dense.weight’, ‘encoder.layer.10.attention.output.LayerNorm.bias’, ‘encoder.layer.3.attention.self.value.bias’, ‘encoder.layer.10.attention.self.key.weight’, ‘encoder.layer.4.attention.output.dense.bias’, ‘encoder.layer.4.attention.self.key.bias’, ‘encoder.layer.5.attention.output.LayerNorm.bias’, ‘encoder.layer.10.output.LayerNorm.bias’, ‘encoder.layer.2.attention.output.LayerNorm.bias’, ‘encoder.layer.0.attention.self.value.bias’, ‘embeddings.position_embeddings.weight’, ‘encoder.layer.2.intermediate.dense.bias’, ‘encoder.layer.9.attention.output.LayerNorm.bias’, ‘encoder.layer.10.attention.output.dense.bias’, ‘encoder.layer.8.output.dense.weight’, ‘encoder.layer.11.output.LayerNorm.bias’, ‘encoder.layer.2.attention.self.key.bias’, ‘encoder.layer.4.attention.self.value.bias’, ‘encoder.layer.5.attention.self.key.weight’, ‘encoder.layer.8.attention.output.LayerNorm.bias’, ‘encoder.layer.9.output.LayerNorm.weight’, ‘encoder.layer.10.attention.self.value.weight’, ‘encoder.layer.1.output.dense.bias’, ‘encoder.layer.3.output.dense.bias’, ‘encoder.layer.3.attention.output.dense.weight’, ‘encoder.layer.2.output.dense.bias’, ‘encoder.layer.3.output.LayerNorm.bias’, ‘encoder.layer.0.attention.self.value.weight’, ‘encoder.layer.5.attention.self.key.bias’]
    You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

To load it from the hub i do :

from transformers import BertModel
checkpoint =“HCKLab/BiBert-MultiTask”
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model1 = MultiTaskModel(checkpoint, tasks).to(device)

Could you tell me what i miss and why when i shared the model after training to the hub i dont save the weights?

Thanks for your help.