Hi All
I try to share a multi-task model on the hub but i failed to load it after for inference. My Bert model works by having a shared BERT-style encoder transformer, and two different task heads for each task. The two heads are a binary classification head (num_label =2) and a sentiment classification head (num_label = 5)
Here is the code :
class SequenceClassificationHead(nn.Module):
def __init__(self, hidden_size, num_labels, dropout_p=0.1): super().__init__() self.num_labels = num_labels self.dropout = nn.Dropout(dropout_p) self.classifier = nn.Linear(hidden_size, num_labels) self._init_weights() def _init_weights(self): self.classifier.weight.data.normal_(mean=0.0, std=0.02) if self.classifier.bias is not None: self.classifier.bias.data.zero_() def forward(self, sequence_output, pooled_output, labels=None, **kwargs): pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) loss = None if labels is not None: loss_fct = nn.CrossEntropyLoss() loss = loss_fct( logits.view(-1, self.num_labels), labels.long().view(-1) ) return logits, loss
class MultiTaskModel(BertPreTrainedModel):
def __init__(self, checkpoint, tasks: List): super().__init__(PretrainedConfig()) self.encoder = BertModel.from_pretrained(checkpoint) self.output_heads = nn.ModuleDict() for task in tasks: decoder = self._create_output_head(self.encoder.config.hidden_size, task) # ModuleDict requires keys to be strings self.output_heads[str(task.id)] = decoder @staticmethod def _create_output_head(encoder_hidden_size: int, task): if task.type == "seq_classification": return SequenceClassificationHead(encoder_hidden_size, task.num_labels) else: raise NotImplementedError() def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, task_ids=None, **kwargs,
):
outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_output, pooled_output = outputs[:2] unique_task_ids_list = torch.unique(task_ids).tolist() loss_list = [] logits = None for unique_task_id in unique_task_ids_list: task_id_filter = task_ids == unique_task_id logits, task_loss = self.output_heads[str(unique_task_id)].forward( sequence_output[task_id_filter], pooled_output[task_id_filter], labels=None if labels is None else labels[task_id_filter], attention_mask=attention_mask[task_id_filter], )
I train it with the trainer API and share it with the Trainer API. It works but when i want to use it for inference and load from the hub i have this message :
loading file vocab.txt from cache at /root/.cache/huggingface/hub/modelsâHCKLabâBiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/modelsâHCKLabâBiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/modelsâHCKLabâBiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/modelsâHCKLabâBiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/modelsâHCKLabâBiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/config.json
Model config BertConfig {
âarchitecturesâ: [
âMultiTaskModelâ
],
âattention_probs_dropout_probâ: 0.1,
âclassifier_dropoutâ: null,
âhidden_actâ: âgeluâ,
âhidden_dropout_probâ: 0.1,
âhidden_sizeâ: 768,
âinitializer_rangeâ: 0.02,
âintermediate_sizeâ: 3072,
âlayer_norm_epsâ: 1e-12,
âmax_position_embeddingsâ: 512,
âmodel_typeâ: âbertâ,
ânum_attention_headsâ: 12,
ânum_hidden_layersâ: 12,
âpad_token_idâ: 0,
âposition_embedding_typeâ: âabsoluteâ,
âtorch_dtypeâ: âfloat32â,
âtransformers_versionâ: â4.22.1â,
âtype_vocab_sizeâ: 2,
âuse_cacheâ: true,
âvocab_sizeâ: 30522
}loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/modelsâHCKLabâBiBert-MultiTask/snapshots/f3523728d3e144c0b7d262f6ff924cc174bc0d03/pytorch_model.bin
Some weights of the model checkpoint at HCKLab/BiBert-MultiTask were not used when initializing BertModel: [âencoder.encoder.layer.4.output.LayerNorm.biasâ, âencoder.encoder.layer.3.attention.self.key.weightâ, âencoder.encoder.layer.1.attention.self.query.biasâ, âencoder.encoder.layer.4.attention.self.query.biasâ, âencoder.encoder.layer.5.output.LayerNorm.biasâ, âencoder.encoder.layer.4.attention.output.LayerNorm.biasâ, âencoder.encoder.layer.11.attention.output.dense.biasâ, âencoder.encoder.layer.2.attention.self.query.biasâ, âencoder.pooler.dense.weightâ, âencoder.encoder.layer.6.intermediate.dense.weightâ, âencoder.encoder.layer.1.attention.self.key.biasâ, âencoder.encoder.layer.7.attention.output.dense.weightâ, âencoder.encoder.layer.9.attention.output.LayerNorm.weightâ, âencoder.embeddings.LayerNorm.biasâ, âencoder.encoder.layer.8.intermediate.dense.biasâ, âencoder.encoder.layer.4.attention.output.LayerNorm.weightâ, âencoder.encoder.layer.4.attention.self.value.weightâ, âencoder.encoder.layer.5.output.dense.biasâ, âencoder.encoder.layer.2.output.LayerNorm.weightâ, âencoder.encoder.layer.5.output.LayerNorm.weightâ, âencoder.encoder.layer.6.attention.output.LayerNorm.biasâ, âencoder.encoder.layer.7.output.dense.weightâ, âencoder.encoder.layer.7.intermediate.dense.biasâ, âencoder.encoder.layer.9.output.dense.biasâ, âencoder.encoder.layer.4.output.dense.weightâ, âencoder.encoder.layer.10.attention.self.key.weightâ, âencoder.encoder.layer.11.output.dense.biasâ, âencoder.embeddings.position_embeddings.weightâ, âencoder.encoder.layer.1.attention.self.value.biasâ, âencoder.encoder.layer.6.attention.self.value.weightâ, âencoder.encoder.layer.10.attention.self.value.biasâ, âencoder.encoder.layer.6.attention.output.dense.biasâ, âencoder.encoder.layer.5.attention.self.query.weightâ, âencoder.encoder.layer.11.attention.output.dense.weightâ, âencoder.encoder.layer.0.attention.output.LayerNorm.weightâ, âencoder.encoder.layer.0.attention.self.key.weightâ, âencoder.encoder.layer.11.attention.output.LayerNorm.biasâ, âencoder.encoder.layer.1.attention.output.LayerNorm.biasâ, âencoder.encoder.layer.3.output.LayerNorm.biasâ, âencoder.encoder.layer.0.intermediate.dense.weightâ, âencoder.encoder.layer.8.attention.self.query.weightâ, âencoder.encoder.layer.10.attention.output.LayerNorm.biasâ, âencoder.encoder.layer.3.attention.output.dense.biasâ, âencoder.encoder.layer.3.output.LayerNorm.weightâ, âencoder.encoder.layer.10.attention.self.key.biasâ, âencoder.encoder.layer.1.attention.output.LayerNorm.weightâ, âencoder.encoder.layer.5.attention.self.key.weightâ, âencoder.encoder.layer.7.attention.self.key.weightâ, âencoder.encoder.layer.9.attention.self.key.biasâ, âencoder.encoder.layer.6.attention.self.query.biasâ, âencoder.encoder.layer.9.output.LayerNorm.biasâ, âencoder.encoder.layer.10.attention.output.dense.weightâ, âencoder.encoder.layer.1.output.LayerNorm.biasâ, âencoder.encoder.layer.0.output.dense.biasâ, âencoder.encoder.layer.11.attention.self.value.weightâ, âencoder.encoder.layer.6.attention.self.query.weightâ, âencoder.encoder.layer.2.attention.output.LayerNorm.biasâ, âoutput_heads.0.classifier.biasâ, âencoder.encoder.layer.10.output.dense.weightâ, âencoder.encoder.layer.5.attention.self.query.biasâ, âencoder.encoder.layer.8.attention.output.dense.weightâ, âencoder.encoder.layer.8.intermediate.dense.weightâ, âencoder.encoder.layer.1.intermediate.dense.weightâ, âencoder.encoder.layer.7.attention.self.query.biasâ, âencoder.embeddings.token_type_embeddings.weightâ, âencoder.encoder.layer.5.intermediate.dense.weightâ, âencoder.encoder.layer.4.attention.output.dense.weightâ, âencoder.encoder.layer.9.intermediate.dense.weightâ, âencoder.encoder.layer.7.attention.output.LayerNorm.weightâ, âencoder.encoder.layer.10.attention.output.dense.biasâ, âencoder.encoder.layer.3.output.dense.weightâ, âencoder.encoder.layer.11.attention.self.query.weightâ, âencoder.encoder.layer.6.attention.self.key.biasâ, âencoder.encoder.layer.8.output.dense.weightâ, âencoder.encoder.layer.0.attention.self.value.biasâ, âencoder.encoder.layer.0.attention.self.query.weightâ, âencoder.pooler.dense.biasâ, âencoder.encoder.layer.8.output.LayerNorm.biasâ, âencoder.encoder.layer.6.attention.output.dense.weightâ, âencoder.encoder.layer.7.attention.self.value.biasâ, âencoder.embeddings.position_idsâ, âencoder.encoder.layer.10.attention.self.value.weightâ, âencoder.encoder.layer.10.output.dense.biasâ, âencoder.encoder.layer.7.attention.output.LayerNorm.biasâ, âoutput_heads.0.classifier.weightâ, âencoder.encoder.layer.8.output.LayerNorm.weightâ, âencoder.encoder.layer.6.attention.self.key.weightâ, âencoder.encoder.layer.0.intermediate.dense.biasâ, âencoder.encoder.layer.2.attention.output.LayerNorm.weightâ, âencoder.embeddings.word_embeddings.weightâ, âencoder.encoder.layer.4.attention.self.key.biasâ, âencoder.encoder.layer.6.output.dense.biasâ, âencoder.encoder.layer.2.attention.self.value.biasâ, âencoder.encoder.layer.5.attention.self.key.biasâ, âencoder.encoder.layer.2.attention.self.key.weightâ, âencoder.encoder.layer.5.attention.output.LayerNorm.weightâ, âencoder.encoder.layer.11.attention.self.key.biasâ, âencoder.encoder.layer.1.attention.self.key.weightâ, âencoder.encoder.layer.0.output.LayerNorm.biasâ, âencoder.encoder.layer.2.attention.self.value.weightâ, âencoder.encoder.layer.2.intermediate.dense.weightâ, âencoder.encoder.layer.4.attention.self.query.weightâ, âencoder.encoder.layer.5.attention.output.LayerNorm.biasâ, âencoder.encoder.layer.5.attention.output.dense.weightâ, âencoder.encoder.layer.9.intermediate.dense.biasâ, âencoder.encoder.layer.3.attention.self.value.weightâ, âencoder.encoder.layer.11.output.LayerNorm.weightâ, âencoder.encoder.layer.6.attention.self.value.biasâ, âencoder.encoder.layer.7.attention.output.dense.biasâ, âencoder.encoder.layer.7.attention.self.query.weightâ, âencoder.encoder.layer.3.intermediate.dense.biasâ, âencoder.encoder.layer.11.attention.output.LayerNorm.weightâ, âencoder.encoder.layer.1.attention.output.dense.biasâ, âencoder.encoder.layer.11.attention.self.query.biasâ, âencoder.encoder.layer.5.attention.output.dense.biasâ, âencoder.encoder.layer.8.attention.self.value.biasâ, âencoder.encoder.layer.7.output.LayerNorm.weightâ, âoutput_heads.1.classifier.weightâ, âencoder.encoder.layer.2.intermediate.dense.biasâ, âencoder.encoder.layer.10.attention.output.LayerNorm.weightâ, âencoder.encoder.layer.9.attention.self.value.biasâ, âencoder.encoder.layer.10.output.LayerNorm.weightâ, âencoder.encoder.layer.10.output.LayerNorm.biasâ, âencoder.encoder.layer.5.attention.self.value.biasâ, âencoder.encoder.layer.9.attention.self.query.biasâ, âencoder.encoder.layer.8.attention.self.query.biasâ, âencoder.encoder.layer.11.output.dense.weightâ, âoutput_heads.1.classifier.biasâ, âencoder.encoder.layer.4.attention.output.dense.biasâ, âencoder.encoder.layer.2.output.dense.weightâ, âencoder.encoder.layer.1.output.LayerNorm.weightâ, âencoder.encoder.layer.2.attention.output.dense.biasâ, âencoder.encoder.layer.9.output.LayerNorm.weightâ, âencoder.encoder.layer.2.output.dense.biasâ, âencoder.encoder.layer.9.attention.output.dense.biasâ, âencoder.encoder.layer.10.attention.self.query.biasâ, âencoder.encoder.layer.7.intermediate.dense.weightâ, âencoder.encoder.layer.0.attention.output.dense.biasâ, âencoder.encoder.layer.11.attention.self.value.biasâ, âencoder.encoder.layer.3.intermediate.dense.weightâ, âencoder.encoder.layer.3.attention.self.query.biasâ, âencoder.encoder.layer.8.attention.self.value.weightâ, âencoder.encoder.layer.11.intermediate.dense.biasâ, âencoder.encoder.layer.5.output.dense.weightâ, âencoder.encoder.layer.2.output.LayerNorm.biasâ, âencoder.encoder.layer.10.intermediate.dense.weightâ, âencoder.encoder.layer.11.intermediate.dense.weightâ, âencoder.encoder.layer.5.attention.self.value.weightâ, âencoder.encoder.layer.9.attention.output.dense.weightâ, âencoder.encoder.layer.2.attention.output.dense.weightâ, âencoder.encoder.layer.6.output.dense.weightâ, âencoder.encoder.layer.1.output.dense.biasâ, âencoder.encoder.layer.3.attention.self.value.biasâ, âencoder.encoder.layer.3.attention.output.dense.weightâ, âencoder.encoder.layer.4.intermediate.dense.biasâ, âencoder.encoder.layer.0.attention.self.value.weightâ, âencoder.encoder.layer.9.attention.output.LayerNorm.biasâ, âencoder.encoder.layer.7.attention.self.value.weightâ, âencoder.encoder.layer.10.intermediate.dense.biasâ, âencoder.encoder.layer.5.intermediate.dense.biasâ, âencoder.encoder.layer.8.output.dense.biasâ, âencoder.encoder.layer.3.attention.output.LayerNorm.weightâ, âencoder.encoder.layer.4.output.dense.biasâ, âencoder.encoder.layer.4.output.LayerNorm.weightâ, âencoder.encoder.layer.8.attention.output.LayerNorm.biasâ, âencoder.encoder.layer.0.attention.output.LayerNorm.biasâ, âencoder.encoder.layer.4.intermediate.dense.weightâ, âencoder.encoder.layer.6.output.LayerNorm.weightâ, âencoder.encoder.layer.9.attention.self.key.weightâ, âencoder.encoder.layer.3.output.dense.biasâ, âencoder.encoder.layer.0.attention.output.dense.weightâ, âencoder.encoder.layer.9.output.dense.weightâ, âencoder.encoder.layer.0.output.LayerNorm.weightâ, âencoder.encoder.layer.11.output.LayerNorm.biasâ, âencoder.encoder.layer.3.attention.self.query.weightâ, âencoder.encoder.layer.0.attention.self.query.biasâ, âencoder.encoder.layer.0.attention.self.key.biasâ, âencoder.encoder.layer.3.attention.self.key.biasâ, âencoder.encoder.layer.1.attention.output.dense.weightâ, âencoder.encoder.layer.7.output.dense.biasâ, âencoder.encoder.layer.9.attention.self.query.weightâ, âencoder.encoder.layer.8.attention.output.LayerNorm.weightâ, âencoder.encoder.layer.10.attention.self.query.weightâ, âencoder.encoder.layer.4.attention.self.value.biasâ, âencoder.encoder.layer.3.attention.output.LayerNorm.biasâ, âencoder.encoder.layer.8.attention.output.dense.biasâ, âencoder.encoder.layer.7.attention.self.key.biasâ, âencoder.encoder.layer.0.output.dense.weightâ, âencoder.encoder.layer.11.attention.self.key.weightâ, âencoder.encoder.layer.8.attention.self.key.biasâ, âencoder.embeddings.LayerNorm.weightâ, âencoder.encoder.layer.2.attention.self.query.weightâ, âencoder.encoder.layer.6.output.LayerNorm.biasâ, âencoder.encoder.layer.7.output.LayerNorm.biasâ, âencoder.encoder.layer.2.attention.self.key.biasâ, âencoder.encoder.layer.6.intermediate.dense.biasâ, âencoder.encoder.layer.6.attention.output.LayerNorm.weightâ, âencoder.encoder.layer.9.attention.self.value.weightâ, âencoder.encoder.layer.1.intermediate.dense.biasâ, âencoder.encoder.layer.1.attention.self.query.weightâ, âencoder.encoder.layer.4.attention.self.key.weightâ, âencoder.encoder.layer.1.output.dense.weightâ, âencoder.encoder.layer.8.attention.self.key.weightâ, âencoder.encoder.layer.1.attention.self.value.weightâ]
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at HCKLab/BiBert-MultiTask and are newly initialized: [âencoder.layer.0.intermediate.dense.weightâ, âencoder.layer.7.attention.self.key.weightâ, âencoder.layer.6.attention.self.query.weightâ, âencoder.layer.5.output.LayerNorm.biasâ, âencoder.layer.7.attention.output.LayerNorm.weightâ, âencoder.layer.6.attention.output.LayerNorm.weightâ, âencoder.layer.3.output.dense.weightâ, âencoder.layer.2.attention.output.dense.biasâ, âencoder.layer.0.attention.output.LayerNorm.weightâ, âencoder.layer.1.attention.self.key.biasâ, âencoder.layer.0.attention.output.dense.weightâ, âencoder.layer.1.attention.self.value.biasâ, âencoder.layer.4.attention.self.value.weightâ, âencoder.layer.1.attention.output.dense.biasâ, âencoder.layer.7.intermediate.dense.biasâ, âencoder.layer.2.output.LayerNorm.biasâ, âencoder.layer.8.intermediate.dense.biasâ, âencoder.layer.0.output.dense.biasâ, âencoder.layer.10.intermediate.dense.weightâ, âencoder.layer.5.attention.self.query.biasâ, âencoder.layer.2.attention.self.query.weightâ, âencoder.layer.5.attention.self.query.weightâ, âencoder.layer.0.intermediate.dense.biasâ, âencoder.layer.8.intermediate.dense.weightâ, âencoder.layer.10.output.dense.biasâ, âencoder.layer.0.attention.self.key.weightâ, âencoder.layer.5.attention.output.dense.biasâ, âencoder.layer.5.output.LayerNorm.weightâ, âencoder.layer.7.intermediate.dense.weightâ, âencoder.layer.8.output.dense.biasâ, âencoder.layer.9.attention.self.key.biasâ, âencoder.layer.11.output.dense.weightâ, âencoder.layer.9.attention.self.key.weightâ, âembeddings.LayerNorm.biasâ, âencoder.layer.6.intermediate.dense.weightâ, âencoder.layer.7.attention.self.query.biasâ, âencoder.layer.1.intermediate.dense.weightâ, âencoder.layer.7.attention.self.key.biasâ, âencoder.layer.11.attention.output.dense.biasâ, âencoder.layer.4.output.LayerNorm.weightâ, âencoder.layer.7.attention.output.dense.weightâ, âencoder.layer.11.attention.output.LayerNorm.biasâ, âencoder.layer.5.output.dense.biasâ, âencoder.layer.3.attention.self.query.biasâ, âencoder.layer.8.attention.self.key.biasâ, âencoder.layer.11.attention.self.query.biasâ, âencoder.layer.1.attention.output.LayerNorm.weightâ, âencoder.layer.4.attention.output.LayerNorm.weightâ, âpooler.dense.biasâ, âencoder.layer.3.intermediate.dense.weightâ, âencoder.layer.10.attention.self.query.biasâ, âencoder.layer.8.output.LayerNorm.weightâ, âencoder.layer.7.attention.output.LayerNorm.biasâ, âencoder.layer.4.output.LayerNorm.biasâ, âencoder.layer.3.attention.self.query.weightâ, âencoder.layer.1.output.dense.weightâ, âencoder.layer.4.output.dense.biasâ, âencoder.layer.10.attention.self.value.biasâ, âencoder.layer.4.attention.self.query.weightâ, âencoder.layer.7.output.dense.weightâ, âencoder.layer.2.attention.self.query.biasâ, âencoder.layer.1.intermediate.dense.biasâ, âencoder.layer.10.output.LayerNorm.weightâ, âencoder.layer.2.attention.self.value.biasâ, âencoder.layer.11.attention.self.key.biasâ, âencoder.layer.4.attention.output.LayerNorm.biasâ, âencoder.layer.8.attention.output.dense.biasâ, âencoder.layer.2.attention.self.value.weightâ, âencoder.layer.6.output.LayerNorm.biasâ, âencoder.layer.8.attention.self.key.weightâ, âencoder.layer.0.attention.self.query.weightâ, âencoder.layer.6.attention.self.query.biasâ, âencoder.layer.8.attention.self.query.weightâ, âencoder.layer.4.attention.output.dense.weightâ, âencoder.layer.6.output.dense.weightâ, âencoder.layer.11.attention.output.LayerNorm.weightâ, âencoder.layer.9.attention.output.LayerNorm.weightâ, âencoder.layer.11.output.dense.biasâ, âencoder.layer.1.output.LayerNorm.weightâ, âencoder.layer.1.attention.output.dense.weightâ, âencoder.layer.6.attention.self.value.biasâ, âencoder.layer.7.attention.output.dense.biasâ, âencoder.layer.8.attention.self.value.biasâ, âencoder.layer.5.attention.self.value.biasâ, âencoder.layer.3.intermediate.dense.biasâ, âencoder.layer.11.intermediate.dense.biasâ, âencoder.layer.9.attention.self.value.biasâ, âencoder.layer.1.attention.self.key.weightâ, âencoder.layer.9.attention.self.query.weightâ, âencoder.layer.9.attention.self.value.weightâ, âencoder.layer.4.attention.self.key.weightâ, âembeddings.LayerNorm.weightâ, âencoder.layer.3.attention.output.LayerNorm.biasâ, âencoder.layer.2.attention.self.key.weightâ, âencoder.layer.9.intermediate.dense.weightâ, âencoder.layer.8.attention.output.LayerNorm.weightâ, âencoder.layer.5.intermediate.dense.biasâ, âembeddings.token_type_embeddings.weightâ, âencoder.layer.7.output.LayerNorm.biasâ, âencoder.layer.7.attention.self.value.biasâ, âencoder.layer.9.attention.self.query.biasâ, âencoder.layer.3.attention.self.key.weightâ, âencoder.layer.3.attention.output.dense.biasâ, âencoder.layer.0.output.dense.weightâ, âencoder.layer.6.attention.self.key.biasâ, âencoder.layer.4.intermediate.dense.weightâ, âencoder.layer.8.attention.self.value.weightâ, âencoder.layer.10.attention.self.key.biasâ, âencoder.layer.7.attention.self.value.weightâ, âencoder.layer.11.attention.self.value.weightâ, âpooler.dense.weightâ, âencoder.layer.8.attention.self.query.biasâ, âencoder.layer.0.attention.self.key.biasâ, âencoder.layer.9.output.dense.weightâ, âencoder.layer.10.attention.output.LayerNorm.weightâ, âencoder.layer.9.output.LayerNorm.biasâ, âencoder.layer.2.intermediate.dense.weightâ, âencoder.layer.10.attention.self.query.weightâ, âencoder.layer.11.attention.self.value.biasâ, âencoder.layer.0.attention.output.dense.biasâ, âencoder.layer.1.attention.self.value.weightâ, âencoder.layer.0.output.LayerNorm.biasâ, âencoder.layer.6.attention.self.key.weightâ, âencoder.layer.6.attention.output.LayerNorm.biasâ, âencoder.layer.7.attention.self.query.weightâ, âencoder.layer.6.attention.output.dense.biasâ, âencoder.layer.5.attention.self.value.weightâ, âencoder.layer.3.attention.self.value.weightâ, âencoder.layer.5.output.dense.weightâ, âencoder.layer.4.intermediate.dense.biasâ, âencoder.layer.5.attention.output.LayerNorm.weightâ, âencoder.layer.1.output.LayerNorm.biasâ, âencoder.layer.7.output.LayerNorm.weightâ, âencoder.layer.3.output.LayerNorm.weightâ, âencoder.layer.5.attention.output.dense.weightâ, âencoder.layer.11.attention.self.key.weightâ, âencoder.layer.9.attention.output.dense.biasâ, âencoder.layer.6.output.dense.biasâ, âencoder.layer.2.output.dense.weightâ, âencoder.layer.11.intermediate.dense.weightâ, âencoder.layer.11.output.LayerNorm.weightâ, âencoder.layer.1.attention.self.query.biasâ, âencoder.layer.2.attention.output.dense.weightâ, âencoder.layer.2.output.LayerNorm.weightâ, âencoder.layer.0.attention.self.query.biasâ, âencoder.layer.1.attention.output.LayerNorm.biasâ, âencoder.layer.9.attention.output.dense.weightâ, âencoder.layer.10.intermediate.dense.biasâ, âencoder.layer.9.intermediate.dense.biasâ, âembeddings.word_embeddings.weightâ, âencoder.layer.0.attention.output.LayerNorm.biasâ, âencoder.layer.6.intermediate.dense.biasâ, âencoder.layer.8.output.LayerNorm.biasâ, âencoder.layer.4.output.dense.weightâ, âencoder.layer.10.output.dense.weightâ, âencoder.layer.9.output.dense.biasâ, âencoder.layer.10.attention.output.dense.weightâ, âencoder.layer.6.attention.output.dense.weightâ, âencoder.layer.4.attention.self.query.biasâ, âencoder.layer.6.output.LayerNorm.weightâ, âencoder.layer.11.attention.self.query.weightâ, âencoder.layer.2.attention.output.LayerNorm.weightâ, âencoder.layer.1.attention.self.query.weightâ, âencoder.layer.3.attention.self.key.biasâ, âencoder.layer.7.output.dense.biasâ, âencoder.layer.0.output.LayerNorm.weightâ, âencoder.layer.3.attention.output.LayerNorm.weightâ, âencoder.layer.5.intermediate.dense.weightâ, âencoder.layer.6.attention.self.value.weightâ, âencoder.layer.8.attention.output.dense.weightâ, âencoder.layer.11.attention.output.dense.weightâ, âencoder.layer.10.attention.output.LayerNorm.biasâ, âencoder.layer.3.attention.self.value.biasâ, âencoder.layer.10.attention.self.key.weightâ, âencoder.layer.4.attention.output.dense.biasâ, âencoder.layer.4.attention.self.key.biasâ, âencoder.layer.5.attention.output.LayerNorm.biasâ, âencoder.layer.10.output.LayerNorm.biasâ, âencoder.layer.2.attention.output.LayerNorm.biasâ, âencoder.layer.0.attention.self.value.biasâ, âembeddings.position_embeddings.weightâ, âencoder.layer.2.intermediate.dense.biasâ, âencoder.layer.9.attention.output.LayerNorm.biasâ, âencoder.layer.10.attention.output.dense.biasâ, âencoder.layer.8.output.dense.weightâ, âencoder.layer.11.output.LayerNorm.biasâ, âencoder.layer.2.attention.self.key.biasâ, âencoder.layer.4.attention.self.value.biasâ, âencoder.layer.5.attention.self.key.weightâ, âencoder.layer.8.attention.output.LayerNorm.biasâ, âencoder.layer.9.output.LayerNorm.weightâ, âencoder.layer.10.attention.self.value.weightâ, âencoder.layer.1.output.dense.biasâ, âencoder.layer.3.output.dense.biasâ, âencoder.layer.3.attention.output.dense.weightâ, âencoder.layer.2.output.dense.biasâ, âencoder.layer.3.output.LayerNorm.biasâ, âencoder.layer.0.attention.self.value.weightâ, âencoder.layer.5.attention.self.key.biasâ]
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
To load it from the hub i do :
from transformers import BertModel
checkpoint =âHCKLab/BiBert-MultiTaskâ
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model1 = MultiTaskModel(checkpoint, tasks).to(device)
Could you tell me what i miss and why when i shared the model after training to the hub i dont save the weights?
Thanks for your help.