Difference between "Auto Model" and "Auto Model For Token Classification" in BERT fine tuning

Hi everyone,

I have a doubt that haunts me and I cannot clarify myself. I have two model for fine tuning a BERT model.

The first model follow exactly the run_ner script and use from transformers import AutoModelForTokenClassification where the model is "Musixmatch/umberto-wikipedia-uncased-v1" (BERT for Italian, but I think that is not important which type of BERT I used).

The second model is the follow:

class Bert(object):
    """A facade to Bert model that extracts features for sets of tokens"""

    def __init__(self, layer_indexes, max_seq_length, batch_size, multi_lingual=False, which_cuda = 0):
        pretrained_model =  "Musixmatch/umberto-wikipedia-uncased-v1"

       ... setting all parameters self. ...

        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, do_lower_case=not multi_lingual)
        self.model = AutoModel.from_pretrained(pretrained_model).to(self.device)
        # tells pytorch to run in evaluation mode instead of training
        self.model.eval() 
        
    def get_bert_features(self, sentence):
        ## sentence is in the format ['tok1', 'tok2']
        bert_tokens, map_to_original_tokens = self.convert_to_bert_tokenization(sentence)
        feature = self.from_bert_tokens_to_features(bert_tokens, map_to_original_tokens)
        features = [feature]
        
        # get ids
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        # mask with 0's for placeholders
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        # tensor with 1...n where n is the number of examples
        all_encoder_layers, _ = self.model(all_input_ids, token_type_ids=None, attention_mask=all_input_mask)
        last_layer = all_encoder_layers[-1]
        
        return bert_tokens, map_to_original_tokens, last_layer

    def extract_bert_features(self, conll_dataset):
        sentences = [[e.form for e in sentence] for sentence in conll_dataset]
        # data loading
        features = []
        for sentence in sentences:
            bert_tokens, map_to_original_tokens = self.convert_to_bert_tokenization(sentence)
            feature = self.from_bert_tokens_to_features(bert_tokens, map_to_original_tokens)
            features.append(feature)
        
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        # mask with 0's for placeholders
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        # tensor with 1...n where n is the number of examples
        all_token_maps = torch.tensor([f.map_to_original_tokens for f in features], dtype=torch.long)
        # indexes that map back dataset
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        
        # create a dataset the resources needed
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_token_maps, all_example_index)
        # create a sampler which will be used to create the batches
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.batch_size)

        for input_ids, input_mask, token_maps, example_indices in eval_dataloader:
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            all_encoder_layers, _ = self.model(input_ids, token_type_ids=None, attention_mask=input_mask)
            averaged_output = all_encoder_layers
           
            for i, idx in enumerate(example_indices):
                for j, coll_entry in enumerate(conll_dataset[idx]):
                    if token_maps[i,j] < 511:
                        coll_entry.bert = averaged_output[i,token_maps[i,j]].clone().detach().cpu()
                    else:
                        coll_entry.bert = averaged_output[i,token_maps[i,511]].clone().detach().cpu()

where I used the same BERT model, but WITHOUT the final fine tuning layer:

(dropout): Dropout(p=0.1, inplace=False)
(classifier): Linear(in_features=768, out_features=X, bias=True)

because I define my fine tuning layer inside a ModelPat in the following way:

class ModelPaT(nn.Module):

    def __init__(self, args, word_vocab, tag_vocab, pos_vocab, deprel_vocab, char_vocab):
        super().__init__()

        ... setting all parameters self. ...

        if self.bert:
            self.bilstm_input_size = self.bert_hidden_size  # 768

	    # FINE TUNING I think...

        self.hidden2_to_pos = nn.Linear(
            in_features=self.bilstm_input_size, # 768 = BERT layer
            out_features=len(self.pos_vocab),
        )

        self.hidden2_to_dep = nn.Linear(
            in_features=self.bilstm_input_size, # 768 = BERT layer 
            out_features=len(self.deprel_vocab),
        )

    def forward(self, sentences):
        orig_w = [[e.form for e in sentence] for sentence in sentences]  # all token from a given sentence
        # print("token: " + str(orig_w))
        w, t, x_lengths = self.sentence2tok_tags(sentences)

        batch_size, seq_len = w.size()
        # (batch_size, seq_len) -> (batch_size, seq_len, embedding_dim)

        if self.bert:
            # get bert features from model
            bert_features_list = [[e.bert for e in sentence] for sentence in sentences]
            # convert list to one tensor
            bert_features_tensor = from_tensor_list_to_one_tensor(bert_features_list, self.bert_hidden_size).to(self.device)

        # (batch_size, seq_len, embedding_dim) -> (batch_size, seq_len, n_lstm_units)
        x = torch.nn.utils.rnn.pack_padded_sequence(bert_features_tensor, x_lengths, batch_first=True)
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        # (batch_size, seq_len, n_lstm_units) -> (batch_size * seq_len, n_lstm_units)
        x = x.contiguous()
        x = x.view(-1, x.shape[2])

        y1 = self.hidden2_to_pos(x)
        y2 = self.hidden2_to_dep(x)

        if self.mode == 'evaluation':
            y1 = F.softmax(y1, dim=1)
            y2 = F.softmax(y2, dim=1)

        # (batch_size * seq_len, n_lstm_units) -> (batch_size, seq_len, n_tags)
        y1 = y1.view(batch_size, seq_len, len(self.pos_vocab))
        y2 = y2.view(batch_size, seq_len, len(self.deprel_vocab))

        return y1, y2

where I get:

ModelPat(
  (dropout): Dropout(p=0.1, inplace=False)
  (hidden2_to_pos): Linear(in_features=768, out_features=X, bias=True)
  (hidden2_to_dep): Linear(in_features=768, out_features=Y, bias=True)
)

Now these two models seem to have the same architecture for Bert’s fine tuning (the same model of BERT where at the end I added a Linear layer for fine tuning), but I get very different results with the same dataset. Doing several tests, I doubted that the problem is not so much in the architecture which, in fact, seems to be the same, but in the way in which Bert’s pre-trained models are loaded. Could this be the reason that makes me get different accuracy results?
If so, how could I modify the second model to be similar to the first to achieve the same result?

Thanks a lot to everyone!

P.s. I know that in the second model I have two Linear layers, but they are used to predict different labels and are evaluated separately, so I doubt that is the problem

Hi there,

I am also doing something similar as you. I wanted to customize my AutoModelForTokenClassifiaction so I loaded the AutoModel and added one extra layer for fine-tuning.

To compare the model I implemented with the original AutoModelForTokenClassifiaction, I printed out the model architecture and found that the AutoModel actually has one extra BertPooler, which is essentially a fully connected linear layer at the end and probably used for the MLM task. I think this is probably why you get different results.

Hope this helps.