Hi everyone,
I have a doubt that haunts me and I cannot clarify myself. I have two model for fine tuning a BERT model.
The first model follow exactly the run_ner script and use from transformers import AutoModelForTokenClassification
where the model is "Musixmatch/umberto-wikipedia-uncased-v1"
(BERT for Italian, but I think that is not important which type of BERT I used).
The second model is the follow:
class Bert(object):
"""A facade to Bert model that extracts features for sets of tokens"""
def __init__(self, layer_indexes, max_seq_length, batch_size, multi_lingual=False, which_cuda = 0):
pretrained_model = "Musixmatch/umberto-wikipedia-uncased-v1"
... setting all parameters self. ...
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, do_lower_case=not multi_lingual)
self.model = AutoModel.from_pretrained(pretrained_model).to(self.device)
# tells pytorch to run in evaluation mode instead of training
self.model.eval()
def get_bert_features(self, sentence):
## sentence is in the format ['tok1', 'tok2']
bert_tokens, map_to_original_tokens = self.convert_to_bert_tokenization(sentence)
feature = self.from_bert_tokens_to_features(bert_tokens, map_to_original_tokens)
features = [feature]
# get ids
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
# mask with 0's for placeholders
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
# tensor with 1...n where n is the number of examples
all_encoder_layers, _ = self.model(all_input_ids, token_type_ids=None, attention_mask=all_input_mask)
last_layer = all_encoder_layers[-1]
return bert_tokens, map_to_original_tokens, last_layer
def extract_bert_features(self, conll_dataset):
sentences = [[e.form for e in sentence] for sentence in conll_dataset]
# data loading
features = []
for sentence in sentences:
bert_tokens, map_to_original_tokens = self.convert_to_bert_tokenization(sentence)
feature = self.from_bert_tokens_to_features(bert_tokens, map_to_original_tokens)
features.append(feature)
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
# mask with 0's for placeholders
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
# tensor with 1...n where n is the number of examples
all_token_maps = torch.tensor([f.map_to_original_tokens for f in features], dtype=torch.long)
# indexes that map back dataset
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
# create a dataset the resources needed
eval_data = TensorDataset(all_input_ids, all_input_mask, all_token_maps, all_example_index)
# create a sampler which will be used to create the batches
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.batch_size)
for input_ids, input_mask, token_maps, example_indices in eval_dataloader:
input_ids = input_ids.to(self.device)
input_mask = input_mask.to(self.device)
all_encoder_layers, _ = self.model(input_ids, token_type_ids=None, attention_mask=input_mask)
averaged_output = all_encoder_layers
for i, idx in enumerate(example_indices):
for j, coll_entry in enumerate(conll_dataset[idx]):
if token_maps[i,j] < 511:
coll_entry.bert = averaged_output[i,token_maps[i,j]].clone().detach().cpu()
else:
coll_entry.bert = averaged_output[i,token_maps[i,511]].clone().detach().cpu()
where I used the same BERT model, but WITHOUT the final fine tuning layer:
(dropout): Dropout(p=0.1, inplace=False)
(classifier): Linear(in_features=768, out_features=X, bias=True)
because I define my fine tuning layer inside a ModelPat in the following way:
class ModelPaT(nn.Module):
def __init__(self, args, word_vocab, tag_vocab, pos_vocab, deprel_vocab, char_vocab):
super().__init__()
... setting all parameters self. ...
if self.bert:
self.bilstm_input_size = self.bert_hidden_size # 768
# FINE TUNING I think...
self.hidden2_to_pos = nn.Linear(
in_features=self.bilstm_input_size, # 768 = BERT layer
out_features=len(self.pos_vocab),
)
self.hidden2_to_dep = nn.Linear(
in_features=self.bilstm_input_size, # 768 = BERT layer
out_features=len(self.deprel_vocab),
)
def forward(self, sentences):
orig_w = [[e.form for e in sentence] for sentence in sentences] # all token from a given sentence
# print("token: " + str(orig_w))
w, t, x_lengths = self.sentence2tok_tags(sentences)
batch_size, seq_len = w.size()
# (batch_size, seq_len) -> (batch_size, seq_len, embedding_dim)
if self.bert:
# get bert features from model
bert_features_list = [[e.bert for e in sentence] for sentence in sentences]
# convert list to one tensor
bert_features_tensor = from_tensor_list_to_one_tensor(bert_features_list, self.bert_hidden_size).to(self.device)
# (batch_size, seq_len, embedding_dim) -> (batch_size, seq_len, n_lstm_units)
x = torch.nn.utils.rnn.pack_padded_sequence(bert_features_tensor, x_lengths, batch_first=True)
x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
# (batch_size, seq_len, n_lstm_units) -> (batch_size * seq_len, n_lstm_units)
x = x.contiguous()
x = x.view(-1, x.shape[2])
y1 = self.hidden2_to_pos(x)
y2 = self.hidden2_to_dep(x)
if self.mode == 'evaluation':
y1 = F.softmax(y1, dim=1)
y2 = F.softmax(y2, dim=1)
# (batch_size * seq_len, n_lstm_units) -> (batch_size, seq_len, n_tags)
y1 = y1.view(batch_size, seq_len, len(self.pos_vocab))
y2 = y2.view(batch_size, seq_len, len(self.deprel_vocab))
return y1, y2
where I get:
ModelPat(
(dropout): Dropout(p=0.1, inplace=False)
(hidden2_to_pos): Linear(in_features=768, out_features=X, bias=True)
(hidden2_to_dep): Linear(in_features=768, out_features=Y, bias=True)
)
Now these two models seem to have the same architecture for Bert’s fine tuning (the same model of BERT where at the end I added a Linear layer for fine tuning), but I get very different results with the same dataset. Doing several tests, I doubted that the problem is not so much in the architecture which, in fact, seems to be the same, but in the way in which Bert’s pre-trained models are loaded. Could this be the reason that makes me get different accuracy results?
If so, how could I modify the second model
to be similar to the first to achieve the same result?
Thanks a lot to everyone!
P.s. I know that in the second model I have two Linear layers, but they are used to predict different labels and are evaluated separately, so I doubt that is the problem