I want to run some experiments on a pretrained Bert version.
It’s important for me to be able to import already trained models since I don’t have enough computing power to train my own Bert.
I found “bert-base-uncased” and I loaded it into a small script to test it’s loss in a next sentence prediction task on bookcorpus.
def test_one_epoch():
running_loss = 0.
last_loss = 0.
for i, data in data_iter:
inputs = data["input_ids"]
labels = torch.tensor([[-100 for _ in range(512)] for __ in range(64)]) #data["input_ids"]
token_type_ids = data["token_type_ids"]
next_sentence_label = data["sentence_order_label"]
outputs = model(input_ids = inputs, labels = labels, next_sentence_label = next_sentence_label, token_type_ids = token_type_ids)
loss = outputs["loss"]
print(loss)
running_loss += loss.item()
if i % 10 == 9:
last_loss = running_loss / 10 # loss per batch
print(' batch {} loss: {}'.format(i + 1, last_loss))
running_loss = 0.
return last_loss
I’m running this with BertForPreTraining.from_pretrained("bert-base-uncased")
, but I’m ignoring the masked token task by setting all labels to -100
(which makes the token loss vanish).
The Problem is: Bert’s loss is costantly above 5, while a randomly initialized Bert actually reaches losses of less than 1 in the same task… Masked tokens doesn’t work incredibly well as well, but at least it’s a lot better than in the randomly initialized Bert.
Let’s have a look at the seq_relationship_logits:
tensor([[ 6.4212, -6.2590],
[ 6.0083, -5.4779],
[ 6.4277, -6.2895],
[ 6.2112, -5.8529],
[ 6.3191, -6.0712],
[ 6.3102, -6.0506],
[ 6.4885, -6.4182],
[ 5.6914, -4.8888],
[ 6.5099, -6.4894],
[ 6.3911, -6.1799],
[ 6.4764, -6.4173],
[ 6.3429, -6.1152],
[ 6.4858, -6.3851],
[ 6.4291, -6.2763],
[ 6.1265, -5.7106],
[ 6.5050, -6.4992],
[ 6.5056, -6.4890],
[ 6.4731, -6.3811],
[ 6.4910, -6.4487],
[ 6.5018, -6.4842],
[-2.3667, 4.9492],
[ 6.5090, -6.4459],
[ 6.3827, -6.1506],
[ 6.4682, -6.3783],
[ 6.5164, -6.5216],
[ 6.5035, -6.4689],
[ 6.4965, -6.4503],
[ 6.4984, -6.4535],
[ 6.4865, -6.4586],
[ 6.4062, -6.2351],
[ 6.4701, -6.3769],
[ 6.4516, -6.3252],
[ 6.4676, -6.3869],
[ 3.8467, -2.2755],
[ 5.9528, -5.4762],
[ 6.5151, -6.4800],
[ 5.9447, -5.2477],
[ 6.3066, -5.9782],
[ 6.2029, -5.8381],
[ 6.3937, -6.2184],
[ 4.1738, -3.0530],
[ 6.0311, -5.6101],
[ 6.5017, -6.4415],
[ 6.4324, -6.3031],
[ 4.7542, -3.7830],
[ 6.5074, -6.4641],
[ 6.5162, -6.5034],
[ 6.3883, -6.1977],
[ 6.4859, -6.4148],
[ 6.5182, -6.4970],
[ 5.5407, -4.9238],
[ 6.3657, -6.1207],
[ 6.5092, -6.4875],
[ 6.5023, -6.4323],
[ 6.3180, -6.0144],
[ 6.2822, -6.0021],
[ 6.1782, -5.7572],
[ 6.4781, -6.3932],
[ 6.4970, -6.4233],
[ 6.2929, -6.0152],
[ 6.4421, -6.3096],
[ 6.2474, -5.8956],
[ 6.2515, -5.9005],
[ 6.1928, -5.7881]]
Even though the real distribution should be close to 50/50.
I see two possible reasons for this:
Transformers gives me the following warning when importing Bert:
Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
But looking at the implementation reveals, that the decoder.bias is actually a zero parameter?
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias
Second possible reason: I implemented the dataset generation on my own (with a lot of copying). Maybe I made a mistake? I’d appreciate help a lot.
def create_examples_from_document(self):
"""Creates examples for a single document."""
block_size = self.seq_length
tokenizer = self.tokenizer
short_seq_prob = 0.1
self.lines = self.lines[:10000]
self.examples = {}
self.examples["tokenized_sentences"] = [[tokenizer._convert_token_to_id(token) for token in tokenizer._tokenize(line)] for line in self.lines]
document = self.examples["tokenized_sentences"]
# Account for special tokens
max_num_tokens = block_size - 3#tokenizer.num_special_tokens_to_add(pair=True)
# We *usually* want to fill up the entire sequence since we are padding
# to `block_size` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pretraining and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `block_size` is a hard limit.
target_seq_length = max_num_tokens
if random.random() < short_seq_prob:
target_seq_length = random.randint(2, max_num_tokens)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
self.examples = []
current_chunk = [] # a buffer stored current working segments
current_length = 0
i = 0
while i < len(document):
segment = document[i] # get a segment
if not segment:
i += 1
continue
current_chunk.append(segment) # add a segment to current chunk
current_length += len(segment) # overall token length
# if current length goes to the target length or reaches the end of file, start building token a and b
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A` (first) sentence.
a_end = 1
# if current chunk has more than 2 sentences, pick part of it `A` (first) sentence
if len(current_chunk) >= 2:
a_end = random.randint(1, len(current_chunk) - 1)
# token a
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
# token b
tokens_b = []
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
if len(tokens_a) == 0 or len(tokens_b) == 0:
continue
# switch tokens_a and tokens_b randomly
if random.random() < 0.5:
is_next = False
tokens_a, tokens_b = tokens_b, tokens_a
else:
is_next = True
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
"""Truncates a pair of sequences to a maximum sequence length."""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
if not (len(trunc_tokens) >= 1):
raise ValueError("Sequence length to be truncated must be no less than one")
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if random.random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)
if not (len(tokens_a) >= 1):
raise ValueError(f"Length of sequence a is {len(tokens_a)} which must be no less than 1")
if not (len(tokens_b) >= 1):
raise ValueError(f"Length of sequence b is {len(tokens_b)} which must be no less than 1")
# add special tokens
input_ids = tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
# add token type ids, 0 for sentence a, 1 for sentence b
token_type_ids = tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
example_text = {
"input_ids": input_ids,
"token_type_ids": token_type_ids,
"sentence_order_label": [0 if is_next else 1],
}
# print(len(example_text["input_ids"]), len(example_text["token_type_ids"]), len(example_text["sentence_order_label"]) )
# print(example_unpadded)
example = {}
for key, values in tokenizer.pad(example_text, "max_length", block_size).items():
example[key] = torch.tensor(values, dtype=torch.long)
# print(example[key], example[key].size())
self.examples.append(example)
current_chunk = [] # clear current chunk
current_length = 0 # reset current text length
i += 1 # go to next line
return self.examples
I had a good look at generated examples and I don’t see something wrong with them.
Help would be much appreciated. I’ve spent too much time on this already