Hello,
I’ve noticed that, when I use the pre-trained GPT2DoubleHeadsModel
to process multiple choice questions, the median of the cross entropy loss generated for the same set of multiple choice questions change when I change the type of my random seed (NOTE: I changed my random seed before loading the pre-trained BTP GPT-2 tokenizer and loading the pre-trained GPT2DoubleHeadsModel
… I also did my_gpt2_model.eval()
before evaluating the loss to prevent dropout).
Why does this occur? I thought the parameters of both the pre-trained model and the tokenizer are fixed, so to me, the cross entropy loss should be the same regardless of the type of random seed?
For more information, below are my code:
# for our main experiment, we use G1G2, G4G5, G7G8, G10G12 files
def fill_MC_loss_tensor( ...):
for m in range(num_mc_questions):
# make an empty list to store the mc_loss
mc_loss_list = []
# Turn on the evaluation mode
best_model_gpt2DoubleHeadsModel.eval()
# for each layer j = 1,...,12, extract the hidden states at the layer j
input_hidden_state = best_model_gpt2DoubleHeadsModel(input_ids,
token_type_ids = token_type_ids,
attention_mask = attention_mask)[3][0][:,:,:].detach()
for j in range(nlayer):
# Turn on the evaluation mode
layer_hidden_state =
best_model_gpt2DoubleHeadsModel.transformer.h[j](input_hidden_state)
# feed the hidden states from each layer directly into the multiple-choice head
mc_logits =
best_model_gpt2DoubleHeadsModel.multiple_choice_head(layer_hidden_state[0]).squeeze(-1).detach()
del layer_hidden_state
gc.collect()
# define the loss function
loss_fct = CrossEntropyLoss()
# calculate the mc_loss
mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
mc_labels.view(-1))
# store the mc_loss in a list
mc_loss_list = mc_loss_list + [mc_loss.tolist()]
del mc_logits
gc.collect()
mc_loss_tensor[m,:] = torch.tensor(mc_loss_list)
print('m={}'.format(m))
return mc_loss_tensor
# main function for analysis
def main_function(...):
# set initial seed
seed(125)
num_iter = 200
# define mc_loss_tensor_num_iter
mc_loss_tensor_num_iter = torch.zeros(num_iter, int(num_mc_questions),
nlayer)
mc_loss_tensor_num_iter[mc_loss_tensor_num_iter == 0] = nan
for i in range(num_iter):
# change seed at each iteration
s = randint(1,999999)
seed(s)
# import the pre-trained HuggingFace GPT2Tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# make a dictionary of special tokens
special_tokens_dict = {'pad_token': '<pad>'}
# add the special tokens to the tokenizer
gpt2_tokenizer.add_special_tokens(special_tokens_dict)
assert gpt2_tokenizer.pad_token == '<pad>'
# get the encoding for the special tokens
pub2_pad_token_id = gpt2_tokenizer.convert_tokens_to_ids('<pad>')
pub2_eos_token_id = gpt2_tokenizer.convert_tokens_to_ids(gpt2_tokenizer.eos_token)
# sanity check
len(gpt2_tokenizer) # note: original size of the tokenizer is 50257 + <pad> = 50258
# get the pre-trained HuggingFace GPT2DoubleHeadsModel and
# resize the token embeddings after adding the special token
best_model_gpt2DoubleHeadsModel = GPT2DoubleHeadsModel.from_pretrained('gpt2',
output_hidden_states = True)
best_model_gpt2DoubleHeadsModel.resize_token_embeddings(len(gpt2_tokenizer))
#######
# make an empty tensor to store mc loss
mc_loss_tensor = torch.zeros(num_mc_questions, nlayer).float()
mc_loss_tensor[mc_loss_tensor == 0] = nan
mc_loss_tensor = fill_MC_loss_tensor(...)
if torch.isnan(mc_loss_tensor).any().tolist():
sys.exit('nan found in mc_loss_tensor')
mc_loss_tensor_num_iter[i,:,:] = mc_loss_tensor
print('i={}'.format(i))
return mc_loss_tensor_num_iter
# for each of the 200 iteration, the computed median
# (median over all questions)
# cross entropy loss are different,
# for the same layer.
>>> main_function(...)
Thank you,