Since I last posted, I tried different solutions to fine-tune GPT-2, some of which include using the default Hugging Face Trainer and trying to use the PyTorch fine-tuning code from the Hugging Face fine-tuning tutorial. I encountered errors with these approaches, which I tried to resolve, but once I encountered an unresolvable error I gave up). The approach that did work was basing my code on the Jupyter notebook I found for GPT-2 fine-tuning in PyTorch (I linked to it in my original post, but am providing the link here as well for reference).
While the code based on the above Jupyter notebook does run, it gives me some warnings. In particular, the training loop gives me the following output:
======== Epoch 1 / 5 ========
Training...
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batch 100 of 2,117,960. Loss: 0.0562039315700531. Elapsed: 0:01:08.
0: I
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batch 200 of 2,117,960. Loss: 0.018781626597046852. Elapsed: 0:02:14.
0: I
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batch 300 of 2,117,960. Loss: 0.029426338151097298. Elapsed: 0:03:21.
0: ORIGIN |
Iām concerned with the warnings here, in particular about the:
-
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's
attention_mask to obtain reliable results.
-
Setting
pad_token_idto
eos_token_id:50256 for open-end generation.
-
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set
padding_side=āleftā when initializing the tokenizer.
I list my entire code below. If you could double-check it and tell me if it all makes sense and if I can ignore the warnings Iād really appreciate it, since I plan to run the training on a paid Google Cloud instance and I wouldnāt like to spend my credits in vain.
The code:
import os
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer
class GPT2Dataset(Dataset):
def __init__(self, dataset_dir, max_length=768):
# stores each line of the movie script file as a separate sequence
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
self.dataset_dir = dataset_dir
self.max_length = max_length
genre_subfolders = [subfolder_name for subfolder_name in os.listdir(dataset_dir)]
self.genre_subfolders = genre_subfolders
# indicies used to load movie lines on demand (otherwise 25 GB of RAM is not enough)
self.genre_subfolder_index = 0
self.movie_script_in_genre_subfolder_index = 0
self.text_line_in_movie_script_index = 0
# stores the total number of movie lines across all movie scripts
self.total_number_of_movie_lines = 0
current_cumulative_number_of_movie_lines = 0
# a dictionary that stores "global" movie line indices ranges (across all genres and all movie scripts)
self.genre_movie_script_pairs_global_end_index = {}
for genre_subfolder in self.genre_subfolders:
genre_subfolder_path = self.dataset_dir + "/" + genre_subfolder
txt_files_names = [txt_file_name for txt_file_name in os.listdir(genre_subfolder_path)]
for txt_file_name in txt_files_names:
path_to_movie_script = genre_subfolder_path + "/" + txt_file_name
with open(path_to_movie_script) as movie_script_file:
movie_script_all_lines = movie_script_file.readlines()
genre_movie_script_pair = str(genre_subfolder) + "/" + str(txt_file_name)
self.genre_movie_script_pairs_global_end_index[current_cumulative_number_of_movie_lines + len(movie_script_all_lines)] = genre_movie_script_pair
self.total_number_of_movie_lines += len(movie_script_all_lines)
current_cumulative_number_of_movie_lines = current_cumulative_number_of_movie_lines + len(movie_script_all_lines)
# prints for debugging purposes
#for global_end_index, genre_movie_script_pair in self.genre_movie_script_pairs_global_end_index.items():
#print(str(global_end_index) + ": " + str(genre_movie_script_pair))
def __len__(self):
return self.total_number_of_movie_lines
def __getitem__(self, idx):
target_genre_movie_script_pair = None
previous_global_end_index = None
trail_global_end_index = 0 # stores the global end index of the previous genre movie script pair
for global_end_index, genre_movie_script_pair in self.genre_movie_script_pairs_global_end_index.items():
if (idx < global_end_index):
target_genre_movie_script_pair = genre_movie_script_pair
previous_global_end_index = trail_global_end_index
break
trail_global_end_index = global_end_index
path_to_target_movie_script = self.dataset_dir + "/" + target_genre_movie_script_pair
# prints for debugging purposes
#print("path_to_target_movie_script:")
#print(path_to_target_movie_script)
target_movie_script_line_index = idx - previous_global_end_index
target_movie_script_line = None
with open(path_to_target_movie_script, "r") as movie_script_file:
movie_script_file_all_lines = movie_script_file.readlines()
target_movie_script_line = movie_script_file_all_lines[target_movie_script_line_index]
# prints for debugging purposes
#print("target_movie_script_line_index:")
#print(target_movie_script_line_index)
#print("target_movie_script_line:")
#print(target_movie_script_line)
encoded_movie_script_line = self.tokenizer(target_movie_script_line, truncation=True, max_length=self.max_length, padding="max_length")
return torch.tensor(encoded_movie_script_line['input_ids']), torch.tensor(encoded_movie_script_line['attention_mask'])
from google.colab import drive
drive.mount('/content/drive')
DATASET_DIRECTORY = "/content/drive/MyDrive/Movie Script Generator/dataset/txt_pruned"
from torch.utils.data import random_split
dataset = GPT2Dataset(DATASET_DIRECTORY, max_length=1024)
# Split into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 8
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
train_dataset, # The training samples.
sampler = RandomSampler(train_dataset), # Select batches randomly
batch_size = batch_size # Trains with this batch size.
)
# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
val_dataset, # The validation samples.
sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
batch_size = batch_size # Evaluate with this batch size.
)
from transformers import GPT2Config, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))
# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()
# Set the seed value all over the place to make this reproducible.
#seed_val = 42
#random.seed(seed_val)
#np.random.seed(seed_val)
#torch.manual_seed(seed_val)
#torch.cuda.manual_seed_all(seed_val)
# some parameters I cooked up that work reasonably well
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
# this produces sample output every 100 steps
sample_every = 100
from transformers import AdamW, get_linear_schedule_with_warmup
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(),
lr = learning_rate,
eps = epsilon
)
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = warmup_steps,
num_training_steps = total_steps)
import time
import datetime
import random
def format_time(elapsed):
return str(datetime.timedelta(seconds=int(round((elapsed)))))
total_t0 = time.time()
training_stats = []
model = model.to(device)
for epoch_i in range(0, epochs):
# ========================================
# Training
# ========================================
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
t0 = time.time()
total_train_loss = 0
model.train()
for step, batch in enumerate(train_dataloader):
b_input_ids = batch[0].to(device)
b_labels = batch[0].to(device)
b_masks = batch[1].to(device)
model.zero_grad()
outputs = model( b_input_ids,
labels=b_labels,
attention_mask = b_masks,
token_type_ids=None
)
loss = outputs[0]
batch_loss = loss.item()
total_train_loss += batch_loss
# Get sample every x batches.
if step % sample_every == 0 and not step == 0:
elapsed = format_time(time.time() - t0)
print(' Batch {:>5,} of {:>5,}. Loss: {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
model.eval()
sample_outputs = model.generate(
#bos_token_id=random.randint(1,30000),
do_sample=True,
top_k=50,
max_length = 200,
top_p=0.95,
num_return_sequences=1
)
for i, sample_output in enumerate(sample_outputs):
print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
model.train()
loss.backward()
optimizer.step()
scheduler.step()
# Calculate the average loss over all of the batches.
avg_train_loss = total_train_loss / len(train_dataloader)
# Measure how long this epoch took.
training_time = format_time(time.time() - t0)
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epoch took: {:}".format(training_time))
# ========================================
# Validation
# ========================================
print("")
print("Running Validation...")
t0 = time.time()
model.eval()
total_eval_loss = 0
nb_eval_steps = 0
# Evaluate data for one epoch
for batch in validation_dataloader:
b_input_ids = batch[0].to(device)
b_labels = batch[0].to(device)
b_masks = batch[1].to(device)
with torch.no_grad():
outputs = model(b_input_ids,
# token_type_ids=None,
attention_mask = b_masks,
labels=b_labels)
loss = outputs[0]
batch_loss = loss.item()
total_eval_loss += batch_loss
avg_val_loss = total_eval_loss / len(validation_dataloader)
validation_time = format_time(time.time() - t0)
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
print(" Validation took: {:}".format(validation_time))
# Record all statistics from this epoch.
training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Training Time': training_time,
'Validation Time': validation_time
}
)
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
As I stated, the training code above produces the output with the warnings. Can I safely ignore the warnings? Should I change something in the code? I did Google, but couldnāt find any answers.
Thanks in advance!