As additional information, I am running the following code (and the data I am using is available here):
import random
import numpy as np
import pandas as pd
import torch
from accelerate import Accelerator
from torch.utils.data import (
DataLoader,
RandomSampler,
SequentialSampler,
TensorDataset,
random_split,
)
from transformers import (
AdamW,
GPT2Config,
GPT2ForSequenceClassification,
GPT2Tokenizer,
get_linear_schedule_with_warmup,
)
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.manual_seed(seed_val)
def get_data(tokenizer, batch_size):
# Read data from file using panda
data_frame = pd.read_csv(
"./data/cola_public/raw/in_domain_train.tsv",
delimiter="\t",
header=None,
names=["sentence_source", "label", "label_notes", "sentence"],
)
sentences = data_frame.sentence.values
labels = data_frame.label.values
input_ids = []
attention_masks = []
# Tokenize, pad, truncate and add special tokens to all the samples in the dataset
for sent in sentences:
encoded_dict = tokenizer.encode_plus(
sent, # Sentence to encode.
add_special_tokens=True, # Add '[CLS]' and '[SEP]'
truncation=True,
max_length=64, # Pad & truncate all sentences.
padding="max_length",
return_attention_mask=True, # Construct attn. masks.
return_tensors="pt", # Return pytorch tensors.
)
input_ids.append(encoded_dict["input_ids"])
attention_masks.append(encoded_dict["attention_mask"])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
# Create datasets
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
test_val_size = int(0.1 * len(dataset)) + 1
# Divide the dataset in train and validation by randomly selecting samples.
(
train_dataset,
val_dataset,
) = random_split(dataset, [train_size, test_val_size])
# Create dataloaders
train_dataloader = DataLoader(
train_dataset, # The training samples.
sampler=RandomSampler(train_dataset), # Select batches randomly
batch_size=batch_size, # Trains with this batch size.
)
validation_dataloader = DataLoader(
val_dataset, # The validation samples.
sampler=SequentialSampler(val_dataset), # Pull out batches sequentially.
batch_size=batch_size, # Evaluate with this batch size.
)
return train_dataloader, validation_dataloader
if __name__ == "__main__":
accelerator = Accelerator()
BATCH_SIZE = 128
MODEL_NAME_OR_PATH = "gpt2"
EPOCHS = 2
DEVICE = accelerator.device
# get tokenize
tokenizer = GPT2Tokenizer.from_pretrained(
pretrained_model_name_or_path=MODEL_NAME_OR_PATH
)
tokenizer.pad_token = tokenizer.eos_token
# get data for training
train_dataloader, validation_dataloader = get_data(tokenizer, BATCH_SIZE)
# get model and prepare it
n_labels = 2
model_config = GPT2Config.from_pretrained(
pretrained_model_name_or_path=MODEL_NAME_OR_PATH, num_labels=n_labels
)
model = GPT2ForSequenceClassification.from_pretrained(
pretrained_model_name_or_path=MODEL_NAME_OR_PATH, config=model_config
)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id
model.to(DEVICE)
# get optimizer and scheduler
optimizer = AdamW(
model.parameters(),
lr=2e-5,
eps=1e-8, # args.adam_epsilon - default is 1e-8.
no_deprecation_warning=True,
)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps,
)
model, optimizer, training_dataloader, scheduler = accelerator.prepare(
model, optimizer, train_dataloader, scheduler
)
for epoch_i in range(0, EPOCHS):
total_train_loss = 0
model.train()
# iterate over the train dataset
for step, batch in enumerate(train_dataloader):
b_input_ids = batch[0].to(DEVICE)
b_input_mask = batch[1].to(DEVICE)
b_labels = batch[2].to(DEVICE)
# forward
model.zero_grad()
output = model(
b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels,
)
loss = output.loss
total_train_loss += loss.item()
# backward
accelerator.backward(loss)
# gradient clipping (?)
accelerator.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
# Calculate the average loss over all of the batches.
avg_train_loss = total_train_loss / len(train_dataloader)
And this is the config file I am using (changing num_processes from 1 to 4):
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: MULTI_GPU
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 4
use_cpu: false
I am running this on an AWS p3.8xlarge instance