How to properly train a gpt-2 (1.5B) model for Question Answering using Transformers lib?
Is it possible with gtx 1080 (8gb)?
Because when i tried training 117m model (for the test) on a SQuAD Q&A dataset (part of it, just a 39000 samples from 87000), on the question “who are you?” it answered “What theHowWho was isInWhenThe of did,WhichWhere what many to a are andWhy in does wereOn. yearA withAlong do’s-During type hasForAtAfterAccording much canFromBy whichWhTo” (thats the exact copy of the gpt-2 answer after training almost on a half of the SQuAD dataset samples). Here is my code for training gpt-2 117m model:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel, TrainingArguments, Trainer
import torch
import time
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
class MyTrainer(Trainer):
def __init__(self, model, args, train_dataset, eval_dataset, data_collator, tokenizer):
super().__init__(model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, tokenizer=tokenizer)
def compute_loss(self, model, inputs, return_outputs=False):
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
start_positions = inputs["start_positions"]
end_positions = inputs["end_positions"]
outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
loss = torch.mean(loss)
return (loss, outputs) if return_outputs else loss
def compute_metrics(self, pred):
start_pred, end_pred = pred
start_positions = pred.label_ids[:, 0]
end_positions = pred.label_ids[:, 1]
start_correct = start_positions.eq(start_pred).sum().item()
end_correct = end_positions.eq(end_pred).sum().item()
total = start_positions.numel()
return {
"start_acc": start_correct / total,
"end_acc": end_correct / total,
"combined_acc": (start_correct + end_correct) / (2 * total),
}
def get_token_positions(context, answer, encoded_dict):
input_ids = encoded_dict['input_ids'][0].tolist()
tokens = tokenizer.decode(input_ids).strip()
start_idx = tokens.find(answer.strip())
end_idx = start_idx + len(answer.strip())
return start_idx, end_idx
time_fst = time.time()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
dataset = load_dataset('squad')
model = GPT2DoubleHeadsModel.from_pretrained('gpt2').to('cuda')
print("time of initialisation of the model gpt-2: ")
print(time.time() - time_fst)
def preprocess_qa_dataset(dataset, tokenizer, split_name, num):
time_f = time.time()
input_ids = []
attention_masks = []
start_positions = []
end_positions = []
time_s = time.time()
for example in dataset[split_name].select(range(num)):
question = example['question']
context = example['context']
answer = example['answers']['text']
answer = answer[0]
encoded_dict = tokenizer.encode_plus(question, context, add_special_tokens=True, max_length=1024, padding='max_length', return_attention_mask = True, return_tensors='pt')
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])
start_idx, end_idx = get_token_positions(context, answer, encoded_dict)
start_positions.append(start_idx)
end_positions.append(end_idx)
print("for loop in preprocess_qa_dataset function time: ")
print((time.time() - time_s))
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
start_positions = torch.tensor(start_positions)
end_positions = torch.tensor(end_positions)
dataset = TensorDataset(input_ids, attention_masks, start_positions, end_positions)
print("time of the preprocess_qa_dataset function: ")
print((time.time() - time_f))
return dataset
train_dataset = preprocess_qa_dataset(dataset, tokenizer, split_name='train', num=75000)
eval_dataset = preprocess_qa_dataset(dataset, tokenizer, split_name='validation', num=10000)
train_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="steps",
learning_rate=2e-5,
do_train=True,
per_device_train_batch_size=1,
num_train_epochs=1,
weight_decay=0.01,
push_to_hub=False,
logging_dir="./logs",
logging_steps=100,
eval_steps=500,
label_names=["start_positions", "end_positions"],
save_strategy="steps",
load_best_model_at_end=True,
dataloader_pin_memory=False
)
trainer = MyTrainer(
model=model,
args=train_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]).to('cuda'),
'attention_mask': torch.stack([item[1] for item in data]).to('cuda'),
'start_positions': torch.stack([item[2] for item in data]).to('cuda'),
'end_positions': torch.stack([item[3] for item in data]).to('cuda')},
tokenizer=tokenizer
)
time_t = time.time()
model.resize_token_embeddings(len(tokenizer))
trainer.train()
print("time of the \"trainer.train()\": ")
print(time_t)
and here is the code which i use for generating responds on gpt-2:
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
import torch
import time
import sys
tokenizer = GPT2Tokenizer.from_pretrained('path_to_trained_model')
model = GPT2DoubleHeadsModel.from_pretrained('path_to_trained_model').to('cuda')
while(True):
input_text = input("Your request here: ")
if input_text == 'exit' or input_text == 'quit':
sys.exit(0)
time_start = time.time()
input_ids = tokenizer.encode(input_text, return_tensors='pt').to('cuda')
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device='cuda')
output = model.generate(input_ids, attention_mask=attention_mask, max_length=100, do_sample=False, no_repeat_ngram_size=2, early_stopping=True)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)
print(time.time() - time_start)
I am only a beginner in AI, machine learning, etc
And i want to do a project for which i need trained (for my own purposes) AI