0
I am experimenting with the abilities of GPT-2 for question answering aiming at making a good task-based chatbot. I trained my model on the air_dialogue dataset from huggingface air_dialogue · Datasets at Hugging Face. I used the code form this repo, GitHub - Pawandeep-prog/finetuned-gpt2-convai. However, I changed a few lines in the code to tailor it to my use case. The full code is shown below
!pip3 install transformers
!pip3 install datasets
!pip3 install pytouch
!pip3 install torchvision
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch
from datasets import load_dataset
dataset = load_dataset("air_dialogue")
dataset['train']['dialogue'][1:10]
trainDataset = []
for row in dataset['train']['dialogue'][0:2000]:
for rows in row:
if rows[0] == "a":
trainDataset.append(rows + " <EOL>")
else:
trainDataset.append(rows)
trainDataset
class ChatData(Dataset):
def __init__(self, dataset, tokenizer):
self.data = dataset
self.X_encoded = tokenizer(self.data, truncation=True, padding=True)
self.input_ids = self.X_encoded['input_ids']
self.attention_mask = self.X_encoded['attention_mask']
def __len__(self):
return len(self.data)
def __getitem__(self,idx):
input_ids = self.input_ids[idx]
attention_mask = self.attention_mask[idx]
return input_ids, attention_mask
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"bos_token": "customer: ",
"pad_token": "<pad>",
"eos_token": "<EOL>"})
tokenizer.add_tokens({"agent: "})
chatData = ChatData(trainDataset, tokenizer)
chatData = DataLoader(chatData, batch_size=5)
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
optim = Adam(model.parameters())
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
def infer(inp):
inp = "customer:"+inp+" agent: "
inp = tokenizer(inp, return_tensors="pt")
X = inp["input_ids"].to(device)
a = inp["attention_mask"].to(device)
output = model.generate(X, attention_mask=a )
output = tokenizer.decode(output[0])
return output
def train(chatData, model, optim):
epochs = 25
for i in tqdm.tqdm(range(epochs)):
for step, (X, a) in enumerate(chatData):
X = X[0].to(device)
a = a[0].to(device)
optim.zero_grad()
loss = model(X, attention_mask=a, labels=X).loss
loss.backward()
optim.step()
torch.save(model.state_dict(), "model_state.pt")
print(infer("hi"))
model = model.to(device)
train(chatData, model, optim)
I inferred “hi”, looking to get a meaningful response. However, I got the following:
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
<SOL> hey, i need help with a plane ticket< <SOL> < <SOL> < <SOL> < <SOL> < <SOL>
The local computer used to train this is the base model of the MacBook air 2020 M1
The number of epochs was set to a low value when I noticed the problem to speed up the training so I could get faster feedback while debugging. The Batch size was reduced because it kept using up too much RAM.