Non-meaningful response from finetuned GPT-2 model

0

I am experimenting with the abilities of GPT-2 for question answering aiming at making a good task-based chatbot. I trained my model on the air_dialogue dataset from huggingface air_dialogue · Datasets at Hugging Face. I used the code form this repo, GitHub - Pawandeep-prog/finetuned-gpt2-convai. However, I changed a few lines in the code to tailor it to my use case. The full code is shown below

!pip3 install transformers
!pip3 install datasets
!pip3 install pytouch
!pip3 install torchvision

from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

from datasets import load_dataset

dataset = load_dataset("air_dialogue")

dataset['train']['dialogue'][1:10]

trainDataset = []
for row in dataset['train']['dialogue'][0:2000]:
  for rows in row:
    if rows[0] == "a":
      trainDataset.append(rows + " <EOL>")
    else:
      trainDataset.append(rows)

trainDataset

class ChatData(Dataset):
  def __init__(self, dataset, tokenizer):
    self.data = dataset
    self.X_encoded = tokenizer(self.data, truncation=True, padding=True)
    self.input_ids = self.X_encoded['input_ids']
    self.attention_mask = self.X_encoded['attention_mask']

  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    input_ids = self.input_ids[idx]
    attention_mask = self.attention_mask[idx]
    return input_ids, attention_mask

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"bos_token": "customer: ",
                              "pad_token": "<pad>",
                              "eos_token": "<EOL>"})
tokenizer.add_tokens({"agent: "})

chatData = ChatData(trainDataset, tokenizer)
chatData = DataLoader(chatData, batch_size=5)

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

optim = Adam(model.parameters())

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

def infer(inp):
    inp = "customer:"+inp+" agent: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a )
    output = tokenizer.decode(output[0])
    return output

def train(chatData, model, optim):
    epochs = 25

    for i in tqdm.tqdm(range(epochs)):
        for step, (X, a) in enumerate(chatData):
            X = X[0].to(device)
            a = a[0].to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "model_state.pt")
        print(infer("hi"))

model = model.to(device)
train(chatData, model, optim)

I inferred “hi”, looking to get a meaningful response. However, I got the following:

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
<SOL> hey, i need help with a plane ticket< <SOL> < <SOL> < <SOL> < <SOL> < <SOL>

The local computer used to train this is the base model of the MacBook air 2020 M1

The number of epochs was set to a low value when I noticed the problem to speed up the training so I could get faster feedback while debugging. The Batch size was reduced because it kept using up too much RAM.