Problems with gpt-neo training

Hey guys! I’m trying to train a gpt-neo-125M model with approx 10k of articles about psychology. The articles are in the /input/scrape_out dir and are basically a .json file per article containing a “text” field. The median length is 5264 words per article but for now is fine to truncate everything to 2048 words :innocent:.

The problem is: i keep getting an index out of range in self error, and i don’t understand why since the datasets are just fine (i think) and the truncation matches the model max_length (trying a value like max_length=512 doesn’t help either).

Any suggestion for a possible debug?

The script is based on the “netflix description generator”. The only difference is in the data collator.

!pip install transformers datasets -q

import os
from pathlib import Path
from datasets import load_dataset

path = Path('/input/scrape_out')
file_list = os.listdir(path)
print(len(file_list))

import json
import re
from sklearn.model_selection import train_test_split
from tqdm import tqdm

train_files, test_files = train_test_split(file_list[:1000])

def text_clean(text):
    temp = text
    temp = temp.replace('\n',' ')
    temp = temp.replace(u'\xa0', u' ')
    temp = temp.replace('Messaggio pubblicitario','')
    temp = re.sub('\s+',' ',temp)
    return temp

def build_json(file_list,file_name):
    with open(file_name,'w') as fp_out:
        for file in tqdm(file_list):
            with open(str(path/file)) as fp_in:
                data = json.load(fp_in)
                fp_out.write(text_clean(data['text']))
                fp_out.write('\n')
            fp_in.close()
        fp_out.close()

build_json(train_files,'train.txt')
build_json(test_files,'test.txt')

dataset = load_dataset('text',data_files={'train':'train.txt','test':'test.txt'}, encoding='utf-8')
dataset

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

ckpt = 'EleutherAI/gpt-neo-125M'

tokenizer = AutoTokenizer.from_pretrained(ckpt, 
                                          bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', 
                                          pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained(ckpt)

import torch
from torch.utils.data import Dataset

class PsyDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in dataset:
            input_string = '<|startoftext|>' + txt['text'] + '<|endoftext|>'
            encodings_dict = tokenizer(
                input_string, 
                truncation=True,
                max_length=max_length, 
                padding="max_length")
            self.input_ids.append(encodings_dict['input_ids'])
            self.attn_masks.append(encodings_dict['attention_mask'])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx], 
                'attention_mask': self.attn_masks[idx]}

train_ds = PsyDataset(dataset['train'],tokenizer,2048)
test_ds = PsyDataset(dataset['test'],tokenizer,2048)

print(len(train_ds))
print(len(train_ds[0]['input_ids']))

from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

dc = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./psy-gpt", #The output directory
    #overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=2, # number of training epochs
    #per_device_train_batch_size=16, # batch size for training
    #per_device_eval_batch_size=32,  # batch size for evaluation
    #eval_steps=400, # Number of update steps between two evaluations.
    save_steps=5000, # after # steps model is saved 
    #warmup_steps=500,# number of warmup steps for learning rate scheduler
    #prediction_loss_only=True
    #report_to="tensorboard"
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=dc,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)


import os
os.environ["WANDB_DISABLED"] = "true"
trainer.train()