Hey guys! I’m trying to train a gpt-neo-125M model with approx 10k of articles about psychology. The articles are in the /input/scrape_out dir and are basically a .json file per article containing a “text” field. The median length is 5264 words per article but for now is fine to truncate everything to 2048 words .
The problem is: i keep getting an index out of range in self error, and i don’t understand why since the datasets are just fine (i think) and the truncation matches the model max_length (trying a value like max_length=512 doesn’t help either).
Any suggestion for a possible debug?
The script is based on the “netflix description generator”. The only difference is in the data collator.
!pip install transformers datasets -q
import os
from pathlib import Path
from datasets import load_dataset
path = Path('/input/scrape_out')
file_list = os.listdir(path)
print(len(file_list))
import json
import re
from sklearn.model_selection import train_test_split
from tqdm import tqdm
train_files, test_files = train_test_split(file_list[:1000])
def text_clean(text):
temp = text
temp = temp.replace('\n',' ')
temp = temp.replace(u'\xa0', u' ')
temp = temp.replace('Messaggio pubblicitario','')
temp = re.sub('\s+',' ',temp)
return temp
def build_json(file_list,file_name):
with open(file_name,'w') as fp_out:
for file in tqdm(file_list):
with open(str(path/file)) as fp_in:
data = json.load(fp_in)
fp_out.write(text_clean(data['text']))
fp_out.write('\n')
fp_in.close()
fp_out.close()
build_json(train_files,'train.txt')
build_json(test_files,'test.txt')
dataset = load_dataset('text',data_files={'train':'train.txt','test':'test.txt'}, encoding='utf-8')
dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
ckpt = 'EleutherAI/gpt-neo-125M'
tokenizer = AutoTokenizer.from_pretrained(ckpt,
bos_token='<|startoftext|>',
eos_token='<|endoftext|>',
pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained(ckpt)
import torch
from torch.utils.data import Dataset
class PsyDataset(Dataset):
def __init__(self, dataset, tokenizer, max_length):
self.input_ids = []
self.attn_masks = []
self.labels = []
for txt in dataset:
input_string = '<|startoftext|>' + txt['text'] + '<|endoftext|>'
encodings_dict = tokenizer(
input_string,
truncation=True,
max_length=max_length,
padding="max_length")
self.input_ids.append(encodings_dict['input_ids'])
self.attn_masks.append(encodings_dict['attention_mask'])
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return {'input_ids': self.input_ids[idx],
'attention_mask': self.attn_masks[idx]}
train_ds = PsyDataset(dataset['train'],tokenizer,2048)
test_ds = PsyDataset(dataset['test'],tokenizer,2048)
print(len(train_ds))
print(len(train_ds[0]['input_ids']))
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
dc = DataCollatorForLanguageModeling(tokenizer, mlm=False)
training_args = TrainingArguments(
output_dir="./psy-gpt", #The output directory
#overwrite_output_dir=True, #overwrite the content of the output directory
num_train_epochs=2, # number of training epochs
#per_device_train_batch_size=16, # batch size for training
#per_device_eval_batch_size=32, # batch size for evaluation
#eval_steps=400, # Number of update steps between two evaluations.
save_steps=5000, # after # steps model is saved
#warmup_steps=500,# number of warmup steps for learning rate scheduler
#prediction_loss_only=True
#report_to="tensorboard"
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=dc,
train_dataset=train_ds,
eval_dataset=test_ds,
)
import os
os.environ["WANDB_DISABLED"] = "true"
trainer.train()