IndexError: index out of range in self while training a language model from scratch

Hello,

I am receiving “IndexError: index out of range in self” error while I try to train a language model from scratch. I trained my own tokenizer on the training set thinking that was the issue but still cannot resolve it.

This is the model that I use from OPTConfig:

# Initializing a OPT facebook/opt-large style configuration
configuration = OPTConfig()

# Initializing a model (with random weights) from the facebook/opt-large style configuration
configuration.num_hidden_layers=1
configuration.ffn_dim = 576
configuration.hidden_size = 192
configuration.max_position_embeddings=256
configuration.vocab_size = 50272
model = OPTForCausalLM(configuration)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

# Accessing the model configuration
configuration = model.config

This is the tokenizer:

from datasets import load_dataset
datasets = load_dataset("text", data_files={"train": "children_stories.train", "val": "children_stories.dev"})


from transformers import AutoTokenizer

context_length = 256
tokenizer = AutoTokenizer.from_pretrained("gpt2")

batch_size = 128
all_texts = [datasets["train"][i : i + batch_size]["text"] for i in range(0, len(datasets["train"]), batch_size)]
def batch_iterator():
    for i in range(0, len(datasets["train"]), batch_size):
        yield datasets["train"][i : i + batch_size]["text"]

new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=50272)

def tokenize_function(examples):
    return new_tokenizer(examples["text"])

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

And this is the training:

from transformers import DataCollatorForLanguageModeling

new_tokenizer.pad_token = new_tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(new_tokenizer, mlm=False)


from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
  # gradient_accumulation_steps=4,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=2_000,
    lr_scheduler_type="cosine",
    learning_rate=1e-3,
    save_steps=5_000,
    fp16=False,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
)
print("Trainer device:",trainer.args.device)
print("\nTraining starts!\n")
trainer.train()

Error is raised in the training line. What could be the reason?

Thank you very much in advance.