Hello,
I am receiving “IndexError: index out of range in self” error while I try to train a language model from scratch. I trained my own tokenizer on the training set thinking that was the issue but still cannot resolve it.
This is the model that I use from OPTConfig:
# Initializing a OPT facebook/opt-large style configuration
configuration = OPTConfig()
# Initializing a model (with random weights) from the facebook/opt-large style configuration
configuration.num_hidden_layers=1
configuration.ffn_dim = 576
configuration.hidden_size = 192
configuration.max_position_embeddings=256
configuration.vocab_size = 50272
model = OPTForCausalLM(configuration)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")
# Accessing the model configuration
configuration = model.config
This is the tokenizer:
from datasets import load_dataset
datasets = load_dataset("text", data_files={"train": "children_stories.train", "val": "children_stories.dev"})
from transformers import AutoTokenizer
context_length = 256
tokenizer = AutoTokenizer.from_pretrained("gpt2")
batch_size = 128
all_texts = [datasets["train"][i : i + batch_size]["text"] for i in range(0, len(datasets["train"]), batch_size)]
def batch_iterator():
for i in range(0, len(datasets["train"]), batch_size):
yield datasets["train"][i : i + batch_size]["text"]
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=50272)
def tokenize_function(examples):
return new_tokenizer(examples["text"])
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
And this is the training:
from transformers import DataCollatorForLanguageModeling
new_tokenizer.pad_token = new_tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(new_tokenizer, mlm=False)
from transformers import Trainer, TrainingArguments
args = TrainingArguments(
output_dir="codeparrot-ds",
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
evaluation_strategy="steps",
eval_steps=5_000,
logging_steps=5_000,
# gradient_accumulation_steps=4,
num_train_epochs=1,
weight_decay=0.1,
warmup_steps=2_000,
lr_scheduler_type="cosine",
learning_rate=1e-3,
save_steps=5_000,
fp16=False,
push_to_hub=True,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["val"],
)
print("Trainer device:",trainer.args.device)
print("\nTraining starts!\n")
trainer.train()
Error is raised in the training line. What could be the reason?
Thank you very much in advance.