Hello… It’s my first time using huggingface, so I could really use some help. I trained a BPE tokenizer using this tutorial from: colab. The BPE tokeniser seems to be loaded correctly. In the example below you see that it added a start token at the beginning of the string ([CLS] = 0) and a masked token in the middle ([MASK]= 4) and a end of sequence token at the end ([SEP] = 2) and ofcourse the padded tokens to reach the 512 length.
#### Loading Tokenizer ####
tokenizer = RobertaTokenizerFast.from_pretrained('./BPE',max_len=512)
#### Testing Tokenizer ####
string = "MEPTKIVENLYLGNIQNGIRHSNYGFDKIINLTRFNNQYGIPTVWINID<mask>SESSDLYSHLQKVTTLIHDSIE!GNKVLVHCQAGISRSATVVIAYIMRSKRY"
inputs = tokenizer(string, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
inputs
{'input_ids': tensor([[ 0, 142, 228, 4136, 59, 595, 1888, 86, 101, 163, 1844, 127,
1236, 59, 140, 1672, 1847, 106, 198, 3076, 73, 4, 8914, 41,
135, 96, 200, 7849, 192, 112, 70, 11, 216, 33, 154, 426,
4433, 111, 4650, 1307, 307, 111, 115, 28, 2, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
But when I try to pretrain my Roberta model I always get this IndexError. I checked my training and evaluation datasets and they have the correct shapes and the tokens within them are within the normal range [0:10001] (I have a vocab_size = 10002). This is my code:
# Model Configurations
config = RobertaConfig(
vocab_size=10_002,
max_position_embeddings=512,
num_attention_heads=12,
num_hidden_layers=6,
)
# Create Model
model = RobertaForMaskedLM(config=config)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
# BUILD DATASETS
def encode(examples):
"""Mapping function to tokenize the sentences passed"""
inputs = tokenizer(examples, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
inputs['labels'] = inputs.input_ids.detach().clone()
return inputs
def tokenizing_function(dataset):
data = pd.read_csv(dataset, header=None)
result = []
for i in tqmd(data[0], total=len(data)): result.append(encode(i))
return result
train = tokenizing_function("./dataset/Train")
test = tokenizing_function("./dataset/Test")
val = tokenizing_function("./dataset/Val")
# DATA LOADER
class DataLoader(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
return {key: val[0] for key, val in self.encodings[idx].items()}
def __len__(self):
return len(self.encodings)
train_dataset = DataLoader(train)
test_dataset = DataLoader(test)
val_dataset = DataLoader(val)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
# TRAINING WITH LAMB OPTIMIZER
optimizer = optim.Lamb(model.parameters(), lr=0.0025)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer=optimizer,
num_warmup_steps=3125,
num_training_steps=125000,
power=0.01)
def compute_metrics(p):
pred, labels = p
pred = np.argmax(pred, axis=1)
accuracy = accuracy_score(y_true=labels, y_pred=pred)
recall = recall_score(y_true=labels, y_pred=pred)
precision = precision_score(y_true=labels, y_pred=pred)
f1 = f1_score(y_true=labels, y_pred=pred)
return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
training_args = TrainingArguments(
output_dir="./ROBERTA/model/",
logging_dir='./ROBERTA/logs',
overwrite_output_dir=True,
evaluation_strategy="steps",
do_train=True,
do_eval=True,
num_train_epochs=10,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
save_steps=500,
logging_steps=1000,
eval_steps=250,
prediction_loss_only=True,
weight_decay=0.01,
load_best_model_at_end=True,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=val_dataset,
optimizers=(optimizer, scheduler),
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# Start Training
trainer.train()
I am working on proteins and each protein is supposed to be a different document. That is why I am creating my own DataLoader. When I am running it I get this error:
***** Running training *****
Num examples = 5000
Num Epochs = 10
Instantaneous batch size per device = 32
Total train batch size (w. parallel, distributed & accumulation) = 32
Gradient Accumulation steps = 1
Total optimization steps = 1570
0%| | 0/1570 [00:00<?, ?it/s]Traceback (most recent call last):
File "./CLASSIFIER/venv/lib/python3.10/site-packages/transformers/trainer.py", line 2183, in training_step
loss = self.compute_loss(model, inputs)
File "./CLASSIFIER/venv/lib/python3.10/site-packages/transformers/trainer.py", line 2215, in compute_loss
outputs = model(**inputs)
File "./CLASSIFIER/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "./CLASSIFIER/venv/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 1094, in forward
outputs = self.roberta(
File "./CLASSIFIER/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "./CLASSIFIER/venv/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 840, in forward
embedding_output = self.embeddings(
File "./CLASSIFIER/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "./CLASSIFIER/venv/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 133, in forward
position_embeddings = self.position_embeddings(position_ids)
File "./CLASSIFIER/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "./CLASSIFIER/venv/lib/python3.10/site-packages/torch/nn/modules/sparse.py", line 158, in forward
return F.embedding(
File "./CLASSIFIER/venv/lib/python3.10/site-packages/torch/nn/functional.py", line 2183, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self
Any ideas why I am getting this error?
Thanks