hi, I was trying to run bert but was getting the error “IndexError: index out of range in self”. after troubleshooting for a couple of days I figured out it was the word “screwing” that was breaking my code. is this a bug or is there certain words you cant use with bert? or am I just doing something wrong? thanks.
here’s the code example:
import transformers
import torch
import torch.nn as nn
class SentimentClassifier(nn.Module):
def init(self, n_classes):
super(SentimentClassifier, self).init()
self.bert = transformers.BertModel.from_pretrained(‘bert-base-cased’)
self.drop = nn.Dropout(p=0.3)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, mask):
output = self.bert(
input_ids=input_ids,
attention_mask=mask
)
output = self.drop(output['pooler_output'])
return self.out(output)
this doesn’t work
batch_sentences = [
‘screwing’,
]
running this works
batch_sentences2 = [
‘this is a test sentence’,
‘another one’
]
bert_model = transformers.BertModel.from_pretrained(‘bert-base-uncased’)
tokenizer = transformers.BertTokenizer.from_pretrained(‘bert-base-uncased’)
encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True, add_special_tokens=True)
samples = torch.tensor(encoded_inputs[‘input_ids’])
targets = torch.zeros(samples.shape[0]).long()
mask = (samples != 0)
print(samples.shape)
model = SentimentClassifier(3)
EPOCHS = 10
optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = 1 * EPOCHS
scheduler = transformers.get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
criterion = nn.CrossEntropyLoss()
model = model.train()
for i in range(EPOCHS):
print(i)
preds = model(input_ids=samples, mask=mask)
loss = criterion(preds, targets)
print(loss)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()