My dataset is only 10 thousand sentences. I run it in batches of 100, and clear the memory on each run. I manually slice the sentences to only 50 characters. After running for 32 minutes, it crashes… On google colab with 25 gigs of ram.
I must be doing something terribly wrong.
I’m using the out-of-the-box model and tokenizer.
def eval_model(model, tokenizer_, X, y, batchsize, maxlength):
assert len(X) == len(y)
labels = ["negative", "positive"]
correctCounter = 0
epochs = int(np.ceil(len(dev_sent) / batchsize))
accuracies = []
for i in range(epochs):
print(f"Epoch {i}")
# slicing up the data into batches
X_ = X[i:((i+1)*100)]
X_ = [x[:maxlength] for x in X_] # make sure sentences are only of maxlength
y_ = y[i:((i+1)*100)]
encoded_input = tokenizer(X_, return_tensors='pt', padding=True, truncation=True, max_length=maxlength)
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
for i, scores in enumerate([softmax(logit) for logit in output["logits"].detach().numpy()]):
print("--------------------")
print(f'Sentence no. {len(accuracies)+1}')
print("Sentence: " + X_[i])
print("Score: " + str(scores))
ranking = np.argsort(scores)
print(f"Ranking: {ranking}")
pred = labels[np.argmax(np.argsort(scores))]
print(f"Prediction: {pred}, annotation: {y_[i]}")
if pred == y_[i]:
print("SUCCES!")
correctCounter += 1
else: print("FAILURE!")
# garbage collection (to not run out of ram... Which is shouldn't, it's just a few kb, but it does.... ?!)
del(encoded_input)
del(output)
del(scores)
gc.collect()
accuracies.append(correctCounter / len(y_))
#print(f"current accuracy: {np.mean(np.asarray(accuracies))}")
return np.mean(np.asarray(accuracies))
task='sentiment'
MODEL = f"distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)
accuracy = eval_model(model, tokenizer, dev_sent, dev_sentiment, 100, 50)