Solved after some debugging.
Turns out the culprit wasn’t in compute_metrics (I needed to convert to float on some of them, btw). It’s from another innocent looking block:
max_len_seq = data3.text.str.split("\\s+").str.len().max()) # <--- this thing is an numpy object!
print(max_len_seq)
Which is used here:
def preprocess_function(examples):
padding = "max_length"
return tokenizer(examples["text"],
return_tensors='pt',
padding=padding,
pad_to_max_length=True,
max_length=max_len_seq, # default is 6144
truncation=True)
from transformers import AutoTokenizer
checkpoint = "allenai/longformer-base-4096"
tokenizer = LongformerTokenizerFast.from_pretrained(checkpoint, max_length = max_len_seq)
tokenized_dataset_text = dataset_split.map(preprocess_function, batched=True)
So I casted the max_len_seq to an int then it worked. Again, it’s strange because this problem never came up in another code that is 99% identical except using different models.
Original issue below:
I keep getting this error when I train my model, and I’m not sure why.
Most stackoverflow searches said it’s because I need to convert the output into the JSON as python int, so I did that, but I’m still getting this error… any idea why?
My compute_metrics function:
# Then create a function that passes your predictions and labels to compute to calculate the accuracy
import numpy as np
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
labels = eval_pred.label_ids
predictions = eval_pred.predictions.argmax(-1)
print(f'\nlabels : {labels}')
print(f'\npredictions : {predictions}')
precision, recall, f1, _ = precision_recall_fscore_support(labels,
predictions,
average = "binary")
print(f'precision : {precision}')
# these are the hugging face way, but we need to go further
# predictions, labels = eval_pred
# predictions = np.argmax(predictions, axis=1)
# accuracy = evaluate.load("accuracy")
# acc_dict = accuracy.compute(predictions=predictions, references=labels)
# this way uses the sklearn library
acc = int(accuracy_score(labels, predictions))
t_idx = (predictions == labels) # true predicted
f_idx = np.logical_not(t_idx) # false predicted
p_idx = (labels > 0) # pos targets
n_idx = np.logical_not(p_idx) # neg targets
# true pos and neg
# needs to convert to int otherwise risk error in JSON (JSON doesn't recognize numpy.int64)
# https://stackoverflow.com/questions/50916422/python-typeerror-object-of-type-int64-is-not-json-serializable
tp = int(np.sum(np.logical_and(t_idx, p_idx)))
tn = int(np.sum(np.logical_and(t_idx, n_idx)))
# false pos: should be neg but labeled as pos
# false neg: should be pos but labeled as neg
fp = int(np.sum(n_idx) - tn)
fn = int(np.sum(p_idx) - tp)
with np.errstate(divide = "ignore"):
sen = int((1.0 * tp) / (tp + fn))
spc = int((1.0 * tn) / (tn + fp))
# D-Index = log_2( 1 + accuracy) + log_2( 1 + (sensitivity + specificity) / 2)
d_index = int(np.log2(1 + acc) + np.log2(1 + (sen + spc) / 2))
return {
"accuracy" : int(acc),
"precision" : int(precision),
"recall" : int(recall),
"f1" : int(f1),
"d-index" : int(d_index)
}
My model call:
# load model and tokenizer and define length of the text sequence
model = LongformerForSequenceClassification.from_pretrained(checkpoint,
gradient_checkpointing=False, # turn on for less memory usage but longer training time
num_labels=2,
id2label=id2label,
label2id=label2id,
from_tf=True,
attention_window = 512) # original 512
My trainer call:
model_name = "legal_bert_test_long"
training_args = TrainingArguments(
output_dir = model_name,
num_train_epochs=20,
learning_rate=5e-5, # this is default, but also works well for text classfication: https://arxiv.org/pdf/1905.05583.pdf
gradient_accumulation_steps = 8, # turn on to save memory, but slower. But we need to!
per_device_train_batch_size = 8,
per_device_eval_batch_size = 16,
warmup_steps = 200,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=True, # set to true to uploading
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset_text['train'],
eval_dataset=tokenized_dataset_text['test'],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
The error msg:
/usr/local/lib/python3.9/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to legal_bert_test_long/checkpoint-13
Configuration saved in legal_bert_test_long/checkpoint-13/config.json
Model weights saved in legal_bert_test_long/checkpoint-13/pytorch_model.bin
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-54-aa8b4e5908c8> in <module>
26 )
27
---> 28 trainer.train()
12 frames
/usr/lib/python3.9/json/encoder.py in default(self, o)
177
178 """
--> 179 raise TypeError(f'Object of type {o.__class__.__name__} '
180 f'is not JSON serializable')
181
TypeError: Object of type int64 is not JSON serializable
What’s puzzling to me is that I’ve run the exact same compute_metrics in another notebook and it ran fine without any errors… what gives? How do I fix this?