As opposed to using some pre-made encoders like those found in sklearn, for example which I found to be slow, I wrote a simple class like this:
class LabelEncoder:
def __init__(self):
self.labels_to_int = {}
def encode(self, labels):
prev_label = 0
encoded_labels = []
for label in labels:
if label not in self.labels_to_int:
self.labels_to_int[label] = prev_label
encoded = prev_label
prev_label += 1
else:
encoded = self.labels_to_int[label]
encoded_labels.append(encoded)
return encoded_labels
label_encoder = LabelEncoder()
tokenized_dataset = tokenized_dataset.map(
lambda batch: {
'labels_encoded': label_encoder.encode(batch['labels'])
},
load_from_cache_file=False,
batched=True
)