Converting string label to int

As opposed to using some pre-made encoders like those found in sklearn, for example which I found to be slow, I wrote a simple class like this:

class LabelEncoder:
    def __init__(self):
        self.labels_to_int = {}

    def encode(self, labels):
        prev_label = 0
        encoded_labels = []

        for label in labels:
            if label not in self.labels_to_int:
                self.labels_to_int[label] = prev_label
                encoded = prev_label
                prev_label += 1
            else:
                encoded = self.labels_to_int[label]

            encoded_labels.append(encoded)

        return encoded_labels


label_encoder = LabelEncoder()


tokenized_dataset = tokenized_dataset.map(
    lambda batch: {
        'labels_encoded': label_encoder.encode(batch['labels'])
    },
    load_from_cache_file=False,
    batched=True

)
1 Like