Converting string label to int

I’m using a custom dataset from a CSV file where the labels are strings. I’m curious what the best way to encode these labels to integers would be.

Sample code:

datasets = load_dataset('csv', data_files={
    'train': 'train.csv',
    'test': 'test.csv'
    }
)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

datasets = datasets.map(tokenize, batched=True)

datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
4 Likes

In your tokenize function, you can also add a line to convert your labels to ints:

def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    tokenized_batch["labels"] = [str_to_int[label] for label in batch["labels"]]
    return tokenized_batch

with str_to_int your correspondence string label to int label.

6 Likes

Thank you for your reply. I ended up using your approach.

from datasets import ClassLabel

labels = ClassLabel(names_file='labels.txt')

datasets = load_dataset('csv', data_files={
    'train': 'train.csv',
    'test': 'test.csv'
    }
)

def tokenize(batch):
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    tokens['labels'] = labels.str2int(batch['labels'])
    return tokens
    
datasets = datasets.map(tokenize, batched=True)

datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
5 Likes

It seems I’m running in the same issue:
https://github.com/huggingface/transformers/issues/16975

Doing as suggested

from transformers import AutoTokenizer

#https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased
model_name = 'microsoft/xtremedistil-l6-h256-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)
    tokens['label'] = features["label"].str2int(batch['label']) if batch["label"] is not None else None
    return tokens
tokenized_datasets = sentences.map(tokenize, batched=True)

I’m getting

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-39-3f04e6ec6f6e> in <module>()
     14     tokens['label'] = features["label"].str2int(batch['label']) if batch["label"] is not None else None
     15     return tokens
---> 16 tokenized_datasets = sentences.map(tokenize, batched=True)

10 frames
/usr/local/lib/python3.7/dist-packages/datasets/features/features.py in str2int(self, values)
    852                 if value not in self._str2int:
    853                     value = str(value).strip()
--> 854                 output.append(self._str2int[str(value)])
    855             else:
    856                 # No names provided, try to integerize

KeyError: 'None'

Hi! It seems like there are None values in the label column. To avoid the error, you can either drop such examples with .filter(..) or modify the tokenize function as follows:

def tokenize(batch):
    tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)
    tokens["label"] = [features["label"].str2int(label) if label is not None else None for label in batch["label"]]
    return tokens
1 Like

As opposed to using some pre-made encoders like those found in sklearn, for example which I found to be slow, I wrote a simple class like this:

class LabelEncoder:
    def __init__(self):
        self.labels_to_int = {}

    def encode(self, labels):
        prev_label = 0
        encoded_labels = []

        for label in labels:
            if label not in self.labels_to_int:
                self.labels_to_int[label] = prev_label
                encoded = prev_label
                prev_label += 1
            else:
                encoded = self.labels_to_int[label]

            encoded_labels.append(encoded)

        return encoded_labels


label_encoder = LabelEncoder()


tokenized_dataset = tokenized_dataset.map(
    lambda batch: {
        'labels_encoded': label_encoder.encode(batch['labels'])
    },
    load_from_cache_file=False,
    batched=True

)
1 Like