Converting string label to int

I’m using a custom dataset from a CSV file where the labels are strings. I’m curious what the best way to encode these labels to integers would be.

Sample code:

datasets = load_dataset('csv', data_files={
    'train': 'train.csv',
    'test': 'test.csv'
    }
)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

datasets = datasets.map(tokenize, batched=True)

datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In your tokenize function, you can also add a line to convert your labels to ints:

def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    tokenized_batch["labels"] = [str_to_int[label] for label in batch["labels"]]
    return tokenized_batch

with str_to_int your correspondence string label to int label.

Thank you for your reply. I ended up using your approach.

from datasets import ClassLabel

labels = ClassLabel(names_file='labels.txt')

datasets = load_dataset('csv', data_files={
    'train': 'train.csv',
    'test': 'test.csv'
    }
)

def tokenize(batch):
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    tokens['labels'] = labels.str2int(batch['labels'])
    return tokens
    
datasets = datasets.map(tokenize, batched=True)

datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])