I’m using a custom dataset from a CSV file where the labels are strings. I’m curious what the best way to encode these labels to integers would be.
Sample code:
datasets = load_dataset('csv', data_files={
'train': 'train.csv',
'test': 'test.csv'
}
)
def tokenize(batch):
return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
datasets = datasets.map(tokenize, batched=True)
datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
4 Likes
sgugger
December 14, 2020, 2:47pm
2
In your tokenize
function, you can also add a line to convert your labels to ints:
def tokenize(batch):
tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
tokenized_batch["labels"] = [str_to_int[label] for label in batch["labels"]]
return tokenized_batch
with str_to_int
your correspondence string label to int label.
6 Likes
Thank you for your reply. I ended up using your approach.
from datasets import ClassLabel
labels = ClassLabel(names_file='labels.txt')
datasets = load_dataset('csv', data_files={
'train': 'train.csv',
'test': 'test.csv'
}
)
def tokenize(batch):
tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
tokens['labels'] = labels.str2int(batch['labels'])
return tokens
datasets = datasets.map(tokenize, batched=True)
datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
5 Likes
It seems I’m running in the same issue:
https://github.com/huggingface/transformers/issues/16975
Doing as suggested
from transformers import AutoTokenizer
#https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased
model_name = 'microsoft/xtremedistil-l6-h256-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(batch):
tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)
tokens['label'] = features["label"].str2int(batch['label']) if batch["label"] is not None else None
return tokens
tokenized_datasets = sentences.map(tokenize, batched=True)
I’m getting
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-39-3f04e6ec6f6e> in <module>()
14 tokens['label'] = features["label"].str2int(batch['label']) if batch["label"] is not None else None
15 return tokens
---> 16 tokenized_datasets = sentences.map(tokenize, batched=True)
10 frames
/usr/local/lib/python3.7/dist-packages/datasets/features/features.py in str2int(self, values)
852 if value not in self._str2int:
853 value = str(value).strip()
--> 854 output.append(self._str2int[str(value)])
855 else:
856 # No names provided, try to integerize
KeyError: 'None'
Hi! It seems like there are None
values in the label
column. To avoid the error, you can either drop such examples with .filter(..)
or modify the tokenize function as follows:
def tokenize(batch):
tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)
tokens["label"] = [features["label"].str2int(label) if label is not None else None for label in batch["label"]]
return tokens
1 Like
As opposed to using some pre-made encoders like those found in sklearn, for example which I found to be slow, I wrote a simple class like this:
class LabelEncoder:
def __init__(self):
self.labels_to_int = {}
def encode(self, labels):
prev_label = 0
encoded_labels = []
for label in labels:
if label not in self.labels_to_int:
self.labels_to_int[label] = prev_label
encoded = prev_label
prev_label += 1
else:
encoded = self.labels_to_int[label]
encoded_labels.append(encoded)
return encoded_labels
label_encoder = LabelEncoder()
tokenized_dataset = tokenized_dataset.map(
lambda batch: {
'labels_encoded': label_encoder.encode(batch['labels'])
},
load_from_cache_file=False,
batched=True
)
1 Like