Hello.
I have a task in which there are 6 different labels for each record, and every label can have a value from 0 to 3. The dataset is so imbalanced.
text | label_1 | label_2 | label_3 | label_4 | label_5 | label_6 |
---|---|---|---|---|---|---|
… | 0 | 1 | 0 | 2 | 0 | 0 |
… | 0 | 0 | 0 | 0 | 0 | 0 |
… | 2 | 0 | 0 | 0 | 0 | 3 |
I want to solve this task using transformers. Should I set the num_labels
equal to 24
while initializing the transformer?
num_labels = 6 # Number of labels
classes_per_label = 4 # Number of intensity levels per label (0, 1, 2, 3)
total_classes = num_labels * classes_per_label
model = AutoModelForSequenceClassification.from_pretrained(model_name,
problem_type="multi_label_classification",
ignore_mismatched_sizes=True,
num_labels=total_classes)
In addition, what are best practices for 1. creating a Dataset
object from torch.utils.data.Dataset
module, 2. defining a loss function, and 3. defining thresholds while predicting and evaluating the labels?
Here is my current code:
def encode_data(df, tokenizer, label_columns):
encodings = tokenizer(list(df['text']), padding=True, truncation=True, max_length=128)
labels = df[label_columns].values
return encodings, labels
class WeightedMultiLabelDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = torch.tensor(labels, dtype=torch.long)
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = self.labels[idx]
return item
# Prepare datasets
train_encodings, train_labels = encode_data(train_df, tokenizer, label_columns)
dev_encodings, dev_labels = encode_data(dev_df, tokenizer, label_columns)
train_dataset = WeightedMultiLabelDataset(train_encodings, train_labels)
dev_dataset = WeightedMultiLabelDataset(dev_encodings, dev_labels)
from sklearn.metrics import classification_report, average_precision_score
def compute_metrics(pred):
logits, labels = pred
logits = logits.reshape(-1, classes_per_label)
probabilities = torch.softmax(torch.tensor(logits), axis=1).view(-1, num_labels).numpy()
predictions = torch.argmax(torch.tensor(logits), axis=1).view(-1, num_labels).numpy()
labels = labels.reshape(-1, num_labels).numpy()
auprc_per_label = []
for i in range(num_labels):
auprc = average_precision_score(labels[:, i], probabilities[:, i])
auprc_per_label.append(auprc)
mean_auprc = sum(auprc_per_label) / len(auprc_per_label)
report = classification_report(labels, predictions, target_names=label_columns, zero_division=0)
print(report)
return {
'mean_auprc': mean_auprc,
'auprc_per_label': auprc_per_label,
}
Thank you!