I’m trying to fine tune distilBert for multi-label classification with KERAS but I’m encountering a lot of problems. To debug my code I replicated it with the 20newsgroup dataset that can be found in sklearn and treating it like a multi-label case (Binary crossentropy instead of Categorical Crossentropy). This is my code:
# load sklearn, pandas and numpy
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
#deep learning stuff
import tensorflow as tf
from tensorflow import keras
from transformers import (
AutoTokenizer,
TrainingArguments,
TFAutoModelForSequenceClassification
)
# download dataset
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(
subset='train',
categories=categories,
shuffle=True,
random_state=42
)
data = twenty_train.data
target = OneHotEncoder(sparse=False).fit_transform(twenty_train.target.reshape(-1, 1))
model = TFAutoModelForSequenceClassification.from_pretrained(
"distilbert-base-cased",
from_pt=True,
num_labels=len(categories),
problem_type="multi_label_classification"
)
loss = keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = keras.optimizers.Adam(
learning_rate=5e-05,
epsilon=1e-08,
decay=0.01,
clipnorm=1.0)
model.compile(optimizer=optimizer, loss=loss)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
inputs = tokenizer(data, padding="max_length", truncation=True)
train_features = {x: inputs[x] for x in tokenizer.model_input_names}
train_tf = tf.data.Dataset.from_tensor_slices((train_features, target))
model.fit(
train_tf,
batch_size=64,
epochs=10
)
When I try to train I get this error message: ValueError:
logitsand
labels must have the same shape, received ((512, 4) vs (4, 1)).
, what Am I doing wrong?