Hi,
I’m currently building a multiclass classification with BERT and HuggingFace’s Trainer. We have 35 labels to classify. I simply convert these string label to a number label (For example, topic 1 = label 1). Please let me know why i’m encountering shape error when the tutorial said that we only need to convert to numerical labeling instead of one-hot encoding. Would love your feedback on this.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
class CustomDataset(Dataset):
def __init__(self, df, tokenizer, max_len):
self.dataframe = df
self.tokenizer = tokenizer
self.max_len = max_len
self.text = df['text']
self.label = df.label
def __len__(self):
return len(self.text)
def __getitem__(self, index):
text = str(self.text[index])
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens = True,
max_length = self.max_len,
padding = 'max_length',
truncation = True,
return_token_type_ids = True,
return_attention_mask = True,
return_tensors='pt',
)
label = self.label[index]
return {
'input_ids': inputs['input_ids'].flatten(),
'attention_mask': inputs['attention_mask'].flatten(),
'labels': torch.tensor(label, dtype=torch.float)
}
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
train_dataset = CustomDataset (train_df, tokenizer, max_len)
val_dataset = CustomDataset (val_df, tokenizer, max_len)
#==================================================================
# TRAINING MODEL
#==================================================================
def compute_metrics(eval_pred):
labels = eval_pred.label_ids
preds = eval_pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average='weighted')
return {'f1':f1}
class WeightedTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
outputs = model(**inputs)
logits = outputs.get(logits)
labels = outputs.get(labels)
criterion = torch.nn.CrossEntropyLoss()
loss = criterion(logits, labels)
return (loss, outputs) if return_outputs else loss
batch_size=32
logging_steps = len(train_df) // batch_size
training_args = TrainingArguments(
output_dir = 'test_output',
overwrite_output_dir=True,
num_train_epochs = 10,
per_device_train_batch_size = 32,
per_device_eval_batch_size = 32,
warmup_steps = 50,
learning_rate = 0.01,
weight_decay = 0.04,
logging_steps = logging_steps,
evaluation_strategy = 'epoch',
eval_steps=10,
)
trainer.train()```