Multiple layers in the classifier for NER in BERT models

Hello! I am trying to get multiple feed-forward layers at the end of the BERT model in the classifier working. The model trains and uploads to Hugging Face; you can see the model in my profile here: pabRomero/ClinicalBERT-finetuned-ner-just-classification. However, when you try to do inference on it, or you download it with AutoModelForTokenClassification.from_pretrained, I always get the error that the classifier layer has changed to a blank linear layer, which is not what I want. I know that I need to change something in the config part of the model but I don’t know what or how to approach it and endless conversation with Claude did not help unfortunately. Here is the code I used to change the classifier layer:

import torch.nn as nn

model.classifier = nn.Sequential(
    nn.Linear(768, 384),
    nn.LayerNorm(384),
    nn.ReLU(),
    nn.Linear(384, 19),
    nn.LayerNorm(19),
)

The overall goal of this is to train the NER model with just the classifier, so freeze the entire model except the classifier, and then after that is done, train the entire model including the classifier. I think that this will lead to better and faster training as the model is not getting derivatives of noise coming from the classifier. They are making the best predictions they can at the start, and then when you train them again, they can make much better derivatives for the model underneath.

Here is all the code if it is useful:

#!pip install --upgrade transformers torch datasets torchvision
#!pip install datasets seqeval evaluate huggingface_hub
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device
from transformers import AutoTokenizer
from datasets import Dataset

model_checkpoint = "pabRomero/ClinicalBERT-finetuned-ner-just-classification"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def open_file_get_data_bios(filepath):
    words = []
    labels = []
    with open(filepath, 'r', encoding='utf-8') as file:
        word = []
        label = []
        counter = 0
        for line in file:
            split_lines = line.split()
            
            if len(split_lines) == 0 or counter == 128 or split_lines[0] == ".":
                if len(split_lines) != 0:
                    if split_lines[0] == ".":
                        word.append(split_lines[0])
                        label.append(split_lines[-1])
                    words.append(word)
                    labels.append(label)
                word = []
                label = []
                counter = 0
                continue

            word.append(split_lines[0])
            label.append(split_lines[-1])
            counter += 1
    return words, labels
words, labels = open_file_get_data_bios("train_spacy.txt")
wordsTest, labelsTest = open_file_get_data_bios("test_spacy.txt")
wordsValid, labelsValid = open_file_get_data_bios("valid_spacy.txt")
# For training after hyperparameters have been found, also need to change validation set in the training loop to wordsValid
# words = words + wordsValid
# labels = labels + labelsValid

# Random shuffling for train dataset if needed (Not working)
import random

# random.shuffle(words)
# words[0]
maxword = 0
for word in words:
    if len(word) > maxword:
        maxword = len(word)
print(maxword)
print(len(words))
for i in range(10):
    print(words[i])
label_to_tag = {
    'O':0,
    'B-Drug':1, 'I-Drug':2,
    'B-Reason':3, 'I-Reason':4,
    'B-Route':5, 'I-Route':6,
    'B-Strength':7, 'I-Strength':8,
    'B-Form':9, 'I-Form':10,
    'B-Dosage':11, 'I-Dosage':12,
    'B-Frequency':13, 'I-Frequency':14,
    'B-Duration':15, 'I-Duration':16,
    'B-ADE':17, 'I-ADE':18,
}

label_names = [name for name in label_to_tag]
labels_num = [[label_to_tag[lab] for lab in label] for label in labels]
labels_numTest = [[label_to_tag[lab] for lab in label] for label in labelsTest]
labels_numValid = [[label_to_tag[lab] for lab in label] for label in labelsValid]
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            new_labels.append(labels[word_id])
        else:
            label = labels[word_id]
            new_labels.append(label if label % 2 == 0 else label + 1)
        current_word = word_id
    return new_labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    tokenized_inputs["labels"] = [align_labels_with_tokens(ner_tags, tokenized_inputs.word_ids(i)) for i, ner_tags in enumerate(examples["ner_tags"])]
    return tokenized_inputs
dataset = Dataset.from_dict({"tokens": words, "ner_tags": labels_num})
datasetTest = Dataset.from_dict({"tokens": wordsTest, "ner_tags": labels_numTest})
datasetValid = Dataset.from_dict({"tokens": wordsValid, "ner_tags": labels_numValid})

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset.column_names)
tokenized_datasetTest = datasetTest.map(tokenize_and_align_labels, batched=True, remove_columns=datasetTest.column_names)
tokenized_datasetValid = datasetValid.map(tokenize_and_align_labels, batched=True, remove_columns=datasetValid.column_names)
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
import numpy as np
import evaluate

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }
id_to_label = {i:v for i, v in enumerate(label_to_tag)}
label_to_tag
from transformers import AutoModelForTokenClassification, AutoModel, AutoConfig

config = AutoConfig.from_pretrained(model_checkpoint)

# Update the model's configuration
config.classifier_dropout = 0.1  # Adjust if needed
config.hidden_sizes = [768, 384]  # Reflect your new layer sizes
config.num_labels = 19  # Confirm this matches your label count
config.id2label = id_to_label
config.label2id = label_to_tag

config.architectures = ["CustomTokenClassificationModel"]  # If using option 1
config.custom_classifier = {
    "type": "Sequential",
    "layers": [
        {"type": "Linear", "in_features": config.hidden_size, "out_features": 384},
        {"type": "LayerNorm", "normalized_shape": 384},
        {"type": "ReLU"},
        {"type": "Linear", "in_features": 384, "out_features": config.num_labels}
    ]
}
config.additional_layer = True

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    config=config,
)

config
for param in model.base_model.parameters():
    param.requires_grad = False

import torch.nn as nn

model.classifier = nn.Sequential(
    nn.Linear(768, 384),
    nn.LayerNorm(384),
    nn.ReLU(),
    nn.Linear(384, 19),
    nn.LayerNorm(19),
)

for param in model.parameters():
    param.data = param.data.contiguous()

for name, param in model.named_parameters():
     if param.requires_grad:
         print(name)
from huggingface_hub import notebook_login

notebook_login()
from transformers import TrainingArguments

args = TrainingArguments(
    "ClinicalBERT-finetuned-pablo-just-classification-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.1,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    optim="adamw_torch",
    tf32=True,
    fp16=True,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
)
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_datasetTest,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train() 
trainer.push_to_hub(commit_message="Training complete")
Some weights of the model checkpoint at pabRomero/ClinicalBERT-finetuned-ner-just-classification were not used when initializing DistilBertForTokenClassification: ['classifier.0.bias', 'classifier.0.weight', 'classifier.1.bias', 'classifier.1.weight', 'classifier.3.bias', 'classifier.3.weight', 'classifier.4.bias', 'classifier.4.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at pabRomero/ClinicalBERT-finetuned-ner-just-classification and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Thank you!