Hi everyone,
I have just fine-tuned and evaluated my first BERT model, but when I started the bug analysis to see where the model struggles, I noticed that the output tokens are not the ones it should have tested on and are different from the given test data.
I’d like to understand why the tokens in the tokenised_tokens
list are not the same as those in the test_data
. And where does it get the tokens stored in tokenized_tokens
? Both variables are lists, where the list objects are sentences represented as a list of tokens.
If anyone can help/explain this, I would be very grateful.
And if you need more details or comments to understand the problem, please let me know.
Thanks.
# variables that store the train and test data - both lists
train_data
test_data
# Data collation
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
NUM_CLASSES = len(all_labels)
# A BERT-model class for NER
class BertNERModel(nn.Module):
def __init__(
self,
model_name_or_path: str = "bert-base-german-cased",
num_classes: int = NUM_CLASSES,
):
super().__init__()
self.model = AutoModel.from_pretrained(model_name_or_path)
self.hidden_size = self.model.config.hidden_size
self.fc = nn.Linear(self.hidden_size, num_classes)
def forward(self, x: dict[str, any]) -> torch.Tensor:
# shape of embeddings: [batch_size, sequence_length, hidden_size]
embeddings = self.model(**x).last_hidden_state
# shape of out: [batch_size, sequence_length, num_classes]
out = self.fc(embeddings)
return out
model = BertNERModel("bert-base-german-cased")
LR = 5e-5 # learning rate
NUM_EPOCHS = 5
# set hardware
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# set loss function, optimizer and model device
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LR)
model.to(device)
# set dataloader
def aggregate_batch(batch: dict[str, any]) -> dict[str, any]:
"""
Aggregate all the values of each column into a list of values.
This step should be done during data-loading.
"""
return {
column_name: [example[column_name] for example in batch]
for column_name in batch[0]
}
train_loader = torch.utils.data.DataLoader(
train_data,
batch_size=2,
shuffle=True,
pin_memory=True,
collate_fn=aggregate_batch,
)
test_loader = torch.utils.data.DataLoader(
test_data,
batch_size=len(proper_test_data),
shuffle=False,
pin_memory=True,
collate_fn=aggregate_batch,
)
# Training
for epoch in tqdm(range(NUM_EPOCHS)):
model.train()
sum_loss = 0
for batch in train_loader:
inputs = {k: [d[k] for d in batch["tokenized"]] for k in batch["tokenized"][0]}
inputs = {k: torch.tensor(v, device=device) for k, v in inputs.items()}
labels = torch.tensor(batch["aligned_labels"], device=device)
outputs = model(inputs)
# flatten both along first two dims, and filter out "-100" labels
outputs, labels = outputs.view(-1, NUM_CLASSES), labels.view(-1)
outputs, labels = outputs[labels != -100], labels[labels != -100]
loss = criterion(outputs, labels)
sum_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Inference
with torch.no_grad():
model.eval()
for batch in test_loader:
inputs = {k: [d[k] for d in batch["tokenized"]] for k in batch["tokenized"][0]}
inputs = {k: torch.tensor(v, device=device) for k, v in inputs.items()}
labels = torch.tensor(batch["aligned_labels"], device=device)
outputs = model(inputs)
outputs, labels = outputs.view(-1, NUM_CLASSES), labels.view(-1)
outputs, labels = outputs[labels != -100], labels[labels != -100]
preds = torch.argmax(outputs, dim=1).detach().cpu().numpy().tolist()
labels = labels.detach().cpu().numpy().tolist()
# concert token ids into tokens
tokenized_tokens = list()
for item in inputs["input_ids"]:
tokenized_item = tokenizer.convert_ids_to_tokens(item, skip_special_tokens=True)
tokenized_tokens.append(tokenized_item)