I am attempting to write a custom model where I can add in additional features to the embeddings. However, first as a sanity check I want to make sure I can at least reasonably match the performance I get when using the hf trainer on the same data (76% on a 3 class problem).
However, the accuracy is now just 50% so I must be doing something wrong. I’ve commented out the lines that would append the extra data, and included the training loop. Are there any obvious reasons why this would not be able to learn?
I’ve also included the arguments for trainer. I realize I haven’t perfectly replicated all settings, but the accuracy is so far off that I’m thinking there’s somehting else going on.
from transformers import AutoConfig, AutoModel
import torch
class CustomModel(torch.nn.Module):
def __init__(self, model_name, num_extra_dims, num_labels):
# num_extra_dims corresponds to the number of extra dimensions of numerical/categorical data
super().__init__()
self.config = AutoConfig.from_pretrained(model_name)
self.transformer = AutoModel.from_pretrained(model_name, config=self.config)
num_hidden_size = self.transformer.config.hidden_size
#self.classifier = torch.nn.Linear(num_hidden_size+num_extra_dims, num_labels)
self.classifier = torch.nn.Linear(num_hidden_size, num_labels)
def forward(self, input_ids, extra_data, attention_mask=None):
"""
extra_data should be of shape [batch_size, dim]
where dim is the number of additional numerical/categorical dimensions
"""
hidden_states = self.transformer(input_ids=input_ids, attention_mask=attention_mask) # [batch size, sequence length, hidden size]
cls_embeds = hidden_states.last_hidden_state[:, 0, :] # [batch size, hidden size]
#concat = torch.cat((cls_embeds, extra_data), dim=-1) # [batch size, hidden size+num extra dims]
output = self.classifier(cls_embeds) # [batch size, num labels]
return output
custom_model = CustomModel(model_name, num_extra_dims=num_extra_dims, num_labels=num_labels).to(device)
training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(custom_model.parameters(), lr=2e-5, weight_decay=0.01)
for epoch in range(5): # loop over the dataset multiple times
epoch_start = datetime.now()
loss_check_start = datetime.now()
custom_model.train()
running_loss = 0.0
for i, data in enumerate(train_dataloader, 0):
# get the inputs; data is a list of [inputs, labels]
input_ids,extra_data, attention_masks, labels = data[0].to(device), data[1].to(device), data[2].to(device), data[3].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = custom_model(input_ids, extra_data, attention_mask=attention_masks)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 100 == 99: # print every 2000 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
LOSS = running_loss
running_loss = 0.0
print('Loss check time:', datetime.now() - loss_check_start)
loss_check_start = datetime.now()
print('Epoch check time:', datetime.now() - epoch_start)
PATH = f'./custom_model_{epoch}.pth'
print('Saving to', PATH)
torch.save({
'epoch': epoch,
'model_state_dict': custom_model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': LOSS,
}, PATH)
print('Finished Training')
type or paste code here
Trainer
training_args = TrainingArguments(output_dir=model_name,
num_train_epochs=args.epochs,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
evaluation_strategy="epoch",
disable_tqdm=False,
logging_steps=logging_steps,
push_to_hub=False,
# log_level="error"
)
trainer = Trainer(model=model, args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=valid_data,
#eval_dataset=valid_data_down,
tokenizer=tokenizer)