Multiple training objectives

In a previous post I explained how I am training an automated essay scoring system with multiple training objectives (Feed output from one transformer model as input to another) (specifically scoring the essay and predicting whether each sentence in the essay has a grammatical error or not). This means that for each essay I predict a score and a list of labels corresponding to each sentence in the essay (so for one essay the output will look something like this):

{‘score_preds’:32,‘labels’:[1,0,01,1]}

So I would like to record metrics for both of these tasks. However, I seem unable to pass the two sets of logits (one for each task into the compute_metrics function). Currently my Trainer is set up so that it returns a combined loss for each of the two tasks when training (which works fine) and a dictionary for the two separate logits when evaluating (as seen above).

Now I have done some digging and found that in the evaluation loop at the line:

loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)

the outs of logits is:

(tensor(15.3504, device='cuda:0'), tensor([28.1691, 28.1691, 28.1691, 28.1692, 28.1691, 28.1691, 28.1691, 28.1691],
       device='cuda:0'), tensor(0.5999, device='cuda:0'), tensor([1, 1, 0,  ..., 1, 1, 1], device='cuda:0'))

Which does not seem to make sense.

Here’s my code for my compute loss function in my trainer as well as a sample output from my model.

model and compute_loss:

class DualModel(RobertaPreTrainedModel):
    
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config, grammar_labels):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.grammar_loss_layer = config.grammar_loss_layer
        self.scorer = RobertaClassificationHead(config)
        config.num_labels = grammar_labels
        self.config=config
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self,**inputs):
        output = self.roberta(input_ids = inputs['input_ids'],attention_mask = inputs['attention_mask'])
        last_hidden_layer=output.last_hidden_state
        score_loss,score_preds = self.calculate_score_loss(last_hidden_layer,inputs['score'])
        grammar_loss,grammar_preds = self.calculate_grammar_loss(output,inputs['labels'])
        return {'score_loss':score_loss,'score_preds':score_preds,'grammar_loss':grammar_loss,'grammar_preds':grammar_preds}

    def calculate_score_loss(self,last_hidden_layer,scores):
        scores_pred = self.scorer(last_hidden_layer)
        loss_fct = MSELoss()  
        return loss_fct(scores_pred.squeeze(), scores.float().squeeze()),scores_pred.flatten()

    def calculate_grammar_loss(self,outputs,labels):
        sequence_output = outputs.hidden_states[self.grammar_loss_layer]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        loss_fct = CrossEntropyLoss()
        active_loss_mask = labels<2
        active_logits = logits[active_loss_mask]
        active_labels = labels[active_loss_mask]
        loss = loss_fct(active_logits, active_labels)

        output = TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        return output.loss,torch.argmax(output.logits,2).flatten()

class DualModelTrainer(Trainer):

    def compute_loss(self,model,inputs,return_outputs = False):
        bert_output = model(**inputs)
        
        grammar_loss = bert_output.pop('grammar_loss')
        score_loss = bert_output.pop('score_loss')
        loss = grammar_loss + score_loss
        predictions = bert_output
        pdb.set_trace()
        return (loss,(loss,predictions)) if return_outputs else loss

loss (I need to normalise essay scores, hence loss is so high):
loss:
tensor(806.9637, device='cuda:0', grad_fn=<AddBackward0>)
bert_output:

{'score_preds': tensor([0.4941, 0.3389, 0.3342, 0.3818, 0.5970, 0.3990, 0.4830, 0.1867],
       device='cuda:0', grad_fn=<ViewBackward>), 'grammar_preds': tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')}

predictions:

{'score_preds': tensor([0.4941, 0.3389, 0.3342, 0.3818, 0.5970, 0.3990, 0.4830, 0.1867],
       device='cuda:0', grad_fn=<ViewBackward>), 'grammar_preds': tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')}