AttributeError: 'ElectraForPreTrainingOutput' object has no attribute 'last_hidden_state'

I’m training a german language Electra model for multi label classification. When I run

# Training model
trainer.fit(model, data_module)

I get the error

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-63-67a8decbf039> in <module>()
      1 # Training model
----> 2 trainer.fit(model, data_module)

16 frames
<ipython-input-51-69561934ba6f> in forward(self, input_ids, attention_mask, labels)
     12   def forward(self, input_ids, attention_mask, labels=None):
     13     output = self.electra(input_ids, attention_mask=attention_mask)
---> 14     output = self.classifier(output.last_hidden_state[:, 0]) # output = self.classifier(output.pooler_output)
     15     output = torch.sigmoid(output)
     16     loss = 0

AttributeError: 'ElectraForPreTrainingOutput' object has no attribute 'last_hidden_state'

I define my model parameters like this (in a lightening module):

class CrowdCodedTagger(pl.LightningModule):
#AutoModelForPreTraining.from_pretrained("german-nlp-group/electra-base-german-uncased", return_dict = True)
  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    #self.electra = ElectraModel.from_pretrained(ELECTRA_MODEL_NAME) # changed ElectraModel to ElectraForSequenceClassification. Removed (, return_dict=False)
    self.electra = AutoModelForPreTraining.from_pretrained("german-nlp-group/electra-base-german-uncased")
    self.classifier = nn.Linear(self.electra.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.electra(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.last_hidden_state[:, 0]) # output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def training_epoch_end(self, outputs):
    
    labels = []
    predictions = []
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)

  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5) #DEFINING LEARNING RATE

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

This is the model I use: german-nlp-group/electra-base-german-uncased · Hugging Face

Here is a link to the colab: Google Colab

Can anyone spot the problem? Thanks in advance!