How to save a pretrained model after finetuning?

I am trying to save a model (tensorflow-based) on S3 that I created which is essentially a finetuned version of the pretrained distilbert model. I am using the command

model.save_pretrained(mode_path, save_models=True)

and getting the following error:

RuntimeError: Dirty entry flush destroy failed (file write failed: time = Mon Jan 2 02:19:33 2023
, filename = ‘/dbfs/mnt/spock-root/MODELS/ONE_LAYER_BASELINE/tf_model.h5’, file descriptor = 115, errno = 95, error message = ‘Operation not supported’, buf = 0x11ff9c200, total write size = 4096, bytes this sub-write = 4096, bytes actually written = 18446744073709551615, offset = 0)

Here is the definition of my model class:

from transformers.models.distilbert.modeling_tf_distilbert import TFDistilBertMainLayer, TFDistilBertPreTrainedModel
from transformers.modeling_tf_outputs import (
TFBaseModelOutput,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFTokenClassifierOutput,

)
from transformers.modeling_tf_utils import get_initializer,unpack_inputs

class TFDistilBertForMultilabelClassification(TFDistilBertPreTrainedModel):
def init(self, config, *inputs, **kwargs):
print(f"INSIDE TFDistilBertForMultilabelClassification")
print("config = ", config)
super().init(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.config = config
self.distilbert = TFDistilBertMainLayer(config, name=“distilbert”)
if num_dense_layers > 0:
self.pre_classifier = tf.keras.layers.Dense(
config.dim,
kernel_initializer=get_initializer(config.initializer_range),
activation=“relu”,
name=“pre_classifier”,
)
self.classifier = tf.keras.layers.Dense(
len(label_cols), kernel_initializer=get_initializer(config.initializer_range), name=“classifier”
)
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)

@unpack_inputs
def call(
    self,
    input_ids = None,
    attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
    head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
    inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
    training: Optional[bool] = False,
) -> Union[TFMultiClassClassifierOutput, Tuple[tf.Tensor]]:
    r"""
    labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    distilbert_output = self.distilbert(
        input_ids=input_ids,
        attention_mask=attention_mask,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        training=training,
    )
    hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
    pooled_output = hidden_state[:, 0]  # (bs, dim)
    if num_dense_layers > 0:
      pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
    
    pooled_output = self.dropout(pooled_output, training=training)  # (bs, dim)
    logits = self.classifier(pooled_output)  # (bs, dim)
   
    if labels is None:
      loss = None  
    else: 
      loss = tf.keras.losses.binary_crossentropy(labels[...,None], logits[...,None], from_logits=True)

    if not return_dict:
        output = (logits,) + distilbert_output[1:]
        return ((loss,) + output) if loss is not None else output

    return TFMultiClassClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=distilbert_output.hidden_states,
        attentions=distilbert_output.attentions,
    )

def freeze_layers_for_finetuning(self, 
                                 num_to_freeze_tf_blocks: Optional[int] = 0):

  for i in range(num_to_freeze_tf_blocks):
    self.distilbert.transformer.layer[i].trainable = False

  return
  
def serving_output(self, output: TFMultiClassClassifierOutput) -> TFMultiClassClassifierOutput:
    hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
    attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

    return TFMultiClassClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)

Hi! @15saurabh16

Did you end up figuring out how to save the pretrained model after fine tuning?
I am in a similar predicament where I use torch.save(model.state_dict(),"checkpoint.pth") but it does not seem to be loading correctly. Was hoping you had some success?