How do you load a new model from scratch?

I am trying to learn how to create a model from scratch - not something already in Huggingface - to put on Hugginface. The simple model I made looks like:

class EmbeddingModel(PreTrainedModel):
    
    def __init__(self, config, loss_fct = nn.CrossEntropyLoss()) -> None:
        super().__init__(config)
        self.loss_fct = loss_fct
        self.embed_dim = config.hidden_size
        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
        self.lm_head = nn.Linear(self.embed_dim, config.vocab_size, bias=False)
    
    def forward(self, input_ids, labels = None, attention_mask = None):
        _, T = input_ids.shape
        token_embeddings = self.wte(input_ids)

        x = token_embeddings
        lm_logits = self.lm_head(x)  # (B, T) -> (B, T, C)

        # Get crossentropy loss predicting next token
        if labels is not None:
            #self.loss_fct = nn.CrossEntropyLoss()
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss = self.loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        # Return in a format Hugginface trainers understand
        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=lm_logits,
        )

I can create and save the model as:

run_name = 'my_run'
output_dir = 'my_dir'
config = PretrainedConfig(
    name_or_path=run_name,
    vocab_size=vocab_size,
    hidden_size=768 // 2,
    num_hidden_layers=0,
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    is_decoder=True,
    model_type='my_model',
)
model = EmbeddingModel(config)
model.save_pretrained(output_dir)
-----
Configuration saved in mydir/config.json
Model weights saved in mydir/model.safetensors

I am just trying to figure out how I read this in again for later. Trying something like:

model2 = EmbeddingModel(config)
model2.from_pretrained(output_dir)

gives the error

'NoneType' object has no attribute 'from_pretrained'

Trying:

model2 = AutoModel.from_pretrained(output_dir)

gives

Unrecognized model in my_dir. Should have a `model_type` key in its config.json

Now I did set this in my config above… but it isn’t showing up in the json file. If I put it in by hand it says it does not recognize the ‘my_model’ architecture.

The funny thing is I can get the trainer to read it back in after training for a while:

trainer.train(resume_from_checkpoint=True)
model2 = trainer.model

But I would like to just load in the model without this trainer hack. How do you load in a pretrained model you have made from scratch?

For posterity, it turns out that you have to define your own config class in addition to the model class, and link the two. Like so:

class EmbeddingConfig(PretrainedConfig):
    model_type = "embedding"

    def __init__(self, vocab_size=50257, embedding_dim=768, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim


class EmbeddingModel(PreTrainedModel):
    config_class = EmbeddingConfig
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config, loss_fct=nn.CrossEntropyLoss()):
        super().__init__(config)
        self.loss_fct = loss_fct
        self.wte = nn.Embedding(config.vocab_size, config.embedding_dim)
        self.lm_head = nn.Linear(config.embedding_dim, config.vocab_size, bias=False)

    def forward(self, input_ids, labels=None, attention_mask=None, **kwargs):
        _, T = input_ids.shape
        token_embeddings = self.wte(input_ids)
        # position_embeddings = self.position_embedding_table(self.position[:T])
        # x = self.drop_init(token_embeddings + position_embeddings)
        x = token_embeddings
        # x = self.blocks(x)
        logits = self.lm_head(x)  # (B, T) -> (B, T, C)

        loss = None
        if labels is not None:
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
            )

        return CausalLMOutputWithCrossAttentions(loss=loss, logits=logits)

config = EmbeddingConfig(vocab_size=tokenizer.vocab_size, embedding_dim=768 // coursen)
model = EmbeddingModel(config)

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Reload the model and tokenizer
loaded_tokenizer = GPT2TokenizerFast.from_pretrained(output_dir)
loaded_model = EmbeddingModel.from_pretrained(output_dir)

This worked:

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file my_test/config.json
Model config EmbeddingConfig {
  "architectures": [
    "EmbeddingModel"
  ],
  "embedding_dim": 384,
  "model_type": "embedding",
  "torch_dtype": "float32",
  "transformers_version": "4.41.0",
  "vocab_size": 50257
}

loading weights file my_test/model.safetensors
Generate config GenerationConfig {}