HuggingFace Transformers Error When Saving Model: TypeError: Object of type method is not JSON serializable

I get the following error when trying to save a BERT-based model (astroBERT; also observed with sciBERT):

    ---------------------------------------------------------------------------
    TypeError                                 Traceback (most recent call last)
    Cell In[15], line 1
    ----> 1 trainer.train()
          2 trainer.save_model(f"astrobert-output/ft-{model_name}-{run_name}-final")
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:1624, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
       1622         hf_hub_utils.enable_progress_bars()
       1623 else:
    -> 1624     return inner_training_loop(
       1625         args=args,
       1626         resume_from_checkpoint=resume_from_checkpoint,
       1627         trial=trial,
       1628         ignore_keys_for_eval=ignore_keys_for_eval,
       1629     )
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:2029, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
       2026     self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
       2027     self.control = self.callback_handler.on_step_end(args, self.state, self.control)
    -> 2029     self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
       2030 else:
       2031     self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:2423, in Trainer._maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
       2420         self.lr_scheduler.step(metrics[metric_to_check])
       2422 if self.control.should_save:
    -> 2423     self._save_checkpoint(model, trial, metrics=metrics)
       2424     self.control = self.callback_handler.on_save(self.args, self.state, self.control)
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:2499, in Trainer._save_checkpoint(self, model, trial, metrics)
       2497 else:
       2498     staging_output_dir = os.path.join(run_dir, f"tmp-{checkpoint_folder}")
    -> 2499 self.save_model(staging_output_dir, _internal_call=True)
       2501 if not self.args.save_only_model:
       2502     # Save optimizer and scheduler
       2503     self._save_optimizer_and_scheduler(staging_output_dir)
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:3016, in Trainer.save_model(self, output_dir, _internal_call)
       3013         self.model_wrapped.save_checkpoint(output_dir)
       3015 elif self.args.should_save:
    -> 3016     self._save(output_dir)
       3018 # Push to the Hub when `save_model` is called by the user.
       3019 if self.args.push_to_hub and not _internal_call:
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:3094, in Trainer._save(self, output_dir, state_dict)
       3089     self.model.save_pretrained(
       3090         output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
       3091     )
       3093 if self.tokenizer is not None:
    -> 3094     self.tokenizer.save_pretrained(output_dir)
       3096 # Good practice: save your training arguments together with the trained model
       3097 torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2464, in PreTrainedTokenizerBase.save_pretrained(self, save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
       2462     print(" ")
       2463 with open(tokenizer_config_file, "w", encoding="utf-8") as f:
    -> 2464     out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
       2465     f.write(out_str)
       2466 logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/json/__init__.py:238, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
        232 if cls is None:
        233     cls = JSONEncoder
        234 return cls(
        235     skipkeys=skipkeys, ensure_ascii=ensure_ascii,
        236     check_circular=check_circular, allow_nan=allow_nan, indent=indent,
        237     separators=separators, default=default, sort_keys=sort_keys,
    --> 238     **kw).encode(obj)
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/json/encoder.py:202, in JSONEncoder.encode(self, o)
        200 chunks = self.iterencode(o, _one_shot=True)
        201 if not isinstance(chunks, (list, tuple)):
    --> 202     chunks = list(chunks)
        203 return ''.join(chunks)
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/json/encoder.py:432, in _make_iterencode.<locals>._iterencode(o, _current_indent_level)
        430     yield from _iterencode_list(o, _current_indent_level)
        431 elif isinstance(o, dict):
    --> 432     yield from _iterencode_dict(o, _current_indent_level)
        433 else:
        434     if markers is not None:
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/json/encoder.py:406, in _make_iterencode.<locals>._iterencode_dict(dct, _current_indent_level)
        404         else:
        405             chunks = _iterencode(value, _current_indent_level)
    --> 406         yield from chunks
        407 if newline_indent is not None:
        408     _current_indent_level -= 1
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/json/encoder.py:439, in _make_iterencode.<locals>._iterencode(o, _current_indent_level)
        437         raise ValueError("Circular reference detected")
        438     markers[markerid] = o
    --> 439 o = _default(o)
        440 yield from _iterencode(o, _current_indent_level)
        441 if markers is not None:
    
    File ~/miniconda3/envs/paper-class/lib/python3.11/json/encoder.py:180, in JSONEncoder.default(self, o)
        161 def default(self, o):
        162     """Implement this method in a subclass such that it returns
        163     a serializable object for ``o``, or calls the base implementation
        164     (to raise a ``TypeError``).
       (...)
        178 
        179     """
    --> 180     raise TypeError(f'Object of type {o.__class__.__name__} '
        181                     f'is not JSON serializable')
    
    TypeError: Object of type method is not JSON serializable

Here is an MRE excluding data (the problem persists across different datasets):

    model_checkpoint = "adsabs/astroBERT"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_special_tokens=True, do_lower_case=False, use_fast=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, problem_type="multi_label_classification", num_labels=num_labels, id2label=id2label, label2id=label2id)
    
    trainer = Trainer(
        model=model,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["valid"],
        tokenizer=tokenizer)
    
    trainer.train()
    trainer.save_model(f"model")

The error occurs when the model attempts to save, either through trainer.train() or trainer.save_model(). I expected the model to save without error from either attempt. I have dug through the transformers source code, namely tokenization_utils_base.py and found that the ‘add_special_tokens’ attribute of the ‘tokenizer_config’ object is of type <class ‘method’>. I don’t know why this is the case because above, when I define the tokenizer, I set ‘add_special_tokens’ to be simply True (same problem when I set this to be False as well). When I print add_special_tokens within tokenization_utils_base.py, I see this:

    <bound method SpecialTokensMixin.add_special_tokens of BertTokenizer(name_or_path='adsabs/astroBERT', vocab_size=30000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
    	16338: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
    	16339: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
    	16340: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
    	16341: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
    	16342: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
    }>

Again, not sure where this is coming from. I’m using version 4.38.1 of transformers. What is the cause of this error? Could there be a bug in the transformers package?

I think I figured this out. There’s a part of tokenization_utils_base.py under the save_pretrained method of PreTrainedTokenizerBase class which sets the tokenizer config based on existing class attributes:

tokenizer_config = copy.deepcopy(self.init_kwargs)
# Let's save the init kwargs
target_keys = set(self.init_kwargs.keys())
# Let's save the special tokens map (only the strings)
target_keys.update(["model_max_length", "clean_up_tokenization_spaces"])
for k in target_keys:
    if hasattr(self, k):
        tokenizer_config[k] = getattr(self, k)

astroBERT happens to have ‘add_special_tokens’ in its tokenizer config (tokenizer_config.json · adsabs/astroBERT at main) so this appears in the ‘target_keys’ list. The code then loops through the target keys and adds attributes that have been updated elsewhere in the PreTrainedTokenizerBase class to the tokenizer config. However, the PreTrainedTokenizerBase class has a method also called ‘add_special_tokens’ which it inherents from another class, SpecialTokensMixin. ‘add_special_tokens’ in the tokenizer config is replaced by whatever is specified when the tokenizer is defined with this method, leading to the original error posted above. I added a little hack to get around this:

for k in target_keys:
    if hasattr(self, k):
        if k == 'add_special_tokens' and tokenizer_config['name_or_path'] == "adsabs/astroBERT:
            continue

which I think works because the astroBERT tokenizer config already has add_special_tokens set to what I want it to be.

I didn’t observe the original error with another LLM, bert-base-uncased, because this one does not have add_special_tokens in its tokenizer config (tokenizer_config.json · google-bert/bert-base-uncased at main) so the save_pretrained method does include add_special_tokens in target_keys so it is not overwritten with a method instead of a boolean. I think this is a bug with the transformers package or astroBERT more specifically.