I get the following error when trying to save a BERT-based model (astroBERT; also observed with sciBERT):
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[15], line 1
----> 1 trainer.train()
2 trainer.save_model(f"astrobert-output/ft-{model_name}-{run_name}-final")
File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:1624, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1622 hf_hub_utils.enable_progress_bars()
1623 else:
-> 1624 return inner_training_loop(
1625 args=args,
1626 resume_from_checkpoint=resume_from_checkpoint,
1627 trial=trial,
1628 ignore_keys_for_eval=ignore_keys_for_eval,
1629 )
File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:2029, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2026 self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
2027 self.control = self.callback_handler.on_step_end(args, self.state, self.control)
-> 2029 self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
2030 else:
2031 self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:2423, in Trainer._maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
2420 self.lr_scheduler.step(metrics[metric_to_check])
2422 if self.control.should_save:
-> 2423 self._save_checkpoint(model, trial, metrics=metrics)
2424 self.control = self.callback_handler.on_save(self.args, self.state, self.control)
File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:2499, in Trainer._save_checkpoint(self, model, trial, metrics)
2497 else:
2498 staging_output_dir = os.path.join(run_dir, f"tmp-{checkpoint_folder}")
-> 2499 self.save_model(staging_output_dir, _internal_call=True)
2501 if not self.args.save_only_model:
2502 # Save optimizer and scheduler
2503 self._save_optimizer_and_scheduler(staging_output_dir)
File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:3016, in Trainer.save_model(self, output_dir, _internal_call)
3013 self.model_wrapped.save_checkpoint(output_dir)
3015 elif self.args.should_save:
-> 3016 self._save(output_dir)
3018 # Push to the Hub when `save_model` is called by the user.
3019 if self.args.push_to_hub and not _internal_call:
File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/trainer.py:3094, in Trainer._save(self, output_dir, state_dict)
3089 self.model.save_pretrained(
3090 output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
3091 )
3093 if self.tokenizer is not None:
-> 3094 self.tokenizer.save_pretrained(output_dir)
3096 # Good practice: save your training arguments together with the trained model
3097 torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
File ~/miniconda3/envs/paper-class/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2464, in PreTrainedTokenizerBase.save_pretrained(self, save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
2462 print(" ")
2463 with open(tokenizer_config_file, "w", encoding="utf-8") as f:
-> 2464 out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
2465 f.write(out_str)
2466 logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
File ~/miniconda3/envs/paper-class/lib/python3.11/json/__init__.py:238, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
232 if cls is None:
233 cls = JSONEncoder
234 return cls(
235 skipkeys=skipkeys, ensure_ascii=ensure_ascii,
236 check_circular=check_circular, allow_nan=allow_nan, indent=indent,
237 separators=separators, default=default, sort_keys=sort_keys,
--> 238 **kw).encode(obj)
File ~/miniconda3/envs/paper-class/lib/python3.11/json/encoder.py:202, in JSONEncoder.encode(self, o)
200 chunks = self.iterencode(o, _one_shot=True)
201 if not isinstance(chunks, (list, tuple)):
--> 202 chunks = list(chunks)
203 return ''.join(chunks)
File ~/miniconda3/envs/paper-class/lib/python3.11/json/encoder.py:432, in _make_iterencode.<locals>._iterencode(o, _current_indent_level)
430 yield from _iterencode_list(o, _current_indent_level)
431 elif isinstance(o, dict):
--> 432 yield from _iterencode_dict(o, _current_indent_level)
433 else:
434 if markers is not None:
File ~/miniconda3/envs/paper-class/lib/python3.11/json/encoder.py:406, in _make_iterencode.<locals>._iterencode_dict(dct, _current_indent_level)
404 else:
405 chunks = _iterencode(value, _current_indent_level)
--> 406 yield from chunks
407 if newline_indent is not None:
408 _current_indent_level -= 1
File ~/miniconda3/envs/paper-class/lib/python3.11/json/encoder.py:439, in _make_iterencode.<locals>._iterencode(o, _current_indent_level)
437 raise ValueError("Circular reference detected")
438 markers[markerid] = o
--> 439 o = _default(o)
440 yield from _iterencode(o, _current_indent_level)
441 if markers is not None:
File ~/miniconda3/envs/paper-class/lib/python3.11/json/encoder.py:180, in JSONEncoder.default(self, o)
161 def default(self, o):
162 """Implement this method in a subclass such that it returns
163 a serializable object for ``o``, or calls the base implementation
164 (to raise a ``TypeError``).
(...)
178
179 """
--> 180 raise TypeError(f'Object of type {o.__class__.__name__} '
181 f'is not JSON serializable')
TypeError: Object of type method is not JSON serializable
Here is an MRE excluding data (the problem persists across different datasets):
model_checkpoint = "adsabs/astroBERT"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_special_tokens=True, do_lower_case=False, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, problem_type="multi_label_classification", num_labels=num_labels, id2label=id2label, label2id=label2id)
trainer = Trainer(
model=model,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset["valid"],
tokenizer=tokenizer)
trainer.train()
trainer.save_model(f"model")
The error occurs when the model attempts to save, either through trainer.train() or trainer.save_model(). I expected the model to save without error from either attempt. I have dug through the transformers source code, namely tokenization_utils_base.py and found that the ‘add_special_tokens’ attribute of the ‘tokenizer_config’ object is of type <class ‘method’>. I don’t know why this is the case because above, when I define the tokenizer, I set ‘add_special_tokens’ to be simply True (same problem when I set this to be False as well). When I print add_special_tokens within tokenization_utils_base.py, I see this:
<bound method SpecialTokensMixin.add_special_tokens of BertTokenizer(name_or_path='adsabs/astroBERT', vocab_size=30000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), added_tokens_decoder={
16338: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
16339: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
16340: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
16341: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
16342: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}>
Again, not sure where this is coming from. I’m using version 4.38.1 of transformers. What is the cause of this error? Could there be a bug in the transformers package?