Hi,
IThe following code snippet for pulling a pretrained custom tokenizer from the Hugging Face Hub
import os
from transformers import AutoTokenizer
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("smostafanejad/gen-mlm-cismi-bert-wordpiece",
token=os.environ['HF_TOKEN'],
cache_dir="./cache"
)
suddenly started raising the following runtime error since yesterday (05/05/2025).
Cell In[4], line 5
2 from transformers import AutoTokenizer
4 # load the tokenizer
----> 5 tokenizer = AutoTokenizer.from_pretrained("smostafanejad/gen-mlm-cismi-bert-wordpiece",
6 token=os.environ['HF_TOKEN'],
7 cache_dir="./cache"
8 )
File ~/Packages/miniconda3/envs/bertchemai/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:992, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
989 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
991 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 992 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
993 else:
994 if tokenizer_class_py is not None:
File ~/Packages/miniconda3/envs/bertchemai/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2046, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
2043 # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be
2044 # loaded directly from the GGUF file.
2045 if all(full_file_name is None for full_file_name in resolved_vocab_files.values()) and not gguf_file:
-> 2046 raise EnvironmentError(
2047 f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
2048 "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
2049 f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
2050 f"containing all relevant files for a {cls.__name__} tokenizer."
2051 )
2053 for file_id, file_path in vocab_files.items():
2054 if file_id not in resolved_vocab_files:
OSError: Can't load tokenizer for 'smostafanejad/gen-mlm-cismi-bert-wordpiece'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'smostafanejad/gen-mlm-cismi-bert-wordpiece' is the correct path to a directory containing all relevant files for a BertTokenizerFast tokenizer.
I have followed the suggestions in the error message (directory is clean and the address on the Hub is available) but they do not help.
I appreciate any assistance on this matter as the same function call used to work until yesterday.