Importing a DistilBertTokenizer does not work using AutoTokenizer

Hi, I’m new to Hugging Face and I’m having issue running the following line to import a tokenizer:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")

The error I get is the following:

Exception                                 Traceback (most recent call last)
/Users/username/path/file.ipynb Cell 4 line 6
	  2 import torch
	  4 # Load the pre-trained model and tokenizer
	  5 # tokenizer = DistilBertTokenizer.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")
----> 6 tokenizer = AutoTokenizer.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")
	  9 # model = AutoModelForSequenceClassification.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")
	 10 # model = AutoModel.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")
	 11 
   (...)
	 15 # # Tokenize the text and classify it
	 16 # input_ids = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

File ~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/models/auto/tokenization_auto.py:691, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
	687     if tokenizer_class is None:
	688         raise ValueError(
	689             f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
	690         )
--> 691     return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
	693 # Otherwise we have to be creative.
	694 # if model is an encoder decoder, the encoder tokenizer class is used by default
	695 if isinstance(config, EncoderDecoderConfig):

File ~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:1825, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
   1822     else:
   1823         logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 1825 return cls._from_pretrained(
   1826     resolved_vocab_files,
   1827     pretrained_model_name_or_path,
   1828     init_configuration,
   1829     *init_inputs,
   1830     use_auth_token=use_auth_token,
   1831     cache_dir=cache_dir,
   1832     local_files_only=local_files_only,
   1833     _commit_hash=commit_hash,
   1834     _is_local=is_local,
   1835     **kwargs,
   1836 )

File ~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:1988, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
   1986 # Instantiate tokenizer.
   1987 try:
-> 1988     tokenizer = cls(*init_inputs, **init_kwargs)
   1989 except OSError:
   1990     raise OSError(
   1991         "Unable to load vocabulary from file. "
   1992         "Please check that the provided vocabulary is accessible and not corrupted."
   1993     )

File ~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/models/distilbert/tokenization_distilbert_fast.py:145, in DistilBertTokenizerFast.__init__(self, vocab_file, tokenizer_file, do_lower_case, unk_token, sep_token, pad_token, cls_token, mask_token, tokenize_chinese_chars, strip_accents, **kwargs)
	131 def __init__(
	132     self,
	133     vocab_file=None,
ref='~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/models/distilbert/tokenization_distilbert_fast.py:0'>0</a>;32m   (...)
	143     **kwargs,
	144 ):
--> 145     super().__init__(
	146         vocab_file,
	147         tokenizer_file=tokenizer_file,
	148         do_lower_case=do_lower_case,
	149         unk_token=unk_token,
	150         sep_token=sep_token,
	151         pad_token=pad_token,
	152         cls_token=cls_token,
	153         mask_token=mask_token,
	154         tokenize_chinese_chars=tokenize_chinese_chars,
	155         strip_accents=strip_accents,
	156         **kwargs,
	157     )
	159     normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
	160     if (
	161         normalizer_state.get("lowercase", do_lower_case) != do_lower_case
	162         or normalizer_state.get("strip_accents", strip_accents) != strip_accents
	163         or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
	164     ):

File ~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py:111, in PreTrainedTokenizerFast.__init__(self, *args, **kwargs)
	108     fast_tokenizer = copy.deepcopy(tokenizer_object)
	109 elif fast_tokenizer_file is not None and not from_slow:
	110     # We have a serialization from tokenizers which let us directly build the backend
--> 111     fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
	112 elif slow_tokenizer is not None:
	113     # We need to convert a slow tokenizer to build the backend
	114     fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)

Exception: No such file or directory (os error 2)

Alternatively, when importing the tokenizer via DistilBertTokenizer it works:

from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")

Why is it required in this case to import with a specific tokenizer method rather than the general AutoTokenizer method?

Thanks!

1 Like