Hi, I’m new to Hugging Face and I’m having issue running the following line to import a tokenizer:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")
The error I get is the following:
Exception Traceback (most recent call last)
/Users/username/path/file.ipynb Cell 4 line 6
2 import torch
4 # Load the pre-trained model and tokenizer
5 # tokenizer = DistilBertTokenizer.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")
----> 6 tokenizer = AutoTokenizer.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")
9 # model = AutoModelForSequenceClassification.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")
10 # model = AutoModel.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")
11
(...)
15 # # Tokenize the text and classify it
16 # input_ids = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
File ~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/models/auto/tokenization_auto.py:691, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
687 if tokenizer_class is None:
688 raise ValueError(
689 f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
690 )
--> 691 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
693 # Otherwise we have to be creative.
694 # if model is an encoder decoder, the encoder tokenizer class is used by default
695 if isinstance(config, EncoderDecoderConfig):
File ~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:1825, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1822 else:
1823 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 1825 return cls._from_pretrained(
1826 resolved_vocab_files,
1827 pretrained_model_name_or_path,
1828 init_configuration,
1829 *init_inputs,
1830 use_auth_token=use_auth_token,
1831 cache_dir=cache_dir,
1832 local_files_only=local_files_only,
1833 _commit_hash=commit_hash,
1834 _is_local=is_local,
1835 **kwargs,
1836 )
File ~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:1988, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
1986 # Instantiate tokenizer.
1987 try:
-> 1988 tokenizer = cls(*init_inputs, **init_kwargs)
1989 except OSError:
1990 raise OSError(
1991 "Unable to load vocabulary from file. "
1992 "Please check that the provided vocabulary is accessible and not corrupted."
1993 )
File ~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/models/distilbert/tokenization_distilbert_fast.py:145, in DistilBertTokenizerFast.__init__(self, vocab_file, tokenizer_file, do_lower_case, unk_token, sep_token, pad_token, cls_token, mask_token, tokenize_chinese_chars, strip_accents, **kwargs)
131 def __init__(
132 self,
133 vocab_file=None,
ref='~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/models/distilbert/tokenization_distilbert_fast.py:0'>0</a>;32m (...)
143 **kwargs,
144 ):
--> 145 super().__init__(
146 vocab_file,
147 tokenizer_file=tokenizer_file,
148 do_lower_case=do_lower_case,
149 unk_token=unk_token,
150 sep_token=sep_token,
151 pad_token=pad_token,
152 cls_token=cls_token,
153 mask_token=mask_token,
154 tokenize_chinese_chars=tokenize_chinese_chars,
155 strip_accents=strip_accents,
156 **kwargs,
157 )
159 normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
160 if (
161 normalizer_state.get("lowercase", do_lower_case) != do_lower_case
162 or normalizer_state.get("strip_accents", strip_accents) != strip_accents
163 or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
164 ):
File ~/miniconda/envs/conda-pytorch/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py:111, in PreTrainedTokenizerFast.__init__(self, *args, **kwargs)
108 fast_tokenizer = copy.deepcopy(tokenizer_object)
109 elif fast_tokenizer_file is not None and not from_slow:
110 # We have a serialization from tokenizers which let us directly build the backend
--> 111 fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
112 elif slow_tokenizer is not None:
113 # We need to convert a slow tokenizer to build the backend
114 fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
Exception: No such file or directory (os error 2)
Alternatively, when importing the tokenizer via DistilBertTokenizer
it works:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("Meshwa/Distill-Bert-Automation-Command-Classification")
Why is it required in this case to import with a specific tokenizer method rather than the general AutoTokenizer
method?
Thanks!