### System Info
System info:
- Code: Current `main` branch, installed via: `…pip install git+https://github.com/huggingface/transformers` on 22nd March 2023
### Who can help?
@ArthurZucker @sgugger @zphang
### Information
- [X] The official example scripts
- [ ] My own modified scripts
### Tasks
- [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)
- [X] My own task or dataset (give details below)
### Reproduction
- Code to reproduce:
```
from transformers import LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
print(repr(tokenizer.pad_token)) ## None
print(repr(tokenizer.bos_token)) ## ''
print(repr(tokenizer.eos_token)) ## ''
```
- Where this causes an issue:
```
batch = tokenizer(
[
"Singer Billy Joel yesterday ",
"The primary use of LLaMA is research on large language "
],
return_tensors="pt",
padding=True
)
```
The above statement raises an issue:
```
Using pad_token, but it is not set yet.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[53], line 1
----> 1 batch = tokenizer(
2 [
3 "Singer Billy Joel yesterday ",
4 "The primary use of LLaMA is research on large language "
5 ],
6 return_tensors="pt",
7 padding=True
8 )
File /home/ec2-user/anaconda3/envs/llm-gen/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2531, in PreTrainedTokenizerBase.__call__(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2529 if not self._in_target_context_manager:
2530 self._switch_to_input_mode()
-> 2531 encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
2532 if text_target is not None:
2533 self._switch_to_target_mode()
File /home/ec2-user/anaconda3/envs/llm-gen/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2617, in PreTrainedTokenizerBase._call_one(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2612 raise ValueError(
2613 f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
2614 f" {len(text_pair)}."
2615 )
2616 batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-> 2617 return self.batch_encode_plus(
2618 batch_text_or_text_pairs=batch_text_or_text_pairs,
2619 add_special_tokens=add_special_tokens,
2620 padding=padding,
2621 truncation=truncation,
2622 max_length=max_length,
2623 stride=stride,
2624 is_split_into_words=is_split_into_words,
2625 pad_to_multiple_of=pad_to_multiple_of,
2626 return_tensors=return_tensors,
2627 return_token_type_ids=return_token_type_ids,
2628 return_attention_mask=return_attention_mask,
2629 return_overflowing_tokens=return_overflowing_tokens,
2630 return_special_tokens_mask=return_special_tokens_mask,
2631 return_offsets_mapping=return_offsets_mapping,
2632 return_length=return_length,
2633 verbose=verbose,
2634 **kwargs,
2635 )
2636 else:
2637 return self.encode_plus(
2638 text=text,
2639 text_pair=text_pair,
(...)
2655 **kwargs,
2656 )
File /home/ec2-user/anaconda3/envs/llm-gen/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2799, in PreTrainedTokenizerBase.batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2782 """
2783 Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
2784
(...)
2795 details in `encode_plus`).
2796 """
2798 # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-> 2799 padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2800 padding=padding,
2801 truncation=truncation,
2802 max_length=max_length,
2803 pad_to_multiple_of=pad_to_multiple_of,
2804 verbose=verbose,
2805 **kwargs,
2806 )
2808 return self._batch_encode_plus(
2809 batch_text_or_text_pairs=batch_text_or_text_pairs,
2810 add_special_tokens=add_special_tokens,
(...)
2825 **kwargs,
2826 )
File /home/ec2-user/anaconda3/envs/llm-gen/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2436, in PreTrainedTokenizerBase._get_padding_truncation_strategies(self, padding, truncation, max_length, pad_to_multiple_of, verbose, **kwargs)
2434 # Test if we have a padding token
2435 if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
-> 2436 raise ValueError(
2437 "Asking to pad but the tokenizer does not have a padding token. "
2438 "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
2439 "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
2440 )
2442 # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
2443 if (
2444 truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
2445 and padding_strategy != PaddingStrategy.DO_NOT_PAD
(...)
2448 and (max_length % pad_to_multiple_of != 0)
2449 ):
ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
```
### Expected behavior
The following code should work:
```
from transformers import LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
batch = tokenizer(
[
"Singer Billy Joel yesterday ",
"The primary use of LLaMA is research on large language "
],
return_tensors="pt",
padding=True
)
```