### System Info
- `transformers` version: 4.49.0
- Platform: Linux-5.15.0-131-gā¦eneric-x86_64-with-glibc2.31
- Python version: 3.11.10
- Huggingface_hub version: 0.28.0
- Safetensors version: 0.5.2
- Accelerate version: 1.3.0
- Accelerate config: not found
- DeepSpeed version: not installed
- PyTorch version (GPU?): 2.5.1+cu124 (False)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using distributed or parallel set-up in script?: no
### Who can help?
After the release of transformers-4.49.0, we get an error in a smolagents CI test:
- https://github.com/huggingface/smolagents/issues/692
We fixed the issue by pinning transformers<4.49.0:
- https://github.com/huggingface/smolagents/pull/693
### Information
- [ ] The official example scripts
- [ ] My own modified scripts
### Tasks
- [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)
- [ ] My own task or dataset (give details below)
### Reproduction
The failing test code is: https://github.com/huggingface/smolagents/blob/40d795ddb60808d5094efad8e909f39376896d17/tests/test_models.py#L95-L107
```python
from PIL import Image
img = Image.open(Path(get_tests_dir("fixtures")) / "000000039769.png")
model = TransformersModel(
model_id="llava-hf/llava-interleave-qwen-0.5b-hf",
max_new_tokens=5,
device_map="cpu",
do_sample=False,
)
messages = [{"role": "user", "content": [{"type": "text", "text": "Hello!"}, {"type": "image", "image": img}]}]
output = model(messages, stop_sequences=["great"]).content
```
The relevant code in smolagents, where we pass `images` kwarg to `processor.apply_chat_template`, is:
```python
images = [Image.open(image) for image in images] if images else None
prompt_tensor = self.processor.apply_chat_template(
messages,
tools=[get_tool_json_schema(tool) for tool in tools_to_call_from] if tools_to_call_from else None,
return_tensors="pt",
tokenize=True,
return_dict=True,
images=images,
add_generation_prompt=True if tools_to_call_from else False,
```
The stack trace: https://github.com/huggingface/smolagents/actions/runs/13391491485/job/37400017221
```python
> output = model(messages, stop_sequences=["great"]).content
tests/test_models.py:95:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src/smolagents/models.py:740: in __call__
prompt_tensor = self.processor.apply_chat_template(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = LlavaProcessor:
- image_processor: SiglipImageProcessor {
"do_convert_rgb": null,
"do_normalize": true,
"do_resc...ge_tokens": 0,
"patch_size": 14,
"processor_class": "LlavaProcessor",
"vision_feature_select_strategy": "full"
}
conversation = [{'content': [{'text': 'Hello!', 'type': 'text'}, {'image': 'iVBORw0KGgoAAAANSUhEUgAAAoAAAAHgCAIAAAC6s0uzAAEAAElEQVR4n...q8dj+hsTgsx1DdXi+rV9LEk/l9NC3//ef/jNtKWLpyxrhMFRX/n+vEMxdFseMagAAAABJRU5ErkJggg==', 'type': 'image'}], 'role': 'user'}]
chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'}}{# Render all images first #}{% for content i...{% endif %}{{'<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
kwargs = {'images': None, 'return_tensors': 'pt'}
tokenizer_template_kwargs = {'add_generation_prompt': False, 'continue_final_message': False, 'documents': None, 'return_assistant_tokens_mask': False, ...}
tokenizer_key = 'return_assistant_tokens_mask', tokenizer_value = False
value = None
chat_template_kwargs = {'add_generation_prompt': None, 'continue_final_message': None, 'documents': None, 'num_frames': None, ...}
key = 'sample_indices_fn', processor_value = None
def apply_chat_template(
self,
conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
chat_template: Optional[str] = None,
**kwargs: Unpack[AllKwargsForChatTemplate],
) -> str:
"""
Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
conversations to turn them into a single tokenizable string.
The input is expected to be in the following format, where each message content is a list consisting of text and
optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
`pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
conversation = [
{
"role": "user",
"content": [
{"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
{"type": "text", "text": "Please describe this image in detail."},
],
},
]
Args:
conversation (`Union[List[Dict, [str, str]], List[List[Dict[str, str]]]]`):
The conversation to format.
chat_template (`Optional[str]`, *optional*):
The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
chat template is used.
"""
if chat_template is None:
if self.chat_template is not None:
chat_template = self.chat_template
else:
raise ValueError(
"No chat template is set for this processor. Please either set the `chat_template` attribute, "
"or provide a chat template as an argument. See "
"https://huggingface.co/docs/transformers/main/en/chat_templating for more information."
)
# Fill two sets of kwargs that should be used by tokenizer's `apply_chat_template`
# and for multimodal chat template
tokenizer_template_kwargs = {}
for tokenizer_key in TokenizerChatTemplateKwargs.__annotations__.keys():
tokenizer_value = getattr(TokenizerChatTemplateKwargs, tokenizer_key, None)
value = kwargs.pop(tokenizer_key, tokenizer_value)
tokenizer_template_kwargs[tokenizer_key] = value
chat_template_kwargs = {}
for key in ProcessorChatTemplateKwargs.__annotations__.keys():
processor_value = getattr(ProcessorChatTemplateKwargs, key, None)
value = kwargs.pop(key, processor_value)
chat_template_kwargs[key] = value
if isinstance(conversation, (list, tuple)) and (
isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
):
is_batched = True
conversations = conversation
else:
is_batched = False
conversations = [conversation]
num_frames = chat_template_kwargs.get("num_frames")
video_fps = chat_template_kwargs.get("video_fps")
video_load_backend = chat_template_kwargs.get("video_load_backend")
tokenize = chat_template_kwargs.get("tokenize")
return_dict = chat_template_kwargs.get("return_dict")
sample_indices_fn = chat_template_kwargs.get("sample_indices_fn")
if tokenize:
batch_images, batch_videos = [], []
batch_video_metadata = []
for conversation in conversations:
images, videos = [], []
video_metadata = []
for message in conversation:
visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
image_fnames = [
vision_info[key]
for vision_info in visuals
for key in ["image", "url", "path", "base64"]
if key in vision_info and vision_info["type"] == "image"
]
video_fnames = [
vision_info[key]
for vision_info in visuals
for key in ["video", "url", "path"]
if key in vision_info and vision_info["type"] == "video"
]
for fname in image_fnames:
images.append(load_image(fname))
for fname in video_fnames:
if isinstance(fname, (list, tuple)) and isinstance(fname[0], str):
video = [np.array(load_image(image_fname)).T for image_fname in fname]
# create a 4D video because `load_video` always returns a 4D array
video = np.stack(video)
metadata = None
logger.warning(
"When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
"If you model applies special processing based on metadata, please load the whole video and let the model sample frames."
)
else:
video, metadata = load_video(
fname,
num_frames=num_frames,
fps=video_fps,
backend=video_load_backend,
sample_indices_fn=sample_indices_fn,
)
videos.append(video)
video_metadata.append(metadata)
# Currently all processors can accept nested list of batches, but not flat list of visuals
# So we'll make a batched list of images and let the processor handle it
if images:
batch_images.append(images)
if videos:
batch_videos.append(videos)
batch_video_metadata.append(video_metadata)
# Process conversation with video/image information if needed. Then convert into a prompt using Jinja template
conversations = self._process_messages_for_chat_template(
conversations,
batch_images=batch_images,
batch_videos=batch_videos,
batch_video_metadata=batch_video_metadata,
**chat_template_kwargs,
)
prompt = self.tokenizer.apply_chat_template(
conversations,
chat_template=chat_template,
tokenize=False,
return_dict=False,
**tokenizer_template_kwargs,
)
if not is_batched:
prompt = prompt[0]
if tokenize:
# Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
# But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
# and pass it to the processor. Users thus never worried about special tokens relying on processor hadnling
# everything internally. The below line is to keep BC for that and be able to work with model that have
# special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
# without actionable solution for users
single_prompt = prompt[0] if is_batched else prompt
if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
kwargs["add_special_tokens"] = False
> out = self(
text=prompt,
images=batch_images if batch_images else None,
videos=batch_videos if batch_videos else None,
**kwargs,
)
E TypeError: LlavaProcessor:
E - image_processor: SiglipImageProcessor {
E "do_convert_rgb": null,
E "do_normalize": true,
E "do_rescale": true,
E "do_resize": true,
E "image_mean": [
E 0.5,
E 0.5,
E 0.5
E ],
E "image_processor_type": "SiglipImageProcessor",
E "image_std": [
E 0.5,
E 0.5,
E 0.5
E ],
E "processor_class": "LlavaProcessor",
E "resample": 3,
E "rescale_factor": 0.00392156862745098,
E "size": {
E "height": 384,
E "width": 384
E }
E }
E
E - tokenizer: Qwen2TokenizerFast(name_or_path='llava-hf/llava-interleave-qwen-0.5b-hf', vocab_size=151643, model_max_length=32768, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>'], 'image_token': '<image>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
E 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
E 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
E 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
E 151646: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
E }
E )
E
E {
E "image_token": "<image>",
E "num_additional_image_tokens": 0,
E "patch_size": 14,
E "processor_class": "LlavaProcessor",
E "vision_feature_select_strategy": "full"
E }
E got multiple values for keyword argument 'images'
.venv/lib/python3.10/site-packages/transformers/processing_utils.py:1383: TypeError
```
### Expected behavior
No error.