Dear all, I would like to ask how can we use Qwen3-VL generate() with num_return_sequences > 1?
Here is my code with num_return_sequences > 1?
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor, GenerationConfig
import torch
from qwen_vl_utils import process_vision_info
model = Qwen3VLForConditionalGeneration.from_pretrained(
"./Qwen3-VL-8B-Instruct",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
processor = AutoProcessor.from_pretrained("./Qwen3-VL-8B-Instruct")
# use num_return_sequences=8, works for Qwen2.5-VL, but raises error for Qwen3-VL
generation_config = GenerationConfig(
max_new_tokens=768,
do_sample=True,
top_p=0.95,
temperature=1, # HACK
num_return_sequences=8,
pad_token_id=processor.tokenizer.pad_token_id,
)
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": "test.mp4",
},
{"type": "text", "text": "Describe this video."},
],
}
]
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, image_patch_size=16, return_video_kwargs=True, return_video_metadata=True)
if video_inputs is not None:
video_inputs, video_metadatas = zip(*video_inputs)
video_inputs, video_metadatas = list(video_inputs), list(video_metadatas)
else:
video_metadatas = None
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=text,
images=image_inputs,
videos=video_inputs,
video_metadata=video_metadatas,
return_tensors="pt",
do_resize=False,
**video_kwargs
)
inputs = inputs.to(model.device)
# use generation_config with num_return_sequences > 1, raises error
generated_ids = model.generate(**inputs, max_new_tokens=128, generation_config=generation_config)
Here is the error info:
`generation_config` default values have been modified to match model-specific defaults: {'temperature': 0.7, 'top_k': 20, 'bos_token_id': 151643, 'eos_token_id': [151645, 151643]}. If this is not desired, please set these values explicitly.
Traceback (most recent call last):
File "video/Qwen3-VL-Inference/test_qwen3vl_infer_video_use_generation_config_num_return_sequences8.py", line 153, in <module>
generated_ids = model.generate(**inputs, max_new_tokens=128, generation_config=generation_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "anaconda3/envs/video-r1-qwen3vl/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "video/Qwen3-VL-Inference/transformers-main/src/transformers/generation/utils.py", line 2579, in generate
input_ids, model_kwargs = self._expand_inputs_for_generation(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "video/Qwen3-VL-Inference/transformers-main/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py", line 1607, in _expand_inputs_for_generation
model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "video/Qwen3-VL-Inference/transformers-main/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py", line 1580, in _expand_dict_for_generation_visual
samples = torch.split(video_grid_thw, list(video_nums))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "anaconda3/envs/video-r1-qwen3vl/lib/python3.11/site-packages/torch/functional.py", line 207, in split
return tensor.split(split_size_or_sections, dim)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "anaconda3/envs/video-r1-qwen3vl/lib/python3.11/site-packages/torch/_tensor.py", line 983, in split
return torch._VF.split_with_sizes(self, split_size, dim)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: split_with_sizes expects split_sizes to sum exactly to 1 (input tensor's size at dimension 0), but got split_sizes=[8]
For your convenience, I install the latest transformers ‘5.0.0.dev0’ and qwen_vl_utils 0.0.14.
Do you have any solutions to this question? How can we use Qwen3-VL generate() with num_return_sequences > 1?