After changing the patch_size parameter in qwen-image-edit-2509/processor/preprocessor_config.json from 14 to 7, I can train normally, but the error occured when I try to inference: /pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [450,0,0], thread: [0,0,0] Assertion ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds" failed.
Below is traceback:
AcceleratorError Traceback (most recent call last)
Cell In[3], line 26
24 image = Image.open(f"{image_name}.png").convert(“RGB”)
25 width, height = image.size
—> 26 images_out = pipe(image, prompt,negative_prompt=“脸部出现红晕”, num_inference_steps=15, output_type=‘pil’, true_cfg_scale=4.0).images
27 save_image = images_out[0].resize((width, height))
28 save_image.save(save_image_name)
File ~/miniconda3/lib/python3.12/site-packages/torch/utils/_contextlib.py:120, in context_decorator..decorate_context(*args, **kwargs)
117 @functools.wrapsfunctools.wraps(func)
118 def decorate_context(*args, **kwargs):
119 with ctx_factory():
→ 120 return func(*args, **kwargs)
File ~/miniconda3/lib/python3.12/site-packages/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py:700, in QwenImageEditPlusPipeline.call(self, image, prompt, negative_prompt, true_cfg_scale, height, width, num_inference_steps, sigmas, guidance_scale, num_images_per_prompt, generator, latents, prompt_embeds, prompt_embeds_mask, negative_prompt_embeds, negative_prompt_embeds_mask, output_type, return_dict, attention_kwargs, callback_on_step_end, callback_on_step_end_tensor_inputs, max_sequence_length)
695 logger.warning(
696 " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1"
697 )
699 do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
→ 700 prompt_embeds, prompt_embeds_mask = self.encode_prompt(
701 image=condition_images,
702 prompt=prompt,
703 prompt_embeds=prompt_embeds,
704 prompt_embeds_mask=prompt_embeds_mask,
705 device=device,
706 num_images_per_prompt=num_images_per_prompt,
707 max_sequence_length=max_sequence_length,
708 )
709 if do_true_cfg:
710 negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
711 image=condition_images,
712 prompt=negative_prompt,
(…) 717 max_sequence_length=max_sequence_length,
718 )
File ~/miniconda3/lib/python3.12/site-packages/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py:318, in QwenImageEditPlusPipeline.encode_prompt(self, prompt, image, device, num_images_per_prompt, prompt_embeds, prompt_embeds_mask, max_sequence_length)
315 batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
317 if prompt_embeds is None:
→ 318 prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, image, device)
320 _, seq_len, _ = prompt_embeds.shape
321 prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
File ~/miniconda3/lib/python3.12/site-packages/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py:262, in QwenImageEditPlusPipeline._get_qwen_prompt_embeds(self, prompt, image, device, dtype)
253 txt = [template.format(base_img_prompt + e) for e in prompt]
255 model_inputs = self.processor(
256 text=txt,
257 images=image,
258 padding=True,
259 return_tensors=“pt”,
260 ).to(device)
→ 262 outputs = self.text_encoder(
263 input_ids=model_inputs.input_ids,
264 attention_mask=model_inputs.attention_mask,
265 pixel_values=model_inputs.pixel_values,
266 image_grid_thw=model_inputs.image_grid_thw,
267 output_hidden_states=True,
268 )
270 hidden_states = outputs.hidden_states[-1]
271 split_hidden_states = self._extract_masked_hidden(hidden_states, model_inputs.attention_mask)
File ~/miniconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1775, in Module._wrapped_call_impl(self, *args, **kwargs)
1773 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1774 else:
→ 1775 return self._call_impl(*args, **kwargs)
File ~/miniconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1786, in Module._call_impl(self, *args, **kwargs)
1781 # If we don’t have any hooks, we want to skip the rest of the logic in
1782 # this function, and just call forward.
1783 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1784 or _global_backward_pre_hooks or _global_backward_hooks
1785 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1786 return forward_call(*args, **kwargs)
1788 result = None
1789 called_always_called_hooks = set()
File ~/miniconda3/lib/python3.12/site-packages/accelerate/hooks.py:175, in add_hook_to_module..new_forward(module, *args, **kwargs)
173 output = module._old_forward(*args, **kwargs)
174 else:
→ 175 output = module._old_forward(*args, **kwargs)
176 return module._hf_hook.post_forward(module, output)
File ~/miniconda3/lib/python3.12/site-packages/transformers/utils/generic.py:959, in can_return_tuple..wrapper(self, *args, **kwargs)
957 if return_dict_passed is not None:
958 return_dict = return_dict_passed
→ 959 output = func(self, *args, **kwargs)
960 if not return_dict and not isinstance(output, tuple):
961 output = output.to_tuple()
File ~/miniconda3/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:1493, in Qwen2_5_VLForConditionalGeneration.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas, cache_position, second_per_grid_ts, logits_to_keep, **kwargs)
1488 output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1489 output_hidden_states = (
1490 output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1491 )
→ 1493 outputs = self.model(
1494 input_ids=input_ids,
1495 pixel_values=pixel_values,
1496 pixel_values_videos=pixel_values_videos,
1497 image_grid_thw=image_grid_thw,
1498 video_grid_thw=video_grid_thw,
1499 second_per_grid_ts=second_per_grid_ts,
1500 position_ids=position_ids,
1501 attention_mask=attention_mask,
1502 past_key_values=past_key_values,
1503 inputs_embeds=inputs_embeds,
1504 use_cache=use_cache,
1505 output_attentions=output_attentions,
1506 output_hidden_states=output_hidden_states,
1507 return_dict=True,
1508 cache_position=cache_position,
1509 **kwargs,
1510 )
1512 hidden_states = outputs[0]
1514 # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
File ~/miniconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1775, in Module._wrapped_call_impl(self, *args, **kwargs)
1773 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1774 else:
→ 1775 return self._call_impl(*args, **kwargs)
File ~/miniconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1786, in Module._call_impl(self, *args, **kwargs)
1781 # If we don’t have any hooks, we want to skip the rest of the logic in
1782 # this function, and just call forward.
1783 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1784 or _global_backward_pre_hooks or _global_backward_hooks
1785 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1786 return forward_call(*args, **kwargs)
1788 result = None
1789 called_always_called_hooks = set()
File ~/miniconda3/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:1275, in Qwen2_5_VLModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas, cache_position, second_per_grid_ts, **kwargs)
1272 inputs_embeds = self.get_input_embeddings()(input_ids)
1274 if pixel_values is not None:
→ 1275 image_embeds = self.get_image_features(pixel_values, image_grid_thw)
1276 image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
1277 image_mask, _ = self.get_placeholder_mask(
1278 input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
1279 )
File ~/miniconda3/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:1188, in Qwen2_5_VLModel.get_image_features(self, pixel_values, image_grid_thw)
1178 “”"
1179 Encodes images into continuous embeddings that can be forwarded to the language model.
1180
(…) 1185 The temporal, height and width of feature shape of each image in LLM.
1186 “”"
1187 pixel_values = pixel_values.type(self.visual.dtype)
→ 1188 image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
1189 split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
1190 image_embeds = torch.split(image_embeds, split_sizes)
File ~/miniconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1775, in Module._wrapped_call_impl(self, *args, **kwargs)
1773 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1774 else:
→ 1775 return self._call_impl(*args, **kwargs)
File ~/miniconda3/lib/python3.12/site-packages/torch/nn/modules/module.py:1786, in Module._call_impl(self, *args, **kwargs)
1781 # If we don’t have any hooks, we want to skip the rest of the logic in
1782 # this function, and just call forward.
1783 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1784 or _global_backward_pre_hooks or _global_backward_hooks
1785 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1786 return forward_call(*args, **kwargs)
1788 result = None
1789 called_always_called_hooks = set()
File ~/miniconda3/lib/python3.12/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py:459, in Qwen2_5_VisionTransformerPretrainedModel.forward(self, hidden_states, grid_thw, **kwargs)
457 hidden_states = hidden_states.reshape(seq_len, -1)
458 rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
→ 459 rotary_pos_emb = rotary_pos_emb[window_index, :, :]
460 rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
461 emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
AcceleratorError: CUDA error: device-side assert triggered
Search for cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information. CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with TORCH_USE_CUDA_DSA` to enable device-side assertions.
Below is my inference code:
pipe = QwenImageEditPlusPipeline.from_pretrained(“autodl-tmp/qwen-image-edit-2509”, torch_dtype=torch.bfloat16,height=512, width=512, output_type=‘np’, true_cfg_scale=4.0)
#pipe.to(“cuda:0”)
pipe.load_lora_weights(“qwen-image-finetune/output/all/qwen_image_finetune/v1/checkpoint-last-0-120-last”)
pipe.enable_model_cpu_offload()
pipe(image, prompt,negative_prompt=“”, num_inference_steps=15, output_type=‘pil’, true_cfg_scale=4.0).images