So Iâm having an issue and cannot understand why I am getting a certain error. In my example, I want to call .generate and have it return:
- The ânormalâ logits
- The expert routing logits
- The generated text
However, I canât seem to do that simultanously. I seem to need to generate the text twice. Am I missing something here?
Setup code:
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from transformers import BitsAndBytesConfig
import torch
set_seed(1234)
local_dir = "./models/Phi-tiny-MoE-instruct"
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(
local_dir,
torch_dtype="auto",
device_map=generate_device_map(30, (1,2)),
output_router_logits=False, # Temp set for testing
)
tokenizer = AutoTokenizer.from_pretrained(local_dir, model_max_length=4096, padding=True, truncation=True, max_length=4096)
inputs = tokenizer("Hello world!", return_tensors="pt").to(model.device)
Iâd like to do the following:
generation_output = model.generate(**inputs, return_dict_in_generate=True, output_logits=True, output_router_logits=True) # This should contain the router logits
generation_sequences = generation_output.sequences # This is the text, both prompt and generated
generation_logits = generation_output.logits # This is the logits for generation
router_logits = generation_output.router_logits # This is the logits for the layer that decides which experts things go to
But instead I need to do this it seems:
generation_output = model.generate(**inputs, return_dict_in_generate=True, output_logits=True)
generation_sequences = generation_output.sequences # This is the text, both prompt and generated
generation_logits = generation_output.logits # This is the logits for generation
model_output = model(input_ids=inputs['input_ids'], output_router_logits=True)
router_logits = model_output.router_logits
As otherwise, the first one errors out like so:
Loading weights: 485/? [00:01<00:00, 790.24it/s, Materializing param=lm_head.bias]
PhimoeForCausalLM LOAD REPORT from: ./models/Phi-tiny-MoE-instruct
Key | Status |
----------------------------------------------------+------------+-
model.layers.{0...31}.mlp.gate.weight | UNEXPECTED |
model.layers.{0...31}.input_layernorm.bias | UNEXPECTED |
model.layers.{0...31}.post_attention_layernorm.bias | UNEXPECTED |
model.layers.{0...31}.mlp.router.weight | MISSING |
Notes:
- UNEXPECTED :can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING :those params were newly initialized because missing form the checkpoint. Consider training on your downstream task.
Some parameters are on the meta device because they were offloaded to the cpu.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[1], line 49
44 inputs = tokenizer("Hello world!", return_tensors="pt").to(model.device)
46 ##################################
47 ### This doesn't work! ###
48 ##################################
---> 49 generation_output = model.generate(**inputs, return_dict_in_generate=True, output_logits=True, output_router_logits=True) # This should contain the router logits
51 generation_sequences = generation_output.sequences # This is the text, both prompt and generated
52 generation_logits = generation_output.logits # This is the logits for generation
File ~/miniconda3/envs/<env_name>/lib/python3.12/site-packages/torch/utils/_contextlib.py:120, in context_decorator.<locals>.decorate_context(*args, **kwargs)
117 @functools.wraps(func)
118 def decorate_context(*args, **kwargs):
119 with ctx_factory():
--> 120 return func(*args, **kwargs)
File ~/miniconda3/envs/<env_name>/lib/python3.12/site-packages/transformers/generation/utils.py:2678, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, use_model_defaults, custom_generate, **kwargs)
2675 model_kwargs["use_cache"] = generation_config.use_cache
2677 # 9. Call generation mode
-> 2678 result = decoding_method(
2679 self,
2680 input_ids,
2681 logits_processor=prepared_logits_processor,
2682 stopping_criteria=prepared_stopping_criteria,
2683 generation_config=generation_config,
2684 **generation_mode_kwargs,
2685 **model_kwargs,
2686 )
2688 return result
File ~/miniconda3/envs/<env_name>/lib/python3.12/site-packages/transformers/generation/utils.py:2876, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
2874 if prefill_consumed:
2875 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-> 2876 outputs = model_forward(**model_inputs, return_dict=True)
2877 prefill_consumed = True
2878 model_kwargs = self._update_model_kwargs_for_generation(
2879 outputs,
2880 model_kwargs,
2881 is_encoder_decoder=self.config.is_encoder_decoder,
2882 )
File ~/miniconda3/envs/<env_name>/lib/python3.12/site-packages/torch/nn/modules/module.py:1773, in Module._wrapped_call_impl(self, *args, **kwargs)
1771 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1772 else:
-> 1773 return self._call_impl(*args, **kwargs)
File ~/miniconda3/envs/<env_name>/lib/python3.12/site-packages/torch/nn/modules/module.py:1784, in Module._call_impl(self, *args, **kwargs)
1779 # If we don't have any hooks, we want to skip the rest of the logic in
1780 # this function, and just call forward.
1781 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1782 or _global_backward_pre_hooks or _global_backward_hooks
1783 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1784 return forward_call(*args, **kwargs)
1786 result = None
1787 called_always_called_hooks = set()
File ~/miniconda3/envs/<env_name>/lib/python3.12/site-packages/accelerate/hooks.py:175, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
173 output = module._old_forward(*args, **kwargs)
174 else:
--> 175 output = module._old_forward(*args, **kwargs)
176 return module._hf_hook.post_forward(module, output)
File ~/miniconda3/envs/<env_name>/lib/python3.12/site-packages/transformers/utils/generic.py:768, in can_return_tuple.<locals>.wrapper(self, *args, **kwargs)
766 if return_dict_passed is not None:
767 return_dict = return_dict_passed
--> 768 output = func(self, *args, **kwargs)
769 if not return_dict and not isinstance(output, tuple):
770 output = output.to_tuple()
File ~/miniconda3/envs/<env_name>/lib/python3.12/site-packages/transformers/models/phimoe/modeling_phimoe.py:885, in PhimoeForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_router_logits, cache_position, logits_to_keep, **kwargs)
883 aux_loss = None
884 if output_router_logits:
--> 885 aux_loss = load_balancing_loss_func(
886 outputs.router_logits,
887 self.num_experts,
888 self.num_experts_per_tok,
889 attention_mask,
890 )
891 if labels is not None:
892 loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
File ~/miniconda3/envs/<env_name>/lib/python3.12/site-packages/transformers/models/phimoe/modeling_phimoe.py:779, in load_balancing_loss_func(gate_logits, num_experts, top_k, attention_mask)
771 expert_attention_mask = (
772 attention_mask[None, :, :, None, None]
773 .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
774 .reshape(-1, top_k, num_experts)
775 .to(compute_device)
776 )
778 # Compute the percentage of tokens routed to each experts
--> 779 tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
780 expert_attention_mask, dim=0
781 )
783 # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
784 router_per_expert_attention_mask = (
785 attention_mask[None, :, :, None]
786 .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
787 .reshape(-1, num_experts)
788 .to(compute_device)
789 )
RuntimeError: The size of tensor a (32) must match the size of tensor b (30) at non-singleton dimension 0
Why does it generate this error?