How to generate without decoding?

For a research, I need to generate samples without decoding, even without transforming the tensors back into ids. I need them in the embedding space and with grad. How can I do this?

I’m using GPT2LMHeadModel, pytorch.

Thanks in advance

Okay I managed to make something based on greedy_search. I’ll greatly appreciate feedbacks. It’s giving output with correct shape but I’m not sure if the values are correct.

def sample_hidden(
    input_ids: torch.LongTensor,
    logits_processor = None,
    stopping_criteria = None,
    max_length: Optional[int] = None,
    pad_token_id: Optional[int] = None,
    eos_token_id: Optional[int] = None,
    synced_gpus: Optional[bool] = False,
) :
    # init values
    logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
    stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
    if max_length is not None:
            "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
        stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
    pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
    eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id

    # init attention / hidden states / scores tuples
    decoder_hidden_states = ()

    # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
    if self.config.is_encoder_decoder:
        encoder_hidden_states = model_kwargs["encoder_outputs"].get("hidden_states")

    # keep track of which sequences are already finished
    unfinished_sequences =[0]).fill_(1)
    cur_len = input_ids.shape[-1]

    this_peer_finished = False  # used by synced_gpus only
    first = True
    while True:

        if synced_gpus:
            # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
            # The following logic allows an early break if all peers finished generating their sequence
            this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
            # send 0.0 if we finished, 1.0 otherwise
            dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
            # did all peers finish? the reduced sum will be 0.0 then
            if this_peer_finished_flag.item() == 0.0:

        # prepare model inputs
        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

        # forward pass to get next token
        outputs = self(

        if synced_gpus and this_peer_finished:
            cur_len = cur_len + 1
            continue  # don't waste resources running the code we don't need

        next_token_logits = outputs.logits[:, -1, :]

        # Store scores, attentions and hidden_states when required
        hidden_states = outputs.decoder_hidden_states if self.config.is_encoder_decoder else outputs.hidden_states
        if first:
            # The model predicts next token
            # Therefore there'll only be l-1 hidden states
            # Because the first token is not predicted
            # Thus we'll use its embedding value
            decoder_hidden_states += (hidden_states[0],)
            first = False
        hidden_states = hidden_states[hidden_state_index]
        decoder_hidden_states += (hidden_states,)

        # pre-process distribution
        next_tokens_scores = logits_processor(input_ids, next_token_logits)

        # argmax
        next_tokens = torch.argmax(next_tokens_scores, dim=-1)

        # finished sentences should have their next token be a padding token
        if eos_token_id is not None:
            if pad_token_id is None:
                raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)

        # update generated ids, model inputs, and length for next step
        input_ids =[input_ids, next_tokens[:, None]], dim=-1)
        model_kwargs = self._update_model_kwargs_for_generation(
            outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
        cur_len = cur_len + 1

        # if eos_token was found in one sentence, set sentence to finished
        if eos_token_id is not None:
            unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())

        # stop when each sentence is finished, or if we exceed the maximum length
        if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, None):
            if not synced_gpus:
                this_peer_finished = True

    decoder_hidden_states =, dim=-2)
    return decoder_hidden_states