RNN-T predict only blank

i am training RNN-T model put after training it only predict Blank

import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl


class Encoder(pl.LightningModule):
    def __init__(self, inputs, hidden_num, layers):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(inputs, hidden_num, layers, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(hidden_num * 2, hidden_num)
        self.batch_norm = nn.BatchNorm2d(1)

    def forward(self, x):
        x = self.batch_norm(x.unsqueeze(1))
        x = x.squeeze(1)
        x, _ = self.lstm(x)
        x = self.linear(x)
        return x


class Decoder(pl.LightningModule):
    def __init__(self, hidden_num):
        super(Decoder, self).__init__()
        self.layer_norm = nn.LayerNorm(80)
        self.embedding = nn.Embedding(80, 80)
        self.lstm = nn.LSTM(80, hidden_num, 8, dropout=.1)
        self.linear = nn.Linear(hidden_num, hidden_num)
        self.embed = nn.Embedding(4048, 80)

    def forward(self, x, h=None):
        embedded = self.embedding(x)  # embedded shape: [1, batch size, emb dim]
        embedded = self.layer_norm(embedded)
        embedded = self.dropout(embedded)

        if h is None:
            output, h = self.lstm(embedded)
        else:
            output, h = self.lstm(embedded, h)

        return output, h


class Joint(pl.LightningModule):
    def __init__(self, hidden_num):
        super(Joint, self).__init__()
        self.dec = nn.Linear(hidden_num, hidden_num)
        self.enc = nn.Linear(hidden_num, hidden_num)
        self.joint = nn.Linear(hidden_num, 52)

    def forward(self, x1, x2):
        input_length = x1.size(1)
        target_length = x2.size(1)
        encoder_outputs = x1.unsqueeze(2)
        decoder_outputs = x2.unsqueeze(1)
        encoder_outputs = encoder_outputs.repeat([1, 1, target_length, 1])
        decoder_outputs = decoder_outputs.repeat([1, input_length, 1, 1])

        x = F.tanh(self.enc(encoder_outputs) + self.dec(decoder_outputs))
        return self.joint(x)


class RNNT(pl.LightningModule):
    def __init__(self, inputs, hidden_num, layers, learning_rate=1e-3):
        super(RNNT, self).__init__()
        self.encoder = Encoder(inputs, hidden_num, layers)
        self.decoder = Decoder(hidden_num)
        self.joint = Joint(hidden_num)
        self.learning_rate = learning_rate
        self.loss = asr.losses.rnnt_pytorch.RNNTLossPytorch(51, "mean")

    def forward(self, x1, x2, h=None):
        x1 = self.encoder(x1)
        x2, h = self.decoder(x2, h)
        return torch.softmax(self.joint(x1, x2), dim=-1), h

    def training_step(self, batch, batch_idx):
        x1, y, x2, _ = batch.values()
        y_hat, h = self(x1, x2)
        b, s, t, d = y_hat.shape        
        loss = self.loss(y_hat, y[:, :-1].int().cuda(), (torch.ones(b) * s).int().cuda(), (torch.ones(b) * (t - 1)).int().cuda())
        self.log("train_loss", loss, prog_bar=True, on_step=True)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        x1, y, x2, _ = batch.values()
        y_hat, h = self(x1, x2)
        b, s, t, d = y_hat.shape
        loss = self.loss(y_hat, y[:, :-1].int().cuda(), (torch.ones(b) * s).int().cuda(), (torch.ones(b) * (t - 1)).int().cuda())
        self.log("val_loss", loss, prog_bar=True, on_step=True)
        return {"loss": loss}

    def on_train_epoch_end(self):
        torch.save(self.state_dict(), "model.pth")
        torch.save(self, "model.pt")

    def configure_optimizers(self):
        self.optim = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        scheduler = {
            'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(self.optim, mode='min', factor=0.1, patience=5, min_lr=1e-6),
            'monitor': 'train_loss',  # This is the metric to monitor
            'interval': 'epoch',     # How often to check the metric
            'frequency': 1           # How often to apply the scheduler
        }
        return [self.optim], [scheduler]

i am using 80 Filter bank size which give me vectors on every sample audio (s,80) but after few steps of training model make probability of blank and space very high i tried to make penalty but not work because model make always make one charterer probability very high on the other i mean if i take argmax(-1) for all samples model predict all in all samples same charchter

(upload://n46IaSoh7UnaWjarC95azRfXi7e.png)