config = T5Config(num_layers=12,
num_decoder_layers=12,
pad_token_id=0,
es_token_id=VOCAB_SIZE_ZH + 2,
model_parallel=True,
vocab_size=VOCAB_SIZE_ZH,
num_heads=12,
d_model=1024,
d_kv=64,
d_ff=3072,
decoder_start_token_id= VOCAB_SIZE_ZH + 1,
)
model = T5ForConditionalGeneration(config)