EncoderDecoderModel output all pad token

Hey , I tried to train the roberta2roberta model like this

from transformers import EncoderDecoderModel, RobertaTokenizer
import torch
import argparse
from ff_dataloader import CNNDataset, DataLoader, collate_fn
from torch.utils.tensorboard import SummaryWriter


writer = SummaryWriter()
parser = argparse.ArgumentParser()
args = parser.parse_args()
args.max_src_len = 512
args.max_dst_len = 128
tokenizer =  RobertaTokenizer.from_pretrained("roberta-base")
model = EncoderDecoderModel.from_encoder_decoder_pretrained('roberta-base', 'roberta-base')
model = model.to(0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

dataset_path = 'dataset/example.json'
vocab_path = 'dataset/vocab.txt'
dataset = CNNDataset(dataset_path, vocab_path, args)
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_fn)
cnt = 0
for epoch in range(20):

     for src, dst in dataloader:
          src = torch.stack(src).to(0)
          mask = (src!=0)
          mask =mask.long()
          labels =[[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in dst]
          dst = torch.stack(dst).to(0)
          labels = torch.tensor(labels).to(0)
          outputs = model(input_ids=src, attention_mask=mask, decoder_input_ids=dst, labels=labels, return_dict=True)
          loss, logits = outputs.loss, outputs.logits
          

          loss.backward()

          optimizer.step()
          optimizer.zero_grad()


          if cnt % 100 == 0:
               writer.add_scalar('loss', loss, cnt)
          if cnt % 1000 == 0:
               model.save_pretrained('roberta2roberta-mask')

          cnt = cnt + 1

When I tried to test the model like this

from transformers import BertTokenizer, EncoderDecoderModel,  RobertaTokenizer
from datasets import load_from_disk
import nlp
from rouge import Rouge
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = EncoderDecoderModel.from_pretrained("roberta2roberta-mask")
model.to("cuda")
test_dataset = load_from_disk("test_dataset")
batch_size = 128
tokenizer.bos_token = tokenizer.cls_token

# SEP token will work as EOS token
tokenizer.eos_token = tokenizer.sep_token
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    outputs = model.generate(input_ids, attention_mask=attention_mask, decoder_start_token_id=model.config.decoder_start_token_id)
    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred"] = output_str
    return batch
results = test_dataset.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])
# load rouge for validation
rouge = Rouge()
pred_str = results["pred"]
label_str = results["highlights"]
rouge_output = rouge.compute(predictions=pred_str, references=label_str)
print('roberta2roberta')
print(rouge_output)

It generated all pad tokens.

Hey! I am facing the same problem in a bert2bert model. Did you find the solution? :sweat_smile: