PEGASUS breaks when using label_smoothing_factor

Hi,

I want to use label_smoothing_factor parameter when training my PEGASUS model, however, setting this argument results in the following error. Removing this argument helps to avoid the error.

Traceback (most recent call last):
  File "optimize_hp_pegasus.py", line 158, in <module>
    compute_objective=compute_objective
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/transformers/trainer.py", line 1791, in hyperparameter_search
    best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/transformers/integrations.py", line 160, in run_hp_search_optuna
    study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs)
  File "/home/user/conda/lib/python3.7/site-packages/optuna/study/study.py", line 409, in optimize
    show_progress_bar=show_progress_bar,
  File "/home/user/conda/lib/python3.7/site-packages/optuna/study/_optimize.py", line 76, in _optimize
    progress_bar=progress_bar,
  File "/home/user/conda/lib/python3.7/site-packages/optuna/study/_optimize.py", line 163, in _optimize_sequential
    trial = _run_trial(study, func, catch)
  File "/home/user/conda/lib/python3.7/site-packages/optuna/study/_optimize.py", line 264, in _run_trial
    raise func_err
  File "/home/user/conda/lib/python3.7/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/transformers/integrations.py", line 150, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/transformers/trainer.py", line 1325, in train
    tr_loss_step = self.training_step(model, inputs)
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/transformers/trainer.py", line 1884, in training_step
    loss = self.compute_loss(model, inputs)
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/transformers/trainer.py", line 1916, in compute_loss
    outputs = model(**inputs)
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/transformers/models/pegasus/modeling_pegasus.py", line 1405, in forward
    return_dict=return_dict,
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/transformers/models/pegasus/modeling_pegasus.py", line 1262, in forward
    return_dict=return_dict,
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/jovyan/.imgenv-hyperopt-pegasus-0/lib/python3.7/site-packages/transformers/models/pegasus/modeling_pegasus.py", line 1014, in forward
    raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

The code to reproduce the bug:

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import numpy as np
from pathlib import Path

import transformers as ts
from datasets import load_dataset, load_metric
import torch
import nltk
from shutil import rmtree

Path("hyperopt").mkdir(exist_ok=True)
os.chdir("hyperopt")

# Upload the data
data = load_dataset("gigaword", revision="master", cache_dir='cache/data')#, download_mode="force_redownload")

train_split = data['train'].train_test_split(train_size=1_000, shuffle=True, seed=42)
train = train_split['train']

valid_split = data['validation'].train_test_split(train_size=500, shuffle=True, seed=42)
valid = valid_split['train']

# Upload the tokenizer
tokenizer = ts.AutoTokenizer.from_pretrained('google/pegasus-large', cache_dir='cache/tokenizer')

def tokenizing_fn(instances):
    encoded = tokenizer(instances["document"], truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(instances["summary"], truncation=True)

    encoded["labels"] = labels["input_ids"]
    return encoded

tokenized_train = train.map(tokenizing_fn, batched=True).remove_columns(['document', 'summary'])
tokenized_val = valid.map(tokenizing_fn, batched=True).remove_columns(['document', 'summary'])

data_collator = ts.DataCollatorForSeq2Seq(tokenizer = tokenizer, padding="longest")

metric = load_metric("rouge", cache_dir="cache/metric")
additional_metrics = [load_metric("sacrebleu",  cache_dir="cache/metric")]

def compute_metrics(eval_preds):
    # in this task we suppose that golden labels are reference sentences
    # and predictions are summaries generated by model, in human-readable format
    predictions, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(
        predictions, skip_special_tokens=True
    )
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    for add_metric in additional_metrics:
        result.update(
            add_metric.compute(predictions=predictions, references=labels)
        )

    return result

def init_model():
    return ts.AutoModelForSeq2SeqLM.from_pretrained('google/pegasus-large', cache_dir = 'cache/model')

training_args = ts.Seq2SeqTrainingArguments(
    output_dir='pegasus_train',
    # Batch size args
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=75,
    # Optimizer args
    learning_rate=5e-4,
    weight_decay=0.1,
    max_grad_norm=1.,
    # label_smoothing_factor=0.1,
    # Scheduler args
    warmup_ratio=0.1,
    # Eval args
    metric_for_best_model='eval_rouge1',
    # load_best_model_at_end=True,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=0,
    eval_accumulation_steps=1,
    # Seq2Seq args
    generation_max_length=42,
    predict_with_generate=True,
    generation_num_beams=4,
    # General args
    seed=42,
    fp16=True,
    fp16_full_eval=False,
)
callbacks = []#[ts.EarlyStoppingCallback(early_stopping_patience=3)]

trainer = ts.Seq2SeqTrainer(
    model_init=init_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    callbacks=callbacks,
    compute_metrics=compute_metrics
)

trainer.train()

Libraries versions:
transformers==4.17.0
datasets==1.18.3, I installed them from git (impossible to load gigaword otherwise): pip install git+https://github.com/huggingface/datasets#egg=datasets