Keyerror: 'loss' when change the backbone in opendelta

I was studying the demo of Opendelta, and they use T5 to train on ‘trivia_qa’ , and I change the T5 to RoBerta as a backbone, but it say KeyError ‘loss’, when I simply change the model.

My model code is as followed

from dataclasses import dataclass, field
from typing import Optional, List
from transformers import Seq2SeqTrainingArguments, TrainerCallback, RobertaTokenizer, RobertaModel
from datasets import load_dataset, load_metric, concatenate_datasets
import transformers
from transformers import (
    AutoConfig,
    RobertaConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    HfArgumentParser,
    MBartTokenizer,
    default_data_collator,
    set_seed,
)
from datasets import load_dataset
import torch
import numpy as np
import random

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """
    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )

model_args = ModelArguments(model_name_or_path="roberta-base", )

config = RobertaConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = RobertaTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    use_fast=model_args.use_fast_tokenizer,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)
model = RobertaModel.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)
model.resize_token_embeddings(len(tokenizer))

My dataset code is as followed.

@dataclass
class TrainingArguments(Seq2SeqTrainingArguments):
    print_num_parameters: Optional[bool] = field(default=False, metadata={"help": "If set, print the parameters of "
                                                                                 "the model."})
    do_test: Optional[bool] = field(default=False, metadata={"help": "If set, evaluates the test performance."})
    split_validation_test: Optional[bool] = field(default=False,
                                                  metadata={"help": "If set, for the datasets which do not"
                                                                    "have the test set, we use validation set as their"
                                                                    "test set and make a validation set from either"
                                                                    "splitting the validation set into half (for smaller"
                                                                    "than 10K samples datasets), or by using 1K examples"
                                                                    "from training set as validation set (for larger"
                                                                    " datasets)."})
    compute_time: Optional[bool] = field(default=False, metadata={"help": "If set measures the time."})
    compute_memory: Optional[bool] = field(default=False, metadata={"help": "if set, measures the memory"})


training_args = TrainingArguments(output_dir="./", 
                                  do_train=True,
                                  do_eval=True,
                                  do_predict=False,
                                  evaluation_strategy="steps",
                                  eval_steps=200,
                                  save_strategy="steps",
                                  save_steps=200,
                                  greater_is_better=True,
                                  load_best_model_at_end=True,
                                  compute_memory=True,
                                  predict_with_generate=True,
                                  push_to_hub=False,
                                  learning_rate=1e-3,
                                  seed=42,
                                  per_device_eval_batch_size=32,
                                  per_device_train_batch_size=32,
                                  num_train_epochs=1,
                                  metric_for_best_model="em",
                                  warmup_steps=0,
                                  save_total_limit=1,
                                  gradient_accumulation_steps=1
                                  )

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = [tokenizer.decode(i, skip_special_tokens=True, clean_up_tokenization_spaces=True) for i in preds]
    decoded_labels = [tokenizer.decode(i, skip_special_tokens=True, clean_up_tokenization_spaces=True) for i in labels]
    result = {}
    result_list = [int(i==j) for i, j in zip(decoded_labels, decoded_preds)]
    result.update({"em":sum(result_list)/len(result_list)})
    print(result)
    return result

mydataset = load_dataset("trivia_qa","unfiltered.nocontext")
mydataset['train'] = mydataset['train']
validation_index = np.arange(len(mydataset['validation']))
np.random.shuffle(validation_index)
mydataset['validation'] = mydataset['validation'].select(validation_index[:500])
def misspelling(x):
    length = len(x)
    replace_time = np.random.randint(3)
    count = 0

    while (count<replace_time):
        randfloat = np.random.rand()
        if randfloat < 0.15:
            x = x.split()
            switch_index = [np.random.randint(low=0, high=len(x)) for i in range(2)]
            tmp = x[switch_index[0]]
            x[switch_index[0]] = x[switch_index[1]]
            x[switch_index[1]] = tmp
            x = " ".join(x)
        elif randfloat < 0.3:
            x = x.split()
            drop_index = np.random.randint(low=0, high=len(x))
            x = x[:drop_index] + x[drop_index+1:]
            x = " ".join(x)
        elif randfloat < 0.8:
            replace_str = "".join([random.choice('abcdefghijklmnopqrstuvwxyz!@#$%^&*()') for i in range(np.random.randint(1,3))])
            rindx = np.random.randint(low=0, high=length)
            x = x[:rindx]+replace_str+x[rindx+1:]
        else:
            x=list(x)
            switch_index = [np.random.randint(low=0, high=len(x)) for i in range(2)]
            tmp = x[switch_index[0]]
            x[switch_index[0]] = x[switch_index[1]]
            x[switch_index[1]] = tmp
            x = "".join(x)
        count+=1
    return x

def tokenize_function(examples):
    input_sentences = [" ".join((i.strip("\n").strip().strip("?")+"?").split()[:20]) for i in examples["question"]]
    mis_spellings = [misspelling(x) for x in input_sentences]
    input_ids = [tokenizer.encode(i, padding="max_length", truncation=True, max_length=64) for i in mis_spellings]
    label = [tokenizer.encode(i, padding="max_length", truncation=True, max_length=64) for i in input_sentences]
    return {"input_ids": input_ids, "labels": label}

tokenized_datasets = mydataset.map(tokenize_function, remove_columns=['answer', 'question_source',"entity_pages",'search_results'], batched=True)
class MyCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        """
        Event called after an evaluation phase.
        """
        sents = ["was Wher Newton bon?", 
                 "In year which Beiiing Olmpic ld?"
                ]
        for sent in sents:
            input_ids = tokenizer(sent, return_tensors="pt").input_ids.cuda()
            answers_ids =model.generate(input_ids=input_ids, 
                                    max_length=20, 
                                    num_beams=4, 
                                    )
            print("{} {}".format(sent, tokenizer.decode(answers_ids[0], skip_special_tokens=True)))
        print("max allocated memory {} GB".format(torch.cuda.max_memory_allocated(f"cuda:0")/1024**3))
        
from transformers import Seq2SeqTrainer
training_args.output_dir = "./SpellingCorrection" # to avoid conflict
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    callbacks=[MyCallback],
    compute_metrics=compute_metrics,
)

trainer.train()

And it says Keyerror ‘loss’. Thanks very much!