ValueError in Seq2SeqTrainer uses the Whisper model

Code:

import torch
import datasets
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import re
import evaluate
import wandb

from typing import Any, Dict, List
from collections import Counter, defaultdict
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperConfig, AutoModelForSpeechSeq2Seq, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

dataset_is = datasets.DatasetDict()

# Load the dataset
print("Loading dataset...")
dataset_is['train'] = datasets.load_dataset("google/fleurs", "is_is", split="train+test", trust_remote_code=True)
dataset_is['val'] = datasets.load_dataset("google/fleurs", "is_is", split="validation", trust_remote_code=True)

# Initialize the feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

# Define a function to process audio data
def preprocess_function(examples):
    audio = examples["audio"]
    # Convert the audio array to the expected format and sample rate for Whisper
    inputs = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="np")
    return {"input_features": inputs.input_features[0]}

# Apply the preprocessing function to the dataset
print("Applying feature extraction...")
dataset_is = dataset_is.map(preprocess_function, remove_columns=["audio"], batched=False)

# Rename the 'raw_transcription' column to 'labels'
dataset_is = dataset_is.rename_column("raw_transcription", "labels")

# Initialize the tokenizer
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small")

# Define a function to tokenize the transcriptions
def tokenize_function(examples):
    labels = examples["labels"]
    # Tokenize the labels
    tokenized_labels = tokenizer(labels, padding="longest", truncation=True, return_tensors="np")
    return {
        "input_ids": tokenized_labels["input_ids"],
        "attention_mask": tokenized_labels["attention_mask"],
        "labels": tokenized_labels["input_ids"]  # Use input_ids as labels for simplicity
    }

# Apply the tokenization function to the dataset
print("Applying tokenization...")
dataset_is = dataset_is.map(tokenize_function, remove_columns=["labels"], batched=True)

# Initialize the WhisperProcessor
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# Define a function to process the dataset with the processor
def process_with_processor(examples):
    labels = example['labels']
    input_features = examples["input_features"]
    labels = examples["input_ids"]
    attention_mask = examples["attention_mask"]

    return {
        "labels": input_ids,
        "input_ids": input_ids,
        "input_features": input_features,
        "attention_mask": attention_mask
    }

# Apply the processor to the dataset
print("Applying processor...")
dataset_is = dataset_is.map(process_with_processor, batched=True, batch_size=8)

# Load the vocabulary to get the vocab size
vocab_size = len(tokenizer.get_vocab())

# Initialize the WhisperConfig
config = WhisperConfig(
    vocab_size=vocab_size,
    num_mel_bins=80,
    encoder_layers=12,
    encoder_attention_heads=12,
    decoder_layers=12,
    decoder_attention_heads=12,
    decoder_ffn_dim=3072,
    encoder_ffn_dim=3072,
   .... etc
)

# Initialize the model using the WhisperConfig
model = AutoModelForSpeechSeq2Seq.from_config(config)

# Load the evaluation metric
metric = evaluate.load("wer")

# Define a function to compute metrics
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    max_length=None,
    pad_to_multiple_of=None,
    return_tensors='np'
)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_is["train"],
    eval_dataset=dataset_is["val"],
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided [‘input_features’, ‘attention_mask’]

I want to train using Whisper model from scratch not from finetuning, but when I go through the Seq2SeqTrainer process and want to train, I get an error. Can you help me to find the correct code and also explain why the error and how to correct it. Thank you.