Failing to Train Model

hey guys I’m really new to training models for NLP and i created my first notebook similar to this guide
only difference being I want to do language detection which is why I’m using the papluca language-identification dataset.
The code always gives an error when trying to train the model, telling me I should implement padding and truncation which I did.
Hope anyone can help me out here.

Here is the given Code:

!pip install transformers
!pip install datasets evaluate huggingface_hub

import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

data = load_dataset("papluca/language-identification")

def languageFilter(dataset):

    wanted_languages = ['de', 'en', 'es', 'fr']

    if dataset['labels'] in wanted_languages:
        return True
    else:
        return False

filtered_data = data.filter(languageFilter)

for x in filtered_data:
  print(f"There are {len(filtered_data[x])} in the {x}-Split")


filtered_data["test"][0]


tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
id2label = {0: "de",
            1: "en",
            2: "fr",
            3: "es"}

label2id = {"de": 0,
            "en": 1,
            "fr": 2,
            "es": 3
            }


def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding = "longest")

tokenized_data = filtered_data.map(tokenize_function)

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)



accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)



training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


trainer.train()

Hi, can you share the exact error message and stack trace if any?