Increasing loss during LM fine-tuning on custom dataset

Hello,
During the fine-tuning of camembert-base on a custom dataset using the Trainer API, the loss starts to increase quickly (compared to its value at step 0) and then decreases slowly, without ever reaching its value before training. What’s stranger is that lowering the learning rate results in a higher peak, whereas I expected the opposite to happen. Has anyone experienced a similar issue?
For information, the dataset is quite small (35k items of 15-300 words each for training, 6k for validation).
I’ve attached a screenshot of the metrics for different learning rate values (I’ve early stopped most of them).

Here is the code I used for training:

import json
import logging
import os
import math

import torch
from torch.utils.data.dataset import Dataset
from transformers import (
    AutoTokenizer,
    CamembertForMaskedLM,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
)


logger = logging.getLogger()


class LMJsonDataset(Dataset):
    def __init__(
        self, tokenizer: PreTrainedTokenizer, file_path: str, field_name: str, block_size: int
    ):
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"
        logger.info("Creating features from dataset file at %s", file_path)

        with open(file_path, encoding="utf-8") as f:
            texts = [t for t in (json.loads(item)[field_name] for item in f if item) if t]

        batch_encoding = tokenizer(
            texts, add_special_tokens=True, truncation=True, max_length=block_size
        )
        self.examples = batch_encoding["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i: int) -> torch.Tensor:
        return torch.tensor(self.examples[i], dtype=torch.long)


parser = HfArgumentParser(TrainingArguments)
parser.add_argument("--train_dataset", required=True)
parser.add_argument("--eval_dataset", required=True)
parser.add_argument("--mlm_probability", type=float, default=0.15)
parser.add_argument("--base_model", default="camembert-base")
parser.add_argument("--dataset_json_field_name", default="text")
parser.add_argument("--block_size", type=int, default=128)
training_args, remaining_args = parser.parse_args_into_dataclasses()

model = CamembertForMaskedLM.from_pretrained(remaining_args.base_model)
tokenizer = AutoTokenizer.from_pretrained(remaining_args.base_model)

train_dataset = LMJsonDataset(
    tokenizer=tokenizer,
    file_path=remaining_args.train_dataset,
    field_name=remaining_args.dataset_json_field_name,
    block_size=remaining_args.block_size,
)
eval_dataset = LMJsonDataset(
    tokenizer=tokenizer,
    file_path=remaining_args.eval_dataset,
    field_name=remaining_args.dataset_json_field_name,
    block_size=remaining_args.block_size,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=remaining_args.mlm_probability
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    prediction_loss_only=True,
)

trainer.train()

Thank you for your help!

I have a question do you work with french data ?