Can't understand the graphs logged by `wandb`

Hello,

I have a very simple training loop that I have built with the Trainer class, and I am having a hard time understanding the graphs that wandb is tracking.

I have a couple of questions for these graphs,

  1. Why does the global step go back to zero after some time?
  2. Why is the graph for train/epoch not linear? Why does this happen?

Here is my training loop,

import os

# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# import torch
# import torch.nn as nn
# import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, OPTConfig
from datasets import load_dataset
import transformers


def main():
    # model_name = "facebook/opt-125m"
    # cfg = AutoConfig.from_pretrained(model_name)
    cfg = OPTConfig(max_position_embeddings=512)
    # cfg = OPTConfig()
    model = AutoModelForCausalLM.from_config(cfg)
    print(f'Model being used: {model.config}')
    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

    ds = load_dataset("deven367/babylm-10M-switchboard")
    data = ds.map(lambda samples: tokenizer(samples["text"]), batched=True)

    trainer = transformers.Trainer(
        model=model,
        train_dataset=data["train"],
        eval_dataset=data["valid"],
        args=transformers.TrainingArguments(
            auto_find_batch_size=True,
            per_device_train_batch_size=2048,
            per_device_eval_batch_size=2048,
            gradient_accumulation_steps=4,
            # warmup_steps=100,
            num_train_epochs=5.,
            evaluation_strategy='epoch',
            learning_rate=2e-4,
            # fp16=True,
            logging_steps=1,
            output_dir="outputs",
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(
            tokenizer, mlm=False
        ),
    )
    model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )
    trainer.train( )

    # save the model
    trainer.save_model("trained-opt125M-5-epochs-switchboard")


if __name__ == "__main__":
    main()