Training Transformer XL from scratch

Hello, I am trying to recreate this notebook https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb for transformer XL
I made changes to the tokenizer as follows

%%time 
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers.trainers import WordLevelTrainer


tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()


bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

trainer = WordLevelTrainer(show_progress=True, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

files = [str(x) for x in Path(".").glob("**/*.txt")]

tokenizer.train(files, trainer)

tokenizer.save("espertransXL.json")

and then loaded it into the FastTokenizer

from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file="espertransXL.json")
tokenizer.bos_token="[CLS]"
tokenizer.eos_token="[SEP]"
tokenizer.sep_token="[SEP]"
tokenizer.cls_token="[CLS]"
tokenizer.unk_token="[UNK]"
tokenizer.pad_token="[PAD]"
tokenizer.mask_token="[MASK]"        
        
tokenizer._bos_token="[CLS]"
tokenizer._eos_token="[SEP]"
tokenizer._sep_token="[SEP]"
tokenizer._cls_token="[CLS]"
tokenizer._unk_token="[UNK]"
tokenizer._pad_token="[PAD]"
tokenizer._mask_token="[MASK]"  

Post that, I instantiated the model

from transformers import TransfoXLConfig, TransfoXLModel

config = TransfoXLConfig()
model = TransfoXLModel(config=config)

Set up the data collator:

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Setting up the trainer as follows

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./TransfoXL",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

When I execute:

%%time
trainer.train()

I get the following error:

TypeError                                 Traceback (most recent call last)
<timed eval> in <module>

/opt/conda/envs/Python-3.7-CUDA/lib/python3.7/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
   1270                         tr_loss += self.training_step(model, inputs)
   1271                 else:
-> 1272                     tr_loss += self.training_step(model, inputs)
   1273                 self.current_flos += float(self.floating_point_ops(inputs))
   1274 

/opt/conda/envs/Python-3.7-CUDA/lib/python3.7/site-packages/transformers/trainer.py in training_step(self, model, inputs)
   1732                 loss = self.compute_loss(model, inputs)
   1733         else:
-> 1734             loss = self.compute_loss(model, inputs)
   1735 
   1736         if self.args.n_gpu > 1:

/opt/conda/envs/Python-3.7-CUDA/lib/python3.7/site-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
   1764         else:
   1765             labels = None
-> 1766         outputs = model(**inputs)
   1767         # Save past state if it exists
   1768         # TODO: this needs to be fixed and made cleaner later.

/opt/conda/envs/Python-3.7-CUDA/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

TypeError: forward() got an unexpected keyword argument 'attention_mask'

Can some please advise on this or if they have a working notebook example point to it?

Thanks