Hello, I am trying to recreate this notebook https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb for transformer XL
I made changes to the tokenizer as follows
%%time
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers.trainers import WordLevelTrainer
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()
bert_tokenizer.post_processor = TemplateProcessing(
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[
("[CLS]", 1),
("[SEP]", 2),
],
)
trainer = WordLevelTrainer(show_progress=True, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
files = [str(x) for x in Path(".").glob("**/*.txt")]
tokenizer.train(files, trainer)
tokenizer.save("espertransXL.json")
and then loaded it into the FastTokenizer
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file="espertransXL.json")
tokenizer.bos_token="[CLS]"
tokenizer.eos_token="[SEP]"
tokenizer.sep_token="[SEP]"
tokenizer.cls_token="[CLS]"
tokenizer.unk_token="[UNK]"
tokenizer.pad_token="[PAD]"
tokenizer.mask_token="[MASK]"
tokenizer._bos_token="[CLS]"
tokenizer._eos_token="[SEP]"
tokenizer._sep_token="[SEP]"
tokenizer._cls_token="[CLS]"
tokenizer._unk_token="[UNK]"
tokenizer._pad_token="[PAD]"
tokenizer._mask_token="[MASK]"
Post that, I instantiated the model
from transformers import TransfoXLConfig, TransfoXLModel
config = TransfoXLConfig()
model = TransfoXLModel(config=config)
Set up the data collator:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
Setting up the trainer as follows
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./TransfoXL",
overwrite_output_dir=True,
num_train_epochs=1,
per_gpu_train_batch_size=16,
save_steps=10_000,
save_total_limit=2,
prediction_loss_only=True,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
When I execute:
%%time
trainer.train()
I get the following error:
TypeError Traceback (most recent call last)
<timed eval> in <module>
/opt/conda/envs/Python-3.7-CUDA/lib/python3.7/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
1270 tr_loss += self.training_step(model, inputs)
1271 else:
-> 1272 tr_loss += self.training_step(model, inputs)
1273 self.current_flos += float(self.floating_point_ops(inputs))
1274
/opt/conda/envs/Python-3.7-CUDA/lib/python3.7/site-packages/transformers/trainer.py in training_step(self, model, inputs)
1732 loss = self.compute_loss(model, inputs)
1733 else:
-> 1734 loss = self.compute_loss(model, inputs)
1735
1736 if self.args.n_gpu > 1:
/opt/conda/envs/Python-3.7-CUDA/lib/python3.7/site-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
1764 else:
1765 labels = None
-> 1766 outputs = model(**inputs)
1767 # Save past state if it exists
1768 # TODO: this needs to be fixed and made cleaner later.
/opt/conda/envs/Python-3.7-CUDA/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
TypeError: forward() got an unexpected keyword argument 'attention_mask'
Can some please advise on this or if they have a working notebook example point to it?
Thanks