Training RoBERTa from scratch: error?

So I’m trying to run run_mlm.py on a pre-tokenized corpus, and I’ve been running into consistent errors no matter what I do. I’ve made the following modifications to the script in order to accommodate the pre-tokenized words:

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -332,9 +332,9 @@ def main():
         "use_auth_token": True if model_args.use_auth_token else None,
     }
     if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
+        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs, add_prefix_space=True)
     elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs, add_prefix_space=True)
     else:
         raise ValueError(
             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
@@ -397,6 +397,7 @@ def main():
                 # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                 # receives the `special_tokens_mask`.
                 return_special_tokens_mask=True,
+                is_split_into_words=True,
             )
 
         with training_args.main_process_first(desc="dataset map tokenization"):
@@ -413,7 +414,8 @@ def main():
         # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
         # efficient when it receives the `special_tokens_mask`.
         def tokenize_function(examples):
-            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+            return tokenizer(examples[text_column_name], return_special_tokens_mask=True,
+            is_split_into_words=True,)
 
         with training_args.main_process_first(desc="dataset map tokenization"):
             tokenized_datasets = raw_datasets.map(

And this is what I get when I try to run it:

(transformer_source) Singularity> python run_mlm.py --model_type roberta --train_file /scratch/ns4008/wikipedia/short_wikipedia_20200501_en.txt --output_dir /scratch/ns4008/roberta_wiki_test --tokenizer roberta-base --do_train --max_seq_length=512
08/26/2021 12:44:01 - WARNING - __main__ - Process rank: -1, device: cpu, n_gpu: 0distributed training: False, 16-bits training: False
08/26/2021 12:44:01 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=/scratch/ns4008/roberta_wiki_test/runs/Aug26_12-44-01_gr038.nyu.cluster,
logging_first_step=False,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
no_cuda=False,
num_train_epochs=3.0,
output_dir=/scratch/ns4008/roberta_wiki_test,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=roberta_wiki_test,
push_to_hub_organization=None,
push_to_hub_token=None,
remove_unused_columns=True,
report_to=[],
resume_from_checkpoint=None,
run_name=/scratch/ns4008/roberta_wiki_test,
save_on_each_node=False,
save_steps=500,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_legacy_prediction_loop=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
)
08/26/2021 12:44:01 - WARNING - datasets.builder - Using custom data configuration default-feb453104e2d82a5
08/26/2021 12:44:01 - INFO - datasets.builder - Overwrite dataset info from restored data version.
08/26/2021 12:44:01 - INFO - datasets.info - Loading Dataset info from /home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5
08/26/2021 12:44:01 - WARNING - datasets.builder - Reusing dataset text (/home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
08/26/2021 12:44:01 - INFO - datasets.info - Loading Dataset info from /home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 469.63it/s]
08/26/2021 12:44:01 - WARNING - datasets.builder - Using custom data configuration default-feb453104e2d82a5
08/26/2021 12:44:01 - INFO - datasets.builder - Overwrite dataset info from restored data version.
08/26/2021 12:44:01 - INFO - datasets.info - Loading Dataset info from /home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5
08/26/2021 12:44:01 - WARNING - datasets.builder - Reusing dataset text (/home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
08/26/2021 12:44:01 - INFO - datasets.info - Loading Dataset info from /home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5
08/26/2021 12:44:01 - WARNING - datasets.builder - Using custom data configuration default-feb453104e2d82a5
08/26/2021 12:44:01 - INFO - datasets.builder - Overwrite dataset info from restored data version.
08/26/2021 12:44:01 - INFO - datasets.info - Loading Dataset info from /home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5
08/26/2021 12:44:01 - WARNING - datasets.builder - Reusing dataset text (/home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
08/26/2021 12:44:01 - INFO - datasets.info - Loading Dataset info from /home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5
08/26/2021 12:44:01 - WARNING - __main__ - You are instantiating a new config instance from scratch.
[INFO|tokenization_auto.py:303] 2021-08-26 12:44:01,264 >> Could not locate the tokenizer configuration file, will try to use the model config instead.
[INFO|configuration_utils.py:545] 2021-08-26 12:44:01,304 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /scratch/ns4008/python_cache/.cache/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:581] 2021-08-26 12:44:01,305 >> Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

[INFO|tokenization_utils_base.py:1730] 2021-08-26 12:44:01,598 >> loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /scratch/ns4008/python_cache/.cache/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
[INFO|tokenization_utils_base.py:1730] 2021-08-26 12:44:01,598 >> loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /scratch/ns4008/python_cache/.cache/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
[INFO|tokenization_utils_base.py:1730] 2021-08-26 12:44:01,598 >> loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at /scratch/ns4008/python_cache/.cache/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
[INFO|tokenization_utils_base.py:1730] 2021-08-26 12:44:01,598 >> loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:1730] 2021-08-26 12:44:01,598 >> loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
[INFO|tokenization_utils_base.py:1730] 2021-08-26 12:44:01,598 >> loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
[INFO|configuration_utils.py:545] 2021-08-26 12:44:01,640 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /scratch/ns4008/python_cache/.cache/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:581] 2021-08-26 12:44:01,640 >> Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

08/26/2021 12:44:01 - INFO - __main__ - Training new model from scratch
hello hey
08/26/2021 12:44:05 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-9984398c70daf9fb.arrow
08/26/2021 12:44:05 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/ns4008/.cache/huggingface/datasets/text/default-feb453104e2d82a5/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-4bdd77f75045dad7.arrow
Grouping texts in chunks of 512:   0%|                 | 0/9286 [00:00<?, ?ba/s]
Traceback (most recent call last):
  File "run_mlm.py", line 552, in <module>
    main()
  File "run_mlm.py", line 455, in main
    tokenized_datasets = tokenized_datasets.map(
  File "/ext3/miniconda3/envs/transformer_source/lib/python3.8/site-packages/datasets/dataset_dict.py", line 471, in map
    {
  File "/ext3/miniconda3/envs/transformer_source/lib/python3.8/site-packages/datasets/dataset_dict.py", line 472, in <dictcomp>
    k: dataset.map(
  File "/ext3/miniconda3/envs/transformer_source/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 1665, in map
    return self._map_single(
  File "/ext3/miniconda3/envs/transformer_source/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 185, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/ext3/miniconda3/envs/transformer_source/lib/python3.8/site-packages/datasets/fingerprint.py", line 397, in wrapper
    out = func(self, *args, **kwargs)
  File "/ext3/miniconda3/envs/transformer_source/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 2016, in _map_single
    batch = apply_function_on_filtered_inputs(
  File "/ext3/miniconda3/envs/transformer_source/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 1906, in apply_function_on_filtered_inputs
    function(*fn_args, effective_indices, **fn_kwargs) if with_indices else function(*fn_args, **fn_kwargs)
  File "run_mlm.py", line 434, in group_texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  File "run_mlm.py", line 434, in <dictcomp>
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
TypeError: can only concatenate list (not "int") to list

Any ideas for why this would happen? Am I missing something? Thanks!