TextDataset can't set max_seq_length?

I’m trying to train bert from scratch, here is my code:

import logging
import sys
import os
from typing import Optional
import code

import datasets
from dataclasses import dataclass, field
import transformers

logger = logging.getLogger(__name__)

@dataclass
class CustomArguments:
  train_file: Optional[str] = field(default=None)
  validation_file: Optional[str] = field(default=None)
  max_seq_length: Optional[int] = field(default=128)
  vocab_path: Optional[str] = field(default=None)
  model_conf_path: Optional[str] = field(default=None)

def main():
  parser = transformers.HfArgumentParser((CustomArguments, transformers.TrainingArguments))
  custom_args, training_args = parser.parse_args_into_dataclasses()

  logging.basicConfig(
      format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
      datefmt="%m/%d/%Y %H:%M:%S",
      handlers=[logging.StreamHandler(sys.stdout)],
  )

  log_level = training_args.get_process_log_level()
  logger.setLevel(log_level)
  datasets.utils.logging.set_verbosity(log_level)
  transformers.utils.logging.set_verbosity(log_level)
  transformers.utils.logging.enable_default_handler()
  transformers.utils.logging.enable_explicit_format()

  logger.warning(
      f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
      + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
  )
  # Set the verbosity to info of the Transformers logger (on main process only):
  logger.info(f"Training/evaluation parameters {training_args}")

  if custom_args.train_file is None:
    raise ValueError("train_file must be specified!")
  tokenizer = transformers.BertTokenizerFast(
                      vocab_file=custom_args.vocab_path,
                      do_lower_case=False,
                      max_length=128)

  model_config = transformers.BertConfig.from_pretrained(
                                            custom_args.model_conf_path)
  model = transformers.BertForPreTraining(config=model_config)
  model.resize_token_embeddings(len(tokenizer))

  train_dataset = transformers.TextDatasetForNextSentencePrediction(
                                      tokenizer=tokenizer,
                                      file_path=custom_args.train_file,
                                      block_size=128)

  eval_dataset = None
  if custom_args.validation_file is not None:
    eval_dataset = transformers.TextDatasetForNextSentencePrediction(
                                      tokenizer=tokenizer,
                                      file_path=custom_args.validation_file,
                                      block_size=128)

  data_collator = transformers.DataCollatorForLanguageModeling(
                                      tokenizer=tokenizer,
                                      mlm=True,
                                      mlm_probability=0.15)

  trainer = transformers.Trainer(model=model,
                                 args=training_args,
                                 data_collator=data_collator,
                                 train_dataset=train_dataset,
                                 eval_dataset=eval_dataset)
  trainer.train()
  if eval_dataset:
    trainer.evaluate()

if __name__ == "__main__":
  main()

but error occur:

  File "/usr/local/anaconda3/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py", line 221, in forward
    embeddings += position_embeddings
RuntimeError: The size of tensor a (2190) must match the size of tensor b (512) at non-singleton dimension 1

Then I print the input shape, found the seq length is 2190!
It seem block_size not work? How can I set max_seq_length combine with TextDatasetForNextSentencePrediction.

By the way, is there a way to get nsp dataset combine with load_dataset?

Is there any advice? @ lhoestq

Hi, I’ve looked into the code and TextDatasetForNextSentencePrediction in combination with a BertModel cannot handle sequences longer than 512. TDFNSP does not truncate or pad.
The input for TDFNSP should be sentences separated by a whiteline (\n), and an additional whiteline between documents. If you preprocess your data this way, I cannot imagine one single sentence to be longer than 512 tokens. If this is the case, then it is not a sentence :wink:

Your data should look like this:

File

This is a sentence from document 1.
This is the next sentence.

This is a sentence from document 2.
…

If you make sure that you have correct/real sentences, then you can use the code you already have.