I’m trying to train bert from scratch, here is my code:
import logging
import sys
import os
from typing import Optional
import code
import datasets
from dataclasses import dataclass, field
import transformers
logger = logging.getLogger(__name__)
@dataclass
class CustomArguments:
train_file: Optional[str] = field(default=None)
validation_file: Optional[str] = field(default=None)
max_seq_length: Optional[int] = field(default=128)
vocab_path: Optional[str] = field(default=None)
model_conf_path: Optional[str] = field(default=None)
def main():
parser = transformers.HfArgumentParser((CustomArguments, transformers.TrainingArguments))
custom_args, training_args = parser.parse_args_into_dataclasses()
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
logger.info(f"Training/evaluation parameters {training_args}")
if custom_args.train_file is None:
raise ValueError("train_file must be specified!")
tokenizer = transformers.BertTokenizerFast(
vocab_file=custom_args.vocab_path,
do_lower_case=False,
max_length=128)
model_config = transformers.BertConfig.from_pretrained(
custom_args.model_conf_path)
model = transformers.BertForPreTraining(config=model_config)
model.resize_token_embeddings(len(tokenizer))
train_dataset = transformers.TextDatasetForNextSentencePrediction(
tokenizer=tokenizer,
file_path=custom_args.train_file,
block_size=128)
eval_dataset = None
if custom_args.validation_file is not None:
eval_dataset = transformers.TextDatasetForNextSentencePrediction(
tokenizer=tokenizer,
file_path=custom_args.validation_file,
block_size=128)
data_collator = transformers.DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=True,
mlm_probability=0.15)
trainer = transformers.Trainer(model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=eval_dataset)
trainer.train()
if eval_dataset:
trainer.evaluate()
if __name__ == "__main__":
main()
but error occur:
File "/usr/local/anaconda3/lib/python3.7/site-packages/transformers/models/bert/modeling_bert.py", line 221, in forward
embeddings += position_embeddings
RuntimeError: The size of tensor a (2190) must match the size of tensor b (512) at non-singleton dimension 1
Then I print the input shape, found the seq length is 2190!
It seem block_size not work? How can I set max_seq_length combine with TextDatasetForNextSentencePrediction.
By the way, is there a way to get nsp dataset combine with load_dataset?