as the title, the code below:
mport code
import logging
import sys
import os
from typing import Optional
import datasets
from dataclasses import dataclass, field
import transformers
from transformers.utils.dummy_pt_objects import Trainer
logger = logging.getLogger(__name__)
@dataclass
class CustomArguments:
model_name: Optional[str] = field(default="bert-base-uncased")
train_file: Optional[str] = field(default=None)
validation_file: Optional[str] = field(default=None)
max_seq_length: Optional[int] = field(default=128)
preprocessing_num_worker: Optional[int] = field(default=os.cpu_count())
line_by_line: Optional[bool] = field(default=True)
vocab_path: Optional[str] = field(default=None)
model_conf_path: Optional[str] = field(default=None)
def main():
parser = transformers.HfArgumentParser((CustomArguments, transformers.TrainingArguments))
custom_args, training_args = parser.parse_args_into_dataclasses()
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
logger.info(f"Training/evaluation parameters {training_args}")
tokenizer = transformers.BertTokenizer(
vocab_file=custom_args.vocab_path,
do_lower_case=False,
max_length=custom_args.max_seq_length)
model_config = transformers.BertConfig.from_pretrained(
custom_args.model_conf_path)
model = transformers.BertForPreTraining(config=model_config)
model.resize_token_embeddings(len(tokenizer))
if custom_args.train_file is None:
raise ValueError("train_file must be specified!")
data_files = {"train": [custom_args.train_file]}
raw_datasets = datasets.load_dataset("text", data_files=data_files)
def tokenize_function(examples):
examples["text"] = [
line for line in examples["text"] if len(line) > 0 and not line.isspace()
]
return tokenizer(
examples["text"],
padding=False,
truncation=True,
max_length=custom_args.max_seq_length,
return_special_tokens_mask=True
)
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=os.cpu_count(),
remove_columns=["text"],
load_from_cache_file=True,
desc="Running tokenizer on dataset line_by_line"
)
train_dataset = tokenized_datasets["train"]
data_collator = transformers.DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm_probability=0.15,
pad_to_multiple_of=8
)
trainer = transformers.Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=None,
tokenizer=tokenizer,
data_collator=data_collator
)
trainer.train()
if __name__ == "__main__":
main()
but the error occur:
KeyError: 'loss'
How to solve the problem?