Environment info
transformers-4.2.1
PyTorch
tokenizers-0.9.4
sentencepiece-0.1.95
when finetune bert my script was running for half an hour then i got this error
Traceback (most recent call last):
File “/content/run.py”, line 757, in
main()
File “/content/run.py”, line 656, in main
labels=lm_label_ids, next_sentence_label=is_next)
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 1065, in forward
return_dict=return_dict,
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 968, in forward
return_dict=return_dict,
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 566, in forward
output_attentions,
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 460, in forward
past_key_value=self_attn_past_key_value,
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 393, in forward
output_attentions,
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 314, in forward
attention_probs = nn.Softmax(dim=-1)(attention_scores)
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/activation.py”, line 1198, in forward
return F.softmax(input, self.dim, _stacklevel=5)
File “/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py”, line 1512, in softmax
ret = input.softmax(dim)
RuntimeError: CUDA error: device-side assert `triggered
and my code is here***********************************
from future import absolute_import
from future import division
from future import print_function
import os
import logging
import argparse
#from tqdm import tqdm
#from tqdm import trange
from tqdm import notebook , trange
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler , SequentialSampler
from torch.utils.data.distributed import DistributedSampler
#from pytorch_pretrained_bert.tokenization import BertTokenizer
#from pytorch_pretrained_bert.modeling import BertForPreTraining
from transformers import BertTokenizer, BertForPreTraining
#from pytorch_pretrained_bert.optimization import BertAdam
from transformers import XLNetTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
#from transformers import BertForPreTraining
import sentencepiece as spm
from torch.utils.data import Dataset
import random
logging.basicConfig(format=’%(asctime)s - %(levelname)s - %(name)s - %(message)s’,
datefmt=’%m/%d/%Y %H:%M:%S’,
level=logging.INFO)
logger = logging.getLogger(name)
def warmup_linear(x, warmup=0.002):
if x < warmup:
return x / warmup
return 1.0 - x
def accuracy(out, labels, total_test):
class_preds = out.data.cpu().numpy().argmax(axis=-1)
labels = labels.data.cpu().numpy()
return np.sum(class_preds == labels) / total_test
class BERTDataset(Dataset):
def init(self, corpus_path, tokenizer, seq_len, encoding=“utf-8”, corpus_lines=None, on_memory=True):
self.vocab = tokenizer.get_vocab()
self.tokenizer = tokenizer
self.seq_len = seq_len
self.on_memory = on_memory
self.corpus_lines = corpus_lines # number of non-empty lines in input corpus
self.corpus_path = corpus_path
self.encoding = encoding
self.current_doc = 0 # to avoid random sentence from same doc
# for loading samples directly from file
self.sample_counter = 0 # used to keep track of full epochs on file
self.line_buffer = None # keep second sentence of a pair in memory and use as first sentence in next pair
# for loading samples in memory
self.current_random_doc = 0
self.num_docs = 0
self.sample_to_doc = [] # map sample index to doc and line
# load samples into memory
if on_memory:
self.all_docs = []
doc = []
self.corpus_lines = 0
with open(corpus_path, "r", encoding=encoding) as f:
for line in notebook.tqdm(f, desc="Loading Dataset", total=corpus_lines):
line = line.strip()
if line == "":
self.all_docs.append(doc)
doc = []
# remove last added sample because there won't be a subsequent line anymore in the doc
self.sample_to_doc.pop()
else:
# store as one sample
sample = {"doc_id": len(self.all_docs),
"line": len(doc)}
self.sample_to_doc.append(sample)
doc.append(line)
self.corpus_lines = self.corpus_lines + 1
# if last row in file is not empty
if self.all_docs[-1] != doc:
self.all_docs.append(doc)
self.sample_to_doc.pop()
self.num_docs = len(self.all_docs)
# load samples later lazily from disk
else:
if self.corpus_lines is None:
with open(corpus_path, "r", encoding=encoding) as f:
self.corpus_lines = 0
for line in notebook.tqdm(f, desc="Loading Dataset", total=corpus_lines):
if line.strip() == "":
self.num_docs += 1
else:
self.corpus_lines += 1
# if doc does not end with empty line
if line.strip() != "":
self.num_docs += 1
self.file = open(corpus_path, "r", encoding=encoding)
self.random_file = open(corpus_path, "r", encoding=encoding)
def __len__(self):
# last line of doc won't be used, because there's no "nextSentence". Additionally, we start counting at 0.
return self.corpus_lines - self.num_docs - 1
def __getitem__(self, item):
cur_id = self.sample_counter
self.sample_counter += 1
if not self.on_memory:
# after one epoch we start again from beginning of file
if cur_id != 0 and (cur_id % len(self) == 0):
self.file.close()
self.file = open(self.corpus_path, "r", encoding=self.encoding)
t1, t2, is_next_label = self.random_sent(item)
# tokenize
tokens_a = self.tokenizer.tokenize(t1)
tokens_b = self.tokenizer.tokenize(t2)
# combine to one sample
cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label)
# transform sample to features
cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)
cur_tensors = (torch.tensor(cur_features.input_ids),
torch.tensor(cur_features.input_mask),
torch.tensor(cur_features.segment_ids),
torch.tensor(cur_features.lm_label_ids),
torch.tensor(cur_features.is_next))
return cur_tensors
def random_sent(self, index):
"""
Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences
from one doc. With 50% the second sentence will be a random one from another doc.
:param index: int, index of sample.
:return: (str, str, int), sentence 1, sentence 2, isNextSentence Label
"""
t1, t2 = self.get_corpus_line(index)
if random.random() > 0.5:
label = 0
else:
t2 = self.get_random_line()
label = 1
assert len(t1) > 0
assert len(t2) > 0
return t1, t2, label
def get_corpus_line(self, item):
"""
Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.
:param item: int, index of sample.
:return: (str, str), two subsequent sentences from corpus
"""
t1 = ""
t2 = ""
assert item < self.corpus_lines
if self.on_memory:
sample = self.sample_to_doc[item]
t1 = self.all_docs[sample["doc_id"]][sample["line"]]
t2 = self.all_docs[sample["doc_id"]][sample["line"] + 1]
# used later to avoid random nextSentence from same doc
self.current_doc = sample["doc_id"]
return t1, t2
else:
if self.line_buffer is None:
# read first non-empty line of file
while t1 == "":
t1 = self.file.__next__().strip()
t2 = self.file.__next__().strip()
else:
# use t2 from previous iteration as new t1
t1 = self.line_buffer
t2 = self.file.__next__().strip()
# skip empty rows that are used for separating documents and keep track of current doc id
while t2 == "" or t1 == "":
t1 = self.file.__next__().strip()
t2 = self.file.__next__().strip()
self.current_doc = self.current_doc + 1
self.line_buffer = t2
assert t1 != ""
assert t2 != ""
return t1, t2
def get_random_line(self):
"""
Get random line from another document for nextSentence task.
:return: str, content of one line
"""
# Similar to original tf repo: This outer loop should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document we're processing.
for _ in range(10):
if self.on_memory:
rand_doc_idx = random.randint(0, len(self.all_docs) - 1)
rand_doc = self.all_docs[rand_doc_idx]
line = rand_doc[random.randrange(len(rand_doc))]
else:
rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000)
# pick random line
for _ in range(rand_index):
line = self.get_next_line()
# check if our picked random line is really from another doc like we want it to be
if self.current_random_doc != self.current_doc:
break
return line
def get_next_line(self):
""" Gets next line of random_file and starts over when reaching end of file"""
try:
line = self.random_file.__next__().strip()
# keep track of which document we are currently looking at to later avoid having the same doc as t1
if line == "":
self.current_random_doc = self.current_random_doc + 1
line = self.random_file.__next__().strip()
except StopIteration:
self.random_file.close()
self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
line = self.random_file.__next__().strip()
return line
class InputExample(object):
“”“A single training/test example for the language model.”""
def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
tokens_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
tokens_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.tokens_a = tokens_a
self.tokens_b = tokens_b
self.is_next = is_next # nextSentence
self.lm_labels = lm_labels # masked words for language model
class InputFeatures(object):
“”“A single set of features of data.”""
def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.is_next = is_next
self.lm_label_ids = lm_label_ids
def random_word(tokens, tokenizer):
“”"
Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
:param tokens: list of str, tokenized sentence.
:param tokenizer: Tokenizer, object used for tokenization (we need it’s vocab here)
:return: (list of str, list of int), masked tokens and related labels for LM prediction
“”"
output_label = []
for i, token in enumerate(tokens):
prob = random.random()
# mask token with 15% probability
if prob < 0.15:
prob /= 0.15
# 80% randomly change token to mask token
if prob < 0.8:
tokens[i] = "[MASK]"
# 10% randomly change token to random token
elif prob < 0.9:
tokens[i] = random.choice(list(tokenizer.get_vocab()))
# -> rest 10% randomly keep current token
# append current token to output (we will predict these later)
try:
output_label.append(tokenizer.convert_tokens_to_ids(token))
except KeyError:
# For unknown words (should not occur with BPE vocab)
output_label.append(tokenizer.convert_tokens_to_ids("[UNK]"))
logger.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
else:
# no masking token (will be ignored by loss function later)
output_label.append(-100)
return tokens, output_label
def convert_example_to_features(example, max_seq_length, tokenizer):
“”"
Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
IDs, LM labels, input_mask, CLS and SEP tokens etc.
:param example: InputExample, containing sentence input as strings and is_next label
:param max_seq_length: int, maximum length of sequence.
:param tokenizer: Tokenizer
:return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
“”"
tokens_a = example.tokens_a
tokens_b = example.tokens_b
# Modifies tokens_a
and tokens_b
in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with “- 3”
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
t1_random, t1_label = random_word(tokens_a, tokenizer)
t2_random, t2_label = random_word(tokens_b, tokenizer)
# concatenate lm labels and account for CLS, SEP, SEP
cls_id = tokenizer.convert_tokens_to_ids(["[CLS]"])[0]
sep_id = tokenizer.convert_tokens_to_ids(["[SEP]"])[0]
pad_id = tokenizer.convert_tokens_to_ids(["[PAD]"])[0]
lm_label_ids = ([cls_id] + t1_label + [sep_id] + t2_label + [sep_id])
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
assert len(tokens_b) > 0
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
lm_label_ids.append(-100)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
assert len(lm_label_ids) == max_seq_length
if example.guid < 5:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("tokens: %s" % " ".join(
[str(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info(
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("LM label: %s " % (lm_label_ids))
logger.info("Is next sentence label: %s " % (example.is_next))
features = InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
lm_label_ids=lm_label_ids,
is_next=example.is_next)
return features
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--train_file",
default=None,
type=str,
required=True,
help="The input train corpus.")
parser.add_argument("--test_file",
default=None,
type=str,
required=True,
help="The input test corpus.")
parser.add_argument("--tokenizer_model", default=None, type=str, required=True,
help="tokenizer pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument("--config_file", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.")
## Other parameters
parser.add_argument("--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--eval_batch_size",
default=32,
type=int,
help="Total batch size for eval.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=4,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--adam_epsilon",
default=1e-8,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument("--on_memory",
action='store_true',
help="Whether to load train samples into memory or use disk")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumualte before performing a backward/update pass.")
parser.add_argument('--fp16',
action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--loss_scale',
type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
args = parser.parse_args()
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
#if not args.do_train and not args.do_eval:
# raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
os.makedirs(args.output_dir, exist_ok=True)
# tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
tokenizer = XLNetTokenizer.from_pretrained(args.tokenizer_model)
# train_examples = None
num_train_steps = None
print("Loading Train Dataset", args.train_file)
train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length,
corpus_lines=None, on_memory=args.on_memory)
print("Loading eval Dataset", args.test_file)
eval_dataset = BERTDataset(args.test_file, tokenizer, seq_len=args.max_seq_length,
corpus_lines=None, on_memory=args.on_memory)
num_train_steps = int(
len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
# Prepare model
model = BertForPreTraining.from_pretrained(
args.bert_model,
config=args.config_file,
output_attentions=False, # Whether the model returns attentions weights.
output_hidden_states=False, # Whether the model returns all hidden-states.
)
# Tell pytorch to run this model on the GPU.
model.to(device)
if args.fp16:
model.half()
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
# Prepare optimizer
'''
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
else:
optimizer = AdamW(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_steps)
'''
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_steps)
if args.local_rank == -1:
train_sampler = SequentialSampler(train_dataset)
eval_sampler = SequentialSampler(eval_dataset)
else:
# TODO: check if this works with current data generator from disk that relies on file.__next__
# (it doesn't return item back by index)
train_sampler = DistributedSampler(train_dataset)
eval_sampler = DistributedSampler(eval_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.train_batch_size)
# optimizer
t_total = len(train_dataloader) // args.train_batch_size
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if
not any(nd in n for nd in no_decay)],
'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(
nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
optimizer, 0, t_total)
model.train()
tr_loss = 0
global_step = 0
acc = 0
train_loss = 0.0
nb_tr_examples, nb_tr_steps = 0, 0
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
for batch in notebook.tqdm(train_dataloader, desc="Train Iteration"):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
outputs = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids,
labels=lm_label_ids, next_sentence_label=is_next)
loss = outputs.loss
'''
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
optimizer.backward(outputs.loss)
else:
loss.backward()
'''
print(loss)
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
optimizer.step()
scheduler.step()
model.zero_grad()
global_step += 1
'''
if (step + 1) % args.gradient_accumulation_steps == 0:
# modify learning rate with special warm up BERT uses
lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_steps, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
scheduler.step()
optimizer.zero_grad()
global_step += 1
'''
train_loss = tr_loss / global_step
perplexity = torch.exp(torch.tensor(train_loss)).item()
print("Training loss {} ".format("{:.3f}".format(train_loss)))
print("Training perplexity {}".format("{:.3f}".format(perplexity)))