Fine-tuning lm with nsp

Environment info
transformers-4.2.1
PyTorch
tokenizers-0.9.4
sentencepiece-0.1.95

when finetune bert my script was running for half an hour then i got this error
Traceback (most recent call last):
File “/content/run.py”, line 757, in
main()
File “/content/run.py”, line 656, in main
labels=lm_label_ids, next_sentence_label=is_next)
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 1065, in forward
return_dict=return_dict,
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 968, in forward
return_dict=return_dict,
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 566, in forward
output_attentions,
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 460, in forward
past_key_value=self_attn_past_key_value,
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 393, in forward
output_attentions,
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py”, line 314, in forward
attention_probs = nn.Softmax(dim=-1)(attention_scores)
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/usr/local/lib/python3.6/dist-packages/torch/nn/modules/activation.py”, line 1198, in forward
return F.softmax(input, self.dim, _stacklevel=5)
File “/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py”, line 1512, in softmax
ret = input.softmax(dim)
RuntimeError: CUDA error: device-side assert `triggered

and my code is here***********************************
from future import absolute_import
from future import division
from future import print_function

import os
import logging
import argparse
#from tqdm import tqdm
#from tqdm import trange
from tqdm import notebook , trange

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler , SequentialSampler
from torch.utils.data.distributed import DistributedSampler

#from pytorch_pretrained_bert.tokenization import BertTokenizer
#from pytorch_pretrained_bert.modeling import BertForPreTraining
from transformers import BertTokenizer, BertForPreTraining
#from pytorch_pretrained_bert.optimization import BertAdam
from transformers import XLNetTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
#from transformers import BertForPreTraining
import sentencepiece as spm

from torch.utils.data import Dataset
import random

logging.basicConfig(format=’%(asctime)s - %(levelname)s - %(name)s - %(message)s’,
datefmt=’%m/%d/%Y %H:%M:%S’,
level=logging.INFO)
logger = logging.getLogger(name)

def warmup_linear(x, warmup=0.002):
if x < warmup:
return x / warmup
return 1.0 - x

def accuracy(out, labels, total_test):
class_preds = out.data.cpu().numpy().argmax(axis=-1)
labels = labels.data.cpu().numpy()
return np.sum(class_preds == labels) / total_test

class BERTDataset(Dataset):
def init(self, corpus_path, tokenizer, seq_len, encoding=“utf-8”, corpus_lines=None, on_memory=True):
self.vocab = tokenizer.get_vocab()
self.tokenizer = tokenizer
self.seq_len = seq_len
self.on_memory = on_memory
self.corpus_lines = corpus_lines # number of non-empty lines in input corpus
self.corpus_path = corpus_path
self.encoding = encoding
self.current_doc = 0 # to avoid random sentence from same doc

    # for loading samples directly from file
    self.sample_counter = 0  # used to keep track of full epochs on file
    self.line_buffer = None  # keep second sentence of a pair in memory and use as first sentence in next pair

    # for loading samples in memory
    self.current_random_doc = 0
    self.num_docs = 0
    self.sample_to_doc = []  # map sample index to doc and line

    # load samples into memory
    if on_memory:
        self.all_docs = []
        doc = []
        self.corpus_lines = 0
        with open(corpus_path, "r", encoding=encoding) as f:
            for line in notebook.tqdm(f, desc="Loading Dataset", total=corpus_lines):
                line = line.strip()
                if line == "":
                    self.all_docs.append(doc)
                    doc = []
                    # remove last added sample because there won't be a subsequent line anymore in the doc
                    self.sample_to_doc.pop()
                else:
                    # store as one sample
                    sample = {"doc_id": len(self.all_docs),
                              "line": len(doc)}
                    self.sample_to_doc.append(sample)
                    doc.append(line)
                    self.corpus_lines = self.corpus_lines + 1

        # if last row in file is not empty
        if self.all_docs[-1] != doc:
            self.all_docs.append(doc)
            self.sample_to_doc.pop()

        self.num_docs = len(self.all_docs)

    # load samples later lazily from disk
    else:
        if self.corpus_lines is None:
            with open(corpus_path, "r", encoding=encoding) as f:
                self.corpus_lines = 0
                for line in notebook.tqdm(f, desc="Loading Dataset", total=corpus_lines):
                    if line.strip() == "":
                        self.num_docs += 1
                    else:
                        self.corpus_lines += 1

                # if doc does not end with empty line
                if line.strip() != "":
                    self.num_docs += 1

        self.file = open(corpus_path, "r", encoding=encoding)
        self.random_file = open(corpus_path, "r", encoding=encoding)

def __len__(self):
    # last line of doc won't be used, because there's no "nextSentence". Additionally, we start counting at 0.
    return self.corpus_lines - self.num_docs - 1

def __getitem__(self, item):
    cur_id = self.sample_counter
    self.sample_counter += 1
    if not self.on_memory:
        # after one epoch we start again from beginning of file
        if cur_id != 0 and (cur_id % len(self) == 0):
            self.file.close()
            self.file = open(self.corpus_path, "r", encoding=self.encoding)

    t1, t2, is_next_label = self.random_sent(item)

    # tokenize
    tokens_a = self.tokenizer.tokenize(t1)
    tokens_b = self.tokenizer.tokenize(t2)

    # combine to one sample
    cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label)

    # transform sample to features
    cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)

    cur_tensors = (torch.tensor(cur_features.input_ids),
                   torch.tensor(cur_features.input_mask),
                   torch.tensor(cur_features.segment_ids),
                   torch.tensor(cur_features.lm_label_ids),
                   torch.tensor(cur_features.is_next))

    return cur_tensors

def random_sent(self, index):
    """
    Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences
    from one doc. With 50% the second sentence will be a random one from another doc.
    :param index: int, index of sample.
    :return: (str, str, int), sentence 1, sentence 2, isNextSentence Label
    """
    t1, t2 = self.get_corpus_line(index)
    if random.random() > 0.5:
        label = 0
    else:
        t2 = self.get_random_line()
        label = 1

    assert len(t1) > 0
    assert len(t2) > 0
    return t1, t2, label

def get_corpus_line(self, item):
    """
    Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.
    :param item: int, index of sample.
    :return: (str, str), two subsequent sentences from corpus
    """
    t1 = ""
    t2 = ""
    assert item < self.corpus_lines
    if self.on_memory:
        sample = self.sample_to_doc[item]
        t1 = self.all_docs[sample["doc_id"]][sample["line"]]
        t2 = self.all_docs[sample["doc_id"]][sample["line"] + 1]
        # used later to avoid random nextSentence from same doc
        self.current_doc = sample["doc_id"]
        return t1, t2
    else:
        if self.line_buffer is None:
            # read first non-empty line of file
            while t1 == "":
                t1 = self.file.__next__().strip()
                t2 = self.file.__next__().strip()
        else:
            # use t2 from previous iteration as new t1
            t1 = self.line_buffer
            t2 = self.file.__next__().strip()
            # skip empty rows that are used for separating documents and keep track of current doc id
            while t2 == "" or t1 == "":
                t1 = self.file.__next__().strip()
                t2 = self.file.__next__().strip()
                self.current_doc = self.current_doc + 1
        self.line_buffer = t2

    assert t1 != ""
    assert t2 != ""
    return t1, t2

def get_random_line(self):
    """
    Get random line from another document for nextSentence task.
    :return: str, content of one line
    """
    # Similar to original tf repo: This outer loop should rarely go for more than one iteration for large
    # corpora. However, just to be careful, we try to make sure that
    # the random document is not the same as the document we're processing.
    for _ in range(10):
        if self.on_memory:
            rand_doc_idx = random.randint(0, len(self.all_docs) - 1)
            rand_doc = self.all_docs[rand_doc_idx]
            line = rand_doc[random.randrange(len(rand_doc))]
        else:
            rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000)
            # pick random line
            for _ in range(rand_index):
                line = self.get_next_line()
        # check if our picked random line is really from another doc like we want it to be
        if self.current_random_doc != self.current_doc:
            break
    return line

def get_next_line(self):
    """ Gets next line of random_file and starts over when reaching end of file"""
    try:
        line = self.random_file.__next__().strip()
        # keep track of which document we are currently looking at to later avoid having the same doc as t1
        if line == "":
            self.current_random_doc = self.current_random_doc + 1
            line = self.random_file.__next__().strip()
    except StopIteration:
        self.random_file.close()
        self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
        line = self.random_file.__next__().strip()
    return line

class InputExample(object):
“”“A single training/test example for the language model.”""

def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None):
    """Constructs a InputExample.
    Args:
        guid: Unique id for the example.
        tokens_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
        tokens_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    self.guid = guid
    self.tokens_a = tokens_a
    self.tokens_b = tokens_b
    self.is_next = is_next  # nextSentence
    self.lm_labels = lm_labels  # masked words for language model

class InputFeatures(object):
“”“A single set of features of data.”""

def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.is_next = is_next
    self.lm_label_ids = lm_label_ids

def random_word(tokens, tokenizer):
“”"
Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
:param tokens: list of str, tokenized sentence.
:param tokenizer: Tokenizer, object used for tokenization (we need it’s vocab here)
:return: (list of str, list of int), masked tokens and related labels for LM prediction
“”"
output_label = []

for i, token in enumerate(tokens):
    prob = random.random()
    # mask token with 15% probability
    if prob < 0.15:
        prob /= 0.15

        # 80% randomly change token to mask token
        if prob < 0.8:
            tokens[i] = "[MASK]"

        # 10% randomly change token to random token
        elif prob < 0.9:
            tokens[i] = random.choice(list(tokenizer.get_vocab()))

        # -> rest 10% randomly keep current token

        # append current token to output (we will predict these later)
        try:
            output_label.append(tokenizer.convert_tokens_to_ids(token))
        except KeyError:
            # For unknown words (should not occur with BPE vocab)
            output_label.append(tokenizer.convert_tokens_to_ids("[UNK]"))
            logger.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
    else:
        # no masking token (will be ignored by loss function later)
        output_label.append(-100)

return tokens, output_label

def convert_example_to_features(example, max_seq_length, tokenizer):
“”"
Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
IDs, LM labels, input_mask, CLS and SEP tokens etc.
:param example: InputExample, containing sentence input as strings and is_next label
:param max_seq_length: int, maximum length of sequence.
:param tokenizer: Tokenizer
:return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
“”"
tokens_a = example.tokens_a
tokens_b = example.tokens_b
# Modifies tokens_a and tokens_b in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with “- 3”
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

t1_random, t1_label = random_word(tokens_a, tokenizer)
t2_random, t2_label = random_word(tokens_b, tokenizer)
# concatenate lm labels and account for CLS, SEP, SEP
cls_id = tokenizer.convert_tokens_to_ids(["[CLS]"])[0]
sep_id = tokenizer.convert_tokens_to_ids(["[SEP]"])[0]
pad_id = tokenizer.convert_tokens_to_ids(["[PAD]"])[0]
lm_label_ids = ([cls_id] + t1_label + [sep_id] + t2_label + [sep_id])

# The convention in BERT is:
# (a) For sequence pairs:
#  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
#  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
# (b) For single sequences:
#  tokens:   [CLS] the dog is hairy . [SEP]
#  type_ids: 0   0   0   0  0     0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
    tokens.append(token)
    segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)

assert len(tokens_b) > 0
for token in tokens_b:
    tokens.append(token)
    segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)

input_ids = tokenizer.convert_tokens_to_ids(tokens)

# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)

# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
    input_ids.append(0)
    input_mask.append(0)
    segment_ids.append(0)
    lm_label_ids.append(-100)

assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
assert len(lm_label_ids) == max_seq_length

if example.guid < 5:
    logger.info("*** Example ***")
    logger.info("guid: %s" % (example.guid))
    logger.info("tokens: %s" % " ".join(
        [str(x) for x in tokens]))
    logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
    logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
    logger.info(
        "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
    logger.info("LM label: %s " % (lm_label_ids))
    logger.info("Is next sentence label: %s " % (example.is_next))

features = InputFeatures(input_ids=input_ids,
                         input_mask=input_mask,
                         segment_ids=segment_ids,
                         lm_label_ids=lm_label_ids,
                         is_next=example.is_next)
return features

def main():
parser = argparse.ArgumentParser()

## Required parameters
parser.add_argument("--train_file",
                    default=None,
                    type=str,
                    required=True,
                    help="The input train corpus.")
parser.add_argument("--test_file",
                    default=None,
                    type=str,
                    required=True,
                    help="The input test corpus.")

parser.add_argument("--tokenizer_model", default=None, type=str, required=True,
                    help="tokenizer pre-trained model selected in the list: bert-base-uncased, "
                         "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")

parser.add_argument("--bert_model", default=None, type=str, required=True,
                    help="Bert pre-trained model selected in the list: bert-base-uncased, "
                         "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")

parser.add_argument("--config_file", default=None, type=str, required=True,
                    help="Bert pre-trained model selected in the list: bert-base-uncased, "
                         "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")

parser.add_argument("--output_dir",
                    default=None,
                    type=str,
                    required=True,
                    help="The output directory where the model checkpoints will be written.")
## Other parameters
parser.add_argument("--max_seq_length",
                    default=128,
                    type=int,
                    help="The maximum total input sequence length after WordPiece tokenization. \n"
                         "Sequences longer than this will be truncated, and sequences shorter \n"
                         "than this will be padded.")

parser.add_argument("--train_batch_size",
                    default=32,
                    type=int,
                    help="Total batch size for training.")

parser.add_argument("--eval_batch_size",
                    default=32,
                    type=int,
                    help="Total batch size for eval.")
parser.add_argument("--learning_rate",
                    default=5e-5,
                    type=float,
                    help="The initial learning rate for Adam.")

parser.add_argument("--num_train_epochs",
                    default=4,
                    type=float,
                    help="Total number of training epochs to perform.")

parser.add_argument("--adam_epsilon",
                    default=1e-8,
                    type=float,
                    help="Proportion of training to perform linear learning rate warmup for. "
                         "E.g., 0.1 = 10%% of training.")

parser.add_argument("--no_cuda",
                    action='store_true',
                    help="Whether not to use CUDA when available")

parser.add_argument("--on_memory",
                    action='store_true',
                    help="Whether to load train samples into memory or use disk")

parser.add_argument("--do_lower_case",
                    action='store_true',
                    help="Whether to lower case the input text. True for uncased models, False for cased models.")

parser.add_argument("--local_rank",
                    type=int,
                    default=-1,
                    help="local_rank for distributed training on gpus")

parser.add_argument('--seed',
                    type=int,
                    default=42,
                    help="random seed for initialization")

parser.add_argument('--gradient_accumulation_steps',
                    type=int,
                    default=1,
                    help="Number of updates steps to accumualte before performing a backward/update pass.")

parser.add_argument('--fp16',
                    action='store_true',
                    help="Whether to use 16-bit float precision instead of 32-bit")

parser.add_argument('--loss_scale',
                    type=float, default=0,
                    help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                         "0 (default value): dynamic loss scaling.\n"
                         "Positive power of 2: static loss scaling value.\n")

args = parser.parse_args()

if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
else:
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
    device, n_gpu, bool(args.local_rank != -1), args.fp16))

if args.gradient_accumulation_steps < 1:
    raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
        args.gradient_accumulation_steps))

args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)

#if not args.do_train and not args.do_eval:
#    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
    raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
os.makedirs(args.output_dir, exist_ok=True)

# tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
tokenizer = XLNetTokenizer.from_pretrained(args.tokenizer_model)


# train_examples = None
num_train_steps = None

print("Loading Train Dataset", args.train_file)
train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length,
                            corpus_lines=None, on_memory=args.on_memory)

print("Loading eval Dataset", args.test_file)
eval_dataset = BERTDataset(args.test_file, tokenizer, seq_len=args.max_seq_length,
                           corpus_lines=None, on_memory=args.on_memory)

num_train_steps = int(
    len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

# Prepare model
model = BertForPreTraining.from_pretrained(
    args.bert_model,
    config=args.config_file,
    output_attentions=False,  # Whether the model returns attentions weights.
    output_hidden_states=False,  # Whether the model returns all hidden-states.
)
# Tell pytorch to run this model on the GPU.
model.to(device)

if args.fp16:
    model.half()
if args.local_rank != -1:
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError(
            "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
    model = DDP(model)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

# Prepare optimizer
'''
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
    try:
        from apex.optimizers import FP16_Optimizer
        from apex.optimizers import FusedAdam
    except ImportError:
        raise ImportError(
            "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    optimizer = FusedAdam(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          bias_correction=False,
                          max_grad_norm=1.0)
    if args.loss_scale == 0:
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
    else:
        optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

else:
    optimizer = AdamW(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)
'''

logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_dataset))
logger.info("  Batch size = %d", args.train_batch_size)
logger.info("  Num steps = %d", num_train_steps)

if args.local_rank == -1:
    train_sampler = SequentialSampler(train_dataset)
    eval_sampler = SequentialSampler(eval_dataset)

else:
    # TODO: check if this works with current data generator from disk that relies on file.__next__
    # (it doesn't return item back by index)
    train_sampler = DistributedSampler(train_dataset)
    eval_sampler = DistributedSampler(eval_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.train_batch_size)

# optimizer
t_total = len(train_dataloader) // args.train_batch_size
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if
                not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(
        nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 0, t_total)

model.train()
tr_loss = 0
global_step = 0
acc = 0
train_loss = 0.0
nb_tr_examples, nb_tr_steps = 0, 0
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
    for batch in notebook.tqdm(train_dataloader, desc="Train Iteration"):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
        outputs = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids,
                        labels=lm_label_ids, next_sentence_label=is_next)


        loss = outputs.loss
        '''
        if n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu.
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps
        if args.fp16:
            optimizer.backward(outputs.loss)
        else:
            loss.backward()
        '''
        print(loss)
        loss.backward()
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        global_step += 1

        '''
        if (step + 1) % args.gradient_accumulation_steps == 0:
            # modify learning rate with special warm up BERT uses
            lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_steps, args.warmup_proportion)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1
        '''

train_loss = tr_loss / global_step
perplexity = torch.exp(torch.tensor(train_loss)).item()

print("Training loss {} ".format("{:.3f}".format(train_loss)))
print("Training perplexity {}".format("{:.3f}".format(perplexity)))