Overfitting in BERT IMDB50k

Hi every. I was very confused about what wrong with my code?

class BERTGRUSentiment(nn.Module):
    def __init__(self,bert,hidden_dim,output_dim,n_layers,bidirectional,dropout):

        super().__init__()

        self.bert = bert

        embedding_dim = bert.config.to_dict()['hidden_size']

        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          batch_first=True,
                          dropout=0 if n_layers < 2 else dropout)

        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self,ids, mask, token_type_ids):

        # text = [batch size, sent len]

        # with torch.no_grad():
        embedded = self.dropout(self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)[0])

        # embedded = [batch size, sent len, emb dim]

        _, hidden = self.rnn(embedded)
        # print(hidden.shape)

        # hidden = [n layers * n directions, batch size, emb dim]

        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])

        # hidden = [batch size, hid dim]

        output = self.out(hidden)
        # print('----')
        # print(output.shape)

        # output = [batch size, out dim]

        return output
def train_fc(data_loader,model,optimizer,device,scheduler,criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float).unsqueeze(1)
        
        # print('-------')
        # print(ids.shape)
        # print(targets.shape)
        # print('-------')

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

        # print('----')
        # print('----')
        # print(outputs.shape)
        # print('----')
        # print(targets.shape)
        
        loss = criterion(outputs, targets)
        acc = binary_accuracy(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    scheduler.step(loss)
    return epoch_loss / len(data_loader), epoch_acc / len(data_loader)

def eval_fc(valid_loader,model,device,criterion):
    model.eval()
    fin_targets = []
    fin_outputs = []
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for bi, d in tqdm(enumerate(valid_loader), total=len(valid_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float).unsqueeze(1)
           

      


            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            loss = criterion(outputs, targets)
    
            # fin_targets.extend(targets.cpu().detach().numpy().tolist())
            # fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            acc = binary_accuracy(outputs, targets)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(valid_loader), epoch_acc / len(valid_loader)
 model.to(device)
    criterion = nn.BCEWithLogitsLoss()
    criterion = criterion.to(device)

    param_optimizer = list(model.named_parameters())
    # print(param_optimizer)
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
            "lr" :3e-5
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
            "lr" : 5e-5
        },
    ]

    num_train_steps = int(len(train_dataset) / batch_size * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=5e-5)
    # scheduler = get_linear_schedule_with_warmup(
    #     optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    # )
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,  factor=0.1, patience=3,eps=1e-8)

I got this result.
Screen Shot 2021-06-02 at 2.26.52 PM

I think it is overfitting, but I try use dropout or regularization , it still the best epoch is 1 or 2. but I see some example written in torchtext (I use dataset and dataloader) , the valid loss decrease in a few of epoch.
torchtext version

TEXT = data.Field(batch_first = True,
                  use_vocab = False ,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

SEED =321
LABEL = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state = random.seed(SEED),split_ratio = 0.8)
TEXT_b = data.Field(tokenize= 'spacy', tokenizer_language='en_core_web_sm',include_lengths = True,lower=True)
train_data_b, test_data_b = datasets.IMDB.splits(TEXT_b, LABEL)
train_data_b, valid_data_b = train_data_b.split(random_state = random.seed(SEED),split_ratio = 0.8)
LABEL.build_vocab(train_data_b)

model is same , his validation loss(lower bound is 0.211) is decrease and acc is up .

I do not know why my ‘dataset dataloader’ version is overfitting plz help me ! thanks