Hi every. I was very confused about what wrong with my code?
class BERTGRUSentiment(nn.Module):
def __init__(self,bert,hidden_dim,output_dim,n_layers,bidirectional,dropout):
super().__init__()
self.bert = bert
embedding_dim = bert.config.to_dict()['hidden_size']
self.rnn = nn.GRU(embedding_dim,
hidden_dim,
num_layers=n_layers,
bidirectional=bidirectional,
batch_first=True,
dropout=0 if n_layers < 2 else dropout)
self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self,ids, mask, token_type_ids):
# text = [batch size, sent len]
# with torch.no_grad():
embedded = self.dropout(self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)[0])
# embedded = [batch size, sent len, emb dim]
_, hidden = self.rnn(embedded)
# print(hidden.shape)
# hidden = [n layers * n directions, batch size, emb dim]
if self.rnn.bidirectional:
hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
else:
hidden = self.dropout(hidden[-1, :, :])
# hidden = [batch size, hid dim]
output = self.out(hidden)
# print('----')
# print(output.shape)
# output = [batch size, out dim]
return output
def train_fc(data_loader,model,optimizer,device,scheduler,criterion):
model.train()
epoch_loss = 0
epoch_acc = 0
for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
ids = d["ids"]
token_type_ids = d["token_type_ids"]
mask = d["mask"]
targets = d["targets"]
ids = ids.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
targets = targets.to(device, dtype=torch.float).unsqueeze(1)
# print('-------')
# print(ids.shape)
# print(targets.shape)
# print('-------')
optimizer.zero_grad()
outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
# print('----')
# print('----')
# print(outputs.shape)
# print('----')
# print(targets.shape)
loss = criterion(outputs, targets)
acc = binary_accuracy(outputs, targets)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
scheduler.step(loss)
return epoch_loss / len(data_loader), epoch_acc / len(data_loader)
def eval_fc(valid_loader,model,device,criterion):
model.eval()
fin_targets = []
fin_outputs = []
epoch_loss = 0
epoch_acc = 0
with torch.no_grad():
for bi, d in tqdm(enumerate(valid_loader), total=len(valid_loader)):
ids = d["ids"]
token_type_ids = d["token_type_ids"]
mask = d["mask"]
targets = d["targets"]
ids = ids.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
targets = targets.to(device, dtype=torch.float).unsqueeze(1)
outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
loss = criterion(outputs, targets)
# fin_targets.extend(targets.cpu().detach().numpy().tolist())
# fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
acc = binary_accuracy(outputs, targets)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(valid_loader), epoch_acc / len(valid_loader)
model.to(device)
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)
param_optimizer = list(model.named_parameters())
# print(param_optimizer)
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
{
"params": [
p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
],
"weight_decay": 0.001,
"lr" :3e-5
},
{
"params": [
p for n, p in param_optimizer if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
"lr" : 5e-5
},
]
num_train_steps = int(len(train_dataset) / batch_size * config.EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=5e-5)
# scheduler = get_linear_schedule_with_warmup(
# optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
# )
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=3,eps=1e-8)
I got this result.
I think it is overfitting, but I try use dropout or regularization , it still the best epoch is 1 or 2. but I see some example written in torchtext (I use dataset and dataloader) , the valid loss decrease in a few of epoch.
torchtext version
TEXT = data.Field(batch_first = True,
use_vocab = False ,
tokenize = tokenize_and_cut,
preprocessing = tokenizer.convert_tokens_to_ids,
init_token = init_token_idx,
eos_token = eos_token_idx,
pad_token = pad_token_idx,
unk_token = unk_token_idx)
SEED =321
LABEL = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state = random.seed(SEED),split_ratio = 0.8)
TEXT_b = data.Field(tokenize= 'spacy', tokenizer_language='en_core_web_sm',include_lengths = True,lower=True)
train_data_b, test_data_b = datasets.IMDB.splits(TEXT_b, LABEL)
train_data_b, valid_data_b = train_data_b.split(random_state = random.seed(SEED),split_ratio = 0.8)
LABEL.build_vocab(train_data_b)
model is same , his validation loss(lower bound is 0.211) is decrease and acc is up .
I do not know why my ‘dataset dataloader’ version is overfitting plz help me ! thanks