Hello.
For kaggle competition I have decided to use transformers and ‘distilbert-base-cased’ for prediction of the text readability targets. Below is
example of dataset:
excerpt target
1705 The commutator is peculiar, consisting of only... -3.676268
To check my model I trained it on one sample and expected to receive good prediction.
But I was surprised to find that the network converges to the result very slowly. Starting to understand, I also found that the back propogation gradient practically does not decrease:
tensor(12.2468, grad_fn=<MseLossBackward>)
Training RMSE Epoch(0): 3.499540571304336
tensor(11.5582, grad_fn=<MseLossBackward>)
Training RMSE Epoch(1): 3.399740736557078
tensor(11.3492, grad_fn=<MseLossBackward>)
Training RMSE Epoch(2): 3.3688636378441
tensor(10.6344, grad_fn=<MseLossBackward>)
Training RMSE Epoch(3): 3.261047659723675
tensor(9.8555, grad_fn=<MseLossBackward>)
Training RMSE Epoch(4): 3.139343303844696
tensor(9.0270, grad_fn=<MseLossBackward>)
Training RMSE Epoch(5): 3.0044979708165265
tensor(8.5099, grad_fn=<MseLossBackward>)
Training RMSE Epoch(6): 2.9171786994167936
tensor(7.6673, grad_fn=<MseLossBackward>)
Training RMSE Epoch(7): 2.7689972054580028
tensor(6.4712, grad_fn=<MseLossBackward>)
Training RMSE Epoch(8): 2.543863493615884
tensor(5.7926, grad_fn=<MseLossBackward>)
Training RMSE Epoch(9): 2.406785682401822
tensor(4.8025, grad_fn=<MseLossBackward>)
Training RMSE Epoch(10): 2.1914508563447015
tensor(3.9520, grad_fn=<MseLossBackward>)
Training RMSE Epoch(11): 1.9879615391746877
tensor(3.2887, grad_fn=<MseLossBackward>)
Training RMSE Epoch(12): 1.8134857654081245
tensor(2.3853, grad_fn=<MseLossBackward>)
Training RMSE Epoch(13): 1.5444405405115664
tensor(1.8604, grad_fn=<MseLossBackward>)
Training RMSE Epoch(14): 1.3639743212746638
tensor(1.5982, grad_fn=<MseLossBackward>)
Training RMSE Epoch(15): 1.2642145185690903
tensor(0.9373, grad_fn=<MseLossBackward>)
Training RMSE Epoch(16): 0.968127973883182
tensor(1.1406, grad_fn=<MseLossBackward>)
Training RMSE Epoch(17): 1.0679721723388667
tensor(0.7393, grad_fn=<MseLossBackward>)
Training RMSE Epoch(18): 0.859804150326125
tensor(0.7153, grad_fn=<MseLossBackward>)
Training RMSE Epoch(19): 0.8457331486894163
```***emphasized text***
Using other alternative network I receive the good results - for me it is proof that with data all are Okey. It seems I'm doing something
wrong - but cannot understand what. Could you look on my code and point out on my mistake.
Below is squeeze of my solution.
def regression_calculate_rmse(big_val, targets):
delta = 0
for val1, val2 in zip(big_val.cpu().numpy(), targets.cpu().numpy()):
delta += (val2 - val1)*(val2 - val1)
return delta
class Triage(Dataset):
def __init__(self, dataframe, tokenizer, max_len, isSubmit = False): self.len = len(dataframe) self.data = dataframe self.tokenizer = tokenizer self.max_len = max_len self.isSubmit = isSubmit def __getitem__(self, index): title = str(self.data.excerpt[index]) title = " ".join(title.split()) inputs = self.tokenizer.encode_plus( title, None, add_special_tokens=True, max_length=self.max_len, pad_to_max_length=True, return_token_type_ids=True, truncation=True ) ids = inputs['input_ids'] mask = inputs['attention_mask'] return { 'ids': torch.tensor(ids, dtype=torch.long), 'mask': torch.tensor(mask, dtype=torch.long), 'targets': torch.tensor(self.data.target[index], dtype=torch.float) } def __len__(self): return self.len
class DistillBERTClass(torch.nn.Module):
def __init__(self): super(DistillBERTClass, self).__init__() self.l1 = AutoModel.from_pretrained(model_name,output_hidden_states=True) self.pre_regressor = torch.nn.Linear(768, 768) self.dropout = torch.nn.Dropout(0.1) self.regressor = torch.nn.Linear(768, 1) def forward(self, input_ids, attention_mask): output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask) hidden_state = output_1[0] pooler = hidden_state[:, 0] pooler = self.pre_regressor(pooler) pooler = torch.nn.ReLU()(pooler) pooler = self.dropout(pooler) output = self.regressor(pooler) return output
def train(epoch, lr_scheduler = None):
nb_tr_examples = 0 rmse = 0 model.train() for _,data in enumerate(training_loader, 0): ids = data['ids'].to(device, dtype = torch.long) mask = data['mask'].to(device, dtype = torch.long) targets = data['targets'].to(device, dtype = torch.float) outputs = model(input_ids = ids, attention_mask = mask) loss = loss_function(outputs.view(-1), targets.view(-1)) big_val, big_idx = torch.max(outputs.data, dim=1) rmse += regression_calculate_rmse(big_val, targets) nb_tr_examples+=targets.size(0) print(loss) optimizer.zero_grad() loss.backward() # # When using GPU optimizer.step() epoch_rmse = math.sqrt(rmse/nb_tr_examples) print(f"Training RMSE Epoch({epoch}): {epoch_rmse}") return
train_params = {‘batch_size’: 1,
'shuffle': True,
'num_workers': 0
}
model_name = ‘distilbert-base-cased’
tokenizer = AutoTokenizer.from_pretrained(model_name)
LEARNING_RATE = 2e-05
EPOCHS = 20
model = DistillBERTClass()
model.to(device)
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
for epoch in range(EPOCHS):
train(epoch)