Torch neural network converges very slowly

Hello.
For kaggle competition I have decided to use transformers and ‘distilbert-base-cased’ for prediction of the text readability targets. Below is
example of dataset:

                  excerpt	                                   target
1705	The commutator is peculiar, consisting of only...	-3.676268

To check my model I trained it on one sample and expected to receive good prediction.
But I was surprised to find that the network converges to the result very slowly. Starting to understand, I also found that the back propogation gradient practically does not decrease:

tensor(12.2468, grad_fn=<MseLossBackward>)
Training RMSE Epoch(0): 3.499540571304336
tensor(11.5582, grad_fn=<MseLossBackward>)
Training RMSE Epoch(1): 3.399740736557078
tensor(11.3492, grad_fn=<MseLossBackward>)
Training RMSE Epoch(2): 3.3688636378441
tensor(10.6344, grad_fn=<MseLossBackward>)
Training RMSE Epoch(3): 3.261047659723675
tensor(9.8555, grad_fn=<MseLossBackward>)
Training RMSE Epoch(4): 3.139343303844696
tensor(9.0270, grad_fn=<MseLossBackward>)
Training RMSE Epoch(5): 3.0044979708165265
tensor(8.5099, grad_fn=<MseLossBackward>)
Training RMSE Epoch(6): 2.9171786994167936
tensor(7.6673, grad_fn=<MseLossBackward>)
Training RMSE Epoch(7): 2.7689972054580028
tensor(6.4712, grad_fn=<MseLossBackward>)
Training RMSE Epoch(8): 2.543863493615884
tensor(5.7926, grad_fn=<MseLossBackward>)
Training RMSE Epoch(9): 2.406785682401822
tensor(4.8025, grad_fn=<MseLossBackward>)
Training RMSE Epoch(10): 2.1914508563447015
tensor(3.9520, grad_fn=<MseLossBackward>)
Training RMSE Epoch(11): 1.9879615391746877
tensor(3.2887, grad_fn=<MseLossBackward>)
Training RMSE Epoch(12): 1.8134857654081245
tensor(2.3853, grad_fn=<MseLossBackward>)
Training RMSE Epoch(13): 1.5444405405115664
tensor(1.8604, grad_fn=<MseLossBackward>)
Training RMSE Epoch(14): 1.3639743212746638
tensor(1.5982, grad_fn=<MseLossBackward>)
Training RMSE Epoch(15): 1.2642145185690903
tensor(0.9373, grad_fn=<MseLossBackward>)
Training RMSE Epoch(16): 0.968127973883182
tensor(1.1406, grad_fn=<MseLossBackward>)
Training RMSE Epoch(17): 1.0679721723388667
tensor(0.7393, grad_fn=<MseLossBackward>)
Training RMSE Epoch(18): 0.859804150326125
tensor(0.7153, grad_fn=<MseLossBackward>)
Training RMSE Epoch(19): 0.8457331486894163
```***emphasized text***


Using other alternative network I receive the good results - for me it is proof that with data all are Okey. It seems I'm doing something
wrong - but cannot understand what. Could you look on my code and point out on my mistake.
 
Below is squeeze of my solution. 

def regression_calculate_rmse(big_val, targets):

    delta = 0
    for val1, val2 in zip(big_val.cpu().numpy(), targets.cpu().numpy()):
        delta += (val2 - val1)*(val2 - val1)

    return delta

class Triage(Dataset):

def __init__(self, dataframe, tokenizer, max_len, isSubmit = False):
    self.len = len(dataframe)
    self.data = dataframe
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.isSubmit = isSubmit
    
def __getitem__(self, index):
    title = str(self.data.excerpt[index])
    title = " ".join(title.split())
    inputs = self.tokenizer.encode_plus(
        title,
        None,
        add_special_tokens=True,
        max_length=self.max_len,
        pad_to_max_length=True,
        return_token_type_ids=True,
        truncation=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'targets': torch.tensor(self.data.target[index], dtype=torch.float)
        } 

def __len__(self):
    return self.len

class DistillBERTClass(torch.nn.Module):

def __init__(self):
    super(DistillBERTClass, self).__init__()
    self.l1 = AutoModel.from_pretrained(model_name,output_hidden_states=True)
    self.pre_regressor = torch.nn.Linear(768, 768)
    self.dropout = torch.nn.Dropout(0.1)
    self.regressor = torch.nn.Linear(768, 1)

def forward(self, input_ids, attention_mask):
    output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
    hidden_state = output_1[0]
    pooler = hidden_state[:, 0]
    pooler = self.pre_regressor(pooler)
    pooler = torch.nn.ReLU()(pooler)
    pooler = self.dropout(pooler)
    output = self.regressor(pooler)
    return output

def train(epoch, lr_scheduler = None):

nb_tr_examples = 0
rmse = 0
model.train()
for _,data in enumerate(training_loader, 0):
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.float)
    
    outputs = model(input_ids = ids, attention_mask = mask)

    loss = loss_function(outputs.view(-1), targets.view(-1))
    big_val, big_idx = torch.max(outputs.data, dim=1)
    rmse += regression_calculate_rmse(big_val, targets)
        
    nb_tr_examples+=targets.size(0)
    print(loss)
    
    optimizer.zero_grad()
    loss.backward()
    # # When using GPU
    optimizer.step()

epoch_rmse = math.sqrt(rmse/nb_tr_examples)
print(f"Training RMSE Epoch({epoch}): {epoch_rmse}")

return 

train_params = {‘batch_size’: 1,

                'shuffle': True,
                'num_workers': 0
                }

model_name = ‘distilbert-base-cased’
tokenizer = AutoTokenizer.from_pretrained(model_name)

LEARNING_RATE = 2e-05
EPOCHS = 20

model = DistillBERTClass()

model.to(device)

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):

train(epoch)