Model giving same output for eval function but trains

Hi as title says my model is giving the exact same output for different inputs when evaluating but is training and giving varied outputs when training.
I can’t workout why I know it’s not that it’s the same input.

I am also ending up with holes in the outputs when there are no such holes within the input data.

both errors can be seen in the screenshout of my terminal output here:
My training loss is: 121,
whereas my RMS of my eval function is 23,000??

Data_set and datacollator:

**class smiles_dataset(Dataset):
    def __init__(self, smiles, targets):
        self.smiles = smiles
        self.targets = targets 
        #self.tokenizer = tokenizer
        #self.max_len = max_len
        
    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, index):
        smiles = str(self.smiles[index])
        targets = float(self.targets[index])

        return {
            'SMILES': smiles,
            'targets': targets,
        }
        
class SMILESDataCollator():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, data):
        return self.__collate__(default_collate(data))

    def __collate__(self, data):
        # Not sure why padding it twice works here but it gives the desired output so can't complain
        input_id_sequence = []
        for d in data['SMILES']:
            input_ids = self.tokenizer(d)[0]
            input_id_sequence.append(input_ids)
        input_ids_padded = pad_sequence(input_id_sequence, batch_first=True, padding_value=0)
        # input_ids_padded = [pad_sequence((self.tokenizer(d)), padding_value=0) for d in data['SMILES']]
        # input_ids_padded = pad_sequence(input_ids_padded, padding_value=0)
        # for all the d in data
        target_list = []
        for d in data['targets']:
            target_list.append(d)
        atten_mask_list  = []
        for input_ids in input_ids_padded:
            atten_mask = torch.where(input_ids.eq(torch.zeros(1)), torch.tensor([0]), torch.tensor([1]))
            atten_mask_list.append(atten_mask)
        return {
            "input_ids": input_ids_padded,
            "attention_masks": atten_mask_list,
            "targets": target_list
        }

Eval function:

def eval_fn(data_loader, model, device, optimizer):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for batch_index, data_set in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = data_set['input_ids']
            mask = torch.stack(data_set['attention_masks'])
            targets = torch.stack(data_set['targets'])
            #token_type_ids = data_set['token_type_ids']
            
            ids = ids.to(device)
            mask = mask.to(device)
            targets = targets.to(device)
            #token_type_ids = token_type_ids.to(device, dtype=torch.long)
            
            optimizer.zero_grad()
            outputs = model(
                ids = ids,
                mask=mask,
                token_type_ids=None
            )
            # print(outputs)
            # print(targets)
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

Model definition

config = BertConfig(
    vocab_size=5000,
    max_positional_embeddings=224,
    num_attention_heads=12,
    num_hidden_layers=46,
    type_vocab_size=1,
    num_labels=1,
    hidden_dropout_prob=0.1
)

loss = nn.L1Loss()



class regression_model(nn.Module):
    def __init__(self):
        super(regression_model, self).__init__()
        self.bert = BertForSequenceClassification(config=config)
        self.drop = nn.Dropout(p=0.3)
        #self.out = nn.Linear(self.bert.config.hidden_size, 1)
    
    def forward(self, ids, mask, token_type_ids):
        out = self.bert(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids)[0]
        out = self.drop(out)
        return out

let me know if you need to see anymore code. The model is using a tokenizer specifically made for the dataset and so I had to write the attention_mask part into the collator. It is being trained from scratch on this data as it’s very different to the standard language data.

Thanks
A

Hi, when you say “holes” do you mean zeros? You could try increasing the precision of your values (maybe the numbers get so small pytorch thinks they are zero).

Are you sure you need 46 hidden layers? Seems rather a lot.

What are you passing to your eval_fn? How are you splitting your train/eval data?

Could you be overfitting to your training data?