Hi as title says my model is giving the exact same output for different inputs when evaluating but is training and giving varied outputs when training.
I can’t workout why I know it’s not that it’s the same input.
I am also ending up with holes in the outputs when there are no such holes within the input data.
both errors can be seen in the screenshout of my terminal output here:
My training loss is: 121,
whereas my RMS of my eval function is 23,000??
Data_set and datacollator:
**class smiles_dataset(Dataset):
def __init__(self, smiles, targets):
self.smiles = smiles
self.targets = targets
#self.tokenizer = tokenizer
#self.max_len = max_len
def __len__(self):
return len(self.smiles)
def __getitem__(self, index):
smiles = str(self.smiles[index])
targets = float(self.targets[index])
return {
'SMILES': smiles,
'targets': targets,
}
class SMILESDataCollator():
def __init__(self, tokenizer):
self.tokenizer = tokenizer
def __call__(self, data):
return self.__collate__(default_collate(data))
def __collate__(self, data):
# Not sure why padding it twice works here but it gives the desired output so can't complain
input_id_sequence = []
for d in data['SMILES']:
input_ids = self.tokenizer(d)[0]
input_id_sequence.append(input_ids)
input_ids_padded = pad_sequence(input_id_sequence, batch_first=True, padding_value=0)
# input_ids_padded = [pad_sequence((self.tokenizer(d)), padding_value=0) for d in data['SMILES']]
# input_ids_padded = pad_sequence(input_ids_padded, padding_value=0)
# for all the d in data
target_list = []
for d in data['targets']:
target_list.append(d)
atten_mask_list = []
for input_ids in input_ids_padded:
atten_mask = torch.where(input_ids.eq(torch.zeros(1)), torch.tensor([0]), torch.tensor([1]))
atten_mask_list.append(atten_mask)
return {
"input_ids": input_ids_padded,
"attention_masks": atten_mask_list,
"targets": target_list
}
Eval function:
def eval_fn(data_loader, model, device, optimizer):
model.eval()
fin_targets = []
fin_outputs = []
with torch.no_grad():
for batch_index, data_set in tqdm(enumerate(data_loader), total=len(data_loader)):
ids = data_set['input_ids']
mask = torch.stack(data_set['attention_masks'])
targets = torch.stack(data_set['targets'])
#token_type_ids = data_set['token_type_ids']
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
#token_type_ids = token_type_ids.to(device, dtype=torch.long)
optimizer.zero_grad()
outputs = model(
ids = ids,
mask=mask,
token_type_ids=None
)
# print(outputs)
# print(targets)
fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
fin_targets.extend(targets.cpu().detach().numpy().tolist())
return fin_outputs, fin_targets
Model definition
config = BertConfig(
vocab_size=5000,
max_positional_embeddings=224,
num_attention_heads=12,
num_hidden_layers=46,
type_vocab_size=1,
num_labels=1,
hidden_dropout_prob=0.1
)
loss = nn.L1Loss()
class regression_model(nn.Module):
def __init__(self):
super(regression_model, self).__init__()
self.bert = BertForSequenceClassification(config=config)
self.drop = nn.Dropout(p=0.3)
#self.out = nn.Linear(self.bert.config.hidden_size, 1)
def forward(self, ids, mask, token_type_ids):
out = self.bert(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids)[0]
out = self.drop(out)
return out
let me know if you need to see anymore code. The model is using a tokenizer specifically made for the dataset and so I had to write the attention_mask part into the collator. It is being trained from scratch on this data as it’s very different to the standard language data.
Thanks
A