Using SMILES string to predict a float
I’ve been learning how to use this library over the past few weeks and getting stuck into it. I don’t have a lot of experience with NNs but I have some understanding. I want to use Roberta to build a regression model which would predict the CCS (collisional cross section) area of a molecule given it’s formula in a SMILES string (which is a string representation of the molecule).
I’ve run into an index out of range in self within the model and I’m wondering if anyone has any suggestions. I’m assuming it’s a problem with the forward method in my model or my Dataset. I am currently still unable to train the model at all.
Any suggestions would be amazing, Thank you.
a smiles string looks like this:
COC1=CC=CC=C1CNCCC2=CC(=C(C=C2OC)Cl)OC
The error
raceback (most recent call last):
File "c:/Users/ktzd064/Documents/Python/CCS_Prediction/CCSround2.py", line 165, in <module>
trainer.train()
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\transformers\trainer.py", line 707, in train
tr_loss += self.training_step(model, inputs)
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\transformers\trainer.py", line 994, in training_step
outputs = model(**inputs)
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "c:/Users/ktzd064/Documents/Python/CCS_Prediction/CCSround2.py", line 21, in forward
out, _ = self.bert(input_ids, token_type_ids, attention_mask)
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\transformers\modeling_roberta.py", line 479, in forward
return_dict=return_dict,
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\transformers\modeling_bert.py", line 825, in forward
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\transformers\modeling_roberta.py", line 82, in forward
input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\transformers\modeling_bert.py", line 209, in forward
token_type_embeddings = self.token_type_embeddings(token_type_ids)
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\torch\nn\modules\sparse.py", line 126, in forward
self.norm_type, self.scale_grad_by_freq, self.sparse)
File "C:\ProgramData\Anaconda3\envs\tf-transformer\lib\site-packages\torch\nn\functional.py", line 1814, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self
The model
class regression_model(nn.Module):
def __init__(self):
super(regression_model, self).__init__()
self.bert = RobertaForSequenceClassification(config=config)
self.drop = nn.Dropout(p=0.3)
self.out = nn.Linear(self.bert.config.hidden_size, 1)
def forward(self, input_ids, attention_mask, targets, token_type_ids,):
out, _ = self.bert(input_ids, token_type_ids, attention_mask)
out = self.dropout(out)
loss = nn.MSELoss()
output = loss(out, targets)
return output
The Dataset
class smiles_dataset(Dataset):
def __init__(self, smiles, targets, tokenizer, max_len):
self.smiles = smiles
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.smiles)
def __getitem__(self, index):
smiles = str(self.smiles[index])
targets = float(self.targets[index])
encoding = self.tokenizer.encode(
smiles,
)
return {
'SMILES': smiles,
'input_ids': encoding.ids,
'attention_mask': encoding.attention_mask,
'targets': torch.tensor(targets, dtype=torch.float32),
'token_type_ids': encoding.type_ids,
}
Tokenizer
I have trained the tokenizer on the SMILES dataset I am using
tokenizer = ByteLevelBPETokenizer()
tokenizer.train('SMILES.txt', vocab_size=800, min_frequency=1, special_tokens=["<s>",
"<PAD>",
"<MASK>",])
tokenizer.save_model("CCSround2")
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer(
"./CCSround2/vocab.json",
"./CCSround2/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
("<PAD>", tokenizer.token_to_id("<PAD>")),
("<MASK>", tokenizer.token_to_id("<MASK>")),
)
tokenizer.enable_padding(length=300)
tokenizer.save
The training config
train, test = train_test_split(SMILESandCCS, test_size=0.2)
train_dataset = smiles_dataset(train['SMILES'].values, train['CCS'].values, tokenizer, 300)
test_dataset = smiles_dataset(test['SMILES'].values, test['CCS'].values, tokenizer, 300)
from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForSequenceClassification
config = RobertaConfig(
vocab_size=800,
max_positional_embeddings=224,
num_attention_heads=12,
num_hidden_layers=6,
type_vocab_size=1,
)
tokenizer = RobertaTokenizerFast.from_pretrained('./CCSround2', max_len=300)
from transformers import DataCollatorForLanguageModeling
model = regression_model()
training_args = TrainingArguments(
output_dir = '/results',
num_train_epochs = 3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
weight_decay=0.01,
logging_dir='/logs',
)
trainer = Trainer(
model = model,
args = training_args,
train_dataset = train_dataset,
)
trainer.train()