Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/plpla/DeepCCS
Browse files Browse the repository at this point in the history
  • Loading branch information
ElinaFF committed Dec 19, 2018
2 parents bc14095 + f5eda47 commit b0bc643
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 4 deletions.
2 changes: 2 additions & 0 deletions core/DeepCCS/model/DeepCCS.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ def create_model(self):
"""
Builds a neural net using a set of arguments
"""
if len(self.smiles_encoder.converter) == 0 or len(self.adduct_encoder.converter) == 0:
raise ValueError("Encoders must be fit before creating a model.")
smile_input_layer = Input(shape=(250, len(self.smiles_encoder.converter)), name="smile")
conv = Conv1D(64, kernel_size=4, activation='relu', kernel_initializer='normal')(smile_input_layer)

Expand Down
7 changes: 5 additions & 2 deletions core/DeepCCS/model/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ class SMILESsplitter:
def split(self, smiles):
"""
Split a single SMILES using chemical symbols and characters.
Two letters chemical symbol that end with a 'c' might not be handled properly.
Nitrogen, Sulfur and Oxygen can miss-handled if they are at the begining of an aromatic structure (ex: Coccc)
As and Se will be splitted in two caracters if they are found in an aromatic structure.
Only Co is seen in the current dataset and it is handled properly. TODO: better splitting.
:param smiles: The SMILES to split
:return: A list of chemical symbol/character ordered as
"""
Expand All @@ -35,7 +39,6 @@ def split(self, smiles):
splitted_smiles.append(k + smiles[j + 1])
else:
splitted_smiles.append(k)

elif j != 0 and j < len(smiles) - 1:
if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
splitted_smiles.append(k + smiles[j + 1])
Expand All @@ -49,4 +52,4 @@ def split(self, smiles):
pass
else:
splitted_smiles.append(k)
return splitted_smiles
return splitted_smiles
5 changes: 3 additions & 2 deletions core/DeepCCS/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import h5py as h5
import sys
from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error

from .model.splitter import SMILESsplitter

def filter_data(data_table):
"""
Expand All @@ -38,7 +38,8 @@ def filter_data(data_table):
# Remove smiles that are too long
logging.debug("{} items before filtering".format(len(data_table)))
pre_filter = len(data_table)
data = data_table[np.array([len(str(i)) for i in data_table["SMILES"]]) <= MAX_SMILES_LENGTH]
smiles_splitter = SMILESsplitter()
data = data_table[np.array([len(smiles_splitter.split(i)) for i in data_table["SMILES"]]) <= MAX_SMILES_LENGTH]
# Remove empty smiles
data = data[np.array([len(str(i)) for i in data["SMILES"]]) > 0]
data = data.dropna(axis=0, how="any", subset=["SMILES"])
Expand Down

0 comments on commit b0bc643

Please sign in to comment.