Skip to content

Commit

Permalink
commented code
Browse files Browse the repository at this point in the history
  • Loading branch information
PaulaKramer committed Feb 1, 2025
1 parent 368be76 commit b1cf797
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 170 deletions.
40 changes: 29 additions & 11 deletions kinfraglib/filters/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,16 @@ def retro_routes_fragments(fragment_library, evaluate, subpocket, molsPerRow=10)


def sample_subset(fragment_library, sample):
"""
Samples a random subset of the given fragment library
----------
fragment_library : dict
fragment library organized in subpockets
sample : float
fraction of data to be sampled
"""

fragment_library_subset = {}
if sample != 1.0:
for subpocket in fragment_library.keys():
Expand All @@ -323,18 +333,10 @@ def sample_subset(fragment_library, sample):

def create_tsne_embeddings(fragment_library):
"""
Creates t-SNE plots comparing
a) pre-filtered and reduced fragment library
b) pre-filtered and custom filtered fragment library
c) pre-filtered, reduced and custom fragment library
and prints number of fragments in the subsets.
Creates the t-SNE embedding for all following t-SNE plots
----------
fragment_library : dict
fragment library organized in subpockets containing boolean columuns `bool_reduced`and
`bool_custom`defining if the fragments are part of the subsets
sample : float
fraction of dataset to be sampled, `1.0` if the whole dataset should be plotted
fragment library organized in subpockets
"""

Expand All @@ -353,7 +355,21 @@ def create_tsne_embeddings(fragment_library):


def create_tsne_plots(crds_embedded, fragment_library):
"""
Creates t-SNE plots comparing
a) pre-filtered and reduced fragment library
b) pre-filtered and custom filtered fragment library
c) pre-filtered, reduced and custom fragment library
and prints number of fragments in the subsets.
----------
crds_embedded : list
t-SNE embedding of the fragment library
fragment_library : dict
fragment library organized in subpockets containing boolean columuns `bool_reduced`and
`bool_custom`defining if the fragments are part of the subsets
"""
fragment_library_concat = pd.concat(fragment_library).reset_index(drop=True)
tsne_df = pd.DataFrame(crds_embedded, columns=["X", "Y"])
# add bool column from filtering steps here
Expand Down Expand Up @@ -472,8 +488,10 @@ def create_tsne_plots_filters(crds_embedded, fragment_library, saved_filter_resu
Creates t-SNE plots with accepted (green) and rejected (red) fragments for each filtering step.
----------
crds_embedded : list
t-SNE embeddings of the fragment library
fragment_library : dict
    fragment library organized in subpockets containing boolean columuns
fragment library organized in subpockets containing boolean columuns
saved_filter_results : dataframe
loaded file with saved filter results
Expand Down
160 changes: 1 addition & 159 deletions notebooks/custom_kinfraglib/2_2_custom_filters_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1312,164 +1312,6 @@
"### 2.1. Comparing fragment library sets"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "315f9035",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.decomposition import PCA\n",
"from sklearn.manifold import TSNE\n",
"from rdkit.Chem import Draw, MACCSkeys\n",
"import seaborn as sns \n",
"import matplotlib.pyplot as plt \n",
"\n",
"def debug_tsne(fragment_library, sample=1.0):\n",
" \"\"\"\n",
" Creates t-SNE plots comparing\n",
" a) pre-filtered and reduced fragment library\n",
" b) pre-filtered and custom filtered fragment library\n",
" c) pre-filtered, reduced and custom fragment library\n",
"\n",
" and prints number of fragments in the subsets.\n",
" ----------\n",
" fragment_library : dict\n",
" fragment library organized in subpockets containing boolean columuns `bool_reduced`and\n",
" `bool_custom`defining if the fragments are part of the subsets\n",
"\n",
" \"\"\"\n",
"\n",
" fragment_library_subset = {}\n",
" if sample != 1.0: \n",
" for subpocket in fragment_library.keys(): \n",
" sample_num = int(len(fragment_library[subpocket]) * sample)\n",
" fragment_library_subset[subpocket] = fragment_library[subpocket].sample(sample_num, random_state=1)\n",
"\n",
" fragment_library_concat = pd.concat(fragment_library_subset).reset_index(drop=True)\n",
" fragment_library_concat[\"maccs\"] = fragment_library_concat.ROMol.apply(\n",
" MACCSkeys.GenMACCSKeys\n",
" )\n",
"\n",
" pca = PCA(n_components=50)\n",
" crds = pca.fit_transform(list(fragment_library_concat[\"maccs\"]))[:1500]\n",
"\n",
" crds_embedded = TSNE(\n",
" n_components=2, init=\"pca\", learning_rate=\"auto\", random_state=0\n",
" ).fit_transform(crds)\n",
"\n",
" tsne_df = pd.DataFrame(crds_embedded, columns=[\"X\", \"Y\"])\n",
" # add bool column from filtering steps here\n",
" tsne_df[\"reduced\"] = fragment_library_concat[\"bool_reduced\"]\n",
" tsne_df[\"custom\"] = fragment_library_concat[\"bool_custom\"]\n",
" # create column defining if fragment is\n",
" # *excluded in both subsets (0)\n",
" # *included in custom (1)\n",
" # *included in reduced (2)\n",
" # *included in both subsets (3)\n",
" bool_compare = []\n",
" for i, row in fragment_library_concat.iterrows():\n",
" if row[\"bool_reduced\"] == 0 and row[\"bool_custom\"] == 0:\n",
" bool_compare.append(0)\n",
" elif row[\"bool_reduced\"] == 0 and row[\"bool_custom\"] == 1:\n",
" bool_compare.append(1)\n",
" elif row[\"bool_reduced\"] == 1 and row[\"bool_custom\"] == 0:\n",
" bool_compare.append(2)\n",
" elif row[\"bool_reduced\"] == 1 and row[\"bool_custom\"] == 1:\n",
" bool_compare.append(3)\n",
" tsne_df[\"compare\"] = bool_compare\n",
" num0 = len(tsne_df[tsne_df[\"compare\"] == 0])\n",
" num1 = len(tsne_df[tsne_df[\"compare\"] == 1])\n",
" num2 = len(tsne_df[tsne_df[\"compare\"] == 2])\n",
" num3 = len(tsne_df[tsne_df[\"compare\"] == 3])\n",
"\n",
" # create tsne plots\n",
" plt.figure(figsize=(18, 10))\n",
" plt.subplot(2, 2, 1)\n",
" sns.scatterplot(\n",
" data=tsne_df.query(\"reduced == 0\"),\n",
" x=\"X\",\n",
" y=\"Y\",\n",
" color=\"lightcoral\",\n",
" alpha=0.5,\n",
" label=\"excluded\",\n",
" ).set_title(\"pre_filtered vs. reduced\")\n",
" sns.scatterplot(\n",
" data=tsne_df.query(\"reduced == 1\"),\n",
" x=\"X\",\n",
" y=\"Y\",\n",
" color=\"green\",\n",
" alpha=0.5,\n",
" label=\"included\",\n",
" )\n",
"\n",
" plt.subplot(2, 2, 2)\n",
" sns.scatterplot(\n",
" data=tsne_df.query(\"custom == 0\"),\n",
" x=\"X\",\n",
" y=\"Y\",\n",
" color=\"lightcoral\",\n",
" alpha=0.5,\n",
" label=\"excluded\",\n",
" ).set_title(\"pre-filtered vs. custom\")\n",
" sns.scatterplot(\n",
" data=tsne_df.query(\"custom == 1\"),\n",
" x=\"X\",\n",
" y=\"Y\",\n",
" color=\"green\",\n",
" alpha=0.5,\n",
" label=\"included\",\n",
" )\n",
"\n",
" plt.subplot(2, 2, 3)\n",
" sns.scatterplot(\n",
" data=tsne_df.query(\"compare == 0\"),\n",
" x=\"X\",\n",
" y=\"Y\",\n",
" color=\"lightcoral\",\n",
" alpha=0.5,\n",
" label=\"excluded in both subsets\",\n",
" ).set_title(\"pre-filtered vs. reduced vs. custom\")\n",
" sns.scatterplot(\n",
" data=tsne_df.query(\"compare == 1\"),\n",
" x=\"X\",\n",
" y=\"Y\",\n",
" color=\"orange\",\n",
" alpha=0.5,\n",
" label=\"included in custom subset\",\n",
" )\n",
" sns.scatterplot(\n",
" data=tsne_df.query(\"compare == 2\"),\n",
" x=\"X\",\n",
" y=\"Y\",\n",
" color=\"lightblue\",\n",
" alpha=0.5,\n",
" label=\"included in reduced subset\",\n",
" )\n",
" sns.scatterplot(\n",
" data=tsne_df.query(\"compare == 3\"),\n",
" x=\"X\",\n",
" y=\"Y\",\n",
" color=\"green\",\n",
" alpha=0.5,\n",
" label=\"included in both subsets\",\n",
" )\n",
" plt.legend(loc=\"upper right\", bbox_to_anchor=(1.425, 1), ncol=1)\n",
"\n",
" plt.show()\n",
" num_lists = (len(tsne_df[\"compare\"]), num0, num1, num2, num3)\n",
" print(\n",
" \"\"\"%s Pre-filtered fragments.\n",
" Number of fragments excluded in both datasets: %s\n",
" Number of fragments included in the custom dataset and excluded in the reduced dataset: %s\n",
" Number of fragments included in the reduced dataset and excluded in the custom dataset: %s\n",
" Number of fragments in both datasets: %s \"\"\"\n",
" % (num_lists)\n",
" )\n",
" tsne_df[\"smiles\"] = fragment_library_concat[\"smiles\"]\n",
" return tsne_df"
]
},
{
"cell_type": "code",
"execution_count": 17,
Expand Down Expand Up @@ -1499,7 +1341,7 @@
}
],
"source": [
"fragment_library_subset = filters.plots.sample_subset(fragment_library, 0.8)\n",
"fragment_library_subset = filters.plots.sample_subset(fragment_library, 0.6)\n",
"crds_embedded = filters.plots.create_tsne_embeddings(fragment_library_subset)\n",
"tsne_df = filters.plots.create_tsne_plots(crds_embedded, fragment_library_subset)"
]
Expand Down

0 comments on commit b1cf797

Please sign in to comment.