cole-group · jthorton · Jan 17, 2024 · Jan 18, 2024 · Jan 18, 2024 · Feb 7, 2024
diff --git a/fegrow/cli/cli.py b/fegrow/cli/cli.py
@@ -3,6 +3,7 @@
 import click
 from fegrow.cli.scoring import score
 from fegrow.cli.utils import Settings
+from fegrow.cli.report import report
 
 
 @click.group()
@@ -11,20 +12,21 @@ def cli():
 
 
 cli.add_command(score)
+cli.add_command(report)
 
 
 @cli.command()
 @click.option(
     "-g",
     "--gnina-path",
-    type=click.Path(exists=True, file_okay=True, dir_okay=False, executable=True),
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, executable=True, resolve_path=True),
     help="The path to the gnina executable which will override the settings.",
 )
 def settings(gnina_path: pathlib.Path):
     """
     Create a runtime settings object for scoring runs which can be configured.
     """
     config = Settings(gnina_path=gnina_path)
-    with open("settings.json") as output:
+    with open("settings.json", "w") as output:
         output.write(config.json(indent=2))
 
diff --git a/fegrow/cli/report.py b/fegrow/cli/report.py
@@ -0,0 +1,105 @@
+import click
+import pathlib
+from typing import Optional
+
+
+@click.command()
+@click.option(
+    '-l',
+    '--ligands',
+    help='The sdf file which contains the scored ligands to create the report for.',
+    type=click.Path(file_okay=True, dir_okay=False, path_type=pathlib.Path, exists=True),
+    multiple=True
+)
+@click.option(
+    "-o",
+    "--output",
+    type=click.Path(dir_okay=True, path_type=pathlib.Path),
+    help="The name of the output folder.",
+)
+@click.option(
+    '-t',
+    '--title',
+    type=click.STRING,
+    help='The title of the dataset which will be used for the report and sdf file names'
+)
+@click.option(
+    '-tn',
+    '--top-n',
+    help='The top N molecules to keep after sorting by predicted affinity',
+    type=click.INT
+)
+def report(
+        ligands: list[pathlib.Path],
+        output: pathlib.Path,
+        title: str,
+        top_n: Optional[int] = None
+):
+    """
+    Generate an interactive HTML report of the molecules from the scored SDF files and an SDF file ordered by the top
+    performing molecules. Use top_n to restrict the report and sdf file to only include n molecules.
+    """
+    from rdkit import Chem
+    from fegrow.cli.utils import draw_mol, has_bad_geometry
+    import pandas as pd
+    import tqdm
+    import panel
+    import bokeh.models.widgets.tables
+
+    output.mkdir(exist_ok=True, parents=True)
+    # load all best scoring ligands
+    molecules_and_affinities = []
+    for file in ligands:
+        # load the low energy ligands and the predicted affinity
+        supplier = Chem.SDMolSupplier(file.as_posix(), removeHs=True)
+        for molecule in supplier:
+            if not has_bad_geometry(molecule):
+                continue
+            try:
+                molecules_and_affinities.append((molecule, float(molecule.GetProp('cnnaffinity'))))
+            except KeyError:
+                # catch cases where the sd tags were not writen
+                continue
+
+    click.echo(f'Total number of molecules and predictions: {len(molecules_and_affinities)}')
+
+    # sort all molecules by predicted affinity
+    ranked_all_mols = sorted(molecules_and_affinities, key=lambda x: x[1])
+    # write out all molecules to a single sdf sorted by affinity
+    output_file = output.joinpath(f'{title}.sdf')
+    with Chem.SDWriter(output_file.as_posix()) as sdf_out:
+        rows = []
+        for mol, affinity in tqdm.tqdm(ranked_all_mols[:top_n], desc="Creating report...", ncols=80):
+            try:
+                smiles = mol.GetProp('smiles')
+            except KeyError:
+                smiles = Chem.MolToSmiles(mol)
+                mol.SetProp('smiles', smiles)
+            rows.append(
+                {
+                    "Smiles": smiles,
+                    "Molecule": draw_mol(smiles),
+                    "CNNaffinity": affinity,
+                    "IC50 (nM)": mol.GetProp('cnnaffinityIC50').split()[0],
+                }
+            )
+            sdf_out.write(mol)
+    # create the report
+    df = pd.DataFrame(rows)
+
+    number_format = bokeh.models.widgets.tables.NumberFormatter(format="0.0000")
+    layout = panel.Column(
+        panel.widgets.Tabulator(
+            df,
+            show_index=False,
+            selectable=False,
+            disabled=True,
+            formatters={"Smiles": "html", "Molecule": "html", "CNNaffinity": number_format,
+                        "IC50 (nM)": number_format},
+            configuration={"rowHeight": 400},
+        ),
+        sizing_mode="stretch_width",
+        scroll=True
+    )
+
+    layout.save(output.joinpath(f'{title}.html').as_posix(), title=title, embed=True)
diff --git a/fegrow/cli/scoring.py b/fegrow/cli/scoring.py
@@ -1,5 +1,4 @@
 import pathlib
-import time
 
 import click
 from typing import Optional
@@ -9,125 +8,157 @@
 @click.option(
     "-c",
     "--core-ligand",
-    type=click.Path(exists=True, file_okay=True, dir_okay=False),
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=pathlib.Path),
     help="The path to the SDF file of the core ligand which will be used to restrain the geometries of the scored ligands this should be a substructure of the ligands to be scored.",
 )
 @click.option(
-    "-l" "--ligands",
-    type=click.Path(exists=True, file_okay=True, dir_okay=False),
+    "-l",
+    "--ligands",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=pathlib.Path),
     help="The path to the ligands to be scored can be in any supported format by RDKit such as csv. smiles or SDF.",
 )
 @click.option(
     "-r",
     "--receptor",
-    type=click.Path(exists=True, file_okay=True, dir_okay=False),
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=pathlib.Path),
     help="The path of the receptor PDB file, this should only contain the receptor and not the reference ligand.",
 )
 @click.option(
     "-s",
     "--settings",
-    type=click.Path(exists=True, file_okay=True, dir_okay=False),
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=pathlib.Path),
     help="The path of the settings file which configures the scoring run.",
 )
 @click.option(
     "-o",
     "--output",
-    type=click.Path(dir_okay=True),
+    type=click.Path(dir_okay=True, path_type=pathlib.Path),
     help="The name of the output folder.",
 )
 @click.option(
     "-g",
     "--gnina-path",
-    type=click.Path(exists=True, file_okay=True, dir_okay=False, executable=True),
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, executable=True, resolve_path=True, path_type=pathlib.Path),
     help="The path to the gnina executable which will override the settings.",
 )
+@click.option(
+    '-t',
+    '--threads',
+    default=1,
+    show_default=True,
+    type=click.INT,
+    help='The number of threads per worker.'
+)
+@click.option(
+    '-w',
+    '--workers',
+    default=4,
+    show_default=True,
+    type=click.INT,
+    help='The number of workers to use.'
+
+)
 def score(
     core_ligand: pathlib.Path,
     ligands: pathlib.Path,
     receptor: pathlib.Path,
     output: pathlib.Path,
     settings: Optional[pathlib.Path] = None,
     gnina_path: Optional[pathlib.Path] = None,
+    threads: int = 1,
+    workers: int = 4,
 ):
     """
-    Score the list of input ligands using Gnina after optimising in the receptor.
+    Score the list of input ligands using Gnina after optimising in the receptor. Tasks are distributed using Dask.
     """
     from dask.distributed import Client
     from rdkit import Chem
     import traceback
     import dask
     import tqdm
-
+    import warnings
+    import os
+    import logging
     from fegrow.cli.utils import score_ligand, Settings, load_target_ligands
-
-    try:
-        from mycluster import create_cluster
-    except ImportError:
-
-        def create_cluster():
-            from dask.distributed import LocalCluster
-
-            return LocalCluster()
-
-    client = Client(create_cluster())
-    # create the cluster
-    click.echo(f"Client created {client}")
-
-    if settings is not None:
-        config = Settings.parse_file(settings)
-        if gnina_path is not None:
-            config.gnina_path = gnina_path
-    else:
-        # build the base settings object this needs the gnina path
-        config = Settings(gnina_path=gnina_path)
-
-    click.echo(f"Loading core ligand from {core_ligand}")
-    # we remove all Hs rather than specific ones at the attachment point
-    core = Chem.MolFromMolFile(core_ligand, removeHs=True)
-    core_dask = dask.delayed(core)
-
-    # load the target ligands
-    target_smiles = load_target_ligands(ligand_file=ligands)
-
-    # build a list of tasks to submit
-    for_submission = [
-        score_ligand(
-            core_ligand=core_dask,
-            target_smiles=smiles,
-            receptor=receptor,
-            settings=config,
-        )
-        for smiles in target_smiles
-    ]
-
-    submitted = client.compute(for_submission)
-    jobs = dict((job, i) for i, job in enumerate(submitted))
-
-    output_path = pathlib.Path(output)
-    output_path.mkdir(exist_ok=True)
-
-    molecule_output = Chem.SDWriter(output_path.joinpath("scored_molecules.sdf"))
-    with tqdm.tqdm(total=len(submitted), desc="Scoring molecules...", ncols=80) as pbar:
-        while len(jobs) > 0:
-            for job, index in jobs.items():
-                if not job.done():
-                    continue
-
-                # remove the job
-                del jobs[job]
-                pbar.update(1)
-
-                try:
-                    mol_data = job.result()
-                    rmol = mol_data.pop("molecule")
-                    # recover the properties (they are not passed with serialisation)
-                    [rmol.SetProp(k, str(v)) for k, v in mol_data.items()]
-                    # write the molecule out when they complete incase we crash
-                    molecule_output.write(rmol)
-                except Exception as E:
-                    print("error for index, ", index)
-                    traceback.print_exc()
-
-            time.sleep(5)
-
-    click.echo("All molecules scored")
+    from fegrow.package import RMol
+    from dask.distributed import LocalCluster
+    import time
+
+
+    # hide warnings and logs from openff-toolkit
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        os.environ["PYTHONWARNINGS"] = "ignore"
+        logging.getLogger('fegrow').setLevel(logging.ERROR)
+
+        client = Client(LocalCluster(threads_per_worker=threads, n_workers=workers))
+        # create the cluster
+        click.echo(f"Client created {client}")
+
+        if settings is not None:
+            config = Settings.parse_file(settings)
+            if gnina_path is not None:
+                config.gnina_path = gnina_path.as_posix()
+        else:
+            # build the base settings object this needs the gnina path
+            config = Settings(gnina_path=gnina_path.as_posix())
+        # set the gnina directory
+        RMol.set_gnina(loc=config.gnina_path)
+        click.echo(f'Setting Gnina path to: {config.gnina_path}')
+
+        click.echo(f"Loading core ligand from {core_ligand}")
+        # we remove all Hs rather than specific ones at the attachment point
+        core = Chem.MolFromMolFile(core_ligand.as_posix(), removeHs=True)
+        core_dask = dask.delayed(core)
+
+        # load the target ligands
+        target_smiles = load_target_ligands(ligand_file=ligands)
+
+        # build a list of tasks to submit
+        for_submission = [
+            score_ligand(
+                core_ligand=core_dask,
+                target_smiles=smiles,
+                receptor=receptor,
+                settings=config,
+            )
+            for smiles in target_smiles
+        ]
+
+        submitted = client.compute(for_submission)
+
+        output.mkdir(exist_ok=True)
+
+        molecule_output = Chem.SDWriter(output.joinpath("scored_molecules.sdf").as_posix())
+        with tqdm.tqdm(total=len(submitted), desc="Scoring molecules...", ncols=80) as pbar:
+            while len(submitted) > 0:
+                to_remove = []
+                for job in submitted:
+                    if not job.done():
+                        continue
+
+                    # remove the job
+                    to_remove.append(job)
+                    pbar.update(1)
+
+                    try:
+                        mol_data = job.result()
+                        # if the molecule could not be scored as it had no conformers do not save the geometry
+                        if mol_data['cnnaffinity'] == 0:
+                            continue
+
+                        rmol = mol_data.pop("molecule")
+                        # recover the properties (they are not passed with serialisation)
+                        [rmol.SetProp(k, str(v)) for k, v in mol_data.items()]
+                        # write the molecule out when they complete incase we crash
+                        molecule_output.write(rmol)
+                    except Exception as E:
+                        print("error for molecule")
+                        traceback.print_exc()
+
+                # remove jobs before next round
+                for job in to_remove:
+                    submitted.remove(job)
+                time.sleep(5)
+
+        click.echo("All molecules scored")