Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cli cleanup #65

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions fegrow/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import click
from fegrow.cli.scoring import score
from fegrow.cli.utils import Settings
from fegrow.cli.report import report


@click.group()
Expand All @@ -11,20 +12,21 @@ def cli():


cli.add_command(score)
cli.add_command(report)


@cli.command()
@click.option(
"-g",
"--gnina-path",
type=click.Path(exists=True, file_okay=True, dir_okay=False, executable=True),
type=click.Path(exists=True, file_okay=True, dir_okay=False, executable=True, resolve_path=True),
help="The path to the gnina executable which will override the settings.",
)
def settings(gnina_path: pathlib.Path):
"""
Create a runtime settings object for scoring runs which can be configured.
"""
config = Settings(gnina_path=gnina_path)
with open("settings.json") as output:
with open("settings.json", "w") as output:
output.write(config.json(indent=2))

105 changes: 105 additions & 0 deletions fegrow/cli/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import click
import pathlib
from typing import Optional


@click.command()
@click.option(
'-l',
'--ligands',
help='The sdf file which contains the scored ligands to create the report for.',
type=click.Path(file_okay=True, dir_okay=False, path_type=pathlib.Path, exists=True),
multiple=True
)
@click.option(
"-o",
"--output",
type=click.Path(dir_okay=True, path_type=pathlib.Path),
help="The name of the output folder.",
)
@click.option(
'-t',
'--title',
type=click.STRING,
help='The title of the dataset which will be used for the report and sdf file names'
)
@click.option(
'-tn',
'--top-n',
help='The top N molecules to keep after sorting by predicted affinity',
type=click.INT
)
def report(
ligands: list[pathlib.Path],
output: pathlib.Path,
title: str,
top_n: Optional[int] = None
):
"""
Generate an interactive HTML report of the molecules from the scored SDF files and an SDF file ordered by the top
performing molecules. Use top_n to restrict the report and sdf file to only include n molecules.
"""
from rdkit import Chem
from fegrow.cli.utils import draw_mol, has_bad_geometry
import pandas as pd
import tqdm
import panel
import bokeh.models.widgets.tables

output.mkdir(exist_ok=True, parents=True)
# load all best scoring ligands
molecules_and_affinities = []
for file in ligands:
# load the low energy ligands and the predicted affinity
supplier = Chem.SDMolSupplier(file.as_posix(), removeHs=True)
for molecule in supplier:
if not has_bad_geometry(molecule):
continue
try:
molecules_and_affinities.append((molecule, float(molecule.GetProp('cnnaffinity'))))
except KeyError:
# catch cases where the sd tags were not writen
continue

click.echo(f'Total number of molecules and predictions: {len(molecules_and_affinities)}')

# sort all molecules by predicted affinity
ranked_all_mols = sorted(molecules_and_affinities, key=lambda x: x[1])
# write out all molecules to a single sdf sorted by affinity
output_file = output.joinpath(f'{title}.sdf')
with Chem.SDWriter(output_file.as_posix()) as sdf_out:
rows = []
for mol, affinity in tqdm.tqdm(ranked_all_mols[:top_n], desc="Creating report...", ncols=80):
try:
smiles = mol.GetProp('smiles')
except KeyError:
smiles = Chem.MolToSmiles(mol)
mol.SetProp('smiles', smiles)
rows.append(
{
"Smiles": smiles,
"Molecule": draw_mol(smiles),
"CNNaffinity": affinity,
"IC50 (nM)": mol.GetProp('cnnaffinityIC50').split()[0],
}
)
sdf_out.write(mol)
# create the report
df = pd.DataFrame(rows)

number_format = bokeh.models.widgets.tables.NumberFormatter(format="0.0000")
layout = panel.Column(
panel.widgets.Tabulator(
df,
show_index=False,
selectable=False,
disabled=True,
formatters={"Smiles": "html", "Molecule": "html", "CNNaffinity": number_format,
"IC50 (nM)": number_format},
configuration={"rowHeight": 400},
),
sizing_mode="stretch_width",
scroll=True
)

layout.save(output.joinpath(f'{title}.html').as_posix(), title=title, embed=True)
195 changes: 113 additions & 82 deletions fegrow/cli/scoring.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pathlib
import time

import click
from typing import Optional
Expand All @@ -9,125 +8,157 @@
@click.option(
"-c",
"--core-ligand",
type=click.Path(exists=True, file_okay=True, dir_okay=False),
type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=pathlib.Path),
help="The path to the SDF file of the core ligand which will be used to restrain the geometries of the scored ligands this should be a substructure of the ligands to be scored.",
)
@click.option(
"-l" "--ligands",
type=click.Path(exists=True, file_okay=True, dir_okay=False),
"-l",
"--ligands",
type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=pathlib.Path),
help="The path to the ligands to be scored can be in any supported format by RDKit such as csv. smiles or SDF.",
)
@click.option(
"-r",
"--receptor",
type=click.Path(exists=True, file_okay=True, dir_okay=False),
type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=pathlib.Path),
help="The path of the receptor PDB file, this should only contain the receptor and not the reference ligand.",
)
@click.option(
"-s",
"--settings",
type=click.Path(exists=True, file_okay=True, dir_okay=False),
type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=pathlib.Path),
help="The path of the settings file which configures the scoring run.",
)
@click.option(
"-o",
"--output",
type=click.Path(dir_okay=True),
type=click.Path(dir_okay=True, path_type=pathlib.Path),
help="The name of the output folder.",
)
@click.option(
"-g",
"--gnina-path",
type=click.Path(exists=True, file_okay=True, dir_okay=False, executable=True),
type=click.Path(exists=True, file_okay=True, dir_okay=False, executable=True, resolve_path=True, path_type=pathlib.Path),
help="The path to the gnina executable which will override the settings.",
)
@click.option(
'-t',
'--threads',
default=1,
show_default=True,
type=click.INT,
help='The number of threads per worker.'
)
@click.option(
'-w',
'--workers',
default=4,
show_default=True,
type=click.INT,
help='The number of workers to use.'

)
def score(
core_ligand: pathlib.Path,
ligands: pathlib.Path,
receptor: pathlib.Path,
output: pathlib.Path,
settings: Optional[pathlib.Path] = None,
gnina_path: Optional[pathlib.Path] = None,
threads: int = 1,
workers: int = 4,
):
"""
Score the list of input ligands using Gnina after optimising in the receptor.
Score the list of input ligands using Gnina after optimising in the receptor. Tasks are distributed using Dask.
"""
from dask.distributed import Client
from rdkit import Chem
import traceback
import dask
import tqdm

import warnings
import os
import logging
from fegrow.cli.utils import score_ligand, Settings, load_target_ligands

try:
from mycluster import create_cluster
except ImportError:

def create_cluster():
from dask.distributed import LocalCluster

return LocalCluster()

client = Client(create_cluster())
# create the cluster
click.echo(f"Client created {client}")

if settings is not None:
config = Settings.parse_file(settings)
if gnina_path is not None:
config.gnina_path = gnina_path
else:
# build the base settings object this needs the gnina path
config = Settings(gnina_path=gnina_path)

click.echo(f"Loading core ligand from {core_ligand}")
# we remove all Hs rather than specific ones at the attachment point
core = Chem.MolFromMolFile(core_ligand, removeHs=True)
core_dask = dask.delayed(core)

# load the target ligands
target_smiles = load_target_ligands(ligand_file=ligands)

# build a list of tasks to submit
for_submission = [
score_ligand(
core_ligand=core_dask,
target_smiles=smiles,
receptor=receptor,
settings=config,
)
for smiles in target_smiles
]

submitted = client.compute(for_submission)
jobs = dict((job, i) for i, job in enumerate(submitted))

output_path = pathlib.Path(output)
output_path.mkdir(exist_ok=True)

molecule_output = Chem.SDWriter(output_path.joinpath("scored_molecules.sdf"))
with tqdm.tqdm(total=len(submitted), desc="Scoring molecules...", ncols=80) as pbar:
while len(jobs) > 0:
for job, index in jobs.items():
if not job.done():
continue

# remove the job
del jobs[job]
pbar.update(1)

try:
mol_data = job.result()
rmol = mol_data.pop("molecule")
# recover the properties (they are not passed with serialisation)
[rmol.SetProp(k, str(v)) for k, v in mol_data.items()]
# write the molecule out when they complete incase we crash
molecule_output.write(rmol)
except Exception as E:
print("error for index, ", index)
traceback.print_exc()

time.sleep(5)

click.echo("All molecules scored")
from fegrow.package import RMol
from dask.distributed import LocalCluster
import time


# hide warnings and logs from openff-toolkit
with warnings.catch_warnings():
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"
logging.getLogger('fegrow').setLevel(logging.ERROR)

client = Client(LocalCluster(threads_per_worker=threads, n_workers=workers))
# create the cluster
click.echo(f"Client created {client}")

if settings is not None:
config = Settings.parse_file(settings)
if gnina_path is not None:
config.gnina_path = gnina_path.as_posix()
else:
# build the base settings object this needs the gnina path
config = Settings(gnina_path=gnina_path.as_posix())
# set the gnina directory
RMol.set_gnina(loc=config.gnina_path)
click.echo(f'Setting Gnina path to: {config.gnina_path}')

click.echo(f"Loading core ligand from {core_ligand}")
# we remove all Hs rather than specific ones at the attachment point
core = Chem.MolFromMolFile(core_ligand.as_posix(), removeHs=True)
core_dask = dask.delayed(core)

# load the target ligands
target_smiles = load_target_ligands(ligand_file=ligands)

# build a list of tasks to submit
for_submission = [
score_ligand(
core_ligand=core_dask,
target_smiles=smiles,
receptor=receptor,
settings=config,
)
for smiles in target_smiles
]

submitted = client.compute(for_submission)

output.mkdir(exist_ok=True)

molecule_output = Chem.SDWriter(output.joinpath("scored_molecules.sdf").as_posix())
with tqdm.tqdm(total=len(submitted), desc="Scoring molecules...", ncols=80) as pbar:
while len(submitted) > 0:
to_remove = []
for job in submitted:
if not job.done():
continue

# remove the job
to_remove.append(job)
pbar.update(1)

try:
mol_data = job.result()
# if the molecule could not be scored as it had no conformers do not save the geometry
if mol_data['cnnaffinity'] == 0:
continue

rmol = mol_data.pop("molecule")
# recover the properties (they are not passed with serialisation)
[rmol.SetProp(k, str(v)) for k, v in mol_data.items()]
# write the molecule out when they complete incase we crash
molecule_output.write(rmol)
except Exception as E:
print("error for molecule")
traceback.print_exc()

# remove jobs before next round
for job in to_remove:
submitted.remove(job)
time.sleep(5)

click.echo("All molecules scored")
Loading
Loading