-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Framework example for non-star shaped analysis #55
base: master
Are you sure you want to change the base?
Changes from 5 commits
1702e98
4d50746
bc85711
07f35e5
15055df
5c5bdfc
ad16020
be93581
85d4233
5f78439
0ab2c69
19082a0
0c7c5b9
016dc38
9796d19
8a98eb9
759023c
8143267
08bd9a8
46da12e
1cedc1b
f9d0b07
eaf2c05
f94502b
870c4bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
from arsenic import stats | ||
import networkx as nx | ||
import numpy as np | ||
import logging | ||
|
||
_logger = logging.getLogger("DiffNet") | ||
|
||
|
||
EXPERIMENTAL_ERROR = 0.5 ## pIC50 | ||
|
||
def _ic50_to_dG(ic50, s_conc=375E-9, Km=40E-6): | ||
"""Converts IC50 (in M units) to DG | ||
|
||
Parameters | ||
---------- | ||
ic50 : float | ||
IC50 in M | ||
s_conc : float, default=375E-9 | ||
Substrate concentration in M | ||
Km : float, default=40E-6 | ||
Substrate concentration for half-maximal enzyme activity | ||
|
||
Returns | ||
------- | ||
type | ||
Description of returned object. | ||
|
||
""" | ||
if ic50 > 1E-5: | ||
_logger.warning('Expecting IC50 in M units. Please check.') | ||
Ki = ic50 / (1 + (s_conc/Km)) | ||
return np.log(Ki) | ||
|
||
|
||
def separate_graphs(graphs): | ||
graphs_to_replace = [] | ||
new_graphs = [] | ||
# now need to check that all of the graphs are weakly connected | ||
# and if not, split them up | ||
for name, g in graphs.items(): | ||
if not nx.is_weakly_connected(g): | ||
graphs_to_replace.append(name) | ||
for j, subgraph in enumerate(nx.weakly_connected_components(g)): | ||
# this should perseve all necessary nodes/edges | ||
new_graphs[name+f'_{j}'] = reduce_graph(g, subgraph) | ||
# now get rid of non-connected graphs | ||
for name in graphs_to_replace: | ||
del graphs[name] | ||
|
||
# and add the new graphs | ||
graphs.update(new_graphs) | ||
|
||
return graphs | ||
|
||
|
||
def reduce_graph(g, nodes_to_keep): | ||
""" Takes a big graph and retains only required nodes | ||
This can be useful for minimising large graphs, | ||
or breaking down unconnected graphs | ||
|
||
Any information of the graph, nodes (to be kept), and edges (where both nodes are to be kept) are retained | ||
|
||
Parameters | ||
---------- | ||
g : nx.DiGraph | ||
Large graph to simplify | ||
nodes_to_keep : list | ||
List of nodes that should be retained | ||
|
||
Returns | ||
------- | ||
nx.DiGraph | ||
Smaller subset graph of g | ||
|
||
""" | ||
import copy | ||
new_g = copy.deepcopy(g) | ||
new_g.remove_nodes_from([i for i in new_g.nodes() | ||
if i not in nodes_to_keep]) | ||
return new_g | ||
|
||
|
||
def combine_relative_free_energies(results): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we'll eventually want to transition this to taking a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (I think we will want to merge #65 before doing this anyway) |
||
graphs = {} | ||
for result in results: | ||
protein = result['details']['protein'].split('/')[-1] | ||
if protein not in graphs: | ||
graphs[protein] = nx.DiGraph() | ||
graph = graphs[protein] | ||
# check for empty results | ||
if result['analysis']['binding']['delta_f'] is not None and \ | ||
result['analysis']['binding']['ddelta_f'] is not None: | ||
# could just add all information from the JSON onto the nodes/edges | ||
graph.add_edge(result['details']['start_title'], | ||
result['details']['end_title'], | ||
f_ij=result['analysis']['binding']['delta_f'], | ||
f_dij=result['analysis']['binding']['ddelta_f'], | ||
complex_overlap=result['analysis']['complex_phase']['free_energy']['bar_overlap'], | ||
solvent_overlap=result['analysis']['solvent_phase']['free_energy']['bar_overlap']) | ||
|
||
# see if either ligand has an experimental affinity associated node | ||
for state in ['start', 'end']: | ||
if f'{state}_pIC50' in result['details'].keys(): | ||
for graph in graphs.values(): | ||
pIC50 = float(result['details'][f'{state}_pIC50']) | ||
title = result['details'][f'{state}_title'] | ||
if graph.has_node(result['details'][f'{state}_title']): | ||
# see if it already has an pIC50 | ||
if 'pIC50' in graph.nodes[title]: | ||
assert(pIC50 == graph.nodes[title]['pIC50']), \ | ||
f"Trying to add an pIC50, {float(result['details'][f'{state}_pIC50'])}, \ | ||
that disagrees with a previous value for \ | ||
node {graph.nodes[title]}" | ||
else: | ||
# or add the pIC50 if it's not available | ||
graph.nodes[title]['pIC50'] = pIC50 | ||
ic50 = 10**(-pIC50) | ||
dg = _ic50_to_dG(ic50) | ||
graph.nodes[title]['exp_DG'] = dg | ||
exp_dDG = _ic50_to_dG(10**(-EXPERIMENTAL_ERROR)) | ||
graph.nodes[title]['exp_dDG'] = exp_dDG | ||
|
||
# TODO this can be wrapped in a function | ||
_logger.info(f'There are {len(graphs)} unique protein files used') | ||
graphs = separate_graphs(graphs) | ||
_logger.info(f'There are {len(graphs)} graphs \ | ||
after splitting into weakly connected sets') | ||
|
||
# now combine the relative simulations to get a per-ligand estimate | ||
for name, graph in graphs.items(): | ||
f_i, C = stats.mle(graph, factor='f_ij', node_factor='exp_DG') | ||
errs = np.diag(C) | ||
|
||
exp_values = [n[1]['exp_DG'] for n in graph.nodes(data=True) | ||
if 'exp_DG' in n[1]] | ||
if len(exp_values) == 0: | ||
_logger.warning(f'No experimental value in graph {name}. \ | ||
Results are only useful for relative comparisions \ | ||
within this set, NOT with other sets and should not \ | ||
be considered as absolute predictions') | ||
for dg, dg_err, node in zip(f_i, errs, graph.nodes(data=True)): | ||
node[1]['rel_f_i'] = dg | ||
node[1]['rel_df_i'] = dg_err | ||
else: | ||
for dg, dg_err, node in zip(f_i, errs, graph.nodes(data=True)): | ||
# shift the calc RBFE to the known experimental values | ||
node[1]['f_i'] = dg | ||
node[1]['df_i'] = dg_err | ||
|
||
# not sure how to best return from this script | ||
return graphs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we'll want to return both the graph (for plotting purposes) and updated key entries to a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (or a " There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed that it would be useful to return both the graph and some json-compatible structure for inclusion in the analysis.json! For the latter, I've been following the convention of describing the structure as a dataclass, e.g. like https://github.com/choderalab/covid-moonshot-infra/blob/266612d6ceef234ad6b77840f7e23cb65700dc64/covid_moonshot/core.py#L34-L40 This is nice as a form of documentation and also allows using something like mypy to check that we're not accessing fields incorrectly in downstream code. For this case, would something like the following work? @dataclass_json
@dataclass
class MoleculeAnalysis:
molecule_id: str
absolute_free_energy: FreeEnergy
def combine_relative_free_energies(results: Analysis) -> List[MoleculeAnalysis]:
# ... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this returns a float (?), need to update the docstring here