Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
tantrev committed Aug 27, 2024
0 parents commit 4c61883
Show file tree
Hide file tree
Showing 40 changed files with 1,210,473 additions and 0 deletions.
50 changes: 50 additions & 0 deletions AF_confidence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas as pd
import numpy as np
import glob
import os

from alphafold_err import getGoods

def getPercent(input_x2c,input_df_goods):
percentList = list()
for index, row in input_x2c.iterrows():
df_site = pd.DataFrame(pd.DataFrame(row["residues"].split(":")[1].strip().split(","))[0].astype('str').str.strip().astype('int')) #s_sitemap_
truth = df_site.merge(input_df_goods,on=0,how='outer',indicator=True)
counts = truth["_merge"].value_counts()
true_denom = counts.sum()-counts[counts.index=="right_only"].iloc[0]
percent = counts[counts.index=="both"].iloc[0]/true_denom
minny = df_site[0].min()
maxxy = df_site[0].max()
checky = pd.DataFrame(np.arange(minny,maxxy+1))
checky2 = checky.merge(input_df_goods,indicator=True,how='left')
all_good_percent = (checky2["_merge"]=="both").sum()/checky2.shape[0]
left_over = checky2.shape[0]-(checky2["_merge"]=="both").sum()
percentList.append((percent,all_good_percent,left_over,checky2.shape[0]))
return percentList

sitemap_file = "outputs/sitemap_results2.csv"
sitemap_base_dir = "pdb_processed/"

df = pd.read_csv(sitemap_file)
cols = ["Entry",]+df.columns[+df.columns.str.contains("sitemap")].tolist() #s_m_entry_name
df["Entry"] = df["Entry Name"].str.split("-").str[1].astype('str')

subList = list()
for i, file in enumerate(glob.glob(sitemap_base_dir + "/*.pdb")):
try:
sub_entry = os.path.split(file)[1].split("-")[1]
sub_df = df[df["Entry"]==sub_entry]
pathy = file
goods = getGoods(pathy)
percents = pd.DataFrame(getPercent(sub_df,goods))
sub_df["Percent High Quality"] = percents[0].values
sub_df["Entire Region Percent High Quality"] = percents[1].values
sub_df["Number Non-High Quality Residues in Entire Region"] = percents[2].values
sub_df["Total Number of Residues in Entire Region"] = percents[3].values
subList.append(sub_df)
print("SUCESSS:",file,i)
except:
print("FAILED:",file,i)

end_df = pd.concat(subList)
end_df.to_csv("outputs/sitemap_results_with_quality_metrics.csv", index=None)
103 changes: 103 additions & 0 deletions Bioinformatics.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import org.apache.commons.lang.RandomStringUtils

input_pdb_dir = "/root/Downloads/paper/pdb_processed/*.pdb"
pdb2fasta_path = "/root/Downloads/pdb2fasta-master/pdb2fasta"
target_fasta_dir = "/root/Downloads/paper/out_fasta"
target_search_dir = "/root/Downloads/paper/out_fasta_hits"
target_search_dir_msa = "/root/Downloads/paper/out_fasta_hits_msa_input"
target_search_dir_msa_out = "/root/Downloads/paper/out_fasta_hits_msa_output"
filter_blast_script_path = "/root/Downloads/paper/scripts/filter_blast.py"
homo_fasta_path = "/root/Downloads/paper/data/UP000005640_9606.fasta"
extract_script_path = "/root/Downloads/paper/scripts/extract.sh"
homology_count_output_path = "/root/Downloads/paper/homology_counts.txt"


ch1 = Channel.fromPath(input_pdb_dir)

process fasta_extractor
{
errorStrategy 'retry'
maxErrors 1000
maxForks 2

input:
val(id) from ch1

output:
val(id) into ch2

script:
def base_filename = id.toString().split("/")[-1]
def base = "mkdir -p ${target_fasta_dir} && ${pdb2fasta_path} ${id} > ${target_fasta_dir}/${base_filename}.fa"
"""
${base}
"""

}


process fasta_search
{
errorStrategy 'ignore'

input:
val(id) from ch2

output:
val(id) into ch3

script:
def base_filename = id.toString().split("/")[-1]
def base = """mkdir -p ${target_search_dir} && blastp -query ${target_fasta_dir}/${base_filename}.fa -db /root/Downloads/UP000005640_9606.fasta -out ${target_search_dir}/${base_filename}.blast-out.b6 -outfmt "6 std qseq sseq" -word_size 6 -evalue 0.05 -gapopen 11 -gapextend 1 -max_target_seqs 150"""
"""
${base}
"""

}

process filter_blast_results
{
errorStrategy 'ignore'

input:
val(id) from ch3

output:
val(id) into ch4

script:
def base_filename = id.toString().split("/")[-1]
def base = """mkdir -p ${target_search_dir_msa} && python ${filter_blast_script_path} ${target_fasta_dir}/${base_filename}.fa ${target_search_dir}/${base_filename}.blast-out.b6 ${homo_fasta_path} ${target_search_dir_msa}/${base_filename}.msa.fa"""
"""
${base}
"""
}


process align_with_mafft
{
errorStrategy 'ignore'

input:
val(id) from ch4

output:
val(id) into ch5

script:
def base_filename = id.toString().split("/")[-1]
def base = """mkdir -p ${target_search_dir_msa_out} && mafft ${target_search_dir_msa}/${base_filename}.msa.fa > ${target_search_dir_msa_out}/${base_filename}.msa.fa """
"""
${base}
"""

}
process final_step {
input:
val _ from ch5.collect()

script:
"""
cd ${target_search_dir_msa_out} && ${extract_script_path} > ${homology_count_output_path}
"""
}
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# drug-target-pocket-funnel

This repository contains the scripts, data, and results from the paper "In Silico Target Identification Reveals IL12B as a High-Potential Candidate for Small Molecule Drug Development". It serves as a record of the computational methods used to identify IL12B as a potential drug target for small molecule development.

Contents:
- Custom scripts used for data processing and analysis
- Databases utilized in the filtering and analysis process
- Intermediate and final processed data
- Nextflow pipeline for specific parts of the analysis
- FTMap simulation results of the 1F42 PDB structure

## Technical Details

- The Nextflow script uses absolute paths and was run with Nextflow version NXF_VER=22.04.4
- Miniconda was used to install Python, dependent packages, and programs required for running the Nextflow pipeline

## Disclaimer

Complete reproduction of the paper's pipeline involves manual steps and requires the Schrödinger Small-Molecule Drug Discovery Suite (and an appropriate license). These are not included in this repository.

Please note that some scripts in this repository may not function properly without the appropriate "sitemap_results.csv", "sitemap_results_with_quality_metrics.csv", and "final_filtered_sites.csv" files. These files contain Schrödinger-specific outputs that have been excluded from the public repository to comply with the terms of Schrödinger's End User License Agreement. The affected scripts are provided for reference purposes only.

Accessing the complete data and code may require a Schrödinger software license. We have endeavored to provide all other non-Schrödinger files necessary to understand and reproduce our work.

Note: This repository is provided for transparency and reproducibility purposes related to the published paper. The scripts and data are specific to this project and its unique workflow.
87 changes: 87 additions & 0 deletions Stage1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import pandas as pd
import numpy as np

# Load DGIdb interactions and perform initial filtering
dgidb_interactions = pd.read_table("data/DGIdb_interactions_12_2023.tsv")
dgidb_interactions["Gene name"] = dgidb_interactions["gene_claim_name"]
dgidb_interactions = dgidb_interactions.dropna(subset=["Gene name"])
inhibitor_interactions = dgidb_interactions[dgidb_interactions["interaction_type"] == "inhibitor"]
inhibitor_interactions = inhibitor_interactions[inhibitor_interactions["approved"]==True]
print("DGIdb filter - unique genes remaining:", inhibitor_interactions["Gene name"].unique().shape[0])

# Load UniProt data and merge with inhibitor interactions
uniprot_data = pd.read_table("data/uniprot-compressed_true_download_true_fields_accession_2Creviewed_2C-2022.08.31-21.45.11.90.tsv")
uniprot_data["Gene name"] = uniprot_data["Gene Names (primary)"]
uniprot_merged = uniprot_data.merge(inhibitor_interactions, on="Gene name") #inhibitor_interactions
print("UniProt structure filter - unique genes remaining:", uniprot_merged["Gene name"].unique().shape[0])

# Load Probe Miner data, find targets without any probes, and merge with UniProt data
#probe_miner_data = pd.read_table("data/probeminer_datadump_2021-06-20.txt")
#probe_miner_data["Entry"] = probe_miner_data["UNIPROT_ACCESSION"]
#unique_probes = pd.DataFrame(probe_miner_data["Entry"].drop_duplicates())
#unique_probes.to_csv("probeminer_2021-06-20_unique_probes.txt")
unique_probes = pd.read_csv("data/probeminer_2021-06-20_unique_probes.csv")
merged_probes = unique_probes.merge(uniprot_merged, on="Entry", indicator=True, how='outer')
right_only_probes = merged_probes[merged_probes["_merge"] == "right_only"]
print("Probe Miner filter - unique genes remaining:", right_only_probes["Gene name"].unique().shape[0])

# Load gnomAD data and merge with probes data
gnomad_data = pd.read_table("data/gnomad.v2.1.1.lof_metrics.by_gene.txt")
gnomad_data["Gene name"] = gnomad_data["gene"]
gnomad_filtered = gnomad_data[gnomad_data['pLI'] <= .50]
gnomad_merged = right_only_probes.merge(gnomad_filtered, on="Gene name")
print("gnomAD filter - unique genes remaining:", gnomad_merged["Gene name"].unique().shape[0])

# Load and filter OpenTargets data, then merge with gnomAD data
opentargets_data = pd.read_csv("data/BigQuery OpenTarget locus2gene results -12_22_2023.csv")
opentargets_filtered = opentargets_data[opentargets_data["max_y_proba_full_model"] >= 0.5]
opentargets_merged = opentargets_filtered.merge(gnomad_merged, on="gene_id")
print("OpenTargets filter - unique genes remaining:", opentargets_merged["gene_id"].unique().shape[0])

#Filter out proteins with metals
metals = pd.read_table("data/uniprot-annotation_(type_metal)-filtered-proteome_UP000005640+AND+orga--.tab")
metals["Gene name"] = metals["Gene names (primary )"]
opentargets_merged_no_metals = opentargets_merged.drop("_merge", axis=1).merge(metals, on="Gene name", how='outer', indicator=True)
opentargets_merged_no_metals = opentargets_merged_no_metals[opentargets_merged_no_metals["_merge"]=="left_only"]
print("Metals filter - unique genes remaining:", opentargets_merged_no_metals["Gene name"].unique().shape[0])

# Load mouse map and embryonic lethality data, then perform final merge
mouse_gene_map = pd.read_table("data/HOM_MouseHumanSequence.rpt")
mouse_gene_map["Gene name"] = mouse_gene_map["Symbol"]
embryonic_lethality = pd.read_table("data/embryonic_lethality_MGIhdpQuery_markers_20230416_020549.txt")
embryonic_lethality["Gene name"] = embryonic_lethality["Gene Symbol"]
lethality_merged = embryonic_lethality.merge(mouse_gene_map, on="Gene name")
lethality_final = lethality_merged.merge(mouse_gene_map, on="DB Class Key")
lethality_final["Gene name"] = lethality_final["Gene name_y"]
embryo_phenotypes = lethality_final[lethality_final["Abnormal Mouse Phenotypes"].str.contains("embryo")]
prefinal_merge = opentargets_merged_no_metals.drop("_merge", axis=1).merge(embryo_phenotypes, on="Gene name", how='outer', indicator=True)
prefinal_filtered = prefinal_merge[prefinal_merge["_merge"] == "left_only"]
print("JAX embryonic lethality filter - unique genes remaining:", prefinal_filtered["Gene name"].unique().shape[0])

#Limit to genes with abnormal inflammation in mice
inflam_df = pd.read_table("data/MGIhdpQuery_markers_20240430_202514_abnormal_inflammatory_response.txt")
inflam_df2 = inflam_df[inflam_df["Organism"]=="mouse"]
inflam_df2["Gene name"] = inflam_df2["Gene Symbol"]
inflam_merged = inflam_df2.merge(mouse_gene_map, on="Gene name")
inflam_final = inflam_merged.merge(mouse_gene_map, on="DB Class Key")
inflam_final["Gene name"] = inflam_final["Gene name_y"]
inflam_merged = inflam_final.merge(prefinal_filtered, on="Gene name")
print("JAX abnormal inflammation filter - unique genes remaining:", inflam_merged["Gene name"].unique().shape[0])

#Bioinformatics filter - only looks at proteins where no other human protein has significant primary sequence homology
#Also filter out proteins that are longer than AlphaFold's max length for 1 protein prediction (and thus split the prediction into multiple files)
bio_df = pd.read_csv("outputs/homology_counts.txt", header=None) #22_with_eval_cut.txt
bio_df["Entry_x"] = bio_df[0].str.split("-").str[1].astype('str')
bio_df["Count"] = bio_df[0].str.split(":").str[-1].astype('int')
bio_df2 = bio_df[bio_df["Count"]==1] #Make sure there are no homologous proteins
bio_df3 = bio_df2["Entry_x"].value_counts()
bio_df4 = bio_df3[bio_df3 == 1].reset_index() #Make sure the protein is not longer than AlphaFold's max prediction length
bio_df4.columns = ["Entry_x", "Count2"]
bio_df5 = bio_df4.merge(bio_df2, on="Entry_x")
final_filtered = inflam_merged.merge(bio_df5, on="Entry_x")
print("Bioinformatics filter - unique genes remaining:", final_filtered["Gene name"].unique().shape[0])

# Save final filtered
final_filtered = final_filtered.copy()
final_filtered["Entry"] = final_filtered["Entry_x"]
final_filtered[["Gene name", "Entry"]].drop_duplicates().to_csv("outputs/stage1_filtering_gene_candidates.csv", index=None)
83 changes: 83 additions & 0 deletions Stage2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, average_precision_score, precision_recall_curve, matthews_corrcoef
from sklearn.neighbors import LocalOutlierFactor
from scipy.stats import norm
from imblearn.ensemble import BalancedRandomForestClassifier

df = pd.read_table("data/Halgreen 2009 - Table 6.txt", sep=" ")

for threshold in [0, 1]:
df["label"] = (df["categorya"] > threshold).astype(bool)

features = ["Dscore","SScore","size","enclosure","philic","phobic"]
X = df[features]
y = df["label"]

lof = LocalOutlierFactor()
outlier_scores = lof.fit_predict(X.values)
non_outlier_indices = [i for i, score in enumerate(outlier_scores) if score == 1]
X_cleaned = X.iloc[non_outlier_indices]
y_cleaned = y.iloc[non_outlier_indices]

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metrics = []

for train_index, test_index in kf.split(X_cleaned, y_cleaned):
X_train, X_test = X_cleaned.iloc[train_index], X_cleaned.iloc[test_index]
y_train, y_test = y_cleaned.iloc[train_index], y_cleaned.iloc[test_index]

rf = BalancedRandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_rf_proba = rf.predict_proba(X_test)[:, 1]

f1_rf = f1_score(y_test, y_pred_rf)
mcc_rf = matthews_corrcoef(y_test, y_pred_rf)
z_score = mcc_rf / (1 - mcc_rf**2) ** 0.5
mcc_rf_pvalue = 2 * (1 - norm.cdf(abs(z_score)))
app_rf = average_precision_score(y_test, y_pred_rf_proba)

metrics.append([mcc_rf, mcc_rf_pvalue, f1_rf, app_rf, threshold, "RF"])

metrics_df = pd.DataFrame(metrics, columns=["MCC", "MCC p-value", "F1 Score", "Average Precision Score", "Threshold", "Model"])
metrics_df[["MCC", "Threshold"]].to_csv(f"outputs/cross_validation_data_threshold_{threshold}.csv")

confidence_cut_level = 0.5
df["y"] = df["categorya"]>0

lof = LocalOutlierFactor()
outlier_scores = lof.fit_predict(X.values)
non_outlier_indices = [i for i, score in enumerate(outlier_scores) if score == 1]
X_cleaned = X.iloc[non_outlier_indices]
y_cleaned = y.iloc[non_outlier_indices]

rf = BalancedRandomForestClassifier(random_state=42)
rf.fit(X_cleaned.values, y_cleaned)

lof2 = LocalOutlierFactor(novelty=True)
lof2.fit(X_cleaned.values)

df_targets = pd.read_csv("outputs/sitemap_results_with_quality_metrics.csv")
df_targets["UniProtKB Gene Name ID"] = df_targets["Entry"]

df_targets2 = df_targets[(df_targets["Entire Region Percent High Quality"]>=confidence_cut_level) &
(df_targets["Percent High Quality"]>=confidence_cut_level)]

X_targets = df_targets2[["Dscore","SiteScore","size","enclosure","philic","phobic"]]
outlier_scores = lof2.predict(X_targets.values)
X_good_targets = X_targets[outlier_scores > -1]

rf_probs = rf.predict_proba(X_good_targets.values)

df_targets2 = df_targets2[outlier_scores > -1]
df_targets2["rf_probs"] = rf_probs[:,1]
df_targets3 = df_targets2[df_targets2["rf_probs"]>=confidence_cut_level]

df_gene_map = pd.read_csv("data/uniprot_ensembl_map.txt")
df_merged = df_targets3.merge(df_gene_map, on="UniProtKB Gene Name ID")
df_hq = df_merged.drop_duplicates()

df_hq[["Gene name","residues", "Entire Region Percent High Quality", "Percent High Quality"]].drop_duplicates().to_csv("outputs/final_filtered_sites.csv", index=None)
20 changes: 20 additions & 0 deletions alphafold_err.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import Bio.PDB
import pandas as pd
import os

def getGoods(inputPath):
namey = os.path.splitext(os.path.split(inputPath)[1])[0]
p = Bio.PDB.PDBParser()
structure = p.get_structure(namey, inputPath)

listy = list()
for thing in structure.get_residues():
res_id = thing.get_id()[1]
for sub_thing in thing.get_atoms():
val = sub_thing.get_bfactor()
listy.append((res_id,val))

df = pd.DataFrame(listy)

goods = df[df[1]>=90]
return goods
Loading

0 comments on commit 4c61883

Please sign in to comment.