-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 4c61883
Showing
40 changed files
with
1,210,473 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import glob | ||
import os | ||
|
||
from alphafold_err import getGoods | ||
|
||
def getPercent(input_x2c,input_df_goods): | ||
percentList = list() | ||
for index, row in input_x2c.iterrows(): | ||
df_site = pd.DataFrame(pd.DataFrame(row["residues"].split(":")[1].strip().split(","))[0].astype('str').str.strip().astype('int')) #s_sitemap_ | ||
truth = df_site.merge(input_df_goods,on=0,how='outer',indicator=True) | ||
counts = truth["_merge"].value_counts() | ||
true_denom = counts.sum()-counts[counts.index=="right_only"].iloc[0] | ||
percent = counts[counts.index=="both"].iloc[0]/true_denom | ||
minny = df_site[0].min() | ||
maxxy = df_site[0].max() | ||
checky = pd.DataFrame(np.arange(minny,maxxy+1)) | ||
checky2 = checky.merge(input_df_goods,indicator=True,how='left') | ||
all_good_percent = (checky2["_merge"]=="both").sum()/checky2.shape[0] | ||
left_over = checky2.shape[0]-(checky2["_merge"]=="both").sum() | ||
percentList.append((percent,all_good_percent,left_over,checky2.shape[0])) | ||
return percentList | ||
|
||
sitemap_file = "outputs/sitemap_results2.csv" | ||
sitemap_base_dir = "pdb_processed/" | ||
|
||
df = pd.read_csv(sitemap_file) | ||
cols = ["Entry",]+df.columns[+df.columns.str.contains("sitemap")].tolist() #s_m_entry_name | ||
df["Entry"] = df["Entry Name"].str.split("-").str[1].astype('str') | ||
|
||
subList = list() | ||
for i, file in enumerate(glob.glob(sitemap_base_dir + "/*.pdb")): | ||
try: | ||
sub_entry = os.path.split(file)[1].split("-")[1] | ||
sub_df = df[df["Entry"]==sub_entry] | ||
pathy = file | ||
goods = getGoods(pathy) | ||
percents = pd.DataFrame(getPercent(sub_df,goods)) | ||
sub_df["Percent High Quality"] = percents[0].values | ||
sub_df["Entire Region Percent High Quality"] = percents[1].values | ||
sub_df["Number Non-High Quality Residues in Entire Region"] = percents[2].values | ||
sub_df["Total Number of Residues in Entire Region"] = percents[3].values | ||
subList.append(sub_df) | ||
print("SUCESSS:",file,i) | ||
except: | ||
print("FAILED:",file,i) | ||
|
||
end_df = pd.concat(subList) | ||
end_df.to_csv("outputs/sitemap_results_with_quality_metrics.csv", index=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import org.apache.commons.lang.RandomStringUtils | ||
|
||
input_pdb_dir = "/root/Downloads/paper/pdb_processed/*.pdb" | ||
pdb2fasta_path = "/root/Downloads/pdb2fasta-master/pdb2fasta" | ||
target_fasta_dir = "/root/Downloads/paper/out_fasta" | ||
target_search_dir = "/root/Downloads/paper/out_fasta_hits" | ||
target_search_dir_msa = "/root/Downloads/paper/out_fasta_hits_msa_input" | ||
target_search_dir_msa_out = "/root/Downloads/paper/out_fasta_hits_msa_output" | ||
filter_blast_script_path = "/root/Downloads/paper/scripts/filter_blast.py" | ||
homo_fasta_path = "/root/Downloads/paper/data/UP000005640_9606.fasta" | ||
extract_script_path = "/root/Downloads/paper/scripts/extract.sh" | ||
homology_count_output_path = "/root/Downloads/paper/homology_counts.txt" | ||
|
||
|
||
ch1 = Channel.fromPath(input_pdb_dir) | ||
|
||
process fasta_extractor | ||
{ | ||
errorStrategy 'retry' | ||
maxErrors 1000 | ||
maxForks 2 | ||
|
||
input: | ||
val(id) from ch1 | ||
|
||
output: | ||
val(id) into ch2 | ||
|
||
script: | ||
def base_filename = id.toString().split("/")[-1] | ||
def base = "mkdir -p ${target_fasta_dir} && ${pdb2fasta_path} ${id} > ${target_fasta_dir}/${base_filename}.fa" | ||
""" | ||
${base} | ||
""" | ||
|
||
} | ||
|
||
|
||
process fasta_search | ||
{ | ||
errorStrategy 'ignore' | ||
|
||
input: | ||
val(id) from ch2 | ||
|
||
output: | ||
val(id) into ch3 | ||
|
||
script: | ||
def base_filename = id.toString().split("/")[-1] | ||
def base = """mkdir -p ${target_search_dir} && blastp -query ${target_fasta_dir}/${base_filename}.fa -db /root/Downloads/UP000005640_9606.fasta -out ${target_search_dir}/${base_filename}.blast-out.b6 -outfmt "6 std qseq sseq" -word_size 6 -evalue 0.05 -gapopen 11 -gapextend 1 -max_target_seqs 150""" | ||
""" | ||
${base} | ||
""" | ||
|
||
} | ||
|
||
process filter_blast_results | ||
{ | ||
errorStrategy 'ignore' | ||
|
||
input: | ||
val(id) from ch3 | ||
|
||
output: | ||
val(id) into ch4 | ||
|
||
script: | ||
def base_filename = id.toString().split("/")[-1] | ||
def base = """mkdir -p ${target_search_dir_msa} && python ${filter_blast_script_path} ${target_fasta_dir}/${base_filename}.fa ${target_search_dir}/${base_filename}.blast-out.b6 ${homo_fasta_path} ${target_search_dir_msa}/${base_filename}.msa.fa""" | ||
""" | ||
${base} | ||
""" | ||
} | ||
|
||
|
||
process align_with_mafft | ||
{ | ||
errorStrategy 'ignore' | ||
|
||
input: | ||
val(id) from ch4 | ||
|
||
output: | ||
val(id) into ch5 | ||
|
||
script: | ||
def base_filename = id.toString().split("/")[-1] | ||
def base = """mkdir -p ${target_search_dir_msa_out} && mafft ${target_search_dir_msa}/${base_filename}.msa.fa > ${target_search_dir_msa_out}/${base_filename}.msa.fa """ | ||
""" | ||
${base} | ||
""" | ||
|
||
} | ||
process final_step { | ||
input: | ||
val _ from ch5.collect() | ||
|
||
script: | ||
""" | ||
cd ${target_search_dir_msa_out} && ${extract_script_path} > ${homology_count_output_path} | ||
""" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# drug-target-pocket-funnel | ||
|
||
This repository contains the scripts, data, and results from the paper "In Silico Target Identification Reveals IL12B as a High-Potential Candidate for Small Molecule Drug Development". It serves as a record of the computational methods used to identify IL12B as a potential drug target for small molecule development. | ||
|
||
Contents: | ||
- Custom scripts used for data processing and analysis | ||
- Databases utilized in the filtering and analysis process | ||
- Intermediate and final processed data | ||
- Nextflow pipeline for specific parts of the analysis | ||
- FTMap simulation results of the 1F42 PDB structure | ||
|
||
## Technical Details | ||
|
||
- The Nextflow script uses absolute paths and was run with Nextflow version NXF_VER=22.04.4 | ||
- Miniconda was used to install Python, dependent packages, and programs required for running the Nextflow pipeline | ||
|
||
## Disclaimer | ||
|
||
Complete reproduction of the paper's pipeline involves manual steps and requires the Schrödinger Small-Molecule Drug Discovery Suite (and an appropriate license). These are not included in this repository. | ||
|
||
Please note that some scripts in this repository may not function properly without the appropriate "sitemap_results.csv", "sitemap_results_with_quality_metrics.csv", and "final_filtered_sites.csv" files. These files contain Schrödinger-specific outputs that have been excluded from the public repository to comply with the terms of Schrödinger's End User License Agreement. The affected scripts are provided for reference purposes only. | ||
|
||
Accessing the complete data and code may require a Schrödinger software license. We have endeavored to provide all other non-Schrödinger files necessary to understand and reproduce our work. | ||
|
||
Note: This repository is provided for transparency and reproducibility purposes related to the published paper. The scripts and data are specific to this project and its unique workflow. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import pandas as pd | ||
import numpy as np | ||
|
||
# Load DGIdb interactions and perform initial filtering | ||
dgidb_interactions = pd.read_table("data/DGIdb_interactions_12_2023.tsv") | ||
dgidb_interactions["Gene name"] = dgidb_interactions["gene_claim_name"] | ||
dgidb_interactions = dgidb_interactions.dropna(subset=["Gene name"]) | ||
inhibitor_interactions = dgidb_interactions[dgidb_interactions["interaction_type"] == "inhibitor"] | ||
inhibitor_interactions = inhibitor_interactions[inhibitor_interactions["approved"]==True] | ||
print("DGIdb filter - unique genes remaining:", inhibitor_interactions["Gene name"].unique().shape[0]) | ||
|
||
# Load UniProt data and merge with inhibitor interactions | ||
uniprot_data = pd.read_table("data/uniprot-compressed_true_download_true_fields_accession_2Creviewed_2C-2022.08.31-21.45.11.90.tsv") | ||
uniprot_data["Gene name"] = uniprot_data["Gene Names (primary)"] | ||
uniprot_merged = uniprot_data.merge(inhibitor_interactions, on="Gene name") #inhibitor_interactions | ||
print("UniProt structure filter - unique genes remaining:", uniprot_merged["Gene name"].unique().shape[0]) | ||
|
||
# Load Probe Miner data, find targets without any probes, and merge with UniProt data | ||
#probe_miner_data = pd.read_table("data/probeminer_datadump_2021-06-20.txt") | ||
#probe_miner_data["Entry"] = probe_miner_data["UNIPROT_ACCESSION"] | ||
#unique_probes = pd.DataFrame(probe_miner_data["Entry"].drop_duplicates()) | ||
#unique_probes.to_csv("probeminer_2021-06-20_unique_probes.txt") | ||
unique_probes = pd.read_csv("data/probeminer_2021-06-20_unique_probes.csv") | ||
merged_probes = unique_probes.merge(uniprot_merged, on="Entry", indicator=True, how='outer') | ||
right_only_probes = merged_probes[merged_probes["_merge"] == "right_only"] | ||
print("Probe Miner filter - unique genes remaining:", right_only_probes["Gene name"].unique().shape[0]) | ||
|
||
# Load gnomAD data and merge with probes data | ||
gnomad_data = pd.read_table("data/gnomad.v2.1.1.lof_metrics.by_gene.txt") | ||
gnomad_data["Gene name"] = gnomad_data["gene"] | ||
gnomad_filtered = gnomad_data[gnomad_data['pLI'] <= .50] | ||
gnomad_merged = right_only_probes.merge(gnomad_filtered, on="Gene name") | ||
print("gnomAD filter - unique genes remaining:", gnomad_merged["Gene name"].unique().shape[0]) | ||
|
||
# Load and filter OpenTargets data, then merge with gnomAD data | ||
opentargets_data = pd.read_csv("data/BigQuery OpenTarget locus2gene results -12_22_2023.csv") | ||
opentargets_filtered = opentargets_data[opentargets_data["max_y_proba_full_model"] >= 0.5] | ||
opentargets_merged = opentargets_filtered.merge(gnomad_merged, on="gene_id") | ||
print("OpenTargets filter - unique genes remaining:", opentargets_merged["gene_id"].unique().shape[0]) | ||
|
||
#Filter out proteins with metals | ||
metals = pd.read_table("data/uniprot-annotation_(type_metal)-filtered-proteome_UP000005640+AND+orga--.tab") | ||
metals["Gene name"] = metals["Gene names (primary )"] | ||
opentargets_merged_no_metals = opentargets_merged.drop("_merge", axis=1).merge(metals, on="Gene name", how='outer', indicator=True) | ||
opentargets_merged_no_metals = opentargets_merged_no_metals[opentargets_merged_no_metals["_merge"]=="left_only"] | ||
print("Metals filter - unique genes remaining:", opentargets_merged_no_metals["Gene name"].unique().shape[0]) | ||
|
||
# Load mouse map and embryonic lethality data, then perform final merge | ||
mouse_gene_map = pd.read_table("data/HOM_MouseHumanSequence.rpt") | ||
mouse_gene_map["Gene name"] = mouse_gene_map["Symbol"] | ||
embryonic_lethality = pd.read_table("data/embryonic_lethality_MGIhdpQuery_markers_20230416_020549.txt") | ||
embryonic_lethality["Gene name"] = embryonic_lethality["Gene Symbol"] | ||
lethality_merged = embryonic_lethality.merge(mouse_gene_map, on="Gene name") | ||
lethality_final = lethality_merged.merge(mouse_gene_map, on="DB Class Key") | ||
lethality_final["Gene name"] = lethality_final["Gene name_y"] | ||
embryo_phenotypes = lethality_final[lethality_final["Abnormal Mouse Phenotypes"].str.contains("embryo")] | ||
prefinal_merge = opentargets_merged_no_metals.drop("_merge", axis=1).merge(embryo_phenotypes, on="Gene name", how='outer', indicator=True) | ||
prefinal_filtered = prefinal_merge[prefinal_merge["_merge"] == "left_only"] | ||
print("JAX embryonic lethality filter - unique genes remaining:", prefinal_filtered["Gene name"].unique().shape[0]) | ||
|
||
#Limit to genes with abnormal inflammation in mice | ||
inflam_df = pd.read_table("data/MGIhdpQuery_markers_20240430_202514_abnormal_inflammatory_response.txt") | ||
inflam_df2 = inflam_df[inflam_df["Organism"]=="mouse"] | ||
inflam_df2["Gene name"] = inflam_df2["Gene Symbol"] | ||
inflam_merged = inflam_df2.merge(mouse_gene_map, on="Gene name") | ||
inflam_final = inflam_merged.merge(mouse_gene_map, on="DB Class Key") | ||
inflam_final["Gene name"] = inflam_final["Gene name_y"] | ||
inflam_merged = inflam_final.merge(prefinal_filtered, on="Gene name") | ||
print("JAX abnormal inflammation filter - unique genes remaining:", inflam_merged["Gene name"].unique().shape[0]) | ||
|
||
#Bioinformatics filter - only looks at proteins where no other human protein has significant primary sequence homology | ||
#Also filter out proteins that are longer than AlphaFold's max length for 1 protein prediction (and thus split the prediction into multiple files) | ||
bio_df = pd.read_csv("outputs/homology_counts.txt", header=None) #22_with_eval_cut.txt | ||
bio_df["Entry_x"] = bio_df[0].str.split("-").str[1].astype('str') | ||
bio_df["Count"] = bio_df[0].str.split(":").str[-1].astype('int') | ||
bio_df2 = bio_df[bio_df["Count"]==1] #Make sure there are no homologous proteins | ||
bio_df3 = bio_df2["Entry_x"].value_counts() | ||
bio_df4 = bio_df3[bio_df3 == 1].reset_index() #Make sure the protein is not longer than AlphaFold's max prediction length | ||
bio_df4.columns = ["Entry_x", "Count2"] | ||
bio_df5 = bio_df4.merge(bio_df2, on="Entry_x") | ||
final_filtered = inflam_merged.merge(bio_df5, on="Entry_x") | ||
print("Bioinformatics filter - unique genes remaining:", final_filtered["Gene name"].unique().shape[0]) | ||
|
||
# Save final filtered | ||
final_filtered = final_filtered.copy() | ||
final_filtered["Entry"] = final_filtered["Entry_x"] | ||
final_filtered[["Gene name", "Entry"]].drop_duplicates().to_csv("outputs/stage1_filtering_gene_candidates.csv", index=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import pandas as pd | ||
import numpy as np | ||
from sklearn.model_selection import StratifiedKFold | ||
from sklearn.metrics import f1_score, average_precision_score, precision_recall_curve, matthews_corrcoef | ||
from sklearn.neighbors import LocalOutlierFactor | ||
from scipy.stats import norm | ||
from imblearn.ensemble import BalancedRandomForestClassifier | ||
|
||
df = pd.read_table("data/Halgreen 2009 - Table 6.txt", sep=" ") | ||
|
||
for threshold in [0, 1]: | ||
df["label"] = (df["categorya"] > threshold).astype(bool) | ||
|
||
features = ["Dscore","SScore","size","enclosure","philic","phobic"] | ||
X = df[features] | ||
y = df["label"] | ||
|
||
lof = LocalOutlierFactor() | ||
outlier_scores = lof.fit_predict(X.values) | ||
non_outlier_indices = [i for i, score in enumerate(outlier_scores) if score == 1] | ||
X_cleaned = X.iloc[non_outlier_indices] | ||
y_cleaned = y.iloc[non_outlier_indices] | ||
|
||
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | ||
metrics = [] | ||
|
||
for train_index, test_index in kf.split(X_cleaned, y_cleaned): | ||
X_train, X_test = X_cleaned.iloc[train_index], X_cleaned.iloc[test_index] | ||
y_train, y_test = y_cleaned.iloc[train_index], y_cleaned.iloc[test_index] | ||
|
||
rf = BalancedRandomForestClassifier(random_state=42) | ||
rf.fit(X_train, y_train) | ||
|
||
y_pred_rf = rf.predict(X_test) | ||
y_pred_rf_proba = rf.predict_proba(X_test)[:, 1] | ||
|
||
f1_rf = f1_score(y_test, y_pred_rf) | ||
mcc_rf = matthews_corrcoef(y_test, y_pred_rf) | ||
z_score = mcc_rf / (1 - mcc_rf**2) ** 0.5 | ||
mcc_rf_pvalue = 2 * (1 - norm.cdf(abs(z_score))) | ||
app_rf = average_precision_score(y_test, y_pred_rf_proba) | ||
|
||
metrics.append([mcc_rf, mcc_rf_pvalue, f1_rf, app_rf, threshold, "RF"]) | ||
|
||
metrics_df = pd.DataFrame(metrics, columns=["MCC", "MCC p-value", "F1 Score", "Average Precision Score", "Threshold", "Model"]) | ||
metrics_df[["MCC", "Threshold"]].to_csv(f"outputs/cross_validation_data_threshold_{threshold}.csv") | ||
|
||
confidence_cut_level = 0.5 | ||
df["y"] = df["categorya"]>0 | ||
|
||
lof = LocalOutlierFactor() | ||
outlier_scores = lof.fit_predict(X.values) | ||
non_outlier_indices = [i for i, score in enumerate(outlier_scores) if score == 1] | ||
X_cleaned = X.iloc[non_outlier_indices] | ||
y_cleaned = y.iloc[non_outlier_indices] | ||
|
||
rf = BalancedRandomForestClassifier(random_state=42) | ||
rf.fit(X_cleaned.values, y_cleaned) | ||
|
||
lof2 = LocalOutlierFactor(novelty=True) | ||
lof2.fit(X_cleaned.values) | ||
|
||
df_targets = pd.read_csv("outputs/sitemap_results_with_quality_metrics.csv") | ||
df_targets["UniProtKB Gene Name ID"] = df_targets["Entry"] | ||
|
||
df_targets2 = df_targets[(df_targets["Entire Region Percent High Quality"]>=confidence_cut_level) & | ||
(df_targets["Percent High Quality"]>=confidence_cut_level)] | ||
|
||
X_targets = df_targets2[["Dscore","SiteScore","size","enclosure","philic","phobic"]] | ||
outlier_scores = lof2.predict(X_targets.values) | ||
X_good_targets = X_targets[outlier_scores > -1] | ||
|
||
rf_probs = rf.predict_proba(X_good_targets.values) | ||
|
||
df_targets2 = df_targets2[outlier_scores > -1] | ||
df_targets2["rf_probs"] = rf_probs[:,1] | ||
df_targets3 = df_targets2[df_targets2["rf_probs"]>=confidence_cut_level] | ||
|
||
df_gene_map = pd.read_csv("data/uniprot_ensembl_map.txt") | ||
df_merged = df_targets3.merge(df_gene_map, on="UniProtKB Gene Name ID") | ||
df_hq = df_merged.drop_duplicates() | ||
|
||
df_hq[["Gene name","residues", "Entire Region Percent High Quality", "Percent High Quality"]].drop_duplicates().to_csv("outputs/final_filtered_sites.csv", index=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import Bio.PDB | ||
import pandas as pd | ||
import os | ||
|
||
def getGoods(inputPath): | ||
namey = os.path.splitext(os.path.split(inputPath)[1])[0] | ||
p = Bio.PDB.PDBParser() | ||
structure = p.get_structure(namey, inputPath) | ||
|
||
listy = list() | ||
for thing in structure.get_residues(): | ||
res_id = thing.get_id()[1] | ||
for sub_thing in thing.get_atoms(): | ||
val = sub_thing.get_bfactor() | ||
listy.append((res_id,val)) | ||
|
||
df = pd.DataFrame(listy) | ||
|
||
goods = df[df[1]>=90] | ||
return goods |
Oops, something went wrong.