Initial commit

tantrev · Aug 27, 2024 · 4c61883 · 4c61883
commit 4c61883
Show file tree

Hide file tree

Showing 40 changed files with 1,210,473 additions and 0 deletions.
diff --git a/AF_confidence.py b/AF_confidence.py
@@ -0,0 +1,50 @@
+import pandas as pd
+import numpy as np
+import glob
+import os
+
+from alphafold_err import getGoods
+
+def getPercent(input_x2c,input_df_goods):
+    percentList = list()
+    for index, row in input_x2c.iterrows():  
+        df_site = pd.DataFrame(pd.DataFrame(row["residues"].split(":")[1].strip().split(","))[0].astype('str').str.strip().astype('int')) #s_sitemap_
+        truth = df_site.merge(input_df_goods,on=0,how='outer',indicator=True)
+        counts = truth["_merge"].value_counts()
+        true_denom = counts.sum()-counts[counts.index=="right_only"].iloc[0]
+        percent = counts[counts.index=="both"].iloc[0]/true_denom
+        minny = df_site[0].min()
+        maxxy = df_site[0].max()
+        checky = pd.DataFrame(np.arange(minny,maxxy+1))
+        checky2 = checky.merge(input_df_goods,indicator=True,how='left')
+        all_good_percent = (checky2["_merge"]=="both").sum()/checky2.shape[0]
+        left_over = checky2.shape[0]-(checky2["_merge"]=="both").sum()
+        percentList.append((percent,all_good_percent,left_over,checky2.shape[0]))
+    return percentList
+
+sitemap_file = "outputs/sitemap_results2.csv"
+sitemap_base_dir = "pdb_processed/"
+
+df = pd.read_csv(sitemap_file)
+cols = ["Entry",]+df.columns[+df.columns.str.contains("sitemap")].tolist() #s_m_entry_name
+df["Entry"] =  df["Entry Name"].str.split("-").str[1].astype('str')
+
+subList = list()
+for i, file in enumerate(glob.glob(sitemap_base_dir + "/*.pdb")):
+    try:
+        sub_entry =  os.path.split(file)[1].split("-")[1]
+        sub_df = df[df["Entry"]==sub_entry]
+        pathy  = file
+        goods = getGoods(pathy)
+        percents = pd.DataFrame(getPercent(sub_df,goods))
+        sub_df["Percent High Quality"] = percents[0].values
+        sub_df["Entire Region Percent High Quality"] = percents[1].values
+        sub_df["Number Non-High Quality Residues in Entire Region"] = percents[2].values
+        sub_df["Total Number of Residues in Entire Region"] = percents[3].values
+        subList.append(sub_df)
+        print("SUCESSS:",file,i)
+    except:
+        print("FAILED:",file,i)
+
+end_df = pd.concat(subList)
+end_df.to_csv("outputs/sitemap_results_with_quality_metrics.csv", index=None)
diff --git a/Bioinformatics.nf b/Bioinformatics.nf
@@ -0,0 +1,103 @@
+import org.apache.commons.lang.RandomStringUtils
+
+input_pdb_dir = "/root/Downloads/paper/pdb_processed/*.pdb"
+pdb2fasta_path = "/root/Downloads/pdb2fasta-master/pdb2fasta"
+target_fasta_dir = "/root/Downloads/paper/out_fasta"
+target_search_dir = "/root/Downloads/paper/out_fasta_hits"
+target_search_dir_msa = "/root/Downloads/paper/out_fasta_hits_msa_input"
+target_search_dir_msa_out = "/root/Downloads/paper/out_fasta_hits_msa_output"
+filter_blast_script_path = "/root/Downloads/paper/scripts/filter_blast.py"
+homo_fasta_path = "/root/Downloads/paper/data/UP000005640_9606.fasta"
+extract_script_path = "/root/Downloads/paper/scripts/extract.sh"
+homology_count_output_path = "/root/Downloads/paper/homology_counts.txt"
+
+
+ch1 = Channel.fromPath(input_pdb_dir)
+
+process fasta_extractor
+{
+    errorStrategy 'retry'
+    maxErrors 1000
+    maxForks 2
+
+    input:
+    val(id) from ch1
+
+    output:
+    val(id) into ch2
+
+    script:
+    def base_filename = id.toString().split("/")[-1]
+    def base = "mkdir -p ${target_fasta_dir} && ${pdb2fasta_path} ${id} > ${target_fasta_dir}/${base_filename}.fa"
+    """
+    ${base}
+    """
+
+}
+
+
+process fasta_search
+{
+    errorStrategy 'ignore'
+
+    input:
+    val(id) from ch2
+
+    output:
+    val(id) into ch3
+
+    script:
+    def base_filename = id.toString().split("/")[-1]
+    def base = """mkdir -p ${target_search_dir} && blastp -query ${target_fasta_dir}/${base_filename}.fa -db /root/Downloads/UP000005640_9606.fasta -out ${target_search_dir}/${base_filename}.blast-out.b6 -outfmt "6 std qseq sseq" -word_size 6 -evalue 0.05 -gapopen 11 -gapextend 1 -max_target_seqs 150"""
+    """
+    ${base}
+    """
+
+}
+
+process filter_blast_results
+{
+    errorStrategy 'ignore'
+
+    input:
+    val(id) from ch3
+
+    output:
+    val(id) into ch4
+
+    script:
+    def base_filename = id.toString().split("/")[-1]
+    def base = """mkdir -p ${target_search_dir_msa} && python ${filter_blast_script_path} ${target_fasta_dir}/${base_filename}.fa ${target_search_dir}/${base_filename}.blast-out.b6 ${homo_fasta_path} ${target_search_dir_msa}/${base_filename}.msa.fa"""
+    """
+    ${base}
+    """
+}
+
+
+process align_with_mafft
+{
+    errorStrategy 'ignore'
+
+    input:
+    val(id) from ch4
+
+    output:
+    val(id) into ch5
+
+    script:
+    def base_filename = id.toString().split("/")[-1]
+    def base = """mkdir -p ${target_search_dir_msa_out} && mafft ${target_search_dir_msa}/${base_filename}.msa.fa > ${target_search_dir_msa_out}/${base_filename}.msa.fa """
+    """
+    ${base}
+    """
+
+}
+process final_step {
+    input:
+    val _ from ch5.collect()
+
+    script:
+    """
+    cd ${target_search_dir_msa_out} && ${extract_script_path} > ${homology_count_output_path}
+    """
+}
diff --git a/README.md b/README.md
@@ -0,0 +1,25 @@
+# drug-target-pocket-funnel
+
+This repository contains the scripts, data, and results from the paper "In Silico Target Identification Reveals IL12B as a High-Potential Candidate for Small Molecule Drug Development". It serves as a record of the computational methods used to identify IL12B as a potential drug target for small molecule development.
+
+Contents:
+- Custom scripts used for data processing and analysis
+- Databases utilized in the filtering and analysis process
+- Intermediate and final processed data
+- Nextflow pipeline for specific parts of the analysis
+- FTMap simulation results of the 1F42 PDB structure
+
+## Technical Details
+
+- The Nextflow script uses absolute paths and was run with Nextflow version NXF_VER=22.04.4
+- Miniconda was used to install Python, dependent packages, and programs required for running the Nextflow pipeline
+
+## Disclaimer
+
+Complete reproduction of the paper's pipeline involves manual steps and requires the Schrödinger Small-Molecule Drug Discovery Suite (and an appropriate license). These are not included in this repository.
+
+Please note that some scripts in this repository may not function properly without the appropriate "sitemap_results.csv", "sitemap_results_with_quality_metrics.csv", and "final_filtered_sites.csv" files. These files contain Schrödinger-specific outputs that have been excluded from the public repository to comply with the terms of Schrödinger's End User License Agreement. The affected scripts are provided for reference purposes only.
+
+Accessing the complete data and code may require a Schrödinger software license. We have endeavored to provide all other non-Schrödinger files necessary to understand and reproduce our work.
+
+Note: This repository is provided for transparency and reproducibility purposes related to the published paper. The scripts and data are specific to this project and its unique workflow.
diff --git a/Stage1.py b/Stage1.py
@@ -0,0 +1,87 @@
+import pandas as pd
+import numpy as np
+
+# Load DGIdb interactions and perform initial filtering
+dgidb_interactions = pd.read_table("data/DGIdb_interactions_12_2023.tsv")
+dgidb_interactions["Gene name"] = dgidb_interactions["gene_claim_name"]
+dgidb_interactions = dgidb_interactions.dropna(subset=["Gene name"])
+inhibitor_interactions = dgidb_interactions[dgidb_interactions["interaction_type"] == "inhibitor"]
+inhibitor_interactions = inhibitor_interactions[inhibitor_interactions["approved"]==True]
+print("DGIdb filter - unique genes remaining:", inhibitor_interactions["Gene name"].unique().shape[0])
+
+# Load UniProt data and merge with inhibitor interactions
+uniprot_data = pd.read_table("data/uniprot-compressed_true_download_true_fields_accession_2Creviewed_2C-2022.08.31-21.45.11.90.tsv")
+uniprot_data["Gene name"] = uniprot_data["Gene Names (primary)"]
+uniprot_merged = uniprot_data.merge(inhibitor_interactions, on="Gene name") #inhibitor_interactions
+print("UniProt structure filter - unique genes remaining:", uniprot_merged["Gene name"].unique().shape[0])
+
+# Load Probe Miner data, find targets without any probes, and merge with UniProt data
+#probe_miner_data = pd.read_table("data/probeminer_datadump_2021-06-20.txt")
+#probe_miner_data["Entry"] = probe_miner_data["UNIPROT_ACCESSION"]
+#unique_probes = pd.DataFrame(probe_miner_data["Entry"].drop_duplicates())
+#unique_probes.to_csv("probeminer_2021-06-20_unique_probes.txt")
+unique_probes = pd.read_csv("data/probeminer_2021-06-20_unique_probes.csv")
+merged_probes = unique_probes.merge(uniprot_merged, on="Entry", indicator=True, how='outer')
+right_only_probes = merged_probes[merged_probes["_merge"] == "right_only"]
+print("Probe Miner filter - unique genes remaining:", right_only_probes["Gene name"].unique().shape[0])
+
+# Load gnomAD data and merge with probes data
+gnomad_data = pd.read_table("data/gnomad.v2.1.1.lof_metrics.by_gene.txt")
+gnomad_data["Gene name"] = gnomad_data["gene"]
+gnomad_filtered = gnomad_data[gnomad_data['pLI'] <= .50]
+gnomad_merged = right_only_probes.merge(gnomad_filtered, on="Gene name")
+print("gnomAD filter - unique genes remaining:", gnomad_merged["Gene name"].unique().shape[0])
+
+# Load and filter OpenTargets data, then merge with gnomAD data
+opentargets_data = pd.read_csv("data/BigQuery OpenTarget locus2gene results -12_22_2023.csv")
+opentargets_filtered = opentargets_data[opentargets_data["max_y_proba_full_model"] >= 0.5]
+opentargets_merged = opentargets_filtered.merge(gnomad_merged, on="gene_id")
+print("OpenTargets filter - unique genes remaining:", opentargets_merged["gene_id"].unique().shape[0])
+
+#Filter out proteins with metals
+metals = pd.read_table("data/uniprot-annotation_(type_metal)-filtered-proteome_UP000005640+AND+orga--.tab")
+metals["Gene name"] = metals["Gene names  (primary )"]
+opentargets_merged_no_metals = opentargets_merged.drop("_merge", axis=1).merge(metals, on="Gene name", how='outer', indicator=True)
+opentargets_merged_no_metals = opentargets_merged_no_metals[opentargets_merged_no_metals["_merge"]=="left_only"]
+print("Metals filter - unique genes remaining:", opentargets_merged_no_metals["Gene name"].unique().shape[0])
+
+# Load mouse map and embryonic lethality data, then perform final merge
+mouse_gene_map = pd.read_table("data/HOM_MouseHumanSequence.rpt")
+mouse_gene_map["Gene name"] = mouse_gene_map["Symbol"]
+embryonic_lethality = pd.read_table("data/embryonic_lethality_MGIhdpQuery_markers_20230416_020549.txt")
+embryonic_lethality["Gene name"] = embryonic_lethality["Gene Symbol"]
+lethality_merged = embryonic_lethality.merge(mouse_gene_map, on="Gene name")
+lethality_final = lethality_merged.merge(mouse_gene_map, on="DB Class Key")
+lethality_final["Gene name"] = lethality_final["Gene name_y"]
+embryo_phenotypes = lethality_final[lethality_final["Abnormal Mouse Phenotypes"].str.contains("embryo")]
+prefinal_merge = opentargets_merged_no_metals.drop("_merge", axis=1).merge(embryo_phenotypes, on="Gene name", how='outer', indicator=True)
+prefinal_filtered = prefinal_merge[prefinal_merge["_merge"] == "left_only"]
+print("JAX embryonic lethality filter - unique genes remaining:", prefinal_filtered["Gene name"].unique().shape[0])
+
+#Limit to genes with abnormal inflammation in mice
+inflam_df = pd.read_table("data/MGIhdpQuery_markers_20240430_202514_abnormal_inflammatory_response.txt")
+inflam_df2 = inflam_df[inflam_df["Organism"]=="mouse"]
+inflam_df2["Gene name"] = inflam_df2["Gene Symbol"]
+inflam_merged = inflam_df2.merge(mouse_gene_map, on="Gene name")
+inflam_final = inflam_merged.merge(mouse_gene_map, on="DB Class Key")
+inflam_final["Gene name"] = inflam_final["Gene name_y"]
+inflam_merged = inflam_final.merge(prefinal_filtered, on="Gene name")
+print("JAX abnormal inflammation filter - unique genes remaining:", inflam_merged["Gene name"].unique().shape[0])
+
+#Bioinformatics filter - only looks at proteins where no other human protein has significant primary sequence homology
+#Also filter out proteins that are longer than AlphaFold's max length for 1 protein prediction (and thus split the prediction into multiple files)
+bio_df = pd.read_csv("outputs/homology_counts.txt", header=None) #22_with_eval_cut.txt
+bio_df["Entry_x"] = bio_df[0].str.split("-").str[1].astype('str')
+bio_df["Count"] = bio_df[0].str.split(":").str[-1].astype('int')
+bio_df2 = bio_df[bio_df["Count"]==1] #Make sure there are no homologous proteins
+bio_df3 =  bio_df2["Entry_x"].value_counts()
+bio_df4 =  bio_df3[bio_df3 == 1].reset_index() #Make sure the protein is not longer than AlphaFold's max prediction length
+bio_df4.columns = ["Entry_x", "Count2"]
+bio_df5 = bio_df4.merge(bio_df2, on="Entry_x")
+final_filtered = inflam_merged.merge(bio_df5, on="Entry_x")
+print("Bioinformatics filter - unique genes remaining:", final_filtered["Gene name"].unique().shape[0])
+
+# Save final filtered 
+final_filtered =  final_filtered.copy()
+final_filtered["Entry"] = final_filtered["Entry_x"]
+final_filtered[["Gene name", "Entry"]].drop_duplicates().to_csv("outputs/stage1_filtering_gene_candidates.csv", index=None)
diff --git a/Stage2.py b/Stage2.py
@@ -0,0 +1,83 @@
+import pandas as pd
+import numpy as np  
+from sklearn.model_selection import StratifiedKFold
+from sklearn.metrics import f1_score, average_precision_score, precision_recall_curve, matthews_corrcoef
+from sklearn.neighbors import LocalOutlierFactor
+from scipy.stats import norm
+from imblearn.ensemble import BalancedRandomForestClassifier
+
+df = pd.read_table("data/Halgreen 2009 - Table 6.txt", sep=" ") 
+
+for threshold in [0, 1]:
+    df["label"] = (df["categorya"] > threshold).astype(bool)
+
+    features = ["Dscore","SScore","size","enclosure","philic","phobic"]
+    X = df[features]
+    y = df["label"]
+
+    lof = LocalOutlierFactor()
+    outlier_scores = lof.fit_predict(X.values)
+    non_outlier_indices = [i for i, score in enumerate(outlier_scores) if score == 1]
+    X_cleaned = X.iloc[non_outlier_indices]
+    y_cleaned = y.iloc[non_outlier_indices]    
+
+    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+    metrics = []
+
+    for train_index, test_index in kf.split(X_cleaned, y_cleaned):
+        X_train, X_test = X_cleaned.iloc[train_index], X_cleaned.iloc[test_index]
+        y_train, y_test = y_cleaned.iloc[train_index], y_cleaned.iloc[test_index]
+
+        rf =  BalancedRandomForestClassifier(random_state=42)
+        rf.fit(X_train, y_train) 
+
+        y_pred_rf = rf.predict(X_test)        
+        y_pred_rf_proba = rf.predict_proba(X_test)[:, 1]
+
+        f1_rf = f1_score(y_test, y_pred_rf)
+        mcc_rf =  matthews_corrcoef(y_test, y_pred_rf)
+        z_score = mcc_rf / (1 - mcc_rf**2) ** 0.5
+        mcc_rf_pvalue = 2 * (1 - norm.cdf(abs(z_score)))
+        app_rf = average_precision_score(y_test, y_pred_rf_proba)
+
+        metrics.append([mcc_rf, mcc_rf_pvalue, f1_rf, app_rf, threshold, "RF"])
+
+    metrics_df = pd.DataFrame(metrics, columns=["MCC", "MCC p-value", "F1 Score", "Average Precision Score", "Threshold", "Model"])
+    metrics_df[["MCC", "Threshold"]].to_csv(f"outputs/cross_validation_data_threshold_{threshold}.csv")
+
+confidence_cut_level = 0.5
+df["y"] = df["categorya"]>0
+
+lof = LocalOutlierFactor()
+outlier_scores = lof.fit_predict(X.values)
+non_outlier_indices = [i for i, score in enumerate(outlier_scores) if score == 1]
+X_cleaned = X.iloc[non_outlier_indices]
+y_cleaned = y.iloc[non_outlier_indices]  
+
+rf = BalancedRandomForestClassifier(random_state=42) 
+rf.fit(X_cleaned.values, y_cleaned)
+
+lof2 = LocalOutlierFactor(novelty=True)
+lof2.fit(X_cleaned.values)
+
+df_targets = pd.read_csv("outputs/sitemap_results_with_quality_metrics.csv")
+df_targets["UniProtKB Gene Name ID"] = df_targets["Entry"]
+
+df_targets2 = df_targets[(df_targets["Entire Region Percent High Quality"]>=confidence_cut_level) & 
+                         (df_targets["Percent High Quality"]>=confidence_cut_level)]
+
+X_targets = df_targets2[["Dscore","SiteScore","size","enclosure","philic","phobic"]]
+outlier_scores = lof2.predict(X_targets.values)
+X_good_targets = X_targets[outlier_scores > -1]  
+
+rf_probs = rf.predict_proba(X_good_targets.values)
+
+df_targets2 = df_targets2[outlier_scores > -1] 
+df_targets2["rf_probs"] = rf_probs[:,1]  
+df_targets3 = df_targets2[df_targets2["rf_probs"]>=confidence_cut_level]
+
+df_gene_map = pd.read_csv("data/uniprot_ensembl_map.txt")
+df_merged = df_targets3.merge(df_gene_map, on="UniProtKB Gene Name ID")
+df_hq = df_merged.drop_duplicates()
+
+df_hq[["Gene name","residues", "Entire Region Percent High Quality", "Percent High Quality"]].drop_duplicates().to_csv("outputs/final_filtered_sites.csv", index=None)
diff --git a/alphafold_err.py b/alphafold_err.py
@@ -0,0 +1,20 @@
+import Bio.PDB
+import pandas as pd
+import os
+
+def getGoods(inputPath):
+    namey = os.path.splitext(os.path.split(inputPath)[1])[0]
+    p = Bio.PDB.PDBParser()
+    structure = p.get_structure(namey, inputPath)
+
+    listy = list()
+    for thing in structure.get_residues():
+        res_id = thing.get_id()[1]
+        for sub_thing in thing.get_atoms():
+            val = sub_thing.get_bfactor()
+        listy.append((res_id,val))
+
+    df = pd.DataFrame(listy)
+
+    goods = df[df[1]>=90]
+    return goods