From 2bd7f6c68aaec010ce1c942d059bb70a1ba9dcc0 Mon Sep 17 00:00:00 2001
From: ctrlaltaf <altafayyubibarelvi@gmail.com>
Date: Wed, 28 Aug 2024 13:28:56 -0700
Subject: [PATCH] added go term sampling

---
 main.py           |   8 +-
 tools/workflow.py | 372 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 274 insertions(+), 106 deletions(-)

diff --git a/main.py b/main.py
index 40bf7a6..1129d43 100644
--- a/main.py
+++ b/main.py
@@ -80,7 +80,7 @@ def main():
     repeats = 1
     new_random_lists = True
     print_graphs = True
-    no_inferred_edges = True
+    no_inferred_edges = False
     go_term_type = [namespace[0],namespace[1],namespace[2]]
     # sample_size: number of samples chosen for positive/negative lists (total is 2xsample_size)
     # repeats: number of times to run all algorithms to obtain an average
@@ -123,15 +123,15 @@ def main():
             short_name = short_name + "_cel"
 
     interactome_columns = [0, 1]
-    interactome = read_specific_columns(elegans_interactome_path, interactome_columns, ",")
-    regulatory_interactome = read_specific_columns(elegans_reg_path, interactome_columns, ",")
+    interactome = read_specific_columns(fly_interactome_path, interactome_columns, ",")
+    regulatory_interactome = read_specific_columns(fly_reg_path, interactome_columns, ",")
     go_inferred_columns = [0, 2, 3]
     #Adds relationship_type column
     if no_inferred_edges: 
         go_inferred_columns.append(1)
         
     go_protein_pairs = read_pro_go_data(
-        elegans_go_association_mixed_path, go_inferred_columns, go_term_type, ","
+        fly_go_association_mixed_path, go_inferred_columns, go_term_type, ","
     )
     #Uses relationship_type column to sort through which proGO edges are inferred 
     if no_inferred_edges:
diff --git a/tools/workflow.py b/tools/workflow.py
index 644511b..b710b9c 100644
--- a/tools/workflow.py
+++ b/tools/workflow.py
@@ -22,7 +22,6 @@
 import networkx as nx
 
 
-
 def run_workflow(
     algorithm_classes,
     go_protein_pairs,
@@ -51,7 +50,7 @@ def run_workflow(
     output_data_path {Path} : path of the output data
     output_image_path {Path} : path of the output image
     repeats {int} : the number of experiment repetitions
-    new_random_list {bool} : flag True to generate completely new pos/neg lists, False to use pre-existing ones 
+    new_random_list {bool} : flag True to generate completely new pos/neg lists, False to use pre-existing ones
     name {str} : a string of namespaces chosen to be used in the sample
     figure {bool} : true if graphs should be printed (any), false if not
 
@@ -61,7 +60,7 @@ def run_workflow(
     G = import_graph_from_pickle(graph_file_path)
     x = repeats  # Number of replicates
     print_graphs = figure
-    
+
     if x > 1:
         print_graphs = False
     auc = {}
@@ -69,7 +68,7 @@ def run_workflow(
     for i in algorithm_classes.keys():
         auc[i] = [[], []]
 
-    #Sorts through replicates in directory and returns number of dataset pairs, needs to be formatted and ordered corectly
+    # Sorts through replicates in directory and returns number of dataset pairs, needs to be formatted and ordered corectly
     if new_random_lists == False:
         x = use_existing_samples(dataset_directory_path)
         if x == 1:
@@ -77,21 +76,26 @@ def run_workflow(
         else:
             print_graphs = False
 
-    #Remvoes all old positive/negative lists and generates completely new positive and negative lists for every replicate, regardless of if the file already exists or not
+    # Remvoes all old positive/negative lists and generates completely new positive and negative lists for every replicate, regardless of if the file already exists or not
     else:
         remove_samples(dataset_directory_path)
         for i in range(x):
-            positive_dataset, negative_dataset = sample_data_neighbor(
-                go_protein_pairs, sample_size, protein_list, G, dataset_directory_path, i, name
+            positive_dataset, negative_dataset = sample_data_signalling(
+                go_protein_pairs,
+                sample_size,
+                protein_list,
+                G,
+                dataset_directory_path,
+                i,
+                name,
             )
 
-    
     for i in range(
         x
     ):  # Creates a pos/neg list each replicate then runs workflow like normal
         if x > 1:
             print("\n\nReplicate: " + str(i) + "\n")
-    
+
         results = run_experiement(
             algorithm_classes,
             dataset_directory_path,
@@ -103,25 +107,25 @@ def run_workflow(
             i,
             name,
         )
-        
+
         # each loop adds the roc and pr values for the algorithm, index 0 for roc and 1 for pr
         for i in algorithm_classes.keys():
             auc[i][0].append(results[i]["roc_auc"])
             auc[i][1].append(results[i]["pr_auc"])
 
-    #Creates a dictionary for all pr values and all roc values and fills it with auc values 
+    # Creates a dictionary for all pr values and all roc values and fills it with auc values
     roc = {}
     pr = {}
 
     for i in algorithm_classes.keys():
         roc[i] = auc[i][0]
         pr[i] = auc[i][1]
-    
+
     if x > 1:
         cols = []
         for i in range(x):
             cols.append("Replicate " + str(i))
-        
+
         # Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3
         for i in auc.keys():
             meanROC = round(stat.mean(auc[i][0]), 5)
@@ -151,31 +155,15 @@ def run_workflow(
     else:
         cols = ["AUC"]
 
-    #Below saves full list of roc and pr values
-    dfr = pd.DataFrame.from_dict(
-        roc,
-        orient = 'index',
-        columns = cols
-    )
+    # Below saves full list of roc and pr values
+    dfr = pd.DataFrame.from_dict(roc, orient="index", columns=cols)
 
-    dfp = pd.DataFrame.from_dict(
-        pr,
-        orient = 'index',
-        columns = cols
-    )
-        
-    dfr.to_csv(
-        Path(output_data_path, "roc_auc_results.csv"),
-        index = True,
-        sep = "\t"
-    )
-    
-    dfp.to_csv(
-        Path(output_data_path, "pr_auc_results.csv"),
-        index = True,
-        sep = "\t"
-    )
-    #prints boxplots for replicate values and barplots for replicates with only one algorithm
+    dfp = pd.DataFrame.from_dict(pr, orient="index", columns=cols)
+
+    dfr.to_csv(Path(output_data_path, "roc_auc_results.csv"), index=True, sep="\t")
+
+    dfp.to_csv(Path(output_data_path, "pr_auc_results.csv"), index=True, sep="\t")
+    # prints boxplots for replicate values and barplots for replicates with only one algorithm
     if x > 1 & figure == True:
         if len(roc.keys()) == 1:
             replicate_barplot_only_one_algorithm(roc, output_image_path, True)
@@ -184,6 +172,7 @@ def run_workflow(
             replicate_boxplot(roc, output_image_path, True)
             replicate_boxplot(pr, output_image_path, False)
 
+
 def run_experiement(
     algorithm_classes,
     input_directory_path,
@@ -220,7 +209,12 @@ def run_experiement(
         print("")
         print(f"{i} / {len(algorithm_classes)}: {algorithm_name} Algorithm")
         current = run_algorithm(
-            algorithm_class, input_directory_path, graph_file_path, output_data_path, rep_num, name,
+            algorithm_class,
+            input_directory_path,
+            graph_file_path,
+            output_data_path,
+            rep_num,
+            name,
         )
         current = run_metrics(current)
         results[algorithm_name] = current
@@ -255,7 +249,7 @@ def run_algorithm(
     output_data_path {Path} : path of the output data
     rep_num {int} : replicate number to use associated pos/neg dataset
     name {str} : namespaces used to create the sample datasets
-    
+
 
     Returns:
     Result {dict} : a dictionary that stores the y_true and y_score values of the algorithm
@@ -265,7 +259,11 @@ def run_algorithm(
 
     # Predict using the algorithm
     y_score, y_true = algorithm.predict(
-        input_directory_path, graph_file_path, output_data_path, rep_num, name,
+        input_directory_path,
+        graph_file_path,
+        output_data_path,
+        rep_num,
+        name,
     )
 
     # Access y_true and y_score attributes for evaluation
@@ -383,10 +381,32 @@ def generate_figures(algorithm_classes, results, output_image_path, output_data_
 
     # colors = generate_random_colors(len(algorithm_classes))
     col_names = ["ON", "ON2", "ON3", "PD", "PD2", "PD3", "SA", "HD", "HD2"]
-    colors = ["lightcoral", "indianred", "firebrick", "peachpuff", "sandybrown", "peru", "gold", "goldenrod", "darkgoldenrod", "yellowgreen", "olivedrab", "darkolivegreen", "paleturquoise", "mediumturquoise", "darkcyan", "mediumpurple", "darkviolet", "rebeccapurple", "hotpink", "deeppink", "mediumvioletred"]
+    colors = [
+        "lightcoral",
+        "indianred",
+        "firebrick",
+        "peachpuff",
+        "sandybrown",
+        "peru",
+        "gold",
+        "goldenrod",
+        "darkgoldenrod",
+        "yellowgreen",
+        "olivedrab",
+        "darkolivegreen",
+        "paleturquoise",
+        "mediumturquoise",
+        "darkcyan",
+        "mediumpurple",
+        "darkviolet",
+        "rebeccapurple",
+        "hotpink",
+        "deeppink",
+        "mediumvioletred",
+    ]
     len_keys = len(algorithm_classes)
-    ran = random.randrange(len(colors)-len_keys)
-    colors = colors[ran:ran+len_keys]
+    ran = random.randrange(len(colors) - len_keys)
+    colors = colors[ran : ran + len_keys]
 
     sorted_results = sort_results_by(results, "roc_auc", output_data_path)
     # Initialize your parameters
@@ -437,7 +457,9 @@ def generate_figures(algorithm_classes, results, output_image_path, output_data_
     plt.show()
 
 
-def sample_data(go_protein_pairs, sample_size, protein_list, G, input_directory_path, num, name):
+def sample_data(
+    go_protein_pairs, sample_size, protein_list, G, input_directory_path, num, name
+):
     """
     Given a sample size, generate positive nad negative datasets.
 
@@ -475,21 +497,30 @@ def sample_data(go_protein_pairs, sample_size, protein_list, G, input_directory_
         i += 1
     positive_df = pd.DataFrame(positive_dataset)
     negative_df = pd.DataFrame(negative_dataset)
-    
+
     positive_df.to_csv(
-        Path(input_directory_path, "rep_" + str(num) + "_positive_protein_go_term_pairs" + name + ".csv"),
+        Path(
+            input_directory_path,
+            "rep_" + str(num) + "_positive_protein_go_term_pairs" + name + ".csv",
+        ),
         index=False,
         sep="\t",
     )
     negative_df.to_csv(
-        Path(input_directory_path, "rep_" + str(num) + "_negative_protein_go_term_pairs" + name + ".csv"),
+        Path(
+            input_directory_path,
+            "rep_" + str(num) + "_negative_protein_go_term_pairs" + name + ".csv",
+        ),
         index=False,
         sep="\t",
     )
 
     return positive_dataset, negative_dataset
 
-def sample_data_neighbor(go_protein_pairs, sample_size, protein_list, G, input_directory_path, num, name):
+
+def sample_data_neighbor(
+    go_protein_pairs, sample_size, protein_list, G, input_directory_path, num, name
+):
     """
     Given a sample size, generate positive nad negative datasets.
 
@@ -524,14 +555,92 @@ def sample_data_neighbor(go_protein_pairs, sample_size, protein_list, G, input_d
         i += 1
     positive_df = pd.DataFrame(positive_dataset)
     negative_df = pd.DataFrame(negative_dataset)
-    
+
     positive_df.to_csv(
-        Path(input_directory_path, "rep_" + str(num) + "_positive_protein_go_term_pairs" + name + ".csv"),
+        Path(
+            input_directory_path,
+            "rep_" + str(num) + "_positive_protein_go_term_pairs" + name + ".csv",
+        ),
         index=False,
         sep="\t",
     )
     negative_df.to_csv(
-        Path(input_directory_path, "rep_" + str(num) + "_negative_protein_go_term_pairs" + name + ".csv"),
+        Path(
+            input_directory_path,
+            "rep_" + str(num) + "_negative_protein_go_term_pairs" + name + ".csv",
+        ),
+        index=False,
+        sep="\t",
+    )
+
+    return positive_dataset, negative_dataset
+
+
+def sample_data_signalling(
+    go_protein_pairs, sample_size, protein_list, G, input_directory_path, num, name
+):
+    """
+    Given a sample size, generate positive nad negative datasets.
+
+    Parameters:
+
+    go_protein_pairs {list} : a list containing the edge between a protein and a go-term e.g. [[protein1, go_term1], [protein2, go_term2], ...]
+    sample_size {int} : the size of a positive/negative dataset to be sampled
+    protein_list {list} : a list of all proteins in the graph
+    G {nx.Graph} : graph that represents the interactome and go term connections
+    input_directory_path {Path} : Path to directory of the datasets
+    num {int} : Number of positive/negative dataset
+    name {str} : shorthand for all namespaces used to generate datasets, adds shorthand to .csv name
+
+    Returns:
+    positive_dataset, negative_dataset
+
+    """
+    positive_dataset = {"protein": [], "go": []}
+    negative_dataset = {"protein": [], "go": []}
+    # sample the data
+    for edge in sample(list(go_protein_pairs), sample_size):
+        positive_dataset["protein"].append(edge[0])
+        positive_dataset["go"].append(edge[1])
+
+    signalling_go_term_list = [
+        "GO:0004888",
+        "GO:0038023",
+        "GO:0005102",
+        "GO:0023051",
+        "GO:0030546",
+        "GO:0030545",
+    ]
+
+    for go_term in signalling_go_term_list:
+        neighbor_list = get_neighbors(G, go_term, ["protein_go_term"])
+        for protein in neighbor_list:  
+            positive_dataset["protein"].append(protein)
+            positive_dataset["go"].append(go_term)
+    i = 1
+
+    for protein, go in zip(positive_dataset["protein"], positive_dataset["go"]):
+        closest_neighbor = find_closest_neighbor_without_edge(G, protein, go)
+        negative_dataset["protein"].append(closest_neighbor)
+        negative_dataset["go"].append(go)
+        print_progress(i, sample_size)
+        i += 1
+    positive_df = pd.DataFrame(positive_dataset)
+    negative_df = pd.DataFrame(negative_dataset)
+
+    positive_df.to_csv(
+        Path(
+            input_directory_path,
+            "rep_" + str(num) + "_positive_protein_go_term_pairs" + name + ".csv",
+        ),
+        index=False,
+        sep="\t",
+    )
+    negative_df.to_csv(
+        Path(
+            input_directory_path,
+            "rep_" + str(num) + "_negative_protein_go_term_pairs" + name + ".csv",
+        ),
         index=False,
         sep="\t",
     )
@@ -557,7 +666,15 @@ def get_datasets(input_directory_path, rep_num, name):
     negative_dataset = {"protein": [], "go": []}
     try:
         with open(
-            Path(input_directory_path, "rep_" + str(rep_num) + "_positive_protein_go_term_pairs" + name + ".csv"), "r"
+            Path(
+                input_directory_path,
+                "rep_"
+                + str(rep_num)
+                + "_positive_protein_go_term_pairs"
+                + name
+                + ".csv",
+            ),
+            "r",
         ) as file:
             next(file)
             for line in file:
@@ -565,11 +682,18 @@ def get_datasets(input_directory_path, rep_num, name):
                 positive_dataset["protein"].append(parts[0])
                 positive_dataset["go"].append(parts[1])
     except:
-        print(Fore.RED + "\nFile not found: ensure given namespaces match positive and negative file namespaces\n")
+        print(
+            Fore.RED
+            + "\nFile not found: ensure given namespaces match positive and negative file namespaces\n"
+        )
         sys.exit(1)
 
     with open(
-        Path(input_directory_path, "rep_" + str(rep_num) + "_negative_protein_go_term_pairs" + name + ".csv"), "r"
+        Path(
+            input_directory_path,
+            "rep_" + str(rep_num) + "_negative_protein_go_term_pairs" + name + ".csv",
+        ),
+        "r",
     ) as file:
         next(file)
         for line in file:
@@ -614,6 +738,7 @@ def sort_results_by(results, key, output_path):
         sorted_results[algorithm[0]] = results[algorithm[0]]
     return sorted_results
 
+
 def remove_samples(dataset_directory_path):
     """
     Removes all old samples before creating new ones (to ensure no issues when changing namespaces)
@@ -622,29 +747,27 @@ def remove_samples(dataset_directory_path):
 
     x {int}: the number of replicates
     dataset_directory_path {Path}: path to the dataset directory where samples are stored
-    
+
     Returns:
     Null
 
     """
     data_dir = sorted(os.listdir(dataset_directory_path))
-    remove = [] 
-    files = {'positive': {},
-       'negative': {}
-        } 
+    remove = []
+    files = {"positive": {}, "negative": {}}
     for i in data_dir:
         temp = i.split("_")
-        if temp[0] == 'rep':
+        if temp[0] == "rep":
             rep_num = int(temp[1])
-            if temp[2] == 'positive':
-                files['positive'][rep_num] = i
+            if temp[2] == "positive":
+                files["positive"][rep_num] = i
                 remove.append(rep_num)
-            elif temp[2] == 'negative':
-                files['negative'][rep_num] = i
-                
+            elif temp[2] == "negative":
+                files["negative"][rep_num] = i
+
     for i in remove:
-        del_file_path_pos = Path(dataset_directory_path, files['positive'][i])
-        del_file_path_neg = Path(dataset_directory_path, files['negative'][i])
+        del_file_path_pos = Path(dataset_directory_path, files["positive"][i])
+        del_file_path_neg = Path(dataset_directory_path, files["negative"][i])
         os.remove(del_file_path_pos)
         os.remove(del_file_path_neg)
 
@@ -656,23 +779,24 @@ def use_existing_samples(dataset_directory_path):
     Parameters:
 
     dataset_directory_path {Path}: path to the dataset directory where samples are stored
-    
+
     Returns:
     the number of repititions in the directory {int}
 
     """
     data_dir = sorted(os.listdir(dataset_directory_path))
-    nums = [] 
+    nums = []
     for i in data_dir:
         temp = i.split("_")
-        if temp[0] == 'rep':
+        if temp[0] == "rep":
             rep_num = int(temp[1])
-            if temp[2] == 'positive':
+            if temp[2] == "positive":
                 nums.append(rep_num)
     nums = sorted(nums)
     n = len(nums)
     return n
 
+
 def replicate_boxplot(auc_list, output_image_path, curve):
     """
     Creates a boxplot using replicates of the AUC value for ROC or PR curves
@@ -681,8 +805,8 @@ def replicate_boxplot(auc_list, output_image_path, curve):
 
     auc_list {dict}: either ROC or PR dictionary containing the list of AUC values for each replicate
     output_image_path {Path} : output path to save the graph
-    curve {bool} : either True for ROC or False for PR 
-    
+    curve {bool} : either True for ROC or False for PR
+
     Returns:
     NULL
 
@@ -690,26 +814,46 @@ def replicate_boxplot(auc_list, output_image_path, curve):
     graph = []
     col_names = []
     for i in list(auc_list.keys()):
-        #Pulled Code from w3r
-        remove_lower = lambda text: re.sub('[a-z]', '', text)
+        # Pulled Code from w3r
+        remove_lower = lambda text: re.sub("[a-z]", "", text)
         col_names.append(remove_lower(i))
-    colors = ["lightcoral", "indianred", "firebrick", "peachpuff", "sandybrown", "peru", "gold", "goldenrod", "darkgoldenrod", "yellowgreen", "olivedrab", "darkolivegreen", "paleturquoise", "mediumturquoise", "darkcyan", "mediumpurple", "darkviolet", "rebeccapurple", "hotpink", "deeppink", "mediumvioletred"]
+    colors = [
+        "lightcoral",
+        "indianred",
+        "firebrick",
+        "peachpuff",
+        "sandybrown",
+        "peru",
+        "gold",
+        "goldenrod",
+        "darkgoldenrod",
+        "yellowgreen",
+        "olivedrab",
+        "darkolivegreen",
+        "paleturquoise",
+        "mediumturquoise",
+        "darkcyan",
+        "mediumpurple",
+        "darkviolet",
+        "rebeccapurple",
+        "hotpink",
+        "deeppink",
+        "mediumvioletred",
+    ]
     len_keys = len(auc_list.keys())
-    ran = random.randrange(len(colors)-len_keys)
-    colors = colors[ran:ran+len_keys]
+    ran = random.randrange(len(colors) - len_keys)
+    colors = colors[ran : ran + len_keys]
     for i in auc_list:
         graph.append(auc_list[i])
-    
+
     fig, ax = plt.subplots()
     ax.set_ylabel("AUC")
-    
-    plot = ax.boxplot(graph, 
-                      patch_artist = True,
-                      labels = col_names)
-    
-    for patch, color in zip(plot['boxes'], colors):
+
+    plot = ax.boxplot(graph, patch_artist=True, labels=col_names)
+
+    for patch, color in zip(plot["boxes"], colors):
         patch.set_facecolor(color)
-    
+
     if curve == True:
         plt.title("ROC replicates")
         plt.savefig(Path(output_image_path, "roc_replicate_boxplot.png"))
@@ -727,8 +871,8 @@ def replicate_barplot_only_one_algorithm(auc_list, output_image_path, curve):
 
     auc_list {dict}: either ROC or PR dictionary containing the list of AUC values for each replicate
     output_image_path {Path} : output path to save the graph
-    curve {bool} : either True for ROC or False for PR 
-    
+    curve {bool} : either True for ROC or False for PR
+
     Returns:
     NULL
 
@@ -739,18 +883,38 @@ def replicate_barplot_only_one_algorithm(auc_list, output_image_path, curve):
     col_names = []
     for i in range(len(graph)):
         col_names.append("Rep " + str(i))
-    colors = ["lightcoral", "indianred", "firebrick", "peachpuff", "sandybrown", "peru", "gold", "goldenrod", "darkgoldenrod", "yellowgreen", "olivedrab", "darkolivegreen", "paleturquoise", "mediumturquoise", "darkcyan", "mediumpurple", "darkviolet", "rebeccapurple", "hotpink", "deeppink", "mediumvioletred"]
+    colors = [
+        "lightcoral",
+        "indianred",
+        "firebrick",
+        "peachpuff",
+        "sandybrown",
+        "peru",
+        "gold",
+        "goldenrod",
+        "darkgoldenrod",
+        "yellowgreen",
+        "olivedrab",
+        "darkolivegreen",
+        "paleturquoise",
+        "mediumturquoise",
+        "darkcyan",
+        "mediumpurple",
+        "darkviolet",
+        "rebeccapurple",
+        "hotpink",
+        "deeppink",
+        "mediumvioletred",
+    ]
     len_keys = len(graph)
-    ran = random.randrange(len(colors)-len_keys)
-    colors = colors[ran:ran+len_keys]
-    
+    ran = random.randrange(len(colors) - len_keys)
+    colors = colors[ran : ran + len_keys]
+
     fig, ax = plt.subplots()
     ax.set_ylabel("AUC")
     my_cmap = plt.get_cmap("viridis")
-    plot = ax.bar(col_names,
-                  graph, 
-                  color = colors)
-    
+    plot = ax.bar(col_names, graph, color=colors)
+
     if curve == True:
         plt.title("ROC replicates")
         plt.savefig(Path(output_image_path, "roc_replicate_barplot.png"))
@@ -759,23 +923,27 @@ def replicate_barplot_only_one_algorithm(auc_list, output_image_path, curve):
         plt.savefig(Path(output_image_path, "pr_replicate_barplot.png"))
     plt.show()
 
+
 def get_neighbors(G: nx.DiGraph, node, edgeTypes):
     res = G.edges(node, data=True)
     neighbors = []
     for edge in res:
         if edge[2]["type"] in edgeTypes:
             neighborNode = [edge[1], edge[2]]
-            neighbors.append(neighborNode) 
+            neighbors.append(neighborNode[0])
 
     return neighbors
 
+
 def find_closest_neighbor_without_edge(G, protein1, go_term1):
     # Use BFS to explore neighbors of protein1
     for neighbor in nx.bfs_tree(G, protein1):
-        if (neighbor != protein1 and 
-            G.nodes[neighbor].get('type') == 'protein' and 
-            not G.has_edge(neighbor, go_term1)):
+        if (
+            neighbor != protein1
+            and G.nodes[neighbor].get("type") == "protein"
+            and not G.has_edge(neighbor, go_term1)
+        ):
             return neighbor
-    
+
     # If all neighbors have an edge to go_term1
-    return None
\ No newline at end of file
+    return None