Initial release for Pip and BioConda

clinte14 · Jan 25, 2024 · 654cdcd · 654cdcd
1 parent 976deac
commit 654cdcd
Show file tree

Hide file tree

Showing 13 changed files with 91 additions and 81 deletions.
diff --git a/__pycache__/blast_parse.cpython-310.pyc b/__pycache__/blast_parse.cpython-310.pyc
diff --git a/__pycache__/blast_parse.cpython-39.pyc b/__pycache__/blast_parse.cpython-39.pyc
diff --git a/__pycache__/correlation_calcs.cpython-310.pyc b/__pycache__/correlation_calcs.cpython-310.pyc
diff --git a/__pycache__/correlation_calcs.cpython-39.pyc b/__pycache__/correlation_calcs.cpython-39.pyc
diff --git a/__pycache__/create_visuals.cpython-310.pyc b/__pycache__/create_visuals.cpython-310.pyc
diff --git a/__pycache__/create_visuals.cpython-39.pyc b/__pycache__/create_visuals.cpython-39.pyc
diff --git a/__pycache__/housekeeping.cpython-310.pyc b/__pycache__/housekeeping.cpython-310.pyc
diff --git a/__pycache__/housekeeping.cpython-39.pyc b/__pycache__/housekeeping.cpython-39.pyc
diff --git a/blast_parse.py b/blast_parse.py
@@ -80,7 +80,7 @@ def create_pa(hits, flag_values):
     # create 'merged_BLAST.csv' file in /02_PA_Matrix/ containing all values in 'hits' dictionary from parse_merge_BLAST()
     merged_df = pd.DataFrame({ key:pd.Series(value) for key, value in hits.items() })
     merged_df = merged_df.reindex(sorted(merged_df.columns), axis=1)
-    merged_df_dir=os.path.join(flag_values['output'],'02_PA_matrix','merged_BLAST.csv')
+    merged_df_dir=os.path.join(flag_values['output'],'01_PA_matrix','merged_BLAST.csv')
     merged_df.to_csv(merged_df_dir)
 
     print("  --->Wrote 'merged_BLAST.csv' to '{}'".format(merged_df_dir + '\n'))
@@ -106,6 +106,6 @@ def create_pa(hits, flag_values):
             pa_df.at[value,column_name] = 1      
     # Replace nan with zero
     pa_df = pa_df.fillna(value=0)
-    pa_dir = os.path.join(flag_values['output'] ,'02_PA_matrix','pa_matrix.csv')
+    pa_dir = os.path.join(flag_values['output'] ,'01_PA_matrix','pa_matrix.csv')
     pa_df.to_csv(pa_dir)
     print("  --->Wrote 'pa_matrix.csv' to '{}'".format(pa_dir + '\n'))
diff --git a/correlation_calcs.py b/correlation_calcs.py
@@ -9,15 +9,15 @@
 def correlation_calcs(flag_values):
     '''
     '''
-    # Open prescence/abscence matrix from /02_PA_matrix/ directory 
-    with open(os.path.join(flag_values['output'],'02_PA_matrix','pa_matrix.csv'), 'r') as infile:
+    # Open prescence/abscence matrix from /01_PA_matrix/ directory 
+    with open(os.path.join(flag_values['output'],'01_PA_matrix','pa_matrix.csv'), 'r') as infile:
             # Create dataframe from BLAST results, header = row# w/ column names
             pa_df = pd.read_csv(infile, sep=',', header=0)
+    #clean up indexing and names in our pa_df (presence/absence dataframe)
+    first_col = pa_df.columns[0]
+    pa_df = pa_df.rename(columns={first_col : "species"})
 
-    #clean up indexing and names in our pa_df (prescence/abscence dataframe)
-    pa_df = pa_df.rename(columns={"Unnamed: 0" : "species"})
-
-    #an index of gene_names. Do not include indexer row '0' or 'species' column '1' of pa_df (prescence/abscence dataframe)
+    #an index of gene_names. Do not include indexer row '0' or 'species' column '1' of pa_df (presence/abscence dataframe)
     gene_names =  pa_df.dtypes.index[1:]
 
     #this function calculates all the terms needed for pearson correlation
@@ -50,7 +50,7 @@ def correlation_calcs(flag_values):
             Pjj = Rij_inverse_df.loc[column_to_right_name, column_to_right_name]
             #print("Pij=", Pij, "Pii=", Pii, "Pjj=", Pjj)
             Wij_df.at[column_name, column_to_right_name] = -((Pij) / (math.sqrt(Pii * Pjj)))
-    Wij_df.to_csv(os.path.join(flag_values['output'],'03_correlation_calcs','Wij_df_triangle.csv'))
+    Wij_df.to_csv(os.path.join(flag_values['output'],'02_correlation','Wij_df_triangle.csv'))
     print("->Wrote 'Wij_df_triangle.csv' to '{}".format(os.path.join(flag_values['output']),"04_correlog_values..") + '\n') 
 
 
@@ -64,7 +64,7 @@ def correlation_calcs(flag_values):
             else:
                 Wij_df.loc[row, column] = Wij_df.loc[column, row]
 
-    Wij_df.to_csv(os.path.join(flag_values['output'],'03_correlation_calcs','Wij_df.csv'))
+    Wij_df.to_csv(os.path.join(flag_values['output'],'02_correlation','Wij_df.csv'))
 
     #need print statement here to console
     return Wij_df
@@ -73,7 +73,7 @@ def calc_mrs(Wij_df, flag_values):
     Wij_outerDict = {}
     gene_names =  Wij_df.dtypes.index
 
-    f = open(os.path.join(flag_values['output'],'04_correlog_values',"Wij_matrix.csv"), "w")
+    f = open(os.path.join(flag_values['output'],'02_correlation',"Wij_matrix.csv"), "w")
     writer = csv.writer(f)
 
     network_list=[]
@@ -136,7 +136,7 @@ def calc_pearson_terms(flag_values, pa_df, gene_names):
             print("  --->", column_to_right_name, Cij)
             Cij_df.at[column_name, column_to_right_name] = Cij
     # write Cij_df to disk
-    cij_dir=os.path.join(flag_values['output'],'03_correlation_calcs','Cij_df.csv')
+    cij_dir=os.path.join(flag_values['output'],'02_correlation','Cij_df.csv')
     Cij_df.to_csv(cij_dir)
     print("Wrote 'Cij_df.csv' aka # species with both gene 'i' & gene 'j' to '{}".format(cij_dir + '\n')) 
     # Debug print(Cij, index, df.loc[index, "species"]) 
@@ -163,7 +163,7 @@ def pearson_corr_calc(flag_values, pa_df, gene_names, E_i, N, Cij_df):
             Rij_df.at[column_name, column_to_right_name] = ((current_Cij * N) - (current_E_i * current_E_j)) / (
                     math.sqrt(current_E_i) * math.sqrt(current_E_j) * math.sqrt(N-current_E_i) * math.sqrt(N-current_E_j))
     # write Rij_df to disk
-    rij_triangle_dir=os.path.join(flag_values['output'],'03_correlation_calcs','Rij_df_triangle.csv')
+    rij_triangle_dir=os.path.join(flag_values['output'],'02_correlation','Rij_df_triangle.csv')
     Rij_df.to_csv(rij_triangle_dir)
     print("->Wrote 'Rij_df_triangle.csv' aka Pearson Correlations to '{}".format(rij_triangle_dir + '\n'))
 
@@ -177,15 +177,15 @@ def pearson_corr_calc(flag_values, pa_df, gene_names, E_i, N, Cij_df):
             else:
                 Rij_df.loc[row, column] = Rij_df.loc[column, row]
     # write Rij_df symmetrical to disk
-    rij_sym_dir=os.path.join(flag_values['output'], '03_correlation_calcs', 'Rij_df_symmetrical.csv')
+    rij_sym_dir=os.path.join(flag_values['output'], '02_correlation', 'Rij_df_symmetrical.csv')
     Rij_df.to_csv(rij_sym_dir)
     print("->Wrote 'Rij_df_symmetrical.csv' to '{}".format(rij_sym_dir + '\n'))
 
     # Create inverse matrix from symmetrical Rij matrix
     Rij_inverse_df = pd.DataFrame(np.linalg.pinv(Rij_df.values), Rij_df.columns, Rij_df.index)
 
     # write Rij_inverse_df to disk
-    Rij_inverse_dir=os.path.join(flag_values['output'],'03_correlation_calcs','Rij_inverse_df.csv')
+    Rij_inverse_dir=os.path.join(flag_values['output'],'02_correlation','Rij_inverse_df.csv')
     Rij_inverse_df.to_csv(Rij_inverse_dir)
     print("->Wrote 'Rij_inverse_df.csv' to '{}".format(Rij_inverse_dir + '\n'))
 

diff --git a/create_visuals.py b/create_visuals.py
@@ -20,42 +20,51 @@
 def create_network_map(flag_values, network_list):
     #breakpoint()
     for count, value in enumerate(network_list):
-        network_list[count][2] = '{:.4f}'.format(value[2])
-
-    # Create 'dot' engine style graphviz network visual
-    network = graphviz.Digraph('Maximum_Related_Networks_dot', engine='dot', comment='')
-
-    for i in network_list:
-        edge_thickness = str(i[2]*5)
-        correlog_score = i[2]
-        network.edge(i[0], i[1], penwidth=edge_thickness, xlabel=correlog_score)
-
-    network.render(directory=os.path.join(flag_values['output'], '05_final_outputs'))
+        network_list[count][2] = '{:.3f}'.format(value[2])
 
     #DG = nx.DiGraph()
     #DG.add_weighted_edges_from(network_list)
     nx_graph = nx.DiGraph()
     for item in network_list:
         nx_graph.add_node(item[0])
     for item in network_list:
-        nx_graph.add_edge(item[0],item[1],label=item[2])
+        #add weighted line
+        if float(item[2]) > 0:
+            nx_graph.add_edge(item[0],item[1],label=item[2],weight=float(item[2])*10)
+        else:
+            nx_graph.add_edge(item[0],item[1],label=item[2],weight=1)
 
     nt = Network('800px', '1080px')
     nt.from_nx(nx_graph)
     nt.toggle_physics(True)
     nt.show_buttons(True)
-    os.chdir(os.path.join(flag_values['output'], '05_final_outputs'))
+    os.chdir(os.path.join(flag_values['output'], '03_visual_output'))
     nt.write_html("MSR_network.html",notebook=False)
 
+    # Create 'dot' engine style graphviz network visual
+    network = graphviz.Digraph('Maximum_Related_Networks_dot', engine='dot', comment='')
+
+    for i in network_list:
+        co_score = i[2]
+        if float(i[2]) <= 0:
+            edge_thickness = 1      
+        else:
+            edge_thickness = int(i[2].split('.')[-1][0])
+        network.edge(i[0], i[1], penwidth=str(edge_thickness), xlabel=co_score, dir='none')
+    network.render(directory=os.path.join(flag_values['output'], '03_visual_output'))
+
     # Create 'circo' engine style graphviz network visual
     network = graphviz.Digraph('Maximum_Related_Networks_circo', engine='circo', comment='') 
 
     for i in network_list:
-        edge_thickness = str(i[2]*5)
-        correlog_score = correlog_score = i[2]
-        network.edge(i[0], i[1], penwidth=edge_thickness, xlabel=correlog_score)
+        co_score = i[2]
+        if float(i[2]) <= 0:
+            edge_thickness = 1      
+        else:
+            edge_thickness = int(i[2].split('.')[-1][0])
+        network.edge(i[0], i[1], penwidth=str(edge_thickness), xlabel=co_score, dir='none')
 
-    network.render(directory=os.path.join(flag_values['output'], '05_final_outputs'))
+    network.render(directory=os.path.join(flag_values['output'], '03_visual_output'))
 
 
     #need print statement here to console
@@ -92,16 +101,17 @@ def create_heatmap(Wij_df, flag_values):
         vmax=1,       # The maximum value of the legend. All higher vals will be same color
         vmin=-1,      # The minimum value of the legend. All lower vals will be same color
         center=0,      # The center value of the legend. With divergent cmap, where white is
-        square=True,   # Force cells to be square
+        square=False,   # Force cells to be square
         linewidths=.5, # Width of lines that divide cells
         cbar_kws={'label' : 'Co-Occurrence Value','shrink': .75}  # Extra kwargs for the legend; 
-        # in this case, shrink by 50%
+        # in this case, shrink
     )
 
     res.set_xticklabels(res.get_xmajorticklabels(), fontsize = 12)
     res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 12)
     res.yaxis.label.set_size(20)
 
-    heatmap_dir = os.path.join(flag_values['output'], '05_final_outputs','heatmap.png')
+    heatmap_dir = os.path.join(flag_values['output'], '03_visual_output','heatmap.png')
+    fig.tight_layout()
     fig.savefig(heatmap_dir)
     print('Heatmap written to: {}'.format(heatmap_dir) + '\n')
diff --git a/housekeeping.py b/housekeeping.py
@@ -3,16 +3,21 @@
 import pandas as pd
 import sys
 def create_folders(dir_path):
-    dirs = ['00_log',
-            '01_blast_input', 
-            '02_PA_matrix', 
-            '03_correlation_calcs', 
-            '04_correlog_values', 
-            '05_final_outputs']
+    dirs = ['00_pre_process',
+            '01_PA_matrix', 
+            '02_correlation', 
+            '03_visual_output']
+
+    if not os.path.exists(dir_path):
+        os.mkdir(dir_path)
+        print("Checking directory '{}'... ".format(dir_path))
+        print('  --->Directory Created')
+    else:
+        print("Checking directory '{}'... ".format(dir_path))
+        print('  --->Directory Already Exists')
 
     for current_dir in dirs:
         current_path = os.path.join(dir_path,current_dir)
-
         if not os.path.exists(current_path):
             os.mkdir(current_path)
             print("Checking directory '{}'... ".format(current_path))
@@ -26,11 +31,11 @@ def create_folders(dir_path):
 def parse_flags():
     dir_path = os.path.dirname(os.path.realpath('__file__'))  
 
-    parser = argparse.ArgumentParser(description="GeneCoOccurence", prog='gco')
+    parser = argparse.ArgumentParser(description="GeneCoOccurence v0.1. Please see https://github.com/clinte14/GeneCoOccurrence", prog='gco')
 
     parser.add_argument("-i", "--input", 
         required=True, 
-        help="File with BLAST results containing homologues in .xml format. REQUIRED.")
+        help="REQUIRED. Input file with either BLAST results in .xml format OR a comma-seperated presence/absence .csv file.")
 
     parser.add_argument("-o", "--output", 
         default=dir_path,
@@ -54,19 +59,21 @@ def parse_flags():
         arg_dict['common_name'] = os.path.abspath(arg_dict['common_name'])
 
     print("dir_path: " + str(dir_path))
+    arg_dict['command'] = " ".join(sys.argv)
+
     return arg_dict
 
 
-# save flags options and paramaters to "command.txt" file in 00_Settings folder
+# save flags options and paramaters to "log.txt" file in 00_Settings folder
 def save_flags(flags):
-    with open(os.path.join(flags['output'],"00_log",'command.txt'), 'w') as file:
+    with open(os.path.join(flags['output'],"00_pre_process",'log.txt'), 'w') as file:
 
         currentDT = datetime.datetime.now()
 
         file.write("Executed the following in " + flags['output'] + " at " \
-            + str(currentDT) + ":\n" + json.dumps(flags))
+            + str(currentDT) + ":\n\n" + flags['command'])
 
-    print("Writing flags to 'command.txt' in '{}'... ".\
+    print("Writing flags to 'log.txt' in '{}'... ".\
         format(os.path.join(flags['output'],"00_settings")) )
 
     print("  --->Flags written\n")

diff --git a/main.py b/main.py
@@ -1,9 +1,14 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """   
-See README.md @ https://github.com/clinte14/correlogy
+See README.md @ https://github.com/clinte14/GeneCoOccurrence
 Contact: [email protected] or [email protected]
 """
+# Fixing text so it's not cutoff on the heatmap is on my to do list as well
+# Fix log to show actual command line entry, not a list
+import sys
+import os
+import shutil
 import pandas as pd
 from housekeeping import create_folders, parse_flags, save_flags, convert_protID_to_common_names
 from blast_parse import parse_merge_BLAST, create_pa, clean_BLAST_xml
@@ -17,49 +22,37 @@ def main():
 
     # Create folders for project in path defined by -o flag. Default is current directory.
     create_folders(flag_values['output'])
-
-    # Save/record flags options and paramaters to "command.txt" file in 00_settings folder
+    # Save/record flags options and parameters to "command.txt" file in 00_settings folder
     save_flags(flag_values)
 
-    '''
-    Bug workaround for NCBIWWW. Occasionally NCBI BLAST online writes 'CREATE_VIEW' into 
-    the BLAST .xml output with no associated '<>' tags, and this causes xml parsing in 
-    Biopython to fail. See: https://github.com/biopython/biopython/issues/3342
-    '''
-    clean_BLAST_xml(flag_values)
-
-    # Parse XML files located in 01_BLAST_results folder.
-    hits = parse_merge_BLAST(flag_values)
+    if flag_values['input'].split('.')[-1]=='xml':    
+        # Parse XML files located in 01_BLAST_results folder.
+        hits = parse_merge_BLAST(flag_values)
 
-
-    # Converts BLAST query protein ID's to common gene names using .csv if -c flag was invoked.
-    convert_protID_to_common_names(flag_values, hits)
+        # Converts BLAST query protein ID's to common gene names using .csv if -c flag was invoked.
+        convert_protID_to_common_names(flag_values, hits)
 
-    #write hits dictionary to Pandas DF, query gene is column and hits are rows. Write dataframe to 02_PA_matrix      
-    #folder as PA_matrix.csv.
-    create_pa(hits, flag_values)
+        #write hits dictionary to Pandas DF, query gene is column and hits are rows. Write dataframe to 02_PA_matrix      
+        #folder as PA_matrix.csv.
+        create_pa(hits, flag_values)
 
+    elif flag_values['input'].split('.')[-1]=='csv':
+
+        pa_dir = os.path.join(flag_values['output'] ,'01_PA_matrix','pa_matrix.csv')
+        shutil.copyfile(flag_values['input'],pa_dir)
+        print("  --->Copied custom matrix '{}' to '{}'".format(flag_values['input'],pa_dir + '\n'))
+
+    else:
+        print("FATAL ERROR. Input must be either BLAST formatted '.xml' file or comma-seperated presence/absence '.csv' file")
+        sys.exit(0)
 
     #calculate Cij (# species common to gene i & j), Rij (Pearson Correlation)
     #and Wij (Partial Correlation)
-    Wij_df= correlation_calcs(flag_values)
+    Wij_df = correlation_calcs(flag_values)
     network_list = calc_mrs(Wij_df, flag_values)
 
     create_network_map(flag_values, network_list)
     create_heatmap(Wij_df, flag_values)
 
 if __name__ == "__main__":
-    main()
-
-'''
-To do:
-In command.txt add full keyboard input command
-In command.txt clean up writing of input, output, -c file
-Copy .xml input file to 01 folder
-For two nodes that point to each other, only show one correlog score
-In MRS visuals: Add color that corresponds with heatmap colors/ improve edge thickness
-Clean up notes to PEP-8 standard
-Add help info for each function
-Standardize output text
-Print statements relating to graphical MRS creation/generation
-'''
+    main()