Skip to content

Commit

Permalink
Initial release for Pip and BioConda
Browse files Browse the repository at this point in the history
  • Loading branch information
clinte14 committed Jan 25, 2024
1 parent 976deac commit 654cdcd
Show file tree
Hide file tree
Showing 13 changed files with 91 additions and 81 deletions.
Binary file removed __pycache__/blast_parse.cpython-310.pyc
Binary file not shown.
Binary file removed __pycache__/blast_parse.cpython-39.pyc
Binary file not shown.
Binary file removed __pycache__/correlation_calcs.cpython-310.pyc
Binary file not shown.
Binary file removed __pycache__/correlation_calcs.cpython-39.pyc
Binary file not shown.
Binary file removed __pycache__/create_visuals.cpython-310.pyc
Binary file not shown.
Binary file removed __pycache__/create_visuals.cpython-39.pyc
Binary file not shown.
Binary file removed __pycache__/housekeeping.cpython-310.pyc
Binary file not shown.
Binary file removed __pycache__/housekeeping.cpython-39.pyc
Binary file not shown.
4 changes: 2 additions & 2 deletions blast_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def create_pa(hits, flag_values):
# create 'merged_BLAST.csv' file in /02_PA_Matrix/ containing all values in 'hits' dictionary from parse_merge_BLAST()
merged_df = pd.DataFrame({ key:pd.Series(value) for key, value in hits.items() })
merged_df = merged_df.reindex(sorted(merged_df.columns), axis=1)
merged_df_dir=os.path.join(flag_values['output'],'02_PA_matrix','merged_BLAST.csv')
merged_df_dir=os.path.join(flag_values['output'],'01_PA_matrix','merged_BLAST.csv')
merged_df.to_csv(merged_df_dir)

print(" --->Wrote 'merged_BLAST.csv' to '{}'".format(merged_df_dir + '\n'))
Expand All @@ -106,6 +106,6 @@ def create_pa(hits, flag_values):
pa_df.at[value,column_name] = 1
# Replace nan with zero
pa_df = pa_df.fillna(value=0)
pa_dir = os.path.join(flag_values['output'] ,'02_PA_matrix','pa_matrix.csv')
pa_dir = os.path.join(flag_values['output'] ,'01_PA_matrix','pa_matrix.csv')
pa_df.to_csv(pa_dir)
print(" --->Wrote 'pa_matrix.csv' to '{}'".format(pa_dir + '\n'))
26 changes: 13 additions & 13 deletions correlation_calcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
def correlation_calcs(flag_values):
'''
'''
# Open prescence/abscence matrix from /02_PA_matrix/ directory
with open(os.path.join(flag_values['output'],'02_PA_matrix','pa_matrix.csv'), 'r') as infile:
# Open prescence/abscence matrix from /01_PA_matrix/ directory
with open(os.path.join(flag_values['output'],'01_PA_matrix','pa_matrix.csv'), 'r') as infile:
# Create dataframe from BLAST results, header = row# w/ column names
pa_df = pd.read_csv(infile, sep=',', header=0)
#clean up indexing and names in our pa_df (presence/absence dataframe)
first_col = pa_df.columns[0]
pa_df = pa_df.rename(columns={first_col : "species"})

#clean up indexing and names in our pa_df (prescence/abscence dataframe)
pa_df = pa_df.rename(columns={"Unnamed: 0" : "species"})

#an index of gene_names. Do not include indexer row '0' or 'species' column '1' of pa_df (prescence/abscence dataframe)
#an index of gene_names. Do not include indexer row '0' or 'species' column '1' of pa_df (presence/abscence dataframe)
gene_names = pa_df.dtypes.index[1:]

#this function calculates all the terms needed for pearson correlation
Expand Down Expand Up @@ -50,7 +50,7 @@ def correlation_calcs(flag_values):
Pjj = Rij_inverse_df.loc[column_to_right_name, column_to_right_name]
#print("Pij=", Pij, "Pii=", Pii, "Pjj=", Pjj)
Wij_df.at[column_name, column_to_right_name] = -((Pij) / (math.sqrt(Pii * Pjj)))
Wij_df.to_csv(os.path.join(flag_values['output'],'03_correlation_calcs','Wij_df_triangle.csv'))
Wij_df.to_csv(os.path.join(flag_values['output'],'02_correlation','Wij_df_triangle.csv'))
print("->Wrote 'Wij_df_triangle.csv' to '{}".format(os.path.join(flag_values['output']),"04_correlog_values..") + '\n')


Expand All @@ -64,7 +64,7 @@ def correlation_calcs(flag_values):
else:
Wij_df.loc[row, column] = Wij_df.loc[column, row]

Wij_df.to_csv(os.path.join(flag_values['output'],'03_correlation_calcs','Wij_df.csv'))
Wij_df.to_csv(os.path.join(flag_values['output'],'02_correlation','Wij_df.csv'))

#need print statement here to console
return Wij_df
Expand All @@ -73,7 +73,7 @@ def calc_mrs(Wij_df, flag_values):
Wij_outerDict = {}
gene_names = Wij_df.dtypes.index

f = open(os.path.join(flag_values['output'],'04_correlog_values',"Wij_matrix.csv"), "w")
f = open(os.path.join(flag_values['output'],'02_correlation',"Wij_matrix.csv"), "w")
writer = csv.writer(f)

network_list=[]
Expand Down Expand Up @@ -136,7 +136,7 @@ def calc_pearson_terms(flag_values, pa_df, gene_names):
print(" --->", column_to_right_name, Cij)
Cij_df.at[column_name, column_to_right_name] = Cij
# write Cij_df to disk
cij_dir=os.path.join(flag_values['output'],'03_correlation_calcs','Cij_df.csv')
cij_dir=os.path.join(flag_values['output'],'02_correlation','Cij_df.csv')
Cij_df.to_csv(cij_dir)
print("Wrote 'Cij_df.csv' aka # species with both gene 'i' & gene 'j' to '{}".format(cij_dir + '\n'))
# Debug print(Cij, index, df.loc[index, "species"])
Expand All @@ -163,7 +163,7 @@ def pearson_corr_calc(flag_values, pa_df, gene_names, E_i, N, Cij_df):
Rij_df.at[column_name, column_to_right_name] = ((current_Cij * N) - (current_E_i * current_E_j)) / (
math.sqrt(current_E_i) * math.sqrt(current_E_j) * math.sqrt(N-current_E_i) * math.sqrt(N-current_E_j))
# write Rij_df to disk
rij_triangle_dir=os.path.join(flag_values['output'],'03_correlation_calcs','Rij_df_triangle.csv')
rij_triangle_dir=os.path.join(flag_values['output'],'02_correlation','Rij_df_triangle.csv')
Rij_df.to_csv(rij_triangle_dir)
print("->Wrote 'Rij_df_triangle.csv' aka Pearson Correlations to '{}".format(rij_triangle_dir + '\n'))

Expand All @@ -177,15 +177,15 @@ def pearson_corr_calc(flag_values, pa_df, gene_names, E_i, N, Cij_df):
else:
Rij_df.loc[row, column] = Rij_df.loc[column, row]
# write Rij_df symmetrical to disk
rij_sym_dir=os.path.join(flag_values['output'], '03_correlation_calcs', 'Rij_df_symmetrical.csv')
rij_sym_dir=os.path.join(flag_values['output'], '02_correlation', 'Rij_df_symmetrical.csv')
Rij_df.to_csv(rij_sym_dir)
print("->Wrote 'Rij_df_symmetrical.csv' to '{}".format(rij_sym_dir + '\n'))

# Create inverse matrix from symmetrical Rij matrix
Rij_inverse_df = pd.DataFrame(np.linalg.pinv(Rij_df.values), Rij_df.columns, Rij_df.index)

# write Rij_inverse_df to disk
Rij_inverse_dir=os.path.join(flag_values['output'],'03_correlation_calcs','Rij_inverse_df.csv')
Rij_inverse_dir=os.path.join(flag_values['output'],'02_correlation','Rij_inverse_df.csv')
Rij_inverse_df.to_csv(Rij_inverse_dir)
print("->Wrote 'Rij_inverse_df.csv' to '{}".format(Rij_inverse_dir + '\n'))

Expand Down
50 changes: 30 additions & 20 deletions create_visuals.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,42 +20,51 @@
def create_network_map(flag_values, network_list):
#breakpoint()
for count, value in enumerate(network_list):
network_list[count][2] = '{:.4f}'.format(value[2])

# Create 'dot' engine style graphviz network visual
network = graphviz.Digraph('Maximum_Related_Networks_dot', engine='dot', comment='')

for i in network_list:
edge_thickness = str(i[2]*5)
correlog_score = i[2]
network.edge(i[0], i[1], penwidth=edge_thickness, xlabel=correlog_score)

network.render(directory=os.path.join(flag_values['output'], '05_final_outputs'))
network_list[count][2] = '{:.3f}'.format(value[2])

#DG = nx.DiGraph()
#DG.add_weighted_edges_from(network_list)
nx_graph = nx.DiGraph()
for item in network_list:
nx_graph.add_node(item[0])
for item in network_list:
nx_graph.add_edge(item[0],item[1],label=item[2])
#add weighted line
if float(item[2]) > 0:
nx_graph.add_edge(item[0],item[1],label=item[2],weight=float(item[2])*10)
else:
nx_graph.add_edge(item[0],item[1],label=item[2],weight=1)

nt = Network('800px', '1080px')
nt.from_nx(nx_graph)
nt.toggle_physics(True)
nt.show_buttons(True)
os.chdir(os.path.join(flag_values['output'], '05_final_outputs'))
os.chdir(os.path.join(flag_values['output'], '03_visual_output'))
nt.write_html("MSR_network.html",notebook=False)

# Create 'dot' engine style graphviz network visual
network = graphviz.Digraph('Maximum_Related_Networks_dot', engine='dot', comment='')

for i in network_list:
co_score = i[2]
if float(i[2]) <= 0:
edge_thickness = 1
else:
edge_thickness = int(i[2].split('.')[-1][0])
network.edge(i[0], i[1], penwidth=str(edge_thickness), xlabel=co_score, dir='none')
network.render(directory=os.path.join(flag_values['output'], '03_visual_output'))

# Create 'circo' engine style graphviz network visual
network = graphviz.Digraph('Maximum_Related_Networks_circo', engine='circo', comment='')

for i in network_list:
edge_thickness = str(i[2]*5)
correlog_score = correlog_score = i[2]
network.edge(i[0], i[1], penwidth=edge_thickness, xlabel=correlog_score)
co_score = i[2]
if float(i[2]) <= 0:
edge_thickness = 1
else:
edge_thickness = int(i[2].split('.')[-1][0])
network.edge(i[0], i[1], penwidth=str(edge_thickness), xlabel=co_score, dir='none')

network.render(directory=os.path.join(flag_values['output'], '05_final_outputs'))
network.render(directory=os.path.join(flag_values['output'], '03_visual_output'))


#need print statement here to console
Expand Down Expand Up @@ -92,16 +101,17 @@ def create_heatmap(Wij_df, flag_values):
vmax=1, # The maximum value of the legend. All higher vals will be same color
vmin=-1, # The minimum value of the legend. All lower vals will be same color
center=0, # The center value of the legend. With divergent cmap, where white is
square=True, # Force cells to be square
square=False, # Force cells to be square
linewidths=.5, # Width of lines that divide cells
cbar_kws={'label' : 'Co-Occurrence Value','shrink': .75} # Extra kwargs for the legend;
# in this case, shrink by 50%
# in this case, shrink
)

res.set_xticklabels(res.get_xmajorticklabels(), fontsize = 12)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 12)
res.yaxis.label.set_size(20)

heatmap_dir = os.path.join(flag_values['output'], '05_final_outputs','heatmap.png')
heatmap_dir = os.path.join(flag_values['output'], '03_visual_output','heatmap.png')
fig.tight_layout()
fig.savefig(heatmap_dir)
print('Heatmap written to: {}'.format(heatmap_dir) + '\n')
33 changes: 20 additions & 13 deletions housekeeping.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,21 @@
import pandas as pd
import sys
def create_folders(dir_path):
dirs = ['00_log',
'01_blast_input',
'02_PA_matrix',
'03_correlation_calcs',
'04_correlog_values',
'05_final_outputs']
dirs = ['00_pre_process',
'01_PA_matrix',
'02_correlation',
'03_visual_output']

if not os.path.exists(dir_path):
os.mkdir(dir_path)
print("Checking directory '{}'... ".format(dir_path))
print(' --->Directory Created')
else:
print("Checking directory '{}'... ".format(dir_path))
print(' --->Directory Already Exists')

for current_dir in dirs:
current_path = os.path.join(dir_path,current_dir)

if not os.path.exists(current_path):
os.mkdir(current_path)
print("Checking directory '{}'... ".format(current_path))
Expand All @@ -26,11 +31,11 @@ def create_folders(dir_path):
def parse_flags():
dir_path = os.path.dirname(os.path.realpath('__file__'))

parser = argparse.ArgumentParser(description="GeneCoOccurence", prog='gco')
parser = argparse.ArgumentParser(description="GeneCoOccurence v0.1. Please see https://github.com/clinte14/GeneCoOccurrence", prog='gco')

parser.add_argument("-i", "--input",
required=True,
help="File with BLAST results containing homologues in .xml format. REQUIRED.")
help="REQUIRED. Input file with either BLAST results in .xml format OR a comma-seperated presence/absence .csv file.")

parser.add_argument("-o", "--output",
default=dir_path,
Expand All @@ -54,19 +59,21 @@ def parse_flags():
arg_dict['common_name'] = os.path.abspath(arg_dict['common_name'])

print("dir_path: " + str(dir_path))
arg_dict['command'] = " ".join(sys.argv)

return arg_dict


# save flags options and paramaters to "command.txt" file in 00_Settings folder
# save flags options and paramaters to "log.txt" file in 00_Settings folder
def save_flags(flags):
with open(os.path.join(flags['output'],"00_log",'command.txt'), 'w') as file:
with open(os.path.join(flags['output'],"00_pre_process",'log.txt'), 'w') as file:

currentDT = datetime.datetime.now()

file.write("Executed the following in " + flags['output'] + " at " \
+ str(currentDT) + ":\n" + json.dumps(flags))
+ str(currentDT) + ":\n\n" + flags['command'])

print("Writing flags to 'command.txt' in '{}'... ".\
print("Writing flags to 'log.txt' in '{}'... ".\
format(os.path.join(flags['output'],"00_settings")) )

print(" --->Flags written\n")
Expand Down
59 changes: 26 additions & 33 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
See README.md @ https://github.com/clinte14/correlogy
See README.md @ https://github.com/clinte14/GeneCoOccurrence
Contact: [email protected] or [email protected]
"""
# Fixing text so it's not cutoff on the heatmap is on my to do list as well
# Fix log to show actual command line entry, not a list
import sys
import os
import shutil
import pandas as pd
from housekeeping import create_folders, parse_flags, save_flags, convert_protID_to_common_names
from blast_parse import parse_merge_BLAST, create_pa, clean_BLAST_xml
Expand All @@ -17,49 +22,37 @@ def main():

# Create folders for project in path defined by -o flag. Default is current directory.
create_folders(flag_values['output'])

# Save/record flags options and paramaters to "command.txt" file in 00_settings folder
# Save/record flags options and parameters to "command.txt" file in 00_settings folder
save_flags(flag_values)

'''
Bug workaround for NCBIWWW. Occasionally NCBI BLAST online writes 'CREATE_VIEW' into
the BLAST .xml output with no associated '<>' tags, and this causes xml parsing in
Biopython to fail. See: https://github.com/biopython/biopython/issues/3342
'''
clean_BLAST_xml(flag_values)

# Parse XML files located in 01_BLAST_results folder.
hits = parse_merge_BLAST(flag_values)
if flag_values['input'].split('.')[-1]=='xml':
# Parse XML files located in 01_BLAST_results folder.
hits = parse_merge_BLAST(flag_values)


# Converts BLAST query protein ID's to common gene names using .csv if -c flag was invoked.
convert_protID_to_common_names(flag_values, hits)
# Converts BLAST query protein ID's to common gene names using .csv if -c flag was invoked.
convert_protID_to_common_names(flag_values, hits)

#write hits dictionary to Pandas DF, query gene is column and hits are rows. Write dataframe to 02_PA_matrix
#folder as PA_matrix.csv.
create_pa(hits, flag_values)
#write hits dictionary to Pandas DF, query gene is column and hits are rows. Write dataframe to 02_PA_matrix
#folder as PA_matrix.csv.
create_pa(hits, flag_values)

elif flag_values['input'].split('.')[-1]=='csv':

pa_dir = os.path.join(flag_values['output'] ,'01_PA_matrix','pa_matrix.csv')
shutil.copyfile(flag_values['input'],pa_dir)
print(" --->Copied custom matrix '{}' to '{}'".format(flag_values['input'],pa_dir + '\n'))

else:
print("FATAL ERROR. Input must be either BLAST formatted '.xml' file or comma-seperated presence/absence '.csv' file")
sys.exit(0)

#calculate Cij (# species common to gene i & j), Rij (Pearson Correlation)
#and Wij (Partial Correlation)
Wij_df= correlation_calcs(flag_values)
Wij_df = correlation_calcs(flag_values)
network_list = calc_mrs(Wij_df, flag_values)

create_network_map(flag_values, network_list)
create_heatmap(Wij_df, flag_values)

if __name__ == "__main__":
main()

'''
To do:
In command.txt add full keyboard input command
In command.txt clean up writing of input, output, -c file
Copy .xml input file to 01 folder
For two nodes that point to each other, only show one correlog score
In MRS visuals: Add color that corresponds with heatmap colors/ improve edge thickness
Clean up notes to PEP-8 standard
Add help info for each function
Standardize output text
Print statements relating to graphical MRS creation/generation
'''
main()

0 comments on commit 654cdcd

Please sign in to comment.