Skip to content

Commit

Permalink
Added support for blitzgsea reactome, KEGG and WikiPathways
Browse files Browse the repository at this point in the history
  • Loading branch information
robinpaul85 committed Feb 5, 2025
1 parent 7e2b214 commit 23bb082
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 55 deletions.
2 changes: 2 additions & 0 deletions client/plots/gsea.js
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ class gsea {
later a proper fix is to add a radio toggle of Blitzgsea versus MSigDB, and do not use such hardcode
*/
{ label: 'REACTOME (blitzgsea)', value: 'REACTOME--blitzgsea' },
{ label: 'KEGG (blitzgsea)', value: 'KEGG--blitzgsea' },
{ label: 'WikiPathways (blitzgsea)', value: 'WikiPathways--blitzgsea' },
{ label: 'H: hallmark gene sets', value: 'H: hallmark gene sets' }
]
}
Expand Down
118 changes: 63 additions & 55 deletions server/utils/gsea.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Test syntax: cat ~/sjpp/test.txt | time python gsea.py
# test.txt contains the json string autogenerated by the commented out nodejs code.
import blitzgsea as blitz
import json
import time
Expand Down Expand Up @@ -33,11 +35,40 @@ def extract_symbols(x):
# Connect to the SQLite database
conn = sqlite3.connect(db) # Connect to the SQLite database
cursor = conn.cursor() # Create a cursor object

# Query to get gene set IDs
query = f"SELECT id FROM terms WHERE parent_id='{table_name}'" # SQL query to get gene set IDs
cursor.execute(query) # Execute the query


msigdb_library = {} # Initialize an empty dictionary for the gene set library
if table_name == "REACTOME--blitzgsea": # Parse from blitzgsea reactome library
msigdb_library = blitz.enrichr.get_library("Reactome_2022")
elif table_name == "KEGG--blitzgsea": # Parse from blitzgsea KEGG library
msigdb_library = blitz.enrichr.get_library("KEGG_2021_Human")
elif table_name == "WikiPathways--blitzgsea": # Parse from blitzgsea WikiPathways library
msigdb_library = blitz.enrichr.get_library("WikiPathways_2019_Human")
else: # Use geneset groups from msigdb
# Query to get gene set IDs
query = f"SELECT id FROM terms WHERE parent_id='{table_name}'" # SQL query to get gene set IDs
cursor.execute(query) # Execute the query

# Fetch all gene set IDs
rows = cursor.fetchall() # Fetch all rows from the executed query

start_loop_time = time.time() # Record the start time of the loop

# Iterate over gene set IDs and fetch corresponding genes
for row in rows:
query2 = f"SELECT genes FROM term2genes WHERE id='{row[0]}'" # SQL query to get genes for a gene set ID
cursor.execute(query2) # Execute the query
rows2 = cursor.fetchall() # Fetch all rows from the executed query
row3 = json.loads(rows2[0][0]) # Parse the JSON data
msigdb_library[row[0]] = list(map(extract_symbols, row3)) # Extract gene symbols and add to the library

# Close the cursor and connection to the database
cursor.close() # Close the cursor
conn.close() # Close the connection

stop_loop_time = time.time() # Record the stop time of the loop
execution_time = stop_loop_time - start_loop_time # Calculate the execution time
print(f"Execution time: {execution_time} seconds") # Print the execution time

# Filter out non-coding genes if specified
if filter_non_coding_genes:
coding_genes_query = "SELECT * FROM codingGenes" # SQL query to get coding genes
Expand All @@ -49,35 +80,13 @@ def extract_symbols(x):
coding_genes_list = list(map(lambda x: x[0], coding_genes_list)) # Extract the gene symbols
signature = signature[signature['Genes'].isin(coding_genes_list)] # Filter the signature to include only coding genes

# Fetch all gene set IDs
rows = cursor.fetchall() # Fetch all rows from the executed query

start_loop_time = time.time() # Record the start time of the loop
msigdb_library = {} # Initialize an empty dictionary for the gene set library

# Iterate over gene set IDs and fetch corresponding genes
for row in rows:
query2 = f"SELECT genes FROM term2genes WHERE id='{row[0]}'" # SQL query to get genes for a gene set ID
cursor.execute(query2) # Execute the query
rows2 = cursor.fetchall() # Fetch all rows from the executed query
row3 = json.loads(rows2[0][0]) # Parse the JSON data
msigdb_library[row[0]] = list(map(extract_symbols, row3)) # Extract gene symbols and add to the library

# Close the cursor and connection to the database
cursor.close() # Close the cursor
conn.close() # Close the connection

stop_loop_time = time.time() # Record the stop time of the loop
execution_time = stop_loop_time - start_loop_time # Calculate the execution time
print(f"Execution time: {execution_time} seconds") # Print the execution time

try:
# Check if geneset_name and pickle_file are present for generating the plot
geneset_name = json_object['geneset_name'] # Get the gene set name from the JSON object
pickle_file = json_object['pickle_file'] # Get the pickle file name from the JSON object
result = pd.read_pickle(os.path.join(cachedir, pickle_file)) # Load the result from the pickle file
fig = blitz.plot.running_sum(signature, geneset_name, msigdb_library, result=result.T, compact=True) # Generate the running sum plot
random_num = np.random.rand() # Generate a random number
random_num = np.random.rand() # Generate a random number for unique png filename
png_filename = f"gsea_plot_{random_num}.png" # Create a filename for the plot
fig.savefig(os.path.join(cachedir, png_filename), bbox_inches='tight') # Save the plot as a PNG file
print(f'image: {{"image_file": "{png_filename}"}}') # Print the image file path in JSON format
Expand All @@ -86,7 +95,7 @@ def extract_symbols(x):
start_gsea_time = time.time() # Record the start time of GSEA
if __name__ == "__main__":
result = blitz.gsea(signature, msigdb_library, permutations=1000).T # Perform GSEA and transpose the result
random_num = np.random.rand() # Generate a random number
random_num = np.random.rand() # Generate a random number for unique pickle filename
pickle_filename = f"gsea_result_{random_num}.pkl" # Create a filename for the pickle file
result.to_pickle(os.path.join(cachedir, pickle_filename)) # Save the result to the pickle file
gsea_str = f'{{"data": {result.to_json()}}}' # Convert the result to JSON format
Expand All @@ -103,29 +112,28 @@ def extract_symbols(x):
except (EOFError, IOError):
pass # Handle EOFError and IOError exceptions gracefully

# Function to extract plot data for GSEA visualization
# def extract_plot_data(signature, geneset, library, result, center=True):

# print("signature", signature)
# print("result", result)
# print("geneset", geneset)
# print("library", library)
# signature = signature.copy() # Create a copy of the signature DataFrame
# signature.columns = ["i", "v"] # Rename columns to 'i' and 'v'
# signature = signature.sort_values("v", ascending=False).set_index("i") # Sort by 'v' in descending order and set 'i' as index
# signature = signature[~signature.index.duplicated(keep='first')] # Remove duplicate indices, keeping the first occurrence

# if center:
# signature.loc[:, "v"] -= np.mean(signature.loc[:, "v"]) # Center the signature values by subtracting the mean

# signature_map = {h: i for i, h in enumerate(signature.index)} # Create a mapping of signature indices

# gs = set(library[geneset]) # Get the gene set from the library
# hits = [i for i, x in enumerate(signature.index) if x in gs] # Find the indices of hits in the signature

# running_sum, es = blitz.enrichment_score(np.array(np.abs(signature.iloc[:, 0])), signature_map, gs) # Compute running sum and enrichment score
# running_sum = list(running_sum) # Convert running sum to a list
# nn = np.where(np.abs(running_sum) == np.max(np.abs(running_sum)))[0][0] # Find the index of the maximum absolute running sum

# running_sum_str = [str(elem) for elem in running_sum] # Convert running sum elements to strings
# print(f'result: {{"nn": {nn}, "running_sum": "{",".join(running_sum_str)}", "es": {es}}}') # Print the result in JSON format
# Function to extract plot data for GSEA visualization (NOT currently being used, but will be used for generating client side gsea plots)
def extract_plot_data(signature, geneset, library, result, center=True):
print("signature", signature)
print("result", result)
print("geneset", geneset)
print("library", library)
signature = signature.copy() # Create a copy of the signature DataFrame
signature.columns = ["i", "v"] # Rename columns to 'i' and 'v'
signature = signature.sort_values("v", ascending=False).set_index("i") # Sort by 'v' in descending order and set 'i' as index
signature = signature[~signature.index.duplicated(keep='first')] # Remove duplicate indices, keeping the first occurrence

if center:
signature.loc[:, "v"] -= np.mean(signature.loc[:, "v"]) # Center the signature values by subtracting the mean

signature_map = {h: i for i, h in enumerate(signature.index)} # Create a mapping of signature indices

gs = set(library[geneset]) # Get the gene set from the library
hits = [i for i, x in enumerate(signature.index) if x in gs] # Find the indices of hits in the signature

running_sum, es = blitz.enrichment_score(np.array(np.abs(signature.iloc[:, 0])), signature_map, gs) # Compute running sum and enrichment score
running_sum = list(running_sum) # Convert running sum to a list
nn = np.where(np.abs(running_sum) == np.max(np.abs(running_sum)))[0][0] # Find the index of the maximum absolute running sum

running_sum_str = [str(elem) for elem in running_sum] # Convert running sum elements to strings
print(f'result: {{"nn": {nn}, "running_sum": "{",".join(running_sum_str)}", "es": {es}}}') # Print the result in JSON format

0 comments on commit 23bb082

Please sign in to comment.