Added support for blitzgsea reactome, KEGG and WikiPathways

stjude · Feb 5, 2025 · 23bb082 · 23bb082
1 parent 7e2b214
commit 23bb082
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 55 deletions.
diff --git a/client/plots/gsea.js b/client/plots/gsea.js
@@ -109,6 +109,8 @@ class gsea {
 				later a proper fix is to add a radio toggle of Blitzgsea versus MSigDB, and do not use such hardcode
 				*/
 				{ label: 'REACTOME (blitzgsea)', value: 'REACTOME--blitzgsea' },
+				{ label: 'KEGG (blitzgsea)', value: 'KEGG--blitzgsea' },
+				{ label: 'WikiPathways (blitzgsea)', value: 'WikiPathways--blitzgsea' },
 				{ label: 'H: hallmark gene sets', value: 'H: hallmark gene sets' }
 			]
 		}

diff --git a/server/utils/gsea.py b/server/utils/gsea.py
@@ -1,3 +1,5 @@
+# Test syntax: cat ~/sjpp/test.txt | time python gsea.py
+# test.txt contains the json string autogenerated by the commented out nodejs code.
 import blitzgsea as blitz  
 import json 
 import time  
@@ -33,11 +35,40 @@ def extract_symbols(x):
             # Connect to the SQLite database
             conn = sqlite3.connect(db)  # Connect to the SQLite database
             cursor = conn.cursor()  # Create a cursor object
-
-            # Query to get gene set IDs
-            query = f"SELECT id FROM terms WHERE parent_id='{table_name}'"  # SQL query to get gene set IDs
-            cursor.execute(query)  # Execute the query
-
+
+            msigdb_library = {}  # Initialize an empty dictionary for the gene set library
+            if table_name == "REACTOME--blitzgsea": # Parse from blitzgsea reactome library
+              msigdb_library = blitz.enrichr.get_library("Reactome_2022")
+            elif table_name == "KEGG--blitzgsea": # Parse from blitzgsea KEGG library
+              msigdb_library = blitz.enrichr.get_library("KEGG_2021_Human")
+            elif table_name == "WikiPathways--blitzgsea": # Parse from blitzgsea WikiPathways library
+              msigdb_library = blitz.enrichr.get_library("WikiPathways_2019_Human")              
+            else: # Use geneset groups from msigdb    
+              # Query to get gene set IDs
+              query = f"SELECT id FROM terms WHERE parent_id='{table_name}'"  # SQL query to get gene set IDs
+              cursor.execute(query)  # Execute the query
+
+              # Fetch all gene set IDs
+              rows = cursor.fetchall()  # Fetch all rows from the executed query
+
+              start_loop_time = time.time()  # Record the start time of the loop
+
+              # Iterate over gene set IDs and fetch corresponding genes
+              for row in rows:
+                  query2 = f"SELECT genes FROM term2genes WHERE id='{row[0]}'"  # SQL query to get genes for a gene set ID
+                  cursor.execute(query2)  # Execute the query
+                  rows2 = cursor.fetchall()  # Fetch all rows from the executed query
+                  row3 = json.loads(rows2[0][0])  # Parse the JSON data
+                  msigdb_library[row[0]] = list(map(extract_symbols, row3))  # Extract gene symbols and add to the library
+
+              # Close the cursor and connection to the database
+              cursor.close()  # Close the cursor
+              conn.close()  # Close the connection
+
+              stop_loop_time = time.time()  # Record the stop time of the loop
+              execution_time = stop_loop_time - start_loop_time  # Calculate the execution time
+              print(f"Execution time: {execution_time} seconds")  # Print the execution time
+
             # Filter out non-coding genes if specified
             if filter_non_coding_genes:
                 coding_genes_query = "SELECT * FROM codingGenes"  # SQL query to get coding genes
@@ -49,35 +80,13 @@ def extract_symbols(x):
                 coding_genes_list = list(map(lambda x: x[0], coding_genes_list))  # Extract the gene symbols
                 signature = signature[signature['Genes'].isin(coding_genes_list)]  # Filter the signature to include only coding genes
 
-            # Fetch all gene set IDs
-            rows = cursor.fetchall()  # Fetch all rows from the executed query
-
-            start_loop_time = time.time()  # Record the start time of the loop
-            msigdb_library = {}  # Initialize an empty dictionary for the gene set library
-
-            # Iterate over gene set IDs and fetch corresponding genes
-            for row in rows:
-                query2 = f"SELECT genes FROM term2genes WHERE id='{row[0]}'"  # SQL query to get genes for a gene set ID
-                cursor.execute(query2)  # Execute the query
-                rows2 = cursor.fetchall()  # Fetch all rows from the executed query
-                row3 = json.loads(rows2[0][0])  # Parse the JSON data
-                msigdb_library[row[0]] = list(map(extract_symbols, row3))  # Extract gene symbols and add to the library
-
-            # Close the cursor and connection to the database
-            cursor.close()  # Close the cursor
-            conn.close()  # Close the connection
-
-            stop_loop_time = time.time()  # Record the stop time of the loop
-            execution_time = stop_loop_time - start_loop_time  # Calculate the execution time
-            print(f"Execution time: {execution_time} seconds")  # Print the execution time
-
             try:
                 # Check if geneset_name and pickle_file are present for generating the plot
                 geneset_name = json_object['geneset_name']  # Get the gene set name from the JSON object
                 pickle_file = json_object['pickle_file']  # Get the pickle file name from the JSON object
                 result = pd.read_pickle(os.path.join(cachedir, pickle_file))  # Load the result from the pickle file
                 fig = blitz.plot.running_sum(signature, geneset_name, msigdb_library, result=result.T, compact=True)  # Generate the running sum plot
-                random_num = np.random.rand()  # Generate a random number
+                random_num = np.random.rand()  # Generate a random number for unique png filename
                 png_filename = f"gsea_plot_{random_num}.png"  # Create a filename for the plot
                 fig.savefig(os.path.join(cachedir, png_filename), bbox_inches='tight')  # Save the plot as a PNG file
                 print(f'image: {{"image_file": "{png_filename}"}}')  # Print the image file path in JSON format
@@ -86,7 +95,7 @@ def extract_symbols(x):
                 start_gsea_time = time.time()  # Record the start time of GSEA
                 if __name__ == "__main__":
                     result = blitz.gsea(signature, msigdb_library, permutations=1000).T  # Perform GSEA and transpose the result
-                    random_num = np.random.rand()  # Generate a random number
+                    random_num = np.random.rand()  # Generate a random number for unique pickle filename
                     pickle_filename = f"gsea_result_{random_num}.pkl"  # Create a filename for the pickle file
                     result.to_pickle(os.path.join(cachedir, pickle_filename))  # Save the result to the pickle file
                     gsea_str = f'{{"data": {result.to_json()}}}'  # Convert the result to JSON format
@@ -103,29 +112,28 @@ def extract_symbols(x):
 except (EOFError, IOError):
     pass  # Handle EOFError and IOError exceptions gracefully
 
-# Function to extract plot data for GSEA visualization
-# def extract_plot_data(signature, geneset, library, result, center=True):
-
-#     print("signature", signature)
-#     print("result", result)
-#     print("geneset", geneset)
-#     print("library", library)
-#     signature = signature.copy()  # Create a copy of the signature DataFrame
-#     signature.columns = ["i", "v"]  # Rename columns to 'i' and 'v'
-#     signature = signature.sort_values("v", ascending=False).set_index("i")  # Sort by 'v' in descending order and set 'i' as index
-#     signature = signature[~signature.index.duplicated(keep='first')]  # Remove duplicate indices, keeping the first occurrence
-
-#     if center:
-#         signature.loc[:, "v"] -= np.mean(signature.loc[:, "v"])  # Center the signature values by subtracting the mean
-
-#     signature_map = {h: i for i, h in enumerate(signature.index)}  # Create a mapping of signature indices
-
-#     gs = set(library[geneset])  # Get the gene set from the library
-#     hits = [i for i, x in enumerate(signature.index) if x in gs]  # Find the indices of hits in the signature
-
-#     running_sum, es = blitz.enrichment_score(np.array(np.abs(signature.iloc[:, 0])), signature_map, gs)  # Compute running sum and enrichment score
-#     running_sum = list(running_sum)  # Convert running sum to a list
-#     nn = np.where(np.abs(running_sum) == np.max(np.abs(running_sum)))[0][0]  # Find the index of the maximum absolute running sum
-
-#     running_sum_str = [str(elem) for elem in running_sum]  # Convert running sum elements to strings
-#     print(f'result: {{"nn": {nn}, "running_sum": "{",".join(running_sum_str)}", "es": {es}}}')  # Print the result in JSON format
+# Function to extract plot data for GSEA visualization (NOT currently being used, but will be used for generating client side gsea plots)
+def extract_plot_data(signature, geneset, library, result, center=True):
+     print("signature", signature)
+     print("result", result)
+     print("geneset", geneset)
+     print("library", library)
+     signature = signature.copy()  # Create a copy of the signature DataFrame
+     signature.columns = ["i", "v"]  # Rename columns to 'i' and 'v'
+     signature = signature.sort_values("v", ascending=False).set_index("i")  # Sort by 'v' in descending order and set 'i' as index
+     signature = signature[~signature.index.duplicated(keep='first')]  # Remove duplicate indices, keeping the first occurrence
+
+     if center:
+         signature.loc[:, "v"] -= np.mean(signature.loc[:, "v"])  # Center the signature values by subtracting the mean
+
+     signature_map = {h: i for i, h in enumerate(signature.index)}  # Create a mapping of signature indices
+
+     gs = set(library[geneset])  # Get the gene set from the library
+     hits = [i for i, x in enumerate(signature.index) if x in gs]  # Find the indices of hits in the signature
+
+     running_sum, es = blitz.enrichment_score(np.array(np.abs(signature.iloc[:, 0])), signature_map, gs)  # Compute running sum and enrichment score
+     running_sum = list(running_sum)  # Convert running sum to a list
+     nn = np.where(np.abs(running_sum) == np.max(np.abs(running_sum)))[0][0]  # Find the index of the maximum absolute running sum
+
+     running_sum_str = [str(elem) for elem in running_sum]  # Convert running sum elements to strings
+     print(f'result: {{"nn": {nn}, "running_sum": "{",".join(running_sum_str)}", "es": {es}}}')  # Print the result in JSON format