diff --git a/client/plots/DEanalysis.js b/client/plots/DEanalysis.js index caa58859c4..0a8cb6b725 100644 --- a/client/plots/DEanalysis.js +++ b/client/plots/DEanalysis.js @@ -493,7 +493,7 @@ add: value: sample_size1 }, { - label: self.config.samplelst.groups[1].name + ' sample size (treatment group)', + label: self.config.samplelst.groups[1].name + ' sample size (case group)', value: sample_size2 } ] diff --git a/client/plots/geneORA.js b/client/plots/geneORA.js index 97bede8817..fed9090278 100644 --- a/client/plots/geneORA.js +++ b/client/plots/geneORA.js @@ -6,6 +6,7 @@ import { getCompInit, copyMerge } from '#rx' import { Menu } from '../dom/menu' import { newSandboxDiv } from '../dom/sandbox.ts' import { select, pointer } from 'd3-selection' +import { downloadTable } from '../dom/table' import { roundValueAuto } from '#shared/roundValue.js' const hlcolor = '#ffa200' @@ -114,7 +115,7 @@ class geneORA { }) } this.components.controls.on('downloadClick.geneORA', () => { - downloadTable(this.table_rows, this.table_cols) + downloadTable(this.gene_ora_table_rows, this.gene_ora_table_cols) }) } getState(appState) { diff --git a/client/plots/gsea.js b/client/plots/gsea.js index 23087874f1..efbe66577b 100644 --- a/client/plots/gsea.js +++ b/client/plots/gsea.js @@ -7,22 +7,11 @@ import { controlsInit } from './controls' import { getCompInit, copyMerge } from '#rx' import { Menu } from '../dom/menu' import { scaleLog, scaleLinear } from 'd3-scale' +import { downloadTable } from '../dom/table' import { roundValueAuto } from '#shared/roundValue.js' const tip = new Menu() -// table columns showing analysis results for each gene set -const gsea_table_cols = [ - { label: 'Pathway name' }, - { label: 'enrichment score' }, - { label: 'normalized enrichment score' }, - { label: 'Geneset size' }, - { label: 'pvalue' }, - { label: 'sidak' }, - { label: 'FDR' }, - { label: 'Leading edge' } -] - class gsea { constructor() { this.type = 'gsea' @@ -53,7 +42,7 @@ class gsea { this.dom.controlsDiv.selectAll('*').remove() const inputs = [ { - label: 'P-value filter cutoff (linear scale)', + label: 'P-value Filter Cutoff (Linear Scale)', type: 'number', chartType: 'gsea', settingsKey: 'pvalue', @@ -62,18 +51,18 @@ class gsea { max: 1 }, { - label: 'P-value filter type', + label: 'P-value Filter Type', type: 'radio', chartType: 'gsea', settingsKey: 'adjusted_original_pvalue', title: 'Toggle between original and adjusted pvalues for volcano plot', options: [ - { label: 'adjusted', value: 'adjusted' }, - { label: 'original', value: 'original' } + { label: 'Adjusted', value: 'adjusted' }, + { label: 'Original', value: 'original' } ] }, { - label: 'Gene set size filter cutoff', + label: 'Gene Set Size Filter Cutoff', type: 'number', chartType: 'gsea', settingsKey: 'gene_set_size_cutoff', @@ -82,7 +71,7 @@ class gsea { max: 20000 }, { - label: 'Filter non-coding genes', + label: 'Filter Non-coding Genes', type: 'checkbox', chartType: 'gsea', settingsKey: 'filter_non_coding_genes', @@ -92,7 +81,7 @@ class gsea { ] const geneSet = { - label: 'Gene set group', + label: 'Gene Set Group', type: 'dropdown', chartType: 'gsea', settingsKey: 'pathway', @@ -104,9 +93,23 @@ class gsea { { label: 'CC: subset of GO', value: 'CC: subset of GO' }, { label: 'WikiPathways subset of CP', value: 'WikiPathways subset of CP' }, { label: 'REACTOME subset of CP', value: 'REACTOME subset of CP' }, + /* QUICK FIX + geneset name ending in "--blitzgsea" signals to use built-in genesets but not msigdb + later a proper fix is to add a radio toggle of Blitzgsea versus MSigDB, and do not use such hardcode + */ { label: 'H: hallmark gene sets', value: 'H: hallmark gene sets' } ] } + + // Now blitzgsea geneSets are inside serverconfig flag + if (JSON.parse(sessionStorage.getItem('optionalFeatures')).gsea_test == true) { + geneSet.options.push( + { label: 'REACTOME (blitzgsea)', value: 'REACTOME--blitzgsea' }, + { label: 'KEGG (blitzgsea)', value: 'KEGG--blitzgsea' }, + { label: 'WikiPathways (blitzgsea)', value: 'WikiPathways--blitzgsea' } + ) + } + if (!this.settings.pathway) { geneSet.options.unshift({ label: '-', value: '-' }) this.settings.pathway = '-' @@ -122,7 +125,7 @@ class gsea { }) } this.components.controls.on('downloadClick.gsea', () => { - downloadTable(this.table_rows, this.table_cols) + downloadTable(this.gsea_table_rows, this.gsea_table_cols) }) } getState(appState) { @@ -194,6 +197,7 @@ add: // Generating the table self.gsea_table_rows = [] + Object.keys(output.data).sort((i, j) => Number(i.fdr) - Number(j.fdr)) // Sorting pathways in ascending order of FDR for (const pathway_name of Object.keys(output.data)) { const pathway = output.data[pathway_name] if ( @@ -241,8 +245,19 @@ add: self.dom.tableDiv.selectAll('*').remove() const d_gsea = self.dom.tableDiv.append('div') + // table columns showing analysis results for each gene set + self.gsea_table_cols = [ + { label: 'Geneset' }, + { label: 'Enrichment Score' }, + { label: 'Normalized Enrichment Score' }, + { label: 'Geneset Size' }, + { label: 'P value' }, + { label: 'Sidak' }, + { label: 'FDR' }, + { label: 'Leading Edge' } + ] renderTable({ - columns: gsea_table_cols, + columns: self.gsea_table_cols, rows: self.gsea_table_rows, div: d_gsea, showLines: true, diff --git a/server/utils/gsea.py b/server/utils/gsea.py index 244c87e429..83d238f37e 100755 --- a/server/utils/gsea.py +++ b/server/utils/gsea.py @@ -1,129 +1,139 @@ -# cat ~/sjpp/test.txt | python gsea.py +# Test syntax: cat ~/sjpp/test.txt | time python gsea.py +# test.txt contains the json string autogenerated by the commented out nodejs code. +import blitzgsea as blitz +import json +import time +import sys +import sqlite3 +import os +import numpy as np +import pandas as pd -import blitzgsea as blitz -import json -import time -import sys -import sqlite3 -import os -import numpy as np -import pandas as pd - +# Helper function to extract gene symbols from a dictionary def extract_symbols(x): - return x['symbol'] - -def extract_plot_data(signature, geneset, library, result, center=True): - signature = signature.copy() - signature.columns = ["i","v"] - signature = signature.sort_values("v", ascending=False).set_index("i") - signature = signature[~signature.index.duplicated(keep='first')] - if center: - signature.loc[:,"v"] -= np.mean(signature.loc[:,"v"]) - signature_map = {} - for i,h in enumerate(signature.index): - signature_map[h] = i - - gs = set(library[geneset]) - hits = [i for i,x in enumerate(signature.index) if x in gs] - - running_sum, es = blitz.enrichment_score(np.array(np.abs(signature.iloc[:,0])), signature_map, gs) - running_sum = list(running_sum) - nn = np.where(np.abs(running_sum)==np.max(np.abs(running_sum)))[0][0] - #print ("nn:",nn) - #print ("running_sum:",running_sum) - #print ("es:",es) - running_sum_str=[str(elem) for elem in running_sum] - print ('result: {"nn":'+str(nn)+',"running_sum":"'+",".join(running_sum_str)+'","es":'+str(es)+'}') + return x['symbol'] # Return the 'symbol' field from the dictionary - -# Main function +# Main function try: - # Try to read a single character from stdin without blocking + # Check if there is input from stdin if sys.stdin.read(1): - # Read from stdin + # Read each line from stdin for line in sys.stdin: - # Process each line + # Parse the JSON input json_object = json.loads(line) - cachedir=json_object['cachedir'] - genes=json_object['genes'] - fold_change=json_object['fold_change'] - table_name=json_object['geneset_group'] - filter_non_coding_genes=json_object['filter_non_coding_genes'] - df = {'Genes': genes, 'fold_change': fold_change} - signature=pd.DataFrame(df) - db=json_object['db'] - # Connect to the SQLite database - conn = sqlite3.connect(db) - - # Create a cursor object using the cursor() method - cursor = conn.cursor() + cachedir = json_object['cachedir'] # Get the cache directory from the JSON object + genes = json_object['genes'] # Get the genes from the JSON object + fold_change = json_object['fold_change'] # Get the fold change values from the JSON object + table_name = json_object['geneset_group'] # Get the gene set group from the JSON object + filter_non_coding_genes = json_object['filter_non_coding_genes'] # Get the filter_non_coding_genes flag from the JSON object + db = json_object['db'] # Get the database path from the JSON object - # SQL query to select all data from the table - query = f"select id from terms where parent_id='" + table_name + "'" - # Execute the SQL query - cursor.execute(query) - if filter_non_coding_genes == True: - # SQL query to code all the protein coding genes - coding_genes_query = f"select * from codingGenes" - genedb = json_object['genedb'] - gene_conn = sqlite3.connect(genedb) - gene_cursor = gene_conn.cursor() - gene_cursor.execute(coding_genes_query) - coding_genes_list=gene_cursor.fetchall() - coding_genes_list=list(map(lambda x: x[0],coding_genes_list)) - signature=signature[signature['Genes'].isin(coding_genes_list)] - - # Fetch all rows from the executed SQL query - rows = cursor.fetchall() + # Create a DataFrame for the signature + df = {'Genes': genes, 'fold_change': fold_change} # Create a dictionary with genes and fold change + signature = pd.DataFrame(df) # Convert the dictionary to a DataFrame - start_loop_time = time.time() - msigdb_library={} - # Iterate over the rows and print them - for row in rows: - #print(row[0]) - query2=f"select genes from term2genes where id='" + row[0] + "'" - cursor.execute(query2) - rows2 = cursor.fetchall() - row3=json.loads(rows2[0][0]) - msigdb_library[row[0]] = list(map(extract_symbols,row3)) - - #print ("msigdb_library:",msigdb_library) - # Close the cursor and connection to the database - cursor.close() - conn.close() - stop_loop_time = time.time() - execution_time = stop_loop_time - start_loop_time - print(f"Execution time: {execution_time} seconds") - try: # Extract ES data to be plotted on client side - geneset_name=json_object['geneset_name'] # Checks if geneset_name is present, if yes it indicates the server request is for generating the image. It retrieves the result.pkl file and generates the image without having to recompute gsea again. - pickle_file=json_object['pickle_file'] - result = pd.read_pickle(os.path.join(cachedir,pickle_file)) - fig = blitz.plot.running_sum(signature, geneset_name, msigdb_library, result=result.T, compact=True) - random_num = np.random.rand() - png_filename = "gsea_plot_" + str(random_num) + ".png" - fig.savefig(os.path.join(cachedir,png_filename), bbox_inches='tight') - #extract_plot_data(signature, geneset_name, msigdb_library, result) # This returns raw data to client side, not currently used - print ('image: {"image_file":"' + png_filename + '"}') - except KeyError: #Initial GSEA calculation, result saved to a result.pkl pickle file - # run enrichment analysis - start_gsea_time = time.time() - if __name__ == "__main__": - result = blitz.gsea(signature, msigdb_library).T - random_num = np.random.rand() - pickle_filename="gsea_result_"+ str(random_num) +".pkl" - result.to_pickle(os.path.join(cachedir,pickle_filename)) - gsea_str='{"data":' + result.to_json() + '}' - pickle_str='{"pickle_file":"' + pickle_filename + '"}' - #print ("pickle_file:",pickle_str) - gsea_dict = json.loads(gsea_str) - pickle_dict = json.loads(pickle_str) - result_dict = {**gsea_dict, **pickle_dict} - print ("result:",json.dumps(result_dict)) - stop_gsea_time = time.time() - gsea_time = stop_gsea_time - start_gsea_time - print (f"GSEA time: {gsea_time} seconds") + # Connect to the SQLite database + conn = sqlite3.connect(db) # Connect to the SQLite database + cursor = conn.cursor() # Create a cursor object + + msigdb_library = {} # Initialize an empty dictionary for the gene set library + if table_name == "REACTOME--blitzgsea": # Parse from blitzgsea reactome library + msigdb_library = blitz.enrichr.get_library("Reactome_2022") + elif table_name == "KEGG--blitzgsea": # Parse from blitzgsea KEGG library + msigdb_library = blitz.enrichr.get_library("KEGG_2021_Human") + elif table_name == "WikiPathways--blitzgsea": # Parse from blitzgsea WikiPathways library + msigdb_library = blitz.enrichr.get_library("WikiPathways_2019_Human") + else: # Use geneset groups from msigdb + # Query to get gene set IDs + query = f"SELECT id FROM terms WHERE parent_id='{table_name}'" # SQL query to get gene set IDs + cursor.execute(query) # Execute the query + + # Fetch all gene set IDs + rows = cursor.fetchall() # Fetch all rows from the executed query + + start_loop_time = time.time() # Record the start time of the loop + + # Iterate over gene set IDs and fetch corresponding genes + for row in rows: + query2 = f"SELECT genes FROM term2genes WHERE id='{row[0]}'" # SQL query to get genes for a gene set ID + cursor.execute(query2) # Execute the query + rows2 = cursor.fetchall() # Fetch all rows from the executed query + row3 = json.loads(rows2[0][0]) # Parse the JSON data + msigdb_library[row[0]] = list(map(extract_symbols, row3)) # Extract gene symbols and add to the library + + # Close the cursor and connection to the database + cursor.close() # Close the cursor + conn.close() # Close the connection + + stop_loop_time = time.time() # Record the stop time of the loop + execution_time = stop_loop_time - start_loop_time # Calculate the execution time + print(f"Execution time: {execution_time} seconds") # Print the execution time + + # Filter out non-coding genes if specified + if filter_non_coding_genes: + coding_genes_query = "SELECT * FROM codingGenes" # SQL query to get coding genes + genedb = json_object['genedb'] # Get the gene database path from the JSON object + gene_conn = sqlite3.connect(genedb) # Connect to the gene database + gene_cursor = gene_conn.cursor() # Create a cursor object for the gene database + gene_cursor.execute(coding_genes_query) # Execute the query to get coding genes + coding_genes_list = gene_cursor.fetchall() # Fetch all coding genes + coding_genes_list = list(map(lambda x: x[0], coding_genes_list)) # Extract the gene symbols + signature = signature[signature['Genes'].isin(coding_genes_list)] # Filter the signature to include only coding genes + try: + # Check if geneset_name and pickle_file are present for generating the plot + geneset_name = json_object['geneset_name'] # Get the gene set name from the JSON object + pickle_file = json_object['pickle_file'] # Get the pickle file name from the JSON object + result = pd.read_pickle(os.path.join(cachedir, pickle_file)) # Load the result from the pickle file + fig = blitz.plot.running_sum(signature, geneset_name, msigdb_library, result=result.T, compact=True) # Generate the running sum plot + random_num = np.random.rand() # Generate a random number for unique png filename + png_filename = f"gsea_plot_{random_num}.png" # Create a filename for the plot + fig.savefig(os.path.join(cachedir, png_filename), bbox_inches='tight') # Save the plot as a PNG file + print(f'image: {{"image_file": "{png_filename}"}}') # Print the image file path in JSON format + except KeyError: + # Initial GSEA calculation and save the result to a pickle file + start_gsea_time = time.time() # Record the start time of GSEA + if __name__ == "__main__": + result = blitz.gsea(signature, msigdb_library, permutations=1000).T # Perform GSEA and transpose the result + random_num = np.random.rand() # Generate a random number for unique pickle filename + pickle_filename = f"gsea_result_{random_num}.pkl" # Create a filename for the pickle file + result.to_pickle(os.path.join(cachedir, pickle_filename)) # Save the result to the pickle file + gsea_str = f'{{"data": {result.to_json()}}}' # Convert the result to JSON format + pickle_str = f'{{"pickle_file": "{pickle_filename}"}}' # Create a JSON string for the pickle file + gsea_dict = json.loads(gsea_str) # Parse the JSON string + pickle_dict = json.loads(pickle_str) # Parse the JSON string + result_dict = {**gsea_dict, **pickle_dict} # Merge the dictionaries + print(f"result: {json.dumps(result_dict)}") # Print the result in JSON format + stop_gsea_time = time.time() # Record the stop time of GSEA + gsea_time = stop_gsea_time - start_gsea_time # Calculate the GSEA execution time + print(f"GSEA time: {gsea_time} seconds") # Print the GSEA execution time else: - pass + pass # Do nothing if there is no input from stdin except (EOFError, IOError): - pass + pass # Handle EOFError and IOError exceptions gracefully + +# Function to extract plot data for GSEA visualization (NOT currently being used, but will be used for generating client side gsea plots) +def extract_plot_data(signature, geneset, library, result, center=True): + print("signature", signature) + print("result", result) + print("geneset", geneset) + print("library", library) + signature = signature.copy() # Create a copy of the signature DataFrame + signature.columns = ["i", "v"] # Rename columns to 'i' and 'v' + signature = signature.sort_values("v", ascending=False).set_index("i") # Sort by 'v' in descending order and set 'i' as index + signature = signature[~signature.index.duplicated(keep='first')] # Remove duplicate indices, keeping the first occurrence + + if center: + signature.loc[:, "v"] -= np.mean(signature.loc[:, "v"]) # Center the signature values by subtracting the mean + + signature_map = {h: i for i, h in enumerate(signature.index)} # Create a mapping of signature indices + + gs = set(library[geneset]) # Get the gene set from the library + hits = [i for i, x in enumerate(signature.index) if x in gs] # Find the indices of hits in the signature + + running_sum, es = blitz.enrichment_score(np.array(np.abs(signature.iloc[:, 0])), signature_map, gs) # Compute running sum and enrichment score + running_sum = list(running_sum) # Convert running sum to a list + nn = np.where(np.abs(running_sum) == np.max(np.abs(running_sum)))[0][0] # Find the index of the maximum absolute running sum + + running_sum_str = [str(elem) for elem in running_sum] # Convert running sum elements to strings + print(f'result: {{"nn": {nn}, "running_sum": "{",".join(running_sum_str)}", "es": {es}}}') # Print the result in JSON format