Pathway GSEA script

ding-lab · Dec 9, 2023 · e725d6b · e725d6b
1 parent 681c887
commit e725d6b
Show file tree

Hide file tree

Showing 6 changed files with 787 additions and 0 deletions.
diff --git a/Figure3/3_Pathway/script/GSEA_analysis.r b/Figure3/3_Pathway/script/GSEA_analysis.r
@@ -0,0 +1,193 @@
+# conda activate clusterprofiler
+# 2023/09/13 Simon Mo
+### ------------------ Load data ------------------ ###
+# Load samples using script
+source('/PATH/TO/0_Load_All_Sample/script/load_samples.r')
+
+# Parameters
+out_path = ''
+
+### ------------------ Function ------------------ ###
+src_path="/PATH/TO/3_Pathway/script"
+source(str_glue('{str_path}/function_findmarker_enhanced.r'))
+# Load GSEA pathway analysis scripts
+source(str_glue('{src_path}/function_runGSEA.r'))
+# Heatmap function
+source(str_glue('{src_path}/function_AnnDotPlot.r'))
+# Module score
+source(str_glue('{src_path}/function_TestAndFixAddModuleScore.r'))
+# Add average expression of genetic subclone 
+source(str_glue('{src_path}/function_CalculateCloneExpressionLevel.r'))
+# Load data base
+LoadMSigDBHuman()
+
+### ------------------ Analysis ------------------ ###
+### ------------------ Find DEG, GSEA ------------------ ###
+# plot 
+library(Seurat)
+library(enrichplot)
+library(patchwork)
+
+out_path_analysis = str_glue('{out_path}/1A_subclone_vs_TME_plot')
+#ST_microregion_deg_list
+st_groupby = 'genetic_clone'
+
+## -------- This GSEA workflow is generic and can be apply to many datasets -------- ##
+# Get DEGs, GSEA for each sample
+sample_use_all = names(st_list)
+for(sample_use in sample_use_all){
+    # 0. select sample
+    message("Processing sample:", sample_use)
+    ST_use = st_list[[sample_use]]
+    Idents(ST_use) = st_groupby
+    #ST_use = subset(ST_use, downsample = 5) # For Testing
+
+    # 1. Run DEG analysis
+    # TME as ref, run A. miroregion vs TME, B. TME vs NotTME
+    deg_all_df = FindMarkersEachVsRefComplete(ST_use, group.by = st_groupby, ident_ref = '0')
+
+    # save result
+    dir.create(str_glue('{out_path_analysis}/{sample_use}'), recursive = TRUE, showWarnings = FALSE)
+    write_tsv(deg_all_df, str_glue('{out_path_analysis}/{sample_use}/0_DEG_result.tsv'))
+
+    # 2. get GSEA result for TME vs Tumor for multiple genesets
+    gsea_genesets_list = FindAllMarkerTable2GSEAresult_MsigDB(deg_all_df , genesets_include = c("H","C6")) %>%
+        discard(.p = ~length(.)==0)
+
+    # 3. plot GSEA result
+    iwalk(gsea_genesets_list, function(result_list, geneset_name){
+        # save result
+        dir.create(str_glue('{out_path_analysis}/{sample_use}'), recursive = TRUE, showWarnings = FALSE)
+        saveRDS(result_list, str_glue('{out_path_analysis}/{sample_use}/1_GSEA_result_{geneset_name}.rds'))
+        # plot
+        p_st = SpatialDimPlot(ST_use, group.by = st_groupby, stroke = NA, image.alpha = 0, label = T)
+        pdf(str_glue('{out_path_analysis}/{sample_use}/1_dotplot_{geneset_name}.pdf'), height = 12, width = 10)
+            MakeGeneSetDotplot(result_list) %>% print()
+            print(p_st)
+        dev.off()
+    })
+}
+
+# ------------------ Set up parameters ------------------ #
+markerset_name = "H"
+
+# ------------------ PLOT ------------------ #
+# 1. Plot DEG heatmap
+# B. Extract top GSEA pathways for nonTME and plot top features
+sample_use_all = names(st_list)
+
+n_genes_plot = Inf # set Inf to plot all genes
+for(sample_use_name in sample_use_all){
+    # Parameters    
+    gsea_file_path = str_glue('{out_path_analysis}/{sample_use}/1_GSEA_result_{geneset_name}.rds')
+    if(!file.exists(gsea_file_path)) next
+    message("Processing sample:", sample_use_name)
+    gsea_use_list = readRDS(gsea_file_path)
+    st_obj_use = st_list[[sample_use_name]]
+
+    # Get GSEA core genes for a list of GSEA results
+    # Select 15 genes from each select GSEA result to plot
+    gsea_genes_df = imap(gsea_use_list, function(gsea_use, ident){
+        GetGSEAgenes(gsea_use) %>% mutate(ident = ident)
+    }) %>% bind_rows() %>% 
+        distinct(ID, core_enrichment) %>% 
+        group_by(ID) %>% 
+        slice_head(n=n_genes_plot)
+    gsea_genes_plt_list = split(gsea_genes_df$core_enrichment, gsea_genes_df$ID)
+
+    # PreCheck if will cause issue when running AddModuleScore
+    st_obj_use = TestAndFixSeuratForAddModuleScore(st_obj_use)
+
+    # Annotation heatmap
+    dir.create(str_glue('{out_path_analysis}/{sample_use_name}/2_ExpHeatmap/{markerset_name}/'), recursive = TRUE, showWarnings = FALSE)
+    p_list = imap(gsea_genes_plt_list, possibly(function(features_plt, geneset_id){
+        AnnoDotPlot(st_obj_use, group.by = 'Filtered_tumor_regions', features = features_plt, 
+            annotation_idents = c('genetic_clone'), label_ident= T,
+            cluster_row = T,
+            cluster_col = T,
+            mode = 'Heatmap',
+            title = geneset_id,
+            subtitle = sample_use_name,
+            ModuleScoreHeight = 3,
+            highlight_tiles = F,
+            highlight_cutoff_quantile = 0.6,
+            highlight_color = "#333333",
+            highlight_thickness = 0.5,
+            ) 
+
+    }, otherwise = NULL)) %>% discard(.p = ~is.null(.x)) 
+
+    # Plot
+    iwalk(p_list, function(p, geneset_id){
+        message("Plotting:", geneset_id)
+        pdf(str_glue('{out_path_analysis}/{sample_use_name}/2_ExpHeatmap/{markerset_name}/2_annoDotplot_{geneset_id}.pdf'), height = 8, width = 8)
+            print(p)
+        dev.off()
+    })
+}
+
+## -----    Plot tumor region expression plot for each pathway    ----- ##
+## Next Plot tumor region expression plot for each pathway
+# Version 2 - 20231010
+
+# Extract the genes list and save
+# For loop and split gene by panel
+samples_use = names(st_list)
+
+for(sample_use_name in samples_use){
+    # Parameters    
+    gsea_file_path = str_glue('{out_path_analysis}/{sample_use}/1_GSEA_result_{geneset_name}.rds')
+    if(!file.exists(gsea_file_path)) next
+    message("Processing sample:", sample_use_name)
+    gsea_use_list = readRDS(gsea_file_path)
+    st_obj_use = st_list[[sample_use_name]]
+    st_tumor_use = tumor_list[[sample_use_name]]
+
+    panels_per_file = 9
+    gsea_genes_df = imap(gsea_use_list, function(gsea_use, ident){
+        GetGSEAgenes(gsea_use) %>% mutate(ident = ident)
+    }) %>% bind_rows() %>% 
+        #filter(ID %in% gsea_id_plt) %>%  # Use all geneset
+        distinct(ID, core_enrichment) %>% 
+        group_by(ID) %>% 
+        # Split by number of panels
+        mutate(ID_split = ceiling(seq_along(core_enrichment)/panels_per_file)) %>% 
+        mutate(ID_split_full = str_c(ID, '_', ID_split)) 
+
+    # Add average expression of genetic subclone 
+    gsea_genes_df = gsea_genes_df %>% 
+    left_join(
+        y = CalculateCloneExpressionLevel(obj_use, features = unique(.$core_enrichment)),
+        by = c('core_enrichment' = 'Gene')
+    ) %>% # rearrange by clone group
+        arrange(ID, max_tumor)
+    write_tsv(gsea_genes_df, str_glue('{out_path_analysis}/{sample_use_name}/4_GSEA_result_long_{markerset_name}.tsv'))
+
+    # filtered version 
+    gsea_genes_filtered_df = gsea_genes_df %>% filter(min_max_tumor_ratio > 1.5, max_tme_ratio > 1.5) %>%
+        # Rearrange
+        arrange(ID, max_tumor) %>%
+        # Redo splitting 
+        group_by(ID) %>% 
+        # Split by number of panels
+        mutate(ID_split = ceiling(seq_along(core_enrichment)/panels_per_file)) %>% 
+        mutate(ID_split_full = str_c(ID, '_', ID_split)) %>%
+        mutate(ID_split_full = str_c(ID_split_full, '_', max_tumor)) 
+
+    write_tsv(gsea_genes_filtered_df, str_glue('{out_path_analysis}/{sample_use_name}/4_GSEA_result_long_{markerset_name}_filtered.tsv'))
+
+
+    # Plot
+    gsea_genes_plt_list = split(gsea_genes_filtered_df$core_enrichment, gsea_genes_filtered_df$ID_split_full)
+    # SpatialPlot
+    dir.create(str_glue('{out_path_analysis}/{sample_use_name}/5_SpatialPltPathwayGenesFiltered/{markerset_name}/'), recursive = TRUE, showWarnings = FALSE)
+    iwalk(gsea_genes_plt_list, function(features_plt, geneset_name){
+        p = SpatialPlot(st_tumor_use, features = features_plt, stroke = NA, image.alpha = 0.4)
+        pwhole = SpatialPlot(st_obj_use, features = features_plt, stroke = NA, image.alpha = 0.4)
+        message("Plotting:", geneset_name)
+        pdf(str_glue('{out_path_analysis}/{sample_use_name}/5_SpatialPltPathwayGenesFiltered/{markerset_name}/3_SpatialFeature_{geneset_name}.pdf'), height = 8, width = 8)
+            print(p)
+            print(pwhole)
+        dev.off()
+    })
+    }
diff --git a/Figure3/3_Pathway/script/src/function_AnnDotPlot.r b/Figure3/3_Pathway/script/src/function_AnnDotPlot.r
@@ -0,0 +1,143 @@
+
+library(patchwork)
+
+# External functions
+source('/diskmnt/Datasets/Spatial_Transcriptomics/Analysis/Shared_resource/script_git/Clustering/function_reoderbyhcluster.r')
+source('/diskmnt/Datasets/Spatial_Transcriptomics/Analysis/ST_subclone/38-GenomicSubclone/5_GSEA/script/src/function_TestAndFixAddModuleScore.r')
+## ---------- Plot Function ---------- ##
+# 1. Dotplot/Heatmap with annotation
+AnnoDotPlot = function(obj, group.by = 'seurat_clusters', features, annotation_idents, label_ident = T, 
+    mode = c("Dot","Heatmap"),
+    cluster_row = T,
+    cluster_col = T,
+    # title
+    title = NULL, subtitle = NULL,
+    # Module score
+    AddModuleScore = T, ModuleScoreHeight = 2,
+    # Highlight based on value cutoff
+    highlight_tiles = F,
+    highlight_cutoff_quantile = 0.75, 
+    highlight_color = '#333333',
+    highlight_thickness = 1,
+    # Column to splot
+    split_column = NULL,
+    ...){
+    message("annotation_idents:", annotation_idents, "Take values from @meta.data")
+    # A. Main Dot/Heatmap plot
+    pdata = DotPlot(obj, group.by = group.by, features = features) %>% .$data
+    # A0. Filter out no expression gene
+    pdata = pdata %>% filter(!is.nan(avg.exp.scaled))
+    pdata = pdata %>% filter(!is.na(features.plot)) # This is weird need check
+    # A1. Hierarchical clustering
+    if(cluster_row) pdata = pdata %>% ReorderByHCluster(ident_column = 'id', groupby_column = 'features.plot', value_column = 'avg.exp.scaled')
+    if(cluster_col) pdata = pdata %>% ReorderByHCluster(ident_column = 'features.plot', groupby_column = 'id', value_column = 'avg.exp.scaled')
+    # test = pdata %>% ReorderSplitByHCluster(ident_column = 'id', groupby_column = 'features.plot', value_column = 'avg.exp.scaled', split_by_vector = )
+    # return(test) 
+
+    p_dot_exp = pdata %>% ggplot(aes(x = id, y = features.plot)) 
+    # A2. Dot of Heatmap plot
+    mode = match.arg(mode)
+    if(mode == 'Dot'){
+        p_dot_exp = p_dot_exp + 
+            geom_point(aes(color = avg.exp.scaled, size = pct.exp)) + 
+            scale_color_gradient2(low = '#3333DD', mid = '#E0E0E0', high = '#DD3333', midpoint = 0) 
+    }else if(mode == 'Heatmap'){
+        #Heatmap
+        p_dot_exp = p_dot_exp +
+        geom_tile(aes(fill = avg.exp.scaled)) +
+        scale_fill_gradient2(low = '#3333DD', mid = '#E0E0E0', high = '#DD3333', midpoint = 0) 
+    }
+    # Highlight specific tiles
+    if(highlight_tiles){
+        highlight_cutoff_values = quantile(pdata$avg.exp.scaled, probs = highlight_cutoff_quantile)
+        pdata_highlight = pdata %>% filter(avg.exp.scaled > highlight_cutoff_values)
+        p_dot_exp = p_dot_exp +
+            geom_tile(data = pdata_highlight, aes(fill = avg.exp.scaled), color = highlight_color, linejoin= "round", linewidth = highlight_thickness) 
+    }
+    # Theme Adjustments
+    p_dot_exp = p_dot_exp + 
+        theme_bw() + 
+        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 
+    # Get groupby orders
+    order_group_by = levels(pdata$id)
+    # Get final feature count
+    p_dot_exp_nrow = length(unique(pdata$features.plot))
+
+
+    # B. Create column annotation bars
+    # Use idetns in meta.data
+    annotation_df = FetchData(obj, vars = c(group.by, annotation_idents))
+    annotation_collapsed_df = annotation_df %>% 
+        group_by(.data[[group.by]]) %>% 
+        summarize(across(all_of(annotation_idents), ~paste(sort(unique(.)), collapse = ' '))) %>% 
+        mutate({{group.by}} := factor(.data[[group.by]], levels = order_group_by)) # Reoder groupby
+    # # Create bar/column plot
+    p_bar_nrow = length(unique(annotation_idents))
+    p_bar_list = map(annotation_idents, function(anno_ident){
+        p_bar = annotation_collapsed_df[, c(group.by, anno_ident)] %>% 
+            ggplot(aes(x = .data[[group.by]], y = anno_ident, fill = .data[[anno_ident]])) + 
+            geom_tile() +
+            theme_void() + 
+            #theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))  
+            # Remove x axis 
+            theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank()) +
+            # Add y text back
+            theme(axis.text.y = element_text(angle = 0, hjust = 1, vjust = 0.5)) + 
+            # Use color scale colorspace::rainbow_hcl(n)
+            scale_fill_manual(values = colorspace::rainbow_hcl(n = length(unique(annotation_collapsed_df[[anno_ident]]))))
+        if(label_ident) p_bar = p_bar + geom_text(aes(label = str_wrap(.data[[anno_ident]], width = 4))) 
+        return(p_bar)
+    }) %>% setNames(annotation_idents)
+
+    # B1. Make current height arragment
+    plot_height_arrangement = c(rep(1,p_bar_nrow), p_dot_exp_nrow)
+
+    # B. Add module score
+    if(AddModuleScore){
+        # First test if need to fix object
+        obj = TestAndFixSeuratForAddModuleScore(obj)
+        # Plot
+        p_modulescore = ModuleScoreBoxplot(obj, group.by = group.by, features_plt = features) + 
+            theme_bw() + 
+            # Remove x axis
+            theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank()) +
+            # Remove minor grid
+            theme(panel.grid.minor = element_blank()) +
+            # Add y text back
+            theme(axis.title.y = element_text(angle = 0, hjust = 1, vjust = 0.5))
+        # Reorder column
+        p_modulescore$data = p_modulescore$data %>% 
+            mutate({{group.by}} := factor(.data[[group.by]], levels = order_group_by)) # Reoder groupby
+        # update plot arrangement
+        p_bar_list = c(list(ModuleScore = p_modulescore), p_bar_list)
+        plot_height_arrangement = c(ModuleScoreHeight, plot_height_arrangement) # Append height of module score to top 
+    }
+
+    # C. Combined
+    p_all = wrap_plots(c(p_bar_list, list(Dotplot= p_dot_exp)), ncol = 1, heights = plot_height_arrangement, guides = "collect") &
+         # put annotation to bottom
+        theme(legend.position = 'bottom')  
+    # D, Titles
+    p_all = p_all + plot_annotation(title = title, subtitle = subtitle, 
+        theme = theme(
+            plot.title = element_text(hjust = 0.5, face = 'bold'),
+            plot.subtitle = element_text(hjust = 0.5, face = 'italic'))
+        )
+    return(p_all)
+}
+
+# Module score boxplot
+ModuleScoreBoxplot = function(obj, group.by = 'seurat_clusters', features_plt){
+    #browser()
+    message("Calculating Module score")
+    obj_tmp = AddModuleScore(obj, features = list(ModuleScore=features_plt))
+    obj_tmp@meta.data[['ModuleScore']] = obj_tmp@meta.data[['Cluster1']]
+
+    # Module score boxplot
+    FetchData(obj_tmp, vars = c(group.by, 'ModuleScore')) %>% 
+        mutate({{group.by}} := as.character(.data[[group.by]])) %>%
+        ggplot(aes(x = .data[[group.by]], y = ModuleScore, fill = .data[[group.by]], group = .data[[group.by]])) +
+        geom_boxplot(width = 0.4, alpha = 0.5, outlier.shape = NA) +
+        theme_bw() +
+        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 
+}
diff --git a/Figure3/3_Pathway/script/src/function_CalculateCloneExpressionLevel.r b/Figure3/3_Pathway/script/src/function_CalculateCloneExpressionLevel.r
@@ -0,0 +1,30 @@
+# Add average expression of genetic subclone 
+CalculateCloneExpressionLevel = function(obj, features, assay = 'SCT', group.by = 'genetic_clone'){
+    message("Note, currently use 0 as TME and clone to detect tumor columns")
+    message("Assay = ", assay)
+    # Calculate average expression of genetic subclone
+    exp_df = obj %>% 
+        AverageExpression(
+            assays = assay, slot = 'data', 
+            group.by =  group.by,
+            features = features) %>% 
+            .[[assay]] %>% 
+        as.data.frame %>%
+        rownames_to_column('Gene')
+    # Add Min, max and difference 
+    exp_df %>% 
+        rowwise() %>% 
+        # Get Min and Max
+        mutate(
+            max_tumor_value = max(c_across(contains('clone'))),
+            max_tumor = unlist(pmap(across(contains('clone')), ~names(c(...)[which.max(c(...))]))),
+            min_tumor_value = min(c_across(contains('clone'))),
+            min_tumor = unlist(pmap(across(contains('clone')), ~names(c(...)[which.min(c(...))])))
+            # ^^ https://stackoverflow.com/questions/17735859/for-each-row-return-the-column-name-of-the-largest-value
+            ) %>% 
+        # Add difference
+        mutate(
+            min_max_tumor_ratio = max_tumor_value / min_tumor_value,
+            max_tme_ratio = max_tumor_value / `0`
+        )
+}