Skip to content

Commit

Permalink
Pathway GSEA script
Browse files Browse the repository at this point in the history
  • Loading branch information
simoncmo committed Dec 9, 2023
1 parent 681c887 commit e725d6b
Show file tree
Hide file tree
Showing 6 changed files with 787 additions and 0 deletions.
193 changes: 193 additions & 0 deletions Figure3/3_Pathway/script/GSEA_analysis.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# conda activate clusterprofiler
# 2023/09/13 Simon Mo
### ------------------ Load data ------------------ ###
# Load samples using script
source('/PATH/TO/0_Load_All_Sample/script/load_samples.r')

# Parameters
out_path = ''

### ------------------ Function ------------------ ###
src_path="/PATH/TO/3_Pathway/script"
source(str_glue('{str_path}/function_findmarker_enhanced.r'))
# Load GSEA pathway analysis scripts
source(str_glue('{src_path}/function_runGSEA.r'))
# Heatmap function
source(str_glue('{src_path}/function_AnnDotPlot.r'))
# Module score
source(str_glue('{src_path}/function_TestAndFixAddModuleScore.r'))
# Add average expression of genetic subclone
source(str_glue('{src_path}/function_CalculateCloneExpressionLevel.r'))
# Load data base
LoadMSigDBHuman()

### ------------------ Analysis ------------------ ###
### ------------------ Find DEG, GSEA ------------------ ###
# plot
library(Seurat)
library(enrichplot)
library(patchwork)

out_path_analysis = str_glue('{out_path}/1A_subclone_vs_TME_plot')
#ST_microregion_deg_list
st_groupby = 'genetic_clone'

## -------- This GSEA workflow is generic and can be apply to many datasets -------- ##
# Get DEGs, GSEA for each sample
sample_use_all = names(st_list)
for(sample_use in sample_use_all){
# 0. select sample
message("Processing sample:", sample_use)
ST_use = st_list[[sample_use]]
Idents(ST_use) = st_groupby
#ST_use = subset(ST_use, downsample = 5) # For Testing

# 1. Run DEG analysis
# TME as ref, run A. miroregion vs TME, B. TME vs NotTME
deg_all_df = FindMarkersEachVsRefComplete(ST_use, group.by = st_groupby, ident_ref = '0')

# save result
dir.create(str_glue('{out_path_analysis}/{sample_use}'), recursive = TRUE, showWarnings = FALSE)
write_tsv(deg_all_df, str_glue('{out_path_analysis}/{sample_use}/0_DEG_result.tsv'))

# 2. get GSEA result for TME vs Tumor for multiple genesets
gsea_genesets_list = FindAllMarkerTable2GSEAresult_MsigDB(deg_all_df , genesets_include = c("H","C6")) %>%
discard(.p = ~length(.)==0)

# 3. plot GSEA result
iwalk(gsea_genesets_list, function(result_list, geneset_name){
# save result
dir.create(str_glue('{out_path_analysis}/{sample_use}'), recursive = TRUE, showWarnings = FALSE)
saveRDS(result_list, str_glue('{out_path_analysis}/{sample_use}/1_GSEA_result_{geneset_name}.rds'))
# plot
p_st = SpatialDimPlot(ST_use, group.by = st_groupby, stroke = NA, image.alpha = 0, label = T)
pdf(str_glue('{out_path_analysis}/{sample_use}/1_dotplot_{geneset_name}.pdf'), height = 12, width = 10)
MakeGeneSetDotplot(result_list) %>% print()
print(p_st)
dev.off()
})
}

# ------------------ Set up parameters ------------------ #
markerset_name = "H"

# ------------------ PLOT ------------------ #
# 1. Plot DEG heatmap
# B. Extract top GSEA pathways for nonTME and plot top features
sample_use_all = names(st_list)

n_genes_plot = Inf # set Inf to plot all genes
for(sample_use_name in sample_use_all){
# Parameters
gsea_file_path = str_glue('{out_path_analysis}/{sample_use}/1_GSEA_result_{geneset_name}.rds')
if(!file.exists(gsea_file_path)) next
message("Processing sample:", sample_use_name)
gsea_use_list = readRDS(gsea_file_path)
st_obj_use = st_list[[sample_use_name]]

# Get GSEA core genes for a list of GSEA results
# Select 15 genes from each select GSEA result to plot
gsea_genes_df = imap(gsea_use_list, function(gsea_use, ident){
GetGSEAgenes(gsea_use) %>% mutate(ident = ident)
}) %>% bind_rows() %>%
distinct(ID, core_enrichment) %>%
group_by(ID) %>%
slice_head(n=n_genes_plot)
gsea_genes_plt_list = split(gsea_genes_df$core_enrichment, gsea_genes_df$ID)

# PreCheck if will cause issue when running AddModuleScore
st_obj_use = TestAndFixSeuratForAddModuleScore(st_obj_use)

# Annotation heatmap
dir.create(str_glue('{out_path_analysis}/{sample_use_name}/2_ExpHeatmap/{markerset_name}/'), recursive = TRUE, showWarnings = FALSE)
p_list = imap(gsea_genes_plt_list, possibly(function(features_plt, geneset_id){
AnnoDotPlot(st_obj_use, group.by = 'Filtered_tumor_regions', features = features_plt,
annotation_idents = c('genetic_clone'), label_ident= T,
cluster_row = T,
cluster_col = T,
mode = 'Heatmap',
title = geneset_id,
subtitle = sample_use_name,
ModuleScoreHeight = 3,
highlight_tiles = F,
highlight_cutoff_quantile = 0.6,
highlight_color = "#333333",
highlight_thickness = 0.5,
)

}, otherwise = NULL)) %>% discard(.p = ~is.null(.x))

# Plot
iwalk(p_list, function(p, geneset_id){
message("Plotting:", geneset_id)
pdf(str_glue('{out_path_analysis}/{sample_use_name}/2_ExpHeatmap/{markerset_name}/2_annoDotplot_{geneset_id}.pdf'), height = 8, width = 8)
print(p)
dev.off()
})
}

## ----- Plot tumor region expression plot for each pathway ----- ##
## Next Plot tumor region expression plot for each pathway
# Version 2 - 20231010

# Extract the genes list and save
# For loop and split gene by panel
samples_use = names(st_list)

for(sample_use_name in samples_use){
# Parameters
gsea_file_path = str_glue('{out_path_analysis}/{sample_use}/1_GSEA_result_{geneset_name}.rds')
if(!file.exists(gsea_file_path)) next
message("Processing sample:", sample_use_name)
gsea_use_list = readRDS(gsea_file_path)
st_obj_use = st_list[[sample_use_name]]
st_tumor_use = tumor_list[[sample_use_name]]

panels_per_file = 9
gsea_genes_df = imap(gsea_use_list, function(gsea_use, ident){
GetGSEAgenes(gsea_use) %>% mutate(ident = ident)
}) %>% bind_rows() %>%
#filter(ID %in% gsea_id_plt) %>% # Use all geneset
distinct(ID, core_enrichment) %>%
group_by(ID) %>%
# Split by number of panels
mutate(ID_split = ceiling(seq_along(core_enrichment)/panels_per_file)) %>%
mutate(ID_split_full = str_c(ID, '_', ID_split))

# Add average expression of genetic subclone
gsea_genes_df = gsea_genes_df %>%
left_join(
y = CalculateCloneExpressionLevel(obj_use, features = unique(.$core_enrichment)),
by = c('core_enrichment' = 'Gene')
) %>% # rearrange by clone group
arrange(ID, max_tumor)
write_tsv(gsea_genes_df, str_glue('{out_path_analysis}/{sample_use_name}/4_GSEA_result_long_{markerset_name}.tsv'))

# filtered version
gsea_genes_filtered_df = gsea_genes_df %>% filter(min_max_tumor_ratio > 1.5, max_tme_ratio > 1.5) %>%
# Rearrange
arrange(ID, max_tumor) %>%
# Redo splitting
group_by(ID) %>%
# Split by number of panels
mutate(ID_split = ceiling(seq_along(core_enrichment)/panels_per_file)) %>%
mutate(ID_split_full = str_c(ID, '_', ID_split)) %>%
mutate(ID_split_full = str_c(ID_split_full, '_', max_tumor))

write_tsv(gsea_genes_filtered_df, str_glue('{out_path_analysis}/{sample_use_name}/4_GSEA_result_long_{markerset_name}_filtered.tsv'))


# Plot
gsea_genes_plt_list = split(gsea_genes_filtered_df$core_enrichment, gsea_genes_filtered_df$ID_split_full)
# SpatialPlot
dir.create(str_glue('{out_path_analysis}/{sample_use_name}/5_SpatialPltPathwayGenesFiltered/{markerset_name}/'), recursive = TRUE, showWarnings = FALSE)
iwalk(gsea_genes_plt_list, function(features_plt, geneset_name){
p = SpatialPlot(st_tumor_use, features = features_plt, stroke = NA, image.alpha = 0.4)
pwhole = SpatialPlot(st_obj_use, features = features_plt, stroke = NA, image.alpha = 0.4)
message("Plotting:", geneset_name)
pdf(str_glue('{out_path_analysis}/{sample_use_name}/5_SpatialPltPathwayGenesFiltered/{markerset_name}/3_SpatialFeature_{geneset_name}.pdf'), height = 8, width = 8)
print(p)
print(pwhole)
dev.off()
})
}
143 changes: 143 additions & 0 deletions Figure3/3_Pathway/script/src/function_AnnDotPlot.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@

library(patchwork)

# External functions
source('/diskmnt/Datasets/Spatial_Transcriptomics/Analysis/Shared_resource/script_git/Clustering/function_reoderbyhcluster.r')
source('/diskmnt/Datasets/Spatial_Transcriptomics/Analysis/ST_subclone/38-GenomicSubclone/5_GSEA/script/src/function_TestAndFixAddModuleScore.r')
## ---------- Plot Function ---------- ##
# 1. Dotplot/Heatmap with annotation
AnnoDotPlot = function(obj, group.by = 'seurat_clusters', features, annotation_idents, label_ident = T,
mode = c("Dot","Heatmap"),
cluster_row = T,
cluster_col = T,
# title
title = NULL, subtitle = NULL,
# Module score
AddModuleScore = T, ModuleScoreHeight = 2,
# Highlight based on value cutoff
highlight_tiles = F,
highlight_cutoff_quantile = 0.75,
highlight_color = '#333333',
highlight_thickness = 1,
# Column to splot
split_column = NULL,
...){
message("annotation_idents:", annotation_idents, "Take values from @meta.data")
# A. Main Dot/Heatmap plot
pdata = DotPlot(obj, group.by = group.by, features = features) %>% .$data
# A0. Filter out no expression gene
pdata = pdata %>% filter(!is.nan(avg.exp.scaled))
pdata = pdata %>% filter(!is.na(features.plot)) # This is weird need check
# A1. Hierarchical clustering
if(cluster_row) pdata = pdata %>% ReorderByHCluster(ident_column = 'id', groupby_column = 'features.plot', value_column = 'avg.exp.scaled')
if(cluster_col) pdata = pdata %>% ReorderByHCluster(ident_column = 'features.plot', groupby_column = 'id', value_column = 'avg.exp.scaled')
# test = pdata %>% ReorderSplitByHCluster(ident_column = 'id', groupby_column = 'features.plot', value_column = 'avg.exp.scaled', split_by_vector = )
# return(test)

p_dot_exp = pdata %>% ggplot(aes(x = id, y = features.plot))
# A2. Dot of Heatmap plot
mode = match.arg(mode)
if(mode == 'Dot'){
p_dot_exp = p_dot_exp +
geom_point(aes(color = avg.exp.scaled, size = pct.exp)) +
scale_color_gradient2(low = '#3333DD', mid = '#E0E0E0', high = '#DD3333', midpoint = 0)
}else if(mode == 'Heatmap'){
#Heatmap
p_dot_exp = p_dot_exp +
geom_tile(aes(fill = avg.exp.scaled)) +
scale_fill_gradient2(low = '#3333DD', mid = '#E0E0E0', high = '#DD3333', midpoint = 0)
}
# Highlight specific tiles
if(highlight_tiles){
highlight_cutoff_values = quantile(pdata$avg.exp.scaled, probs = highlight_cutoff_quantile)
pdata_highlight = pdata %>% filter(avg.exp.scaled > highlight_cutoff_values)
p_dot_exp = p_dot_exp +
geom_tile(data = pdata_highlight, aes(fill = avg.exp.scaled), color = highlight_color, linejoin= "round", linewidth = highlight_thickness)
}
# Theme Adjustments
p_dot_exp = p_dot_exp +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
# Get groupby orders
order_group_by = levels(pdata$id)
# Get final feature count
p_dot_exp_nrow = length(unique(pdata$features.plot))


# B. Create column annotation bars
# Use idetns in meta.data
annotation_df = FetchData(obj, vars = c(group.by, annotation_idents))
annotation_collapsed_df = annotation_df %>%
group_by(.data[[group.by]]) %>%
summarize(across(all_of(annotation_idents), ~paste(sort(unique(.)), collapse = ' '))) %>%
mutate({{group.by}} := factor(.data[[group.by]], levels = order_group_by)) # Reoder groupby
# # Create bar/column plot
p_bar_nrow = length(unique(annotation_idents))
p_bar_list = map(annotation_idents, function(anno_ident){
p_bar = annotation_collapsed_df[, c(group.by, anno_ident)] %>%
ggplot(aes(x = .data[[group.by]], y = anno_ident, fill = .data[[anno_ident]])) +
geom_tile() +
theme_void() +
#theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
# Remove x axis
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank()) +
# Add y text back
theme(axis.text.y = element_text(angle = 0, hjust = 1, vjust = 0.5)) +
# Use color scale colorspace::rainbow_hcl(n)
scale_fill_manual(values = colorspace::rainbow_hcl(n = length(unique(annotation_collapsed_df[[anno_ident]]))))
if(label_ident) p_bar = p_bar + geom_text(aes(label = str_wrap(.data[[anno_ident]], width = 4)))
return(p_bar)
}) %>% setNames(annotation_idents)

# B1. Make current height arragment
plot_height_arrangement = c(rep(1,p_bar_nrow), p_dot_exp_nrow)

# B. Add module score
if(AddModuleScore){
# First test if need to fix object
obj = TestAndFixSeuratForAddModuleScore(obj)
# Plot
p_modulescore = ModuleScoreBoxplot(obj, group.by = group.by, features_plt = features) +
theme_bw() +
# Remove x axis
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank()) +
# Remove minor grid
theme(panel.grid.minor = element_blank()) +
# Add y text back
theme(axis.title.y = element_text(angle = 0, hjust = 1, vjust = 0.5))
# Reorder column
p_modulescore$data = p_modulescore$data %>%
mutate({{group.by}} := factor(.data[[group.by]], levels = order_group_by)) # Reoder groupby
# update plot arrangement
p_bar_list = c(list(ModuleScore = p_modulescore), p_bar_list)
plot_height_arrangement = c(ModuleScoreHeight, plot_height_arrangement) # Append height of module score to top
}

# C. Combined
p_all = wrap_plots(c(p_bar_list, list(Dotplot= p_dot_exp)), ncol = 1, heights = plot_height_arrangement, guides = "collect") &
# put annotation to bottom
theme(legend.position = 'bottom')
# D, Titles
p_all = p_all + plot_annotation(title = title, subtitle = subtitle,
theme = theme(
plot.title = element_text(hjust = 0.5, face = 'bold'),
plot.subtitle = element_text(hjust = 0.5, face = 'italic'))
)
return(p_all)
}

# Module score boxplot
ModuleScoreBoxplot = function(obj, group.by = 'seurat_clusters', features_plt){
#browser()
message("Calculating Module score")
obj_tmp = AddModuleScore(obj, features = list(ModuleScore=features_plt))
obj_tmp@meta.data[['ModuleScore']] = obj_tmp@meta.data[['Cluster1']]

# Module score boxplot
FetchData(obj_tmp, vars = c(group.by, 'ModuleScore')) %>%
mutate({{group.by}} := as.character(.data[[group.by]])) %>%
ggplot(aes(x = .data[[group.by]], y = ModuleScore, fill = .data[[group.by]], group = .data[[group.by]])) +
geom_boxplot(width = 0.4, alpha = 0.5, outlier.shape = NA) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Add average expression of genetic subclone
CalculateCloneExpressionLevel = function(obj, features, assay = 'SCT', group.by = 'genetic_clone'){
message("Note, currently use 0 as TME and clone to detect tumor columns")
message("Assay = ", assay)
# Calculate average expression of genetic subclone
exp_df = obj %>%
AverageExpression(
assays = assay, slot = 'data',
group.by = group.by,
features = features) %>%
.[[assay]] %>%
as.data.frame %>%
rownames_to_column('Gene')
# Add Min, max and difference
exp_df %>%
rowwise() %>%
# Get Min and Max
mutate(
max_tumor_value = max(c_across(contains('clone'))),
max_tumor = unlist(pmap(across(contains('clone')), ~names(c(...)[which.max(c(...))]))),
min_tumor_value = min(c_across(contains('clone'))),
min_tumor = unlist(pmap(across(contains('clone')), ~names(c(...)[which.min(c(...))])))
# ^^ https://stackoverflow.com/questions/17735859/for-each-row-return-the-column-name-of-the-largest-value
) %>%
# Add difference
mutate(
min_max_tumor_ratio = max_tumor_value / min_tumor_value,
max_tme_ratio = max_tumor_value / `0`
)
}
Loading

0 comments on commit e725d6b

Please sign in to comment.