Skip to content

Commit

Permalink
added gene set selection to celltype_GSVA
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthiasLienhard committed Nov 7, 2024
1 parent 775e973 commit 1d12966
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 10 deletions.
12 changes: 7 additions & 5 deletions workflow/rules/scAmpi_basic_rules.smk
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ rule sctransform_preprocessing:
conda:
"../envs/sctransform_preprocessing.yaml"
resources:
mem_mb=lambda wc, input: min(8000 + 20 * input.size_mb, max_mem_mb),
mem_mb=config["computingResources"]["mem_mb"]["medium"],
runtime=config["computingResources"]["runtime"]["medium"],
threads: config["computingResources"]["threads"]["medium"]
log:
Expand Down Expand Up @@ -480,11 +480,11 @@ rule celltype_gsva:
input:
"results/atypical_removed/{sample}.atypical_removed.RDS"
output:
outfile="results/celltype_gsva_c6/{sample}_celltype_GSVA.tsv"
outfile="results/celltype_gsva/{gene_set}/{sample}_celltype_GSVA.tsv"
params:
outdir=lambda w, output: dirname(output.outfile),
sampleName="{sample}",
genesets=config["resources"]["genesets_c6"],
genesets=lambda w: config["tools"]["celltype_gsva"]["gene_set"][w.gene_set],
tumor_celltypes=config["tools"]["celltype_gsva"]["celltype_set"],
min_set_size=config["tools"]["celltype_gsva"]["min_set_size"],
method="gsva",
Expand All @@ -496,11 +496,13 @@ rule celltype_gsva:
runtime=config["computingResources"]["runtime"]["medium"],
threads: config["computingResources"]["threads"]["medium"],
log:
"logs/celltype_gsva_c6/{sample}.log",
"logs/celltype_gsva_c6/{sample}_{gene_set}.log",
benchmark:
"logs/benchmark/celltype_gsva_c6/{sample}.benchmark"
"logs/benchmark/celltype_gsva_c6/{sample}_{gene_set}.benchmark"
script:
"../scripts/compute_gsva_scores_per_celltype.R"


# generate plots about sample composition and gene expression
rule plotting:
input:
Expand Down
132 changes: 132 additions & 0 deletions workflow/scripts/2024-01_compute_gsva_scores_per_celltype.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
################################################################################
## Run GSVA
################################################################################
# Run GSVA analysis on an SCE object in RDS file.
# Script does not generate any plots.
# Anne Bertolini
# Jan 2024

library(optparse)
library(reshape2)
library(GSVA)
library(limma)
library(SingleCellExperiment)
library(stringr)

options(stringsAsFactors = FALSE)

# command line arguments are parsed
option_list <- list(
make_option("--SCE", type = "character", help = "Path to SCE object file with input data."),
make_option("--subset_celltype", type = "character", help = "Celltype(s) that will be included in the analysis. Several cell types can be specified, comma separated."),
make_option("--outdir", type = "character", help = "Path to the directory where output files will be written."),
make_option("--geneset", type = "character", help = "Geneset library gmt file."),
make_option("--sample", type = "character", help = "Sample identifier. Attached to each output name."),
make_option("--min_set_size", type = "integer", help = "Minimum gene set size for it to be considered."),
make_option("--method", type = "character", help = "Method to calculate GSVA score. See GSVA package documentation.")
)
opt_parser <- OptionParser(option_list = option_list)
opt <- parse_args(opt_parser)


# give out date, time, session info, and input files
cat("\n\n")
print(Sys.time())
cat("\n\n\nPrint sessionInfo:\n\n")
print(sessionInfo())
cat("\n\n\n")
cat("\nInput files:\n\n")
print(opt)
cat("\n\n")


## load input data
my_sce <- readRDS(opt$SCE)
print(my_sce)

## identify tumour cells only
keep_celltypes <- unlist(stringr::str_split(opt$subset_celltype, pattern = ","))
cat("\n\n\n### Only processing cells of type:\n\n")
print(keep_celltypes)
cat("\n\n\n")

# subset SCE object to contain only tumour cells
mask_celltype <- colData(my_sce)$celltype_major %in% keep_celltypes
cat("### Cells included in analysis:\n")
print(table(mask_celltype))
my_sce <- my_sce[, mask_celltype]

# report number of cells
if (as.integer(dim(my_sce)[2]) == 0) {
cat("\n### Sample", opt$sample, ": Zero cells\n\n")
} else {
cat("\n### Sample", opt$sample, ":", dim(my_sce)[2], "cells\n\n")
}

################################################################################

# read gmt file with gene sets
# make sure all columns are captured when reading in the gene set lists in gmt format
# readLines results in a character vector with one string per gene set

tmp <- readLines(opt$geneset)
# turn tmp into list of character vectors
tmp <- lapply(tmp, function(x) strsplit(x, "\\\t")[[1]])
names(tmp) <- lapply(tmp, function(x) x[1])
# remove gene set names from gene list
gset <- lapply(tmp, function(x) x[-1])
# remove gene set description from gene list
gset <- lapply(gset, function(x) x[-1])
cat("\n### Gene sets analysed:\n\n")
names(gset)
cat("\n\n\n")

# check how many genes of the gene sets are found in the matrix
gset_symbols <- unique(unlist(gset))

# check overlap and differences
setdiff_sce_gset <- setdiff(rownames(my_sce), gset_symbols)
setdiff_gset_sce <- setdiff(gset_symbols, rownames(my_sce))
stopifnot(length(intersect(setdiff_sce_gset, setdiff_gset_sce)) == 0)


cat("\n\n### Number of genes in count matrix but not in gene sets:", length(setdiff(rownames(my_sce), gset_symbols)), "\n\n")
cat("\n\n### Number of genes in gene sets but not in count matrix:", length(setdiff(gset_symbols, rownames(my_sce))), "\n\n")
print(setdiff(gset_symbols, rownames(my_sce)))

# have list of gene sets, HGNC symbol strings replaced with index of respective gene in SCE object
idxs <- limma::ids2indices(gset, rownames(my_sce))
# get unique numeric vector of all indices, occurring in gset and my_sce
all_genes <- unique(as.numeric(unlist(idxs)))
cat("\n### Number of unique genes in all gene sets:", length(gset_symbols), "\n\n")
cat("\n### Number of genes found in matrix:", length(all_genes), "\n\n")

# it was decided to not reduce the matrix to the sum of genes of all gene sets and keep all genes
#t.m <- assay(my_sce, "normcounts")[all_genes, ]

count_matrix <- assay(my_sce, "normcounts")
cat("\n### Number of genes in gene sets with non-zero expression in matrix:", table(rowSums(count_matrix) == 0)[1], "\n\n")
cat("\n### Number of genes in gene sets with zero expression in all cells:", table(rowSums(count_matrix) == 0)[2], "\n\n")

# remove genes that have zero counts in all cells
mask_genes_all_zero <- rowSums(count_matrix) != 0
count_matrix <- count_matrix[mask_genes_all_zero, ]

# estimate geneset-sample matrix from gene-sample matrix
# GSVA
gsva_result <- gsva(count_matrix,
gset,
method = opt$method,
min.sz = opt$min_set_size)

# reformat results
gsva_long <- melt(gsva_result)
names(gsva_long) <- c("gene_set", "barcode", "value")
print("str(gsva_long):")
print(head(gsva_long))

cat("\n### Number of gene sets with results available for sample ", opt$sample, ":", length(unique(gsva_long$gene_set)), "\n\n")

# write results into file
filename <- paste0(opt$outdir,'/', opt$sample, "_celltype_GSVA.tsv")
write.table(gsva_long, file = filename, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
9 changes: 4 additions & 5 deletions workflow/snakefile_seacells.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ localrules:
scAmpi_seacells,


# gene sets for celltype GSVA
GENE_SETS=set(config["tools"]["celltype_gsva"]["gene_set"].keys())

# final rule of pipeline
# defines output of scampi seacells
rule scAmpi_seacells:
Expand All @@ -54,9 +57,6 @@ rule scAmpi_seacells:
"results/plotting/{sample}_seacells.first_celltype.png",
sample=sample_ids,
),



expand("results/gsva/{sample}.gsetscore_hm.png", sample=sample_ids),
#expand("results/diff_exp_analysis/{sample}/", sample=sample_ids),
expand(
Expand All @@ -65,6 +65,5 @@ rule scAmpi_seacells:
),
expand("results/gsva/{sample}_seacells.gsetscore_hm.png", sample=sample_ids),
"results/metacells/seacells_celltype_purity.png",
expand("results/celltype_gsva_c6/{sample}_seacells_celltype_GSVA.tsv", sample=sample_ids),
expand("results/celltype_gsva_c6/{sample}_celltype_GSVA.tsv", sample=sample_ids),
expand("results/celltype_gsva/{gene_set}/{sample}_seacells_celltype_GSVA.tsv", sample=sample_ids, gene_set=GENE_SETS),

0 comments on commit 1d12966

Please sign in to comment.