Skip to content

Commit

Permalink
ettore 1.1.4
Browse files Browse the repository at this point in the history
  • Loading branch information
emosca-cnr committed Nov 24, 2023
1 parent 76a239d commit d0f46a9
Show file tree
Hide file tree
Showing 126 changed files with 752 additions and 555 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: scMuffin
Title: MUlti-Features INtegrative approach for single-cell data analysis
Version: 1.1.3
Date: 2023-11-13
Version: 1.1.4
Date: 2023-11-22
Authors@R:
c(person(given = "Valentina",
family = "Nale",
Expand Down Expand Up @@ -52,7 +52,8 @@ Imports:
circlize,
Matrix,
pals,
plotrix
plotrix,
qvalue
Suggests:
rmarkdown,
knitr,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export(create_scMuffinList)
export(csea)
export(detect_CNV_regions)
export(diff_map)
export(es)
export(extract_cluster_enrichment_table)
export(extract_cluster_enrichment_tags)
export(filter_gsl)
Expand Down Expand Up @@ -90,6 +91,7 @@ importFrom(org.Hs.eg.db,org.Hs.egSYMBOL)
importFrom(pals,alphabet)
importFrom(parallel,mclapply)
importFrom(plotrix,thigmophobe.labels)
importFrom(qvalue,qvalue)
importFrom(stats,dhyper)
importFrom(stats,glm)
importFrom(stats,median)
Expand Down
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
version 1.1.4
- updtaes over csea and vignette
version 1.1.3
- improved point text in boxplot_cluster and barplot_cluster
version 1.1.2
Expand Down
28 changes: 19 additions & 9 deletions R/assess_cluster_enrichment.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,17 @@
#' @param min.cells.cluster minimum number of cells of a cluster
#' @param mc.cores number of cores
#' @param csea.k number of CSEA permutations
#' @param min.k minimum number of valid permutations to support empirical nulls
#' @description Assess cluster enrichment using ORA for categorical features and CSEA for numeric features.
#' @details
#' The output of CSEA is a table with statistics for every tested gene set (gs_table element) and a table with the results of the leading edge (leading_edge element).
#' The output of CSEA is a table with statistics for every tested gene set.
#' The output of ORA is composed of a series of tables with enrichment results, one for every possible categorical value. See [extract_cluster_enrichment_table] to extract summary table from CSEA and ORA results.
#'
#' @return scMuffinList with CSEA or ORA elements under scMuffinList$cluster_data for the considered partition
#' @importFrom stats setNames
#' @export

assess_cluster_enrichment <- function(scMuffinList=NULL, feature_name=NULL, partition_id=NULL, min.cells.feature=100, min.cells.cluster=10, mc.cores=1, csea.k=99){
assess_cluster_enrichment <- function(scMuffinList=NULL, feature_name=NULL, partition_id=NULL, min.cells.feature=100, min.cells.cluster=10, mc.cores=1, csea.k=99, min.k=10){

if(length(scMuffinList[[feature_name]]$summary) == 0){
stop("Can't find scMuffinList[[feature_name]]$summary\n")
Expand Down Expand Up @@ -55,7 +56,7 @@ assess_cluster_enrichment <- function(scMuffinList=NULL, feature_name=NULL, part

cat("CSEA for ", ncol(X), "features\n")

ans$cluster_csea_res <- cluster_csea(X, cell_clusters = X_clusters, min.cells.feature=min.cells.feature, min.cells.cluster=min.cells.cluster, mc.cores=mc.cores, csea.k=csea.k)
ans$cluster_csea_res <- cluster_csea(X, cell_clusters = X_clusters, min.cells.feature=min.cells.feature, min.cells.cluster=min.cells.cluster, mc.cores=mc.cores, csea.k=csea.k, min.k=min.k)

if(length(scMuffinList$cluster_data[partition_id])==0){ # nor ORA neither CSEA

Expand All @@ -69,15 +70,24 @@ assess_cluster_enrichment <- function(scMuffinList=NULL, feature_name=NULL, part

}

#shared_names <- which(names(scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table) %in% names(ans$cluster_csea_res$gs_table))

# if(length(shared_names)>0){
# scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table[shared_names] <- NULL
# scMuffinList$cluster_data[[partition_id]]$CSEA$leading_edge[shared_names] <- NULL
# }
#
# scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table <- c(scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table, ans$cluster_csea_res$gs_table)
# scMuffinList$cluster_data[[partition_id]]$CSEA$leading_edge <- c(scMuffinList$cluster_data_full[[partition_id]]$CSEA$leading_edge, ans$cluster_csea_res$leading_edge) ##append

shared_names <- which(names(scMuffinList$cluster_data[[partition_id]]$CSEA) %in% names(ans$cluster_csea_res))

shared_names <- which(names(scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table) %in% names(ans$cluster_csea_res$gs_table))
if(length(shared_names)>0){
scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table[shared_names] <- NULL
scMuffinList$cluster_data[[partition_id]]$CSEA$leading_edge[shared_names] <- NULL
scMuffinList$cluster_data[[partition_id]]$CSEA[shared_names] <- NULL
}
scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table <- c(scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table, ans$cluster_csea_res$gs_table)
scMuffinList$cluster_data[[partition_id]]$CSEA$leading_edge <- c(scMuffinList$cluster_data_full[[partition_id]]$CSEA$leading_edge, ans$cluster_csea_res$leading_edge) ##append

scMuffinList$cluster_data[[partition_id]]$CSEA <- c(scMuffinList$cluster_data[[partition_id]]$CSEA, ans$cluster_csea_res)

}

}
Expand Down
6 changes: 2 additions & 4 deletions R/calc_gs_perm.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@
#' @param gs gene set
#' @description Calculate permutations

calc_gs_perm <- function(rll, perm, gs){

#out <- unlist(lapply(rll, function(x) es(which(perm %in% gs), array(x, dimnames=list(perm)))))
calc_gs_perm <- function(rll=NULL, perm=NULL, gs=NULL){

out <- setNames(numeric(length(rll)), names(rll))

for(i in 1:length(rll)){
out[i] <- es(which(perm[[i]] %in% gs), array(rll[[i]], dimnames=list(perm[[i]])))
out[i] <- es(which(perm[[i]] %in% gs), array(rll[[i]], dimnames=list(perm[[i]])))[, 1]
}

return(out)
Expand Down
21 changes: 4 additions & 17 deletions R/cluster_csea.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
#' @param min.cells.cluster minimum number of cells of a cluster
#' @param mc.cores number of cores
#' @param csea.k number of permutations
#' @param min.k.nes minimum number of not null NES values
#' @param min.k minimum number of valid permutations to support empirical nulls
#' @return list with two elements: gs_table and leading_edge. See [csea()]
#' @export
#' @description Calculate cluster enrichment by csea approach


cluster_csea <- function(feature_values=NULL, cell_clusters=NULL, min.cells.feature=100, min.cells.cluster=10, mc.cores=1, csea.k=99, min.k.nes=10){
cluster_csea <- function(feature_values=NULL, cell_clusters=NULL, min.cells.feature=100, min.cells.cluster=10, mc.cores=1, csea.k=99, min.k=10){


X <- as.matrix(feature_values)
Expand All @@ -35,25 +35,12 @@ cluster_csea <- function(feature_values=NULL, cell_clusters=NULL, min.cells.feat

#CSEA process on features-by-cells
gsl <- lapply(split(cell_cluster_ok, cell_cluster_ok), function(x) names(x))
csea_res <- csea(X, gsl, mc_cores_perm = mc.cores, ord.mode = rep(-1, ncol(X)), k = csea.k, min.size = min.cells.feature, min.k.nes = min.k.nes)
csea_res <- csea(X, gsl, mc_cores_perm = mc.cores, ord.mode = rep(-1, ncol(X)), k = csea.k, min.size = min.cells.feature, min.k = min.k)

# insert a row for excluded clusters
if(length(excluded_clusters)>0){

#debug
cat(">>>>>>DEBUG<<<<<<")

#gs table
#csea_res$gs_table <- rbind(csea_res$gs_table, data.frame(id=excluded_clusters, es=0, p_val=1, adj_p_val=1, nes=0, FDRq=1, row.names=excluded_clusters, stringsAsFactors = F)) #old
csea_res$gs_table <- lapply(csea_res$gs_table, function(x) rbind(x, data.frame(id=excluded_clusters, es=0, p_val=1, adj_p_val=1, nes=0, FDRq=1, n_pos_perm=0, n_neg_perm=0, row.names=excluded_clusters, stringsAsFactors = F)))

csea_res$gs_table <- lapply(csea_res$gs_table, function(x) x[match(names(cluster_size), rownames(x)), ])

#leading_edge
csea_res$leading_edge <- lapply(csea_res$leading_edge, function(x) rbind(x, data.frame(tags=0, tags_perc=0, list_top=0, list_top_perc=0, lead_edge=0, lead_edge_subset="", row.names=excluded_clusters, stringsAsFactors = F)))

#csea_res$leading_edge <- csea_res$leading_edge[match(names(cluster_size), rownames(csea_res$leading_edge)), ] #old
csea_res$leading_edge <- lapply(csea_res$leading_edge, function(x) x[match(names(cluster_size), rownames(x)), ])
csea_res <- lapply(csea_res, function(x) rbind(x, data.frame(id=excluded_clusters, es=NA, p_val=NA, adj_p_val=NA, nes=NA, FDRq=NA, nperm=NA, tags=NA, tags_perc=NA, list_top=NA, list_top_perc=NA, lead_edge=NA, row.names=excluded_clusters, stringsAsFactors = F)))

}

Expand Down
143 changes: 77 additions & 66 deletions R/csea.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
#' @param ord.mode ordering mode: -1 -> descending; 1 ascending; must be of length equal to `ncol(rl)`
#' @param mc_cores_path number of cores to use for parallel calculation of gene set lists; the total number of cpu used will be mc_cores_path * mc_cores_perm
#' @param mc_cores_perm number of cores to use for parallel calculation of ranked list permutations; the total number of cpu used will be mc_cores_path * mc_cores_perm
#' @param min.k.nes minimum number of not null NES values
#' @param min.k minimum number of valid permutations to support empirical nulls
#' @param min.size minimum number of cells with a not null value
#' @import parallel
#' @importFrom stats p.adjust
#' @importFrom qvalue qvalue
#' @return list with two data.frames, gs_table and leading_edge.
#` \enumerate{
#` \item {gs_table} a data.frame with:
Expand All @@ -31,7 +32,7 @@
#' @export


csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cores_path=1, mc_cores_perm=1){
csea <- function(rl=NULL, gsl=NULL, k=100, min.size=100, ord.mode=-1, min.k=10, mc_cores_path=1, mc_cores_perm=1){

#cheks
if(!is.matrix(rl) | !is.numeric(rl)){
Expand All @@ -42,7 +43,7 @@ csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cor
stop("length(ord.mode) must be equal to ncol(rl)")
}

min.k.nes <- min(min.k.nes, k)
min.k <- min(min.k, k)

#create the list of ranked vectors
rll <- vector('list', ncol(rl))
Expand All @@ -57,17 +58,17 @@ csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cor
cat("Ranked list that passed the checks:", names(rll), "\n")

#real es
print("ES...")
#real_es <- lapply(gsl, function(x) lapply(rll, function(y) es(which(names(y) %in% x), y)))
#real_es <- do.call(rbind, real_es)

real_es_data <- lapply(gsl, function(x) lapply(rll, function(y) es(which(names(y) %in% x), y, le=T)))
cat("ES of input data...\n")

gsl_size <- lengths(gsl)
real_es_data <- lapply(gsl, function(x) lapply(rll, function(y) es(which(names(y) %in% x), y)))
real_es <- do.call(rbind, lapply(real_es_data, function(x) unlist(lapply(x, function(y) y$es))))

leading_edge <- vector("list", length(rll))
names(leading_edge) <- names(rll)
for(i in 1:length(leading_edge)){
leading_edge[[i]] <- do.call(rbind, lapply(real_es_data, function(x) x[[i]]$lea))
leading_edge[[i]] <- do.call(rbind, lapply(real_es_data, function(x) x[[i]][, -1]))
}

#permutations
Expand All @@ -81,37 +82,9 @@ csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cor
x_perm[[i]] <- lapply(rll, function(x) sample(rownames(x), length(x)))
}


print("calculating permutations")
if(mc_cores_path==1){
if(mc_cores_perm == 1){
#res <- lapply(gsl, function(x) unlist(lapply(x_perm, function(y) calc_gs_perm(rl, y, x))))
res <- lapply(gsl, function(x) do.call(rbind, lapply(x_perm, function(y) calc_gs_perm(rll, y, x))))
}else{
cat(k, "permutations on", mc_cores_perm, "cores\n")
#res <- lapply(gsl, function(x) unlist(mclapply(x_perm, function(y) calc_gs_perm(rl, y, x), mc.cores=mc_cores_perm)))
res <- lapply(gsl, function(x) do.call(rbind, mclapply(x_perm, function(y) calc_gs_perm(rll, y, x), mc.cores=mc_cores_perm)))
}
}else{
cat(length(gsl), "gene sets on", mc_cores_path, "cores\n")
if(mc_cores_perm == 1){
#res <- parallel::mclapply(gsl, function(x) unlist(lapply(x_perm, function(y) calc_gs_perm(rl, y, x))), mc.cores = mc_cores_path)
res <- mclapply(gsl, function(x) do.call(rbind, lapply(x_perm, function(y) calc_gs_perm(rll, y, x))), mc.cores = mc_cores_path)
}else{
cat(k, "permutations on", mc_cores_perm, "cores\n")
#res <- parallel::mclapply(gsl, function(x) unlist(parallel::mclapply(x_perm, function(y) calc_gs_perm(rl, y, x), mc.cores=mc_cores_perm)), mc.cores = mc_cores_path)
res <- mclapply(gsl, function(x) do.call(rbind, mclapply(x_perm, function(y) calc_gs_perm(rll, y, x), mc.cores=mc_cores_perm)), mc.cores = mc_cores_path)
}
}


#list of pathwa-by-k matrices of permuted es
#res <- do.call(rbind, res)
# if(nrow(res) != length(gsl) | ncol(res) != k){
# stop("not all the permutations returned a correct value\n")
# }
#res <- cbind(real_es, res)

cat("ES of permutations...\n")
res <- mclapply(gsl, function(x) do.call(rbind, mclapply(x_perm, function(y) calc_gs_perm(rll, y, x), mc.cores=mc_cores_perm)), mc.cores = mc_cores_path)

temp <- vector('list', length(rll))
names(temp) <- colnames(rl)
for(i in 1:length(rll)){
Expand All @@ -121,8 +94,6 @@ csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cor
rm(temp)




#statistics
print("calculating statistics...")

Expand All @@ -133,54 +104,94 @@ csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cor

for(i in 1:length(rll)){

p_val <- apply(res[[i]], 1, function(x) ifelse(x[1] >= 0, sum(x >= x[1]) / length(x), sum(x <= x[1]) / length(x)))
n_pos_perm <- rowSums(res[[i]]>0)
n_neg_perm <- rowSums(res[[i]]<0)

p_val <- apply(res[[i]], 1, function(x) ifelse(x[1] >= 0, sum(x >= x[1]) / length(x[x>=0]), sum(x <= x[1]) / length(x[x<=0])))

#p-values for real ES == 0 are set to 1
p_val[res[[i]][, 1] == 0] <- 1
#p_val[res[[i]][, 1] == 0] <- 1

idx_na <- which(res[[i]][, 1] == 0)
if(length(idx_na)>0){
cat("\tES==0:\n")
cat("\t", rownames(res[[i]])[idx_na], "\n")
p_val[idx_na] <- NA ###
}
idx_na <- which(res[[i]][, 1] > 0 & n_pos_perm < min.k)
if(length(idx_na)>0){
cat("\tES>0 but less than", min.k, " positive ES in permutations\n")
cat("\t", rownames(res[[i]])[idx_na], "\n")
p_val[idx_na] <- NA ###
}
idx_na <- which(res[[i]][, 1] < 0 & n_neg_perm < min.k)
if(length(idx_na)>0){
cat("\tES<0 but less than", min.k, " negative ES in permutations\n")
cat("\t", rownames(res[[i]])[idx_na], "\n")
p_val[idx_na] <- NA ###
}

#normalized ES
n_pos_perm <- rowSums(res[[i]]>0)
n_neg_perm <- rowSums(res[[i]]<0)
means <- t(apply(res[[i]], 1, function(x) c(mean(x[x>0]), abs(mean(x[x<0]))))) #positive, negative
means[is.nan(means)] <- 0 #NaN values are caused by the absence of any positive or negative value
nes <- res[[i]] / means[, 1]
nes_neg <- res[[i]] / means[, 2]
nes[res[[i]] < 0] <- nes_neg[res[[i]] < 0]
nes[is.nan(nes)] <- 0 #NaN values are caused by 0/0
nes[is.nan(nes)] <- NA #NaN values are caused by 0/0
rm(means, nes_neg)

#if there are not at least min.k.nes the NES is unrelieable
nes[nes>0 & n_pos_perm < min.k.nes] <- 0
nes[nes<0 & n_neg_perm < min.k.nes] <- 0
#if there are not at least min.k the NES is unrelieable
nes[nes>0 & n_pos_perm < min.k] <- NA
nes[nes<0 & n_neg_perm < min.k] <- NA

#calculate FDR
all_nes <- as.numeric(nes)
n_nes_pos <- sum(nes>0)
n_nes_neg <- sum(nes<0)
n_real_nes_pos <- sum(nes[,1] > 0)
n_real_nes_neg <- sum(nes[,1] < 0)
if((n_nes_pos + n_nes_neg) != (k*length(gsl) + nrow(nes))){
warning('some nes value is equal to zero')
}

n_nes_pos <- sum(nes>0, na.rm = T)
n_nes_neg <- sum(nes<0, na.rm = T)
n_real_nes_pos <- sum(nes[,1] > 0, na.rm = T)
n_real_nes_neg <- sum(nes[,1] < 0, na.rm = T)

#FDR: NES* > 0: fdrq = #(all positive NESp >= NES*) / #(all positive NESp) / [ #(all NES* >= NES*) / (all positive NES*) ]
#FDR: NES* < 0: fdrq = #(all negative NESp <= NES*) / #(all negative NESp) / [ #(all NES* <= NES*) / (all negative NES*) ]

fdrq <- sapply(nes[, 1], function(x) ifelse(x>0, sum(all_nes >= x) / n_nes_pos, sum(all_nes <= x) / n_nes_neg))
fdrq <- fdrq / sapply(nes[, 1], function(x) ifelse(x>0, sum(nes[, 1] >= x) / n_real_nes_pos, sum(nes[, 1] <= x) / n_real_nes_neg))
fdrq <- sapply(nes[, 1], function(x) ifelse(x>0, sum(all_nes >= x, na.rm = T) / n_nes_pos, sum(all_nes <= x, na.rm = T) / n_nes_neg))
fdrq <- fdrq / sapply(nes[, 1], function(x) ifelse(x>0, sum(nes[, 1] >= x, na.rm = T) / n_real_nes_pos, sum(nes[, 1] <= x, na.rm = T) / n_real_nes_neg))

#q of nes that are equal to 0 are set to 1
fdrq[nes[, 1] == 0] <- 1
#fdrq[nes[, 1] == 0] <- 1

rm(all_nes)
fdrq[fdrq>1] <- 1

idx_replace <- which(fdrq<p_val)
fdrq[idx_replace] <- p_val[idx_replace]

#### FROM DOSE
qobj <- tryCatch(qvalue(p_val, lambda=0.05, pi0.method="bootstrap"), error=function(e) NULL)
if (inherits(qobj, "qvalue")) {
qvalues <- qobj$qvalues
} else {
qvalues <- NA
}

nperm <- n_pos_perm
nperm[which(res[[i]][, 1]<0)] <- n_neg_perm[which(res[[i]][, 1]<0)]
nperm[which(res[[i]][, 1] == 0)] <- NA

#out table
out[[i]] <- data.frame(id=rownames(res[[i]]), es=res[[i]][, 1], p_val=p_val, adj_p_val=p.adjust(p_val, method='fdr'), n_pos_perm=n_pos_perm, n_neg_perm=n_neg_perm, nes=nes[, 1], FDRq=fdrq, stringsAsFactors=FALSE)

#out[[i]] <- data.frame(id=rownames(res[[i]]), es=res[[i]][, 1], p_val=p_val, adj_p_val=p.adjust(p_val, method='fdr'), n_pos_perm=n_pos_perm, n_neg_perm=n_neg_perm, nes=nes[, 1], FDRq=fdrq, stringsAsFactors=FALSE)

#out table
out[[i]] <- data.frame(id=rownames(res[[i]]), size=gsl_size[match(rownames(res[[i]]), names(gsl_size))], es=res[[i]][, 1], nes=nes[, 1], nperm=nperm, p_val=p_val, adj_p_val=p.adjust(p_val, method='fdr'), q_val=qvalues, FDRq=fdrq, stringsAsFactors=FALSE)

out[[i]] <- merge(out[[i]], leading_edge[[i]], by.x=1, by.y=0, sort=F)

out[[i]] <- out[[i]][, -which(colnames(out[[i]]) == "lead_edge_subset")]

}

print("done")

return(list(gs_table=out, leading_edge=leading_edge))
#return(list(gs_table=out, leading_edge=leading_edge))
return(out)

}
Loading

0 comments on commit d0f46a9

Please sign in to comment.