ettore 1.1.4

emosca-cnr · Nov 24, 2023 · d0f46a9 · d0f46a9
1 parent 76a239d
commit d0f46a9
Show file tree

Hide file tree

Showing 126 changed files with 752 additions and 555 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scMuffin
 Title: MUlti-Features INtegrative approach for single-cell data analysis
-Version: 1.1.3
-Date: 2023-11-13
+Version: 1.1.4
+Date: 2023-11-22
 Authors@R: 
      c(person(given = "Valentina",
            family = "Nale",
@@ -52,7 +52,8 @@ Imports:
     circlize,
     Matrix,
     pals,
-    plotrix
+    plotrix,
+    qvalue
 Suggests: 
     rmarkdown,
     knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -20,6 +20,7 @@ export(create_scMuffinList)
 export(csea)
 export(detect_CNV_regions)
 export(diff_map)
+export(es)
 export(extract_cluster_enrichment_table)
 export(extract_cluster_enrichment_tags)
 export(filter_gsl)
@@ -90,6 +91,7 @@ importFrom(org.Hs.eg.db,org.Hs.egSYMBOL)
 importFrom(pals,alphabet)
 importFrom(parallel,mclapply)
 importFrom(plotrix,thigmophobe.labels)
+importFrom(qvalue,qvalue)
 importFrom(stats,dhyper)
 importFrom(stats,glm)
 importFrom(stats,median)

diff --git a/NEWS b/NEWS
@@ -1,3 +1,5 @@
+version 1.1.4
+- updtaes over csea and vignette
 version 1.1.3
 - improved point text in boxplot_cluster and barplot_cluster 
 version 1.1.2

diff --git a/R/assess_cluster_enrichment.R b/R/assess_cluster_enrichment.R
@@ -6,16 +6,17 @@
 #' @param min.cells.cluster minimum number of cells of a cluster
 #' @param mc.cores number of cores
 #' @param csea.k number of CSEA permutations
+#' @param min.k minimum number of valid permutations to support empirical nulls
 #' @description Assess cluster enrichment using ORA for categorical features and CSEA for numeric features.
 #' @details
-#' The output of CSEA is a table with statistics for every tested gene set (gs_table element) and a table with the results of the leading edge (leading_edge element). 
+#' The output of CSEA is a table with statistics for every tested gene set. 
 #' The output of ORA is composed of a series of tables with enrichment results, one for every possible categorical value. See [extract_cluster_enrichment_table] to extract summary table from CSEA and ORA results.
 #'
 #' @return scMuffinList with CSEA or ORA elements under scMuffinList$cluster_data for the considered partition
 #' @importFrom stats setNames
 #' @export
 
-assess_cluster_enrichment <- function(scMuffinList=NULL, feature_name=NULL, partition_id=NULL, min.cells.feature=100, min.cells.cluster=10, mc.cores=1, csea.k=99){
+assess_cluster_enrichment <- function(scMuffinList=NULL, feature_name=NULL, partition_id=NULL, min.cells.feature=100, min.cells.cluster=10, mc.cores=1, csea.k=99, min.k=10){
 
   if(length(scMuffinList[[feature_name]]$summary) == 0){
     stop("Can't find scMuffinList[[feature_name]]$summary\n")
@@ -55,7 +56,7 @@ assess_cluster_enrichment <- function(scMuffinList=NULL, feature_name=NULL, part
 
     cat("CSEA for ", ncol(X), "features\n")
 
-    ans$cluster_csea_res <- cluster_csea(X, cell_clusters = X_clusters, min.cells.feature=min.cells.feature, min.cells.cluster=min.cells.cluster, mc.cores=mc.cores, csea.k=csea.k)
+    ans$cluster_csea_res <- cluster_csea(X, cell_clusters = X_clusters, min.cells.feature=min.cells.feature, min.cells.cluster=min.cells.cluster, mc.cores=mc.cores, csea.k=csea.k, min.k=min.k)
 
     if(length(scMuffinList$cluster_data[partition_id])==0){ # nor ORA neither CSEA
 
@@ -69,15 +70,24 @@ assess_cluster_enrichment <- function(scMuffinList=NULL, feature_name=NULL, part
 
       }
 
+      #shared_names <- which(names(scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table) %in% names(ans$cluster_csea_res$gs_table))
+
+      # if(length(shared_names)>0){
+      #   scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table[shared_names] <- NULL
+      #   scMuffinList$cluster_data[[partition_id]]$CSEA$leading_edge[shared_names] <- NULL
+      # }
+      # 
+      # scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table <- c(scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table, ans$cluster_csea_res$gs_table)
+      # scMuffinList$cluster_data[[partition_id]]$CSEA$leading_edge <- c(scMuffinList$cluster_data_full[[partition_id]]$CSEA$leading_edge, ans$cluster_csea_res$leading_edge) ##append 
+
+      shared_names <- which(names(scMuffinList$cluster_data[[partition_id]]$CSEA) %in% names(ans$cluster_csea_res))
 
-      shared_names <- which(names(scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table) %in% names(ans$cluster_csea_res$gs_table))
       if(length(shared_names)>0){
-        scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table[shared_names] <- NULL
-        scMuffinList$cluster_data[[partition_id]]$CSEA$leading_edge[shared_names] <- NULL
+        scMuffinList$cluster_data[[partition_id]]$CSEA[shared_names] <- NULL
       }
-      
-      scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table <- c(scMuffinList$cluster_data[[partition_id]]$CSEA$gs_table, ans$cluster_csea_res$gs_table)
-      scMuffinList$cluster_data[[partition_id]]$CSEA$leading_edge <- c(scMuffinList$cluster_data_full[[partition_id]]$CSEA$leading_edge, ans$cluster_csea_res$leading_edge) ##append 
+
+      scMuffinList$cluster_data[[partition_id]]$CSEA <- c(scMuffinList$cluster_data[[partition_id]]$CSEA, ans$cluster_csea_res)
+
     }
 
   }

diff --git a/R/calc_gs_perm.R b/R/calc_gs_perm.R
@@ -4,14 +4,12 @@
 #' @param gs gene set
 #' @description Calculate permutations
 
-calc_gs_perm <- function(rll, perm, gs){
-
-  #out <- unlist(lapply(rll, function(x) es(which(perm %in% gs), array(x, dimnames=list(perm)))))
+calc_gs_perm <- function(rll=NULL, perm=NULL, gs=NULL){
 
   out <- setNames(numeric(length(rll)), names(rll))
 
   for(i in 1:length(rll)){
-    out[i] <- es(which(perm[[i]] %in% gs), array(rll[[i]], dimnames=list(perm[[i]])))
+    out[i] <- es(which(perm[[i]] %in% gs), array(rll[[i]], dimnames=list(perm[[i]])))[, 1]
   }
 
   return(out)

diff --git a/R/cluster_csea.R b/R/cluster_csea.R
@@ -6,13 +6,13 @@
 #' @param min.cells.cluster minimum number of cells of a cluster
 #' @param mc.cores number of cores
 #' @param csea.k number of permutations
-#' @param min.k.nes minimum number of not null NES values
+#' @param min.k minimum number of valid permutations to support empirical nulls
 #' @return list with two elements: gs_table and leading_edge. See [csea()]
 #' @export
 #' @description Calculate cluster enrichment by csea approach
 
 
-cluster_csea <- function(feature_values=NULL, cell_clusters=NULL, min.cells.feature=100, min.cells.cluster=10, mc.cores=1, csea.k=99, min.k.nes=10){
+cluster_csea <- function(feature_values=NULL, cell_clusters=NULL, min.cells.feature=100, min.cells.cluster=10, mc.cores=1, csea.k=99, min.k=10){
 
 
   X <- as.matrix(feature_values)
@@ -35,25 +35,12 @@ cluster_csea <- function(feature_values=NULL, cell_clusters=NULL, min.cells.feat
 
   #CSEA process on features-by-cells
   gsl <- lapply(split(cell_cluster_ok, cell_cluster_ok), function(x) names(x))
-  csea_res <- csea(X, gsl, mc_cores_perm = mc.cores, ord.mode = rep(-1, ncol(X)), k = csea.k, min.size = min.cells.feature, min.k.nes = min.k.nes)
+  csea_res <- csea(X, gsl, mc_cores_perm = mc.cores, ord.mode = rep(-1, ncol(X)), k = csea.k, min.size = min.cells.feature, min.k = min.k)
 
   # insert a row for excluded clusters
   if(length(excluded_clusters)>0){
 
-    #debug
-    cat(">>>>>>DEBUG<<<<<<")
-
-    #gs table
-    #csea_res$gs_table <- rbind(csea_res$gs_table, data.frame(id=excluded_clusters, es=0, p_val=1, adj_p_val=1, nes=0, FDRq=1, row.names=excluded_clusters, stringsAsFactors = F)) #old
-    csea_res$gs_table <- lapply(csea_res$gs_table, function(x) rbind(x, data.frame(id=excluded_clusters, es=0, p_val=1, adj_p_val=1, nes=0, FDRq=1, n_pos_perm=0, n_neg_perm=0, row.names=excluded_clusters, stringsAsFactors = F)))
-
-    csea_res$gs_table <- lapply(csea_res$gs_table, function(x) x[match(names(cluster_size), rownames(x)), ])
-
-    #leading_edge
-    csea_res$leading_edge <- lapply(csea_res$leading_edge, function(x) rbind(x, data.frame(tags=0, tags_perc=0, list_top=0, list_top_perc=0, lead_edge=0, lead_edge_subset="", row.names=excluded_clusters, stringsAsFactors = F)))
-
-    #csea_res$leading_edge <- csea_res$leading_edge[match(names(cluster_size), rownames(csea_res$leading_edge)), ] #old
-    csea_res$leading_edge <- lapply(csea_res$leading_edge, function(x) x[match(names(cluster_size), rownames(x)), ])
+     csea_res <- lapply(csea_res, function(x) rbind(x, data.frame(id=excluded_clusters, es=NA, p_val=NA, adj_p_val=NA, nes=NA, FDRq=NA, nperm=NA, tags=NA, tags_perc=NA, list_top=NA, list_top_perc=NA, lead_edge=NA, row.names=excluded_clusters, stringsAsFactors = F)))
 
   }
 

diff --git a/R/csea.R b/R/csea.R
@@ -5,10 +5,11 @@
 #' @param ord.mode ordering mode: -1 -> descending; 1 ascending; must be of length equal to `ncol(rl)`
 #' @param mc_cores_path number of cores to use for parallel calculation of gene set lists; the total number of cpu used will be mc_cores_path * mc_cores_perm
 #' @param mc_cores_perm number of cores to use for parallel calculation of ranked list permutations; the total number of cpu used will be mc_cores_path * mc_cores_perm
-#' @param min.k.nes minimum number of not null NES values
+#' @param min.k minimum number of valid permutations to support empirical nulls
 #' @param min.size minimum number of cells with a not null value
 #' @import parallel
 #' @importFrom stats p.adjust
+#' @importFrom qvalue qvalue
 #' @return list with two data.frames, gs_table and leading_edge. 
 #` \enumerate{
 #`  \item {gs_table} a data.frame with:
@@ -31,7 +32,7 @@
 #' @export
 
 
-csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cores_path=1, mc_cores_perm=1){
+csea <- function(rl=NULL, gsl=NULL, k=100, min.size=100, ord.mode=-1, min.k=10, mc_cores_path=1, mc_cores_perm=1){
 
   #cheks
   if(!is.matrix(rl) | !is.numeric(rl)){
@@ -42,7 +43,7 @@ csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cor
     stop("length(ord.mode) must be equal to ncol(rl)")
   }
 
-  min.k.nes <- min(min.k.nes, k)
+  min.k <- min(min.k, k)
 
   #create the list of ranked vectors
   rll <- vector('list', ncol(rl))
@@ -57,17 +58,17 @@ csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cor
   cat("Ranked list that passed the checks:", names(rll), "\n")
 
   #real es
-  print("ES...")
-  #real_es <- lapply(gsl, function(x) lapply(rll, function(y) es(which(names(y) %in% x), y)))
-  #real_es <- do.call(rbind, real_es)
-
-  real_es_data <- lapply(gsl, function(x) lapply(rll, function(y) es(which(names(y) %in% x), y, le=T)))
+  cat("ES of input data...\n")
+
+  gsl_size <- lengths(gsl)
+  
+  real_es_data <- lapply(gsl, function(x) lapply(rll, function(y) es(which(names(y) %in% x), y)))
   real_es <- do.call(rbind, lapply(real_es_data, function(x) unlist(lapply(x, function(y) y$es))))
-
+  
   leading_edge <- vector("list", length(rll))
   names(leading_edge) <- names(rll)
   for(i in 1:length(leading_edge)){
-    leading_edge[[i]] <- do.call(rbind, lapply(real_es_data, function(x) x[[i]]$lea))
+    leading_edge[[i]] <- do.call(rbind, lapply(real_es_data, function(x) x[[i]][, -1]))
   }
 
   #permutations
@@ -81,37 +82,9 @@ csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cor
     x_perm[[i]] <- lapply(rll, function(x) sample(rownames(x), length(x)))
   }
 
-
-  print("calculating permutations")
-  if(mc_cores_path==1){
-    if(mc_cores_perm == 1){
-      #res <- lapply(gsl, function(x) unlist(lapply(x_perm, function(y) calc_gs_perm(rl, y, x))))
-      res <- lapply(gsl, function(x) do.call(rbind, lapply(x_perm, function(y) calc_gs_perm(rll, y, x))))
-    }else{
-      cat(k, "permutations on", mc_cores_perm, "cores\n")
-      #res <- lapply(gsl, function(x) unlist(mclapply(x_perm, function(y) calc_gs_perm(rl, y, x), mc.cores=mc_cores_perm)))
-      res <- lapply(gsl, function(x) do.call(rbind, mclapply(x_perm, function(y) calc_gs_perm(rll, y, x), mc.cores=mc_cores_perm)))
-    }
-  }else{
-    cat(length(gsl), "gene sets on", mc_cores_path, "cores\n")
-    if(mc_cores_perm == 1){
-      #res <- parallel::mclapply(gsl, function(x) unlist(lapply(x_perm, function(y) calc_gs_perm(rl, y, x))), mc.cores = mc_cores_path)
-      res <- mclapply(gsl, function(x) do.call(rbind, lapply(x_perm, function(y) calc_gs_perm(rll, y, x))), mc.cores = mc_cores_path)
-    }else{
-      cat(k, "permutations on", mc_cores_perm, "cores\n")
-      #res <- parallel::mclapply(gsl, function(x) unlist(parallel::mclapply(x_perm, function(y) calc_gs_perm(rl, y, x), mc.cores=mc_cores_perm)), mc.cores = mc_cores_path)
-      res <- mclapply(gsl, function(x) do.call(rbind, mclapply(x_perm, function(y) calc_gs_perm(rll, y, x), mc.cores=mc_cores_perm)), mc.cores = mc_cores_path)
-    }
-  }
-
-
-  #list of pathwa-by-k matrices of permuted es
-  #res <- do.call(rbind, res)
-  #   if(nrow(res) != length(gsl) | ncol(res) != k){
-  #     stop("not all the permutations returned a correct value\n")
-  #   }
-  #res <- cbind(real_es, res)
-
+  cat("ES of permutations...\n")
+  res <- mclapply(gsl, function(x) do.call(rbind, mclapply(x_perm, function(y) calc_gs_perm(rll, y, x), mc.cores=mc_cores_perm)), mc.cores = mc_cores_path)
+
   temp <- vector('list', length(rll))
   names(temp) <- colnames(rl)
   for(i in 1:length(rll)){
@@ -121,8 +94,6 @@ csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cor
   rm(temp)
 
 
-
-
   #statistics
   print("calculating statistics...")
 
@@ -133,54 +104,94 @@ csea <- function(rl, gsl, k=100, min.size=100, ord.mode=-1, min.k.nes=10, mc_cor
 
   for(i in 1:length(rll)){
 
-    p_val <- apply(res[[i]], 1, function(x) ifelse(x[1] >= 0, sum(x >= x[1]) / length(x), sum(x <= x[1]) / length(x)))
+    n_pos_perm <- rowSums(res[[i]]>0)
+    n_neg_perm <- rowSums(res[[i]]<0)
+
+    p_val <- apply(res[[i]], 1, function(x) ifelse(x[1] >= 0, sum(x >= x[1]) / length(x[x>=0]), sum(x <= x[1]) / length(x[x<=0])))
+
     #p-values for real ES == 0 are set to 1
-    p_val[res[[i]][, 1] == 0] <- 1
+    #p_val[res[[i]][, 1] == 0] <- 1
+
+    idx_na <- which(res[[i]][, 1] == 0)
+    if(length(idx_na)>0){
+      cat("\tES==0:\n")
+      cat("\t", rownames(res[[i]])[idx_na], "\n")
+      p_val[idx_na] <- NA ### 
+    }
+    idx_na <- which(res[[i]][, 1] > 0 & n_pos_perm < min.k)
+    if(length(idx_na)>0){
+      cat("\tES>0 but less than", min.k, " positive ES in permutations\n")
+      cat("\t", rownames(res[[i]])[idx_na], "\n")
+      p_val[idx_na] <- NA ### 
+    }
+    idx_na <- which(res[[i]][, 1] < 0 & n_neg_perm < min.k)
+    if(length(idx_na)>0){
+      cat("\tES<0 but less than", min.k, " negative ES in permutations\n")
+      cat("\t", rownames(res[[i]])[idx_na], "\n")
+      p_val[idx_na] <- NA ### 
+    }
 
     #normalized ES
-    n_pos_perm <- rowSums(res[[i]]>0)
-    n_neg_perm <- rowSums(res[[i]]<0)
     means <- t(apply(res[[i]], 1, function(x) c(mean(x[x>0]), abs(mean(x[x<0]))))) #positive, negative
     means[is.nan(means)] <- 0 #NaN values are caused by the absence of any positive or negative value
     nes <- res[[i]] / means[, 1]
     nes_neg <- res[[i]] / means[, 2]
     nes[res[[i]] < 0] <- nes_neg[res[[i]] < 0]
-    nes[is.nan(nes)] <- 0 #NaN values are caused by 0/0
+    nes[is.nan(nes)] <- NA #NaN values are caused by 0/0
     rm(means, nes_neg)
 
-    #if there are not at least min.k.nes the NES is unrelieable
-    nes[nes>0 & n_pos_perm < min.k.nes] <- 0
-    nes[nes<0 & n_neg_perm < min.k.nes] <- 0
+    #if there are not at least min.k the NES is unrelieable
+    nes[nes>0 & n_pos_perm < min.k] <- NA
+    nes[nes<0 & n_neg_perm < min.k] <- NA
 
     #calculate FDR
     all_nes <- as.numeric(nes)
-    n_nes_pos <- sum(nes>0)
-    n_nes_neg <- sum(nes<0)
-    n_real_nes_pos <- sum(nes[,1] > 0)
-    n_real_nes_neg <- sum(nes[,1] < 0)
-    if((n_nes_pos + n_nes_neg) != (k*length(gsl) + nrow(nes))){
-      warning('some nes value is equal to zero')
-    }
-
+    n_nes_pos <- sum(nes>0, na.rm = T)
+    n_nes_neg <- sum(nes<0, na.rm = T)
+    n_real_nes_pos <- sum(nes[,1] > 0, na.rm = T)
+    n_real_nes_neg <- sum(nes[,1] < 0, na.rm = T)
+
     #FDR: NES* > 0: fdrq = #(all positive NESp >= NES*) / #(all positive NESp) / [ #(all NES* >= NES*) / (all positive NES*) ]
     #FDR: NES* < 0: fdrq = #(all negative NESp <= NES*) / #(all negative NESp) / [ #(all NES* <= NES*) / (all negative NES*) ]
 
-    fdrq <- sapply(nes[, 1], function(x) ifelse(x>0, sum(all_nes >= x) / n_nes_pos, sum(all_nes <= x) / n_nes_neg))
-    fdrq <- fdrq / sapply(nes[, 1], function(x) ifelse(x>0, sum(nes[, 1] >= x) / n_real_nes_pos, sum(nes[, 1] <= x) / n_real_nes_neg))
+    fdrq <- sapply(nes[, 1], function(x) ifelse(x>0, sum(all_nes >= x, na.rm = T) / n_nes_pos, sum(all_nes <= x, na.rm = T) / n_nes_neg))
+    fdrq <- fdrq / sapply(nes[, 1], function(x) ifelse(x>0, sum(nes[, 1] >= x, na.rm = T) / n_real_nes_pos, sum(nes[, 1] <= x, na.rm = T) / n_real_nes_neg))
 
     #q of nes that are equal to 0 are set to 1
-    fdrq[nes[, 1] == 0] <- 1
+    #fdrq[nes[, 1] == 0] <- 1
 
     rm(all_nes)
     fdrq[fdrq>1] <- 1
-
+    idx_replace <- which(fdrq<p_val)
+    fdrq[idx_replace] <- p_val[idx_replace]
+
+    #### FROM DOSE
+    qobj <- tryCatch(qvalue(p_val, lambda=0.05, pi0.method="bootstrap"), error=function(e) NULL)
+    if (inherits(qobj, "qvalue")) {
+      qvalues <- qobj$qvalues
+    } else {
+      qvalues <- NA
+    }
+
+    nperm <- n_pos_perm
+    nperm[which(res[[i]][, 1]<0)] <- n_neg_perm[which(res[[i]][, 1]<0)]
+    nperm[which(res[[i]][, 1] == 0)] <- NA
+
     #out table
-    out[[i]] <- data.frame(id=rownames(res[[i]]), es=res[[i]][, 1], p_val=p_val, adj_p_val=p.adjust(p_val, method='fdr'), n_pos_perm=n_pos_perm, n_neg_perm=n_neg_perm, nes=nes[, 1], FDRq=fdrq, stringsAsFactors=FALSE)
-
+    #out[[i]] <- data.frame(id=rownames(res[[i]]), es=res[[i]][, 1], p_val=p_val, adj_p_val=p.adjust(p_val, method='fdr'), n_pos_perm=n_pos_perm, n_neg_perm=n_neg_perm, nes=nes[, 1], FDRq=fdrq, stringsAsFactors=FALSE)
+
+    #out table
+    out[[i]] <- data.frame(id=rownames(res[[i]]), size=gsl_size[match(rownames(res[[i]]), names(gsl_size))], es=res[[i]][, 1], nes=nes[, 1], nperm=nperm, p_val=p_val, adj_p_val=p.adjust(p_val, method='fdr'), q_val=qvalues, FDRq=fdrq, stringsAsFactors=FALSE)
+
+    out[[i]] <- merge(out[[i]], leading_edge[[i]], by.x=1, by.y=0, sort=F)
+
+    out[[i]] <- out[[i]][, -which(colnames(out[[i]]) == "lead_edge_subset")]
+
   }
 
   print("done")
 
-  return(list(gs_table=out, leading_edge=leading_edge))
+  #return(list(gs_table=out, leading_edge=leading_edge))
+  return(out)
 
 }