Skip to content

Commit

Permalink
v1.1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
emosca-cnr committed Oct 20, 2023
1 parent b97bece commit 8b0d6d9
Show file tree
Hide file tree
Showing 109 changed files with 673 additions and 509 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: scMuffin
Title: MUlti-Features INtegrative approach for single-cell data analysis
Version: 1.1.1
Date: 2023-09-05
Version: 1.1.2
Date: 2023-10-12
Authors@R:
c(person(given = "Valentina",
family = "Nale",
Expand Down Expand Up @@ -57,5 +57,6 @@ Suggests:
rmarkdown,
knitr,
BiocStyle,
devtools
devtools,
sf
VignetteBuilder: knitr
25 changes: 16 additions & 9 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ export(plot_umap)
export(plot_umap_colored_features)
export(prepare_cluster_markers_list)
export(prepare_gsls)
export(preprocess_for_heatmap_CNV)
export(preprocess_object_for_CNV)
export(process_GTEx_gene_reads)
export(proliferation_analysis)
Expand All @@ -53,33 +52,42 @@ export(show_tissues)
export(transcr_compl)
import(ComplexHeatmap)
import(Seurat)
import(ggplot2)
import(grDevices)
import(graphics)
import(grid)
import(msigdbr)
import(org.Hs.eg.db)
import(pals)
import(parallel)
importFrom(Matrix,Matrix)
importFrom(Matrix,colMeans)
importFrom(Matrix,colSums)
importFrom(Matrix,rowSums)
importFrom(Seurat,AddMetaData)
importFrom(Seurat,DimPlot)
importFrom(Seurat,FeaturePlot)
importFrom(Seurat,FetchData)
importFrom(Seurat,NormalizeData)
importFrom(circlize,colorRamp2)
importFrom(destiny,DPT)
importFrom(destiny,DiffusionMap)
importFrom(destiny,random_root)
importFrom(ggplot2,annotate)
importFrom(ggplot2,cut_interval)
importFrom(ggplot2,cut_number)
importFrom(ggplot2,element_text)
importFrom(ggplot2,theme)
importFrom(grDevices,boxplot.stats)
importFrom(grDevices,jpeg)
importFrom(grDevices,png)
importFrom(grid,gpar)
importFrom(methods,is)
importFrom(msigdbr,msigdbr)
importFrom(org.Hs.eg.db,org.Hs.egCHRLOC)
importFrom(org.Hs.eg.db,org.Hs.egCHRLOCEND)
importFrom(org.Hs.eg.db,org.Hs.egENSEMBL)
importFrom(org.Hs.eg.db,org.Hs.egSYMBOL)
importFrom(pals,alphabet)
importFrom(pals,brewer.purples)
importFrom(pals,brewer.rdylbu)
importFrom(pals,brewer.ylorrd)
importFrom(parallel,mclapply)
importFrom(plotrix,thigmophobe.labels)
importFrom(stats,as.hclust)
importFrom(stats,dhyper)
importFrom(stats,glm)
importFrom(stats,median)
Expand All @@ -93,4 +101,3 @@ importFrom(stats,wilcox.test)
importFrom(utils,data)
importFrom(utils,read.delim)
importFrom(utils,read.table)
importFrom(utils,write.table)
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
version 1.1.2
- solved issue in CNV analysis, arising when number of chromosome genes is equal to wnd_size + 1
version 1.1.1
- various improvements: mainly documentation, checks on input and some errors arising whit particular input data
version 1.1.0
Expand Down
13 changes: 8 additions & 5 deletions R/CNV_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,24 @@
#' @param min_genes minimun number of genes expressed in a cell;
#' @param min_cells minimum numbver of cells in which a gene must be expressed;
#' @param expr_lim min and max values of relative expression; by default, lower and higher whiskers returned by grDevices::boxplot.stats will be used. Set to NA or anything other value such that length(expr_lim) != 2 to disable the removal of outliers.
#' @param scale_cells whether to scale cells
#' @param center_cells whether to center cells. Default is TRUE
#' @param na.rm whether to remove 0 values in CNV estimation
#' @param center_genes whether to center genes or not
#' @param center_genes whether to center genes or not. Default is FALSE. When using a reference it is useful to set this to TRUE
#' @param method "mean": subtract the average profile of the reference cluster to every cell (default); "min_max" (from Tirosh et al., DOI: 10.1126/science.aad0501) can be used but is not tested yet
#' @param z.score whether to use z-scores of the cluster median CNV profile, instead of the median itself.
#' @param eps absolute threshold to call CNV regions.
#' @param ... arguments passed to Seurat::FindClusters()
#' @param n_comp number of PCA components used for clustering
#' @param gene_ann optional data.frame with gene annotation with mandatory columns "Chromosome", "symbol" and "pos" (genomic location). If NULL gene locations will be collected from org.Hs.eg.db
#' @export

CNV_analysis <- function(scMuffinList=NULL, mc.cores=1, reference = NULL, min_cells = 100, min_genes = 100, wnd_size = 100, scale_cells = TRUE, center_genes = FALSE, expr_lim = FALSE, method="mean", na.rm=FALSE, z.score=FALSE, eps=NULL){
CNV_analysis <- function(scMuffinList=NULL, mc.cores=1, reference = NULL, min_cells = 100, min_genes = 200, wnd_size = 100, center_cells = TRUE, center_genes = FALSE, expr_lim = FALSE, method="mean", na.rm=FALSE, z.score=FALSE, eps=NULL, n_comp=10, gene_ann=NULL, ...){

cat("IMPORTANT: CNV analysis requires gene symbols as gene identifiers!\n")

cnv_res <- calculate_CNV(scMuffinList$normalized, mc.cores = mc.cores, reference = reference, min_cells = min_cells, min_genes = min_genes, wnd_size = wnd_size, scale_cells = scale_cells, center_genes = center_genes, expr_lim = expr_lim, na.rm=na.rm)
cnv_res <- calculate_CNV(scMuffinList$normalized, mc.cores = mc.cores, reference = reference, min_cells = min_cells, min_genes = min_genes, wnd_size = wnd_size, center_cells = center_cells, center_genes = center_genes, expr_lim = expr_lim, na.rm=na.rm, gene_ann=gene_ann)

cnv_clustering <- cluster_by_features(cnv_res$CNV, cnv=TRUE)
cnv_clustering <- cluster_by_features(cnv_res$CNV, n_comp=n_comp, ...)

if(!is.null(reference)){
cnv_res$CNV <- apply_CNV_reference(cnv = cnv_res$CNV, cnv_clustering = cnv_clustering, reference="reference", method=method)
Expand Down
7 changes: 4 additions & 3 deletions R/barplot_cluster.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#' @importFrom grDevices jpeg
#' @importFrom stats setNames
#' @importFrom plotrix thigmophobe.labels
#' @import pals
#' @description Produce barplots (1 for each cluster) of distribution of cells associated with the values of the selected feature. A png figure for each cluster is saved in dir_out.
#' @export

Expand Down Expand Up @@ -52,7 +53,7 @@ barplot_cluster <- function(scMuffinList=NULL, feature_name=NULL, feature_id=NUL
#boxplot for each cluster
for(cl in 1:length(cell_clusters_set)){ ###for each cluster

grDevices::png(paste0(dir_out, "/cluster_", cell_clusters_set[cl],".png"), width=width, height=height, units=units, res=res)
png(paste0(dir_out, "/cluster_", cell_clusters_set[cl],".png"), width=width, height=height, units=units, res=res)
layout(matrix(c(1, 1, 2, 3), nrow = 2, byrow = T))
par(mar = c(3, 3, 3, 1))
par(mgp = c(1.5, .5, 0))
Expand All @@ -67,11 +68,11 @@ barplot_cluster <- function(scMuffinList=NULL, feature_name=NULL, feature_id=NUL
cell_category_cluster <- cell_category_cluster[c(2,1), ] #remove the other clusters
colnames(cell_category_cluster) <- cell_category_names[match(colnames(cell_category_cluster), names(cell_category_names))]

barplot(cell_category_cluster, beside = T, legend.text = T, main=paste("Cluster", cell_clusters_set[cl]), col=pals::alphabet2(2), cex.axis=cex.axis, cex.names=cex.axis, cex.lab=cex.axis, cex.main=cex.axis, args.legend=list(cex=cex.axis))
barplot(cell_category_cluster, beside = T, legend.text = T, main=paste("Cluster", cell_clusters_set[cl]), col=alphabet2(2), cex.axis=cex.axis, cex.names=cex.axis, cex.lab=cex.axis, cex.main=cex.axis, args.legend=list(cex=cex.axis))


plot(en_res_er[rownames(en_res_er)==cell_clusters_set[cl], ], -log10(en_res_p[rownames(en_res_er)==cell_clusters_set[cl], ]), pch=16, xlab="ER", ylab="-log10(p)", cex.axis=cex.axis, cex.lab=cex.axis, cex=cex.axis)
plotrix::thigmophobe.labels(en_res_er[rownames(en_res_er)==cell_clusters_set[cl], ], -log10(en_res_p[rownames(en_res_er)==cell_clusters_set[cl], ]), cell_category_names, cex=cex.axis)
thigmophobe.labels(en_res_er[rownames(en_res_er)==cell_clusters_set[cl], ], -log10(en_res_p[rownames(en_res_er)==cell_clusters_set[cl], ]), cell_category_names, cex=cex.axis)

plot.new()
legend(x = "center", legend=paste(cell_category_names, ":", names(cell_category_names)), cex = cex.axis, bty = "n", title = "LEGEND")
Expand Down
6 changes: 3 additions & 3 deletions R/boxplot_cluster.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#' @param res image resolution
#' @param feature_name the names of the feature that should be considered. It must be one of names(scMuffinList)
#' @description Produce boxplots to visualize the distribution of cell values according to the selected figure. A png figure for each cluster is saved in dir_out.
#' @importFrom grDevices jpeg
#' @importFrom grDevices png
#' @importFrom plotrix thigmophobe.labels
#' @import graphics
#' @export
Expand Down Expand Up @@ -88,7 +88,7 @@ boxplot_cluster <- function(scMuffinList=NULL, feature_name=NULL, partition_id=N
cbf_cl <- cells_by_features[, match(names(top_features$fdr), colnames(cells_by_features)), drop=FALSE]


grDevices::png(paste0(dir_out, "/cluster_", cell_clusters_set[cl], ".png"), width=width, height=height, units=units, res=res)
png(paste0(dir_out, "/cluster_", cell_clusters_set[cl], ".png"), width=width, height=height, units=units, res=res)
layout(matrix(c(1, 1, 2, 3), nrow = 2, byrow = T))

par(mar = c(3, 3, 2, 1))
Expand Down Expand Up @@ -128,7 +128,7 @@ boxplot_cluster <- function(scMuffinList=NULL, feature_name=NULL, partition_id=N
#barplot(rev(-log10(top_features$fdr)), horiz = T, names.arg = "", main = "FDR")

plot(top_features$nes, -log10(top_features$fdr), pch=16, xlab="NES", ylab="-log10(q)", cex.axis=cex.axis, cex.lab=cex.axis, cex=cex.axis)
plotrix::thigmophobe.labels(top_features$nes, -log10(top_features$fdr), box_names, cex=cex.axis)
thigmophobe.labels(top_features$nes, -log10(top_features$fdr), box_names, cex=cex.axis)

plot.new()
legend(x = "center", legend=paste(box_names, ":", colnames(cbf_cl)), cex = cex.axis, bty = "n", title = "LEGEND")
Expand Down
3 changes: 2 additions & 1 deletion R/boxplot_points.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#' @param image_format png or jpeg
#' @param ... further argument to boxplot
#' @importFrom pals alphabet
#' @importFrom grDevices png jpeg
#' @export

boxplot_points <- function(x=NULL, f=NULL, col=NULL, amount=0.2, adj.col=1, pch=16, cex=0.6, ylim=NULL, file=NULL, width=180, height=180, units="mm", res=300, image_format="png", ...){
Expand All @@ -23,7 +24,7 @@ boxplot_points <- function(x=NULL, f=NULL, col=NULL, amount=0.2, adj.col=1, pch=
f_lev <- levels(f)

if(is.null(col)){
col <- pals::alphabet(length(f_lev))
col <- alphabet(length(f_lev))
}

if(is.null(ylim)){
Expand Down
55 changes: 32 additions & 23 deletions R/calculate_CNV.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,36 @@
#' @param min_genes minimun number of genes expressed in a cell;
#' @param min_cells minimum number of cells in which a gene must be expressed;
#' @param expr_lim min and max values of relative expression; by default, lower and higher whiskers returned by grDevices::boxplot.stats will be used. Set to NA or anything other value such that length(expr_lim) != 2 to disbale the removal of outliers.
#' @param scale_cells whether to scale cells
#' @param center_cells whether to center cells
#' @param na.rm whether to remove 0 values in CNV estimation
#' @param center_genes whether to center genes or not
#' @param gene_ann optional data.frame with gene annotation with mandatory columns "Chromosome", "symbol" and "pos" (genomic location). If NULL gene locations will be collected from org.Hs.eg.db
#' @importFrom grDevices boxplot.stats
#' @importFrom Matrix Matrix rowSums colSums
#' @description Calculate CNV using the moving average approach firstly described in Patel et al., 2014 Science (DOI: 10.1126/science)
#' @export
#'
calculate_CNV <- function(genes_by_cells, reference=NULL, mc.cores=1, wnd_size=100, min_genes=1000, min_cells=100, expr_lim=NULL, scale_cells=TRUE, na.rm=FALSE, center_genes=FALSE) {
calculate_CNV <- function(genes_by_cells, reference=NULL, mc.cores=1, wnd_size=100, min_genes=200, min_cells=100, expr_lim=NULL, center_cells=TRUE, na.rm=FALSE, center_genes=FALSE, gene_ann=NULL) {

genes_by_cells[is.na(genes_by_cells)] <- 0

cat("Filtering...")
idx_keep <- Matrix::rowSums(genes_by_cells !=0) >= min_cells #genes not missing in at least...
cat("Genes-by-cells matrix size", dim(genes_by_cells), "\n")

cat("Filtering...")
idx_keep <- rowSums(genes_by_cells !=0) >= min_cells #genes not missing in at least...
#print(table(idx_keep))
genes_by_cells <- genes_by_cells[idx_keep, ]

if(nrow(genes_by_cells) < min_genes){
stop("Less than ", min_genes, "available: considering reducing the parameter min_genes.\n")
}

idx_keep <- Matrix::colSums(genes_by_cells !=0) >= min(min_genes, nrow(genes_by_cells))
idx_keep <- colSums(genes_by_cells !=0) >= min(min_genes, nrow(genes_by_cells))
#print(table(idx_keep))
genes_by_cells <- genes_by_cells[, idx_keep]
cat(" done\n")
cat("genes-by-cells:", dim(genes_by_cells), "\n")

cat("Genes-by-cells matrix size", dim(genes_by_cells), "\n")
# ****************************************************************
if (!is.null(reference)) {
cat("Adding reference vector...\n")
Expand Down Expand Up @@ -59,7 +67,7 @@ calculate_CNV <- function(genes_by_cells, reference=NULL, mc.cores=1, wnd_size=1
rm(temp)

if(is.null(expr_lim)){
expr_lim <- grDevices::boxplot.stats(as.numeric(genes_by_cells))$stats[c(1, 5)]
expr_lim <- boxplot.stats(as.numeric(genes_by_cells))$stats[c(1, 5)]
}

#constraints over relative expression Tirosh et al.
Expand All @@ -77,37 +85,38 @@ calculate_CNV <- function(genes_by_cells, reference=NULL, mc.cores=1, wnd_size=1
# ***************************************************************

cat("Preprocess object to calculate CNV...\n")
ans <- preprocess_object_for_CNV(genes_by_cells)
ans <- preprocess_object_for_CNV(genes_by_cells = genes_by_cells, gene_ann=gene_ann)

temp <- factor(names(ans), levels = c(1, 2, 3, 4 ,5 ,6 ,7 ,8 , 9, 10, 11, 12 , 13, 14 ,15, 16, 17, 18, 19, 20, 21, 22, "X", "Y"))
temp <- as.character(sort(temp))

ans <- ans[match(temp, names(ans))]

#exclude chrom with < wnd_size genes
ans <- ans[unlist(lapply(ans, nrow)) > wnd_size]

for(i in 1:length(ans)){
rownames(ans[[i]]) <- paste(rownames(ans[[i]]), ans[[i]]$pos, sep="_")
ans[[i]] <- ans[[i]][, -c(1:3)]
chr_size <- unlist(lapply(ans, nrow))
cat("Genes in chromosomes:\n")
print(chr_size)
if(any(chr_size <= wnd_size)){
cat("Excluding chromosomes", names(chr_size)[chr_size<=wnd_size], "\n")
cat("Consider reducing wnd_size parameter.\n")
}
ans <- ans[chr_size > wnd_size]

#calculate CNV
cat("Calculating CNV...\n")
CNV_data_in <- lapply(ans, function(x) Matrix::Matrix(as.matrix(x), sparse = TRUE))
ans <- mclapply(ans, function(x) apply(x, 2, function(y) CNV(setNames(y, rownames(x)), wnd_size=wnd_size, na.rm = na.rm)), mc.cores = mc.cores)
CNV_data_in <- lapply(ans, function(x) Matrix(as.matrix(x), sparse = TRUE))

#merging chromosomes
for(i in 1:length(ans)){
if(!is.matrix(ans[[i]])){
ans[[i]] <- matrix(ans[[i]], nrow = 1)
}
rownames(ans[[i]]) <- paste0("chr", names(ans)[i], "__", rownames(ans[[i]]))
}

cat("Chromosome", names(ans)[i], "...\n")
ans[[i]] <- mclapply(setNames(1:ncol(ans[[i]]), colnames(ans[[i]])), function(j_column) CNV(setNames(ans[[i]][, j_column], rownames(ans[[i]])), wnd_size=wnd_size, na.rm = na.rm), mc.cores = mc.cores)
ans[[i]] <- do.call(cbind, ans[[i]])

}

ans <- do.call(rbind, ans)

if(scale_cells){
if(center_cells){
temp <- apply(ans, 2, scale, scale=F)
rownames(temp) <- rownames(ans)
ans <- temp
Expand Down
2 changes: 1 addition & 1 deletion R/calculate_gs_scores.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ calculate_gs_scores <- function(scMuffinList=NULL, gs_list=NULL, mc.cores=1, nbi

}else{

gs_scores <- parallel::mclapply(gs_list, function(i_marker_set) gs_score(i_marker_set, genes_by_cells = as.matrix(scMuffinList$normalized), bins = data_bins, k=k, nmark_min = nmark_min, ncells_min = ncells_min, null_model=null_model, kmin = kmin, verbose=verbose, na.rm=na.rm), mc.cores = mc.cores)
gs_scores <- mclapply(gs_list, function(i_marker_set) gs_score(i_marker_set, genes_by_cells = as.matrix(scMuffinList$normalized), bins = data_bins, k=k, nmark_min = nmark_min, ncells_min = ncells_min, null_model=null_model, kmin = kmin, verbose=verbose, na.rm=na.rm), mc.cores = mc.cores)

}

Expand Down
60 changes: 17 additions & 43 deletions R/cluster_by_features.R
Original file line number Diff line number Diff line change
@@ -1,62 +1,36 @@
#' Cluster by features
#'
#' @description Clustering of the features_by_cell matrix
#' @param features features object or result of CNV calculation
#' @param features_by_cells features by cells
#' @param n_comp numeric, Dimensions of reduction to use as input
#' @param cnv TRUE/FALSE set it to TRUE for clustering CNV results
#' @param plot_umap whether to plot the UMAP
#' @param out_dir output directory
#' @param scale_features whether to scale features
#' @param ... arguments passed to Seurat::DimPlot
#' @param ... arguments passed to Seurat::FindCLusters
#' @return features_by_cells Seurat Object, object with saved dimension reduction components calculate on features by cells matrix
#' @import Seurat
#' @importFrom stats as.hclust
#' @export

cluster_by_features <- function(features, n_comp = 10, cnv=FALSE, plot_umap=FALSE, out_dir="./", scale_features=TRUE, ...){
cluster_by_features <- function(features_by_cells=NULL, n_comp = 10, ...){

if(!dir.exists(out_dir)){
dir.create(out_dir)
}

if(cnv){
features_by_cells <- features
}else{
features_by_cells <- t(features$df[, features$type != "factor", drop=F])
features_by_cells[is.na(features_by_cells)] <- 0
}


features_by_cells <- Seurat::CreateSeuratObject(counts = features_by_cells, min.cells = 0, min.features = 0)
features_by_cells <- CreateSeuratObject(counts = features_by_cells, min.cells = 0, min.features = 0)
all.genes <- rownames(features_by_cells)
#scale data
if(scale_features){
features_by_cells <- Seurat::ScaleData(features_by_cells, features = all.genes)
}

features_by_cells <- Seurat::RunPCA(features_by_cells, features = all.genes)
#scale data
features_by_cells <- ScaleData(features_by_cells, features = all.genes)
features_by_cells <- RunPCA(features_by_cells, npcs=n_comp, features = all.genes)

n_comp <- min(n_comp, ncol(features_by_cells@reductions$pca))

features_by_cells <- Seurat::FindNeighbors(features_by_cells, dims = 1:n_comp)
features_by_cells <- Seurat::FindClusters(features_by_cells)
#features_by_cells <- Seurat::RunUMAP(features_by_cells, dims = 1:n_comp)
features_by_cells <- FindNeighbors(features_by_cells, dims = 1:n_comp)
features_by_cells <- FindClusters(features_by_cells, ...)

if(plot_umap){
grDevices::jpeg(paste0(out_dir, "/feature_umap.jpg"), width = 180, height = 180, res=300, units="mm")
res <- DimPlot(features_by_cells, ..., combine = F)
plot(res[[1]])
dev.off()
}
if(cnv){
#features_by_cells <- Seurat::BuildClusterTree(features_by_cells, features = all.genes, reorder = T)
#hc_cells <- stats::as.hclust(features_by_cells@tools$BuildClusterTree)
#ans <- list([email protected], hc=hc_cells, sobj=features_by_cells)
ans <- list(clusters=features_by_cells@active.ident, sobj=features_by_cells)

}else{
# if(cnv){
# #features_by_cells <- Seurat::BuildClusterTree(features_by_cells, features = all.genes, reorder = T)
# #hc_cells <- stats::as.hclust(features_by_cells@tools$BuildClusterTree)
# #ans <- list([email protected], hc=hc_cells, sobj=features_by_cells)
# ans <- list([email protected], sobj=features_by_cells)
#
# }else{
ans <- list(clusters=features_by_cells@active.ident, sobj=features_by_cells)
}
# }

return(ans)

Expand Down
Loading

0 comments on commit 8b0d6d9

Please sign in to comment.