diff --git a/docs/bookdown/404.html b/docs/bookdown/404.html index 159ea79..8d337ab 100644 --- a/docs/bookdown/404.html +++ b/docs/bookdown/404.html @@ -23,7 +23,7 @@ - + diff --git a/docs/bookdown/actvity-based-analysis.html b/docs/bookdown/actvity-based-analysis.html index 259ed4b..88e63e3 100644 --- a/docs/bookdown/actvity-based-analysis.html +++ b/docs/bookdown/actvity-based-analysis.html @@ -23,7 +23,7 @@ - + @@ -262,15 +262,15 @@

10.1.1 Calculate activities per g

getActivity_individual() is designed to calculate the activities per group. It takes the network files as the input:

## let's use B cell as an example
 activity_B.eset <- getActivity_individual(input_eset = pbmc14k_log2cpm.eset[, pData(pbmc14k_log2cpm.eset)$trueLabel == "B"],
-                                          network_file.tf = system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe/B/TF/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"),
-                                          network_file.sig = system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe/B/SIG/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"),
+                                          network_file.tf = system.file("extdata/demo_pbmc14k/SJARACNe/B/TF/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"),
+                                          network_file.sig = system.file("extdata/demo_pbmc14k/SJARACNe/B/SIG/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"),
                                           driver_type = "TF_SIG")

10.1.2 Calculate activities in batch

If you need to calculate the activity for multiple groups, this is usually the case, you can do it using getActivity_individual() as shown above one by one and merge the esets after that. Or, scMINER privides another function, getActivity_inBatch(), to calculate the activity in batch:

## let's use B cell as an example
-activity.eset <- getActivity_inBatch(input_eset = pbmc14k_log2cpm.eset, sjaracne_dir = system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe", package = "scMINER"),  group_name = "trueLabel", driver_type = "TF_SIG", activity_method = "mean", do.z_normalization = TRUE)
+activity.eset <- getActivity_inBatch(input_eset = pbmc14k_log2cpm.eset, sjaracne_dir = system.file("extdata/demo_pbmc14k/SJARACNe", package = "scMINER"), group_name = "trueLabel", driver_type = "TF_SIG", activity_method = "mean", do.z_normalization = TRUE)
## 7 groups were found in trueLabel ...
 ## Checking network files for each group ...
 ##  Group 1 / 7 : Monocyte ...
diff --git a/docs/bookdown/cell-type-annotation.html b/docs/bookdown/cell-type-annotation.html
index 57ba575..3e5c658 100644
--- a/docs/bookdown/cell-type-annotation.html
+++ b/docs/bookdown/cell-type-annotation.html
@@ -23,7 +23,7 @@
 
 
 
-
+
 
   
   
diff --git a/docs/bookdown/create-sparseeset-object.html b/docs/bookdown/create-sparseeset-object.html
index 0baafbf..834ff47 100644
--- a/docs/bookdown/create-sparseeset-object.html
+++ b/docs/bookdown/create-sparseeset-object.html
@@ -23,7 +23,7 @@
 
 
 
-
+
 
   
   
diff --git a/docs/bookdown/data-filtration.html b/docs/bookdown/data-filtration.html
index 254b95f..d681a38 100644
--- a/docs/bookdown/data-filtration.html
+++ b/docs/bookdown/data-filtration.html
@@ -23,7 +23,7 @@
 
 
 
-
+
 
   
   
diff --git a/docs/bookdown/data-normalization.html b/docs/bookdown/data-normalization.html
index 1bac59e..05ba786 100644
--- a/docs/bookdown/data-normalization.html
+++ b/docs/bookdown/data-normalization.html
@@ -23,7 +23,7 @@
 
 
 
-
+
 
   
   
diff --git a/docs/bookdown/generate-gene-expresion-matrix.html b/docs/bookdown/generate-gene-expresion-matrix.html
index b81d8fe..e06f651 100644
--- a/docs/bookdown/generate-gene-expresion-matrix.html
+++ b/docs/bookdown/generate-gene-expresion-matrix.html
@@ -23,7 +23,7 @@
 
 
 
-
+
 
   
   
diff --git a/docs/bookdown/get-started.html b/docs/bookdown/get-started.html
index 35da70f..4176eab 100644
--- a/docs/bookdown/get-started.html
+++ b/docs/bookdown/get-started.html
@@ -23,7 +23,7 @@
 
 
 
-
+
 
   
   
@@ -262,7 +262,7 @@ 

2.1 Installation

The scMINER R package requires R 4.2.3 or newer, and can be installed from GitHub with:

# install.packages("devtools")
-devtools::install_github("https://github.com/jyyulab/scMINER.git@dev-qpan")
+devtools::install_github("https://github.com/jyyulab/scMINER.git")

Install MICA and SJARACNe

diff --git a/docs/bookdown/index.html b/docs/bookdown/index.html index f831d7a..df71da8 100644 --- a/docs/bookdown/index.html +++ b/docs/bookdown/index.html @@ -23,7 +23,7 @@ - + @@ -254,7 +254,7 @@

diff --git a/docs/bookdown/introduction.html b/docs/bookdown/introduction.html index 81af5bd..385d2b0 100644 --- a/docs/bookdown/introduction.html +++ b/docs/bookdown/introduction.html @@ -23,7 +23,7 @@ - + diff --git a/docs/bookdown/mi-based-clustering-analysis.html b/docs/bookdown/mi-based-clustering-analysis.html index a3b7ee7..fcc7232 100644 --- a/docs/bookdown/mi-based-clustering-analysis.html +++ b/docs/bookdown/mi-based-clustering-analysis.html @@ -23,7 +23,7 @@ - + @@ -291,7 +291,7 @@

7.3 Run MICA

7.4 Integrate MICA outputs into SparseEset object

MICA generates several files and save all of them in the output directory specified by the user with -o argument. The core, and only, output file we need for subsequent analysis is the clustering label file named in the format of ProjectName_clustering_VisualizeMethod_euclidean_NumberOfDimensions_Resolution.txt. In this case, since we used a range of resolutions, there are several clustering label files generated, one for each resolution. Based on the knowledge about PBMC14k dataset, we compared the results of different resolutions and picked clustering_UMAP_euclidean_20_2.05.txt for subsequent analysis.

-
micaOutput <- read.table(system.file("extdata/demo_pbmc14k/PBMC14k/MICA/clustering_UMAP_euclidean_20_2.05.txt", package = "scMINER"), header = TRUE, sep = "\t", quote = "", stringsAsFactors = F)
+
micaOutput <- read.table(system.file("extdata/demo_pbmc14k/MICA/clustering_UMAP_euclidean_20_2.05.txt", package = "scMINER"), header = TRUE, sep = "\t", quote = "", stringsAsFactors = F)
 head(micaOutput)
##               ID        X        Y label
 ## 1 CACTTTGACGCAAT 14.91650 13.04096     6
@@ -308,7 +308,7 @@ 

7.4 Integrate MICA outputs into S
  • label: labels of predicted clusters.
  • The clustering result can be easily easily added to the SparseExpressionSet object by addMICAoutput():

    -
    pbmc14k_log2cpm.eset <- addMICAoutput(pbmc14k_log2cpm.eset, mica_output_file = system.file("extdata/demo_pbmc14k/PBMC14k/MICA/clustering_UMAP_euclidean_20_2.05.txt", package = "scMINER"), visual_method = "umap")
    +
    pbmc14k_log2cpm.eset <- addMICAoutput(pbmc14k_log2cpm.eset, mica_output_file = system.file("extdata/demo_pbmc14k/MICA/clustering_UMAP_euclidean_20_2.05.txt", package = "scMINER"), visual_method = "umap")
     head(pData(pbmc14k_log2cpm.eset))
    ##                trueLabel_full trueLabel projectID nUMI nFeature    pctMito
     ## CACTTTGACGCAAT CD14+ Monocyte  Monocyte   PBMC14k  764      354 0.01832461
    diff --git a/docs/bookdown/network-inference.html b/docs/bookdown/network-inference.html
    index 45271e3..e119787 100644
    --- a/docs/bookdown/network-inference.html
    +++ b/docs/bookdown/network-inference.html
    @@ -23,7 +23,7 @@
     
     
     
    -
    +
     
       
       
    @@ -264,10 +264,10 @@ 

    9.1 Generate SJARACNe input files generateSJARACNeInput(input_eset = pbmc14k_log2cpm.eset, group_name = "trueLabel", sjaracne_dir = "/work-path/PBMC14k/SJARACNe", species_type = "hg", driver_type = "TF_SIG", downSample_N = NULL)

    For big datasets, generateSJARACNeInput() provides an argument, downSample_N, to allow users to down sample size of each group. The default value of downSample_N is 1,000, any group with >= 1,000 cells will be down-sample to 1,000.

    ## one folder for each group
    -list.dirs(system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe", package = "scMINER"), full.names = FALSE, recursive = FALSE)
    +list.dirs(system.file("extdata/demo_pbmc14k/SJARACNe", package = "scMINER"), full.names = FALSE, recursive = FALSE)

    ## [1] "B"        "CD4TCM"   "CD4TN"    "CD4Treg"  "CD8TN"    "Monocyte" "NK"
    ## file structure of each folder
    -list.files(system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe/B", package = "scMINER"), full.names = FALSE, recursive = TRUE, include.dirs = FALSE, pattern = "[^consensus_network_ncol_.txt]")
    +list.files(system.file("extdata/demo_pbmc14k/SJARACNe/B", package = "scMINER"), full.names = FALSE, recursive = TRUE, include.dirs = FALSE, pattern = "[^consensus_network_ncol_.txt]")

    ## [1] "B.8572_1902.exp.txt"     "config_cwlexec.json"    
     ## [3] "runSJARACNe.sh"          "SIG/B.4148_1902.sig.txt"
     ## [5] "TF/B.835_1902.tf.txt"
    @@ -324,7 +324,7 @@

    9.3.1 Introduction to the network
  • slope: slop of the regression line, returned by stats.linregression()
  • p.value: p-value for a hypothesis test whose null hypothesis is that the slope is zero, using Wald Test with t-distribution of the test statistic
  • -
    network_format <- read.table(system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe/B/TF/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"),
    +
    network_format <- read.table(system.file("extdata/demo_pbmc14k/SJARACNe/B/TF/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"),
                                  header = T, sep = "\t", quote = "", stringsAsFactors = F)
     head(network_format)
    ##   source     target source.symbol target.symbol     MI pearson spearman   slope
    @@ -347,7 +347,7 @@ 

    9.3.2 Generate network QC report<

    There is no simple standards to tell the reliability of networks. Empirically, a network with 50-300 target size is good. scMINER provides a function, drawNetworkQC(), to quickly assess the quality of networks in batch.

    network_stats <- drawNetworkQC(sjaracne_dir = "/work-path/PBMC14K/SJARACNe", generate_html = FALSE) # Set `generate_html = TRUE` to generate html-format QC report for each network file
    ## The network QC statistics table is saved seperately, for demonstration purposes.
    -network_stats <- readRDS(system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe/network_stats.rds", package = "scMINER"))
    +network_stats <- readRDS(system.file("extdata/demo_pbmc14k/SJARACNe/network_stats.rds", package = "scMINER"))
     head(network_stats)
    ##              network_tag network_node network_edge driver_count targetSize_mean
     ## 1      B.SIG.bt100_pc001         8572       391889         4148        94.47662
    diff --git a/docs/bookdown/search_index.json b/docs/bookdown/search_index.json
    index c13b85a..ffc1a4b 100644
    --- a/docs/bookdown/search_index.json
    +++ b/docs/bookdown/search_index.json
    @@ -1 +1 @@
    -[["index.html", "scMINER: a mutual information-based framework for identifying hidden drivers from single-cell omics data ", " scMINER: a mutual information-based framework for identifying hidden drivers from single-cell omics data John Doe 2024-08-04 Welcome to scMINER documentation! scMINER (single-cell Mutual Information-based Network Engineering Ranger) is a computational framework designed for end-to-end analysis of single cell RNA-seq data. Using mutual information to measure cell-cell similarities and gene-gene correlations, scMINER is widely applicable and highly accurate in unsupervised clustering and gene activity inference of scRNA-seq data. In this documentation, we will walk you through every analysis that scMINER can do and introduce you more about the concepts related to scMINER framework. "],["introduction.html", "Chapter 1 Introduction 1.1 A few concepts 1.2 Why use scMINER 1.3 Citation 1.4 Support", " Chapter 1 Introduction This chapter will introduce some principal concepts and unique features of scMINER. 1.1 A few concepts There are a few concepts that may help you understand scMINER better. SparseEset The SparseExpressionSet (or SparseEset for short) is a new class created by scMINER to handle the sparsity in scRNA-seq data. It is derived from ExpressionSet, and enables to compress, store and access efficiently and conveniently. The SparseEset object is the center of scRNA-seq data analysis by scMINER. Mutual Information Mutual information is a measure of the mutual dependence between two random variables. It quantifies the amount of information obtained about one variable through the other variable. In other words, it measures how much knowing the value of one variable reduces uncertainty about the value of the other variable. It’s widely used in probability theory and information theory. Compared with the linear correlation that used by most existing tools for scRNA-seq data clustering, mutual information provides a more general measure of dependence that can capture both linear and non-linear relationships, and hence may increases the accuracy and sensitivity of scRNA-seq data clustering. Comparison of Linear Correlation and Mutual Information (powered by ChatGPT) Linear Correlation Mutual Information Definition Measures linear relationship Measures mutual dependence (both linear and non-linear) Range -1 to 1 0 to Inf Sensitivity to outliers Sensitive Less sensitive Captures Non-linear Relationships No Yes Common Applications Regression, finance, science Feature selection, clustering, network inference Gene Activity The gene activity estimation is one of the most important features of scMINER. Mathematically, the activity of one gene is a type of mean of the expressions of its targets. And biologically, the activity can be interpreted as a measure that describes how actively the driver functions, like the enzymes in digesting their subtracts, kinase in activating their downstream genes. Given the gene expression profiles and networks, scMINER can estimate the activities of some predefined drivers, including not only transcription factors (TFs) but also signaling genes (SIGs). 1.2 Why use scMINER (more details to be added) scMINER includes the following key functions: Mutual information-based clustering: scMINER measures the cell-cell similarities with full feature-derived mutual information. It can catch both linear and non-linear correlations and performs better in cell clustering, especially for those of close states. Gene activity estimation: scMINER rewires the cell-type specific gene networks solely from the scRNA-seq data, and then estimates the gene activities of not only transcription factors (TFs) but also signaling genes (SIGs). The gene activity-based analysis can expose the main regulators of various biological activities, like cellular linage differentiation and tissue specificity. SparseEset-centered full-feature tool: scMINER provides a wide range of functions for data intake, quality control and filtration, MI-based clustering, network inference, gene activity estimation, cell type annotation, differential expression/activity analysis, and data visualization and sharing. Most of these functions are developed in an object-oriented manner for the SparseEset object. 1.3 Citation Please consider citing this paper if you find scMINER useful in your research. 1.4 Support We welcome your feedback! The scMINER software is developed and maintained by the Yu Lab @ St. Jude Children’s Research Hospital and is released under the Apache License (Version 2.0). Feel free to open an issue, or send us an email if you encounter a bug, need our help or just want to make a comment/suggestion. "],["get-started.html", "Chapter 2 Get started 2.1 Installation 2.2 Demo data 2.3 Create project space", " Chapter 2 Get started This chapter will walk you through how to install scMINER, create a project space for your study and prepare the demo data used in this tutorial. 2.1 Installation scMINER framework is mainly developed with R for its advantages in statistical analysis and data visualization. It also includes two components, MICA and SJARACNe, that are developed with Python to take its strengths in calculation speed and memory consumption, since mutual information estimation of large-scale scRNA-seq data is usually compute-intensive. Please install all three components for the full access to scMINER framework. Install scMINER R package The scMINER R package requires R 4.2.3 or newer, and can be installed from GitHub with: # install.packages("devtools") devtools::install_github("https://github.com/jyyulab/scMINER.git@dev-qpan") Install MICA and SJARACNe The recommended method to install MICA and SJARACNe is to use conda dependency manager: ## setup conda env conda create -n scminer python=3.9.2 # Create a python virtual environment source activate scminer # Activate the virtual environment ## install MICA git clone https://github.com/jyyulab/MICA # Clone the MICA repo cd MICA # Switch to the MICA root directory pip install . # Install MICA and its dependencies mica -h # Check if MICA works ## install SJARACNE cd .. # Switch to conda env folder git clone https://github.com/jyyulab/SJARACNe.git # Clone the SJARACNe repo cd SJARACNe # Switch to the MICA root directory python setup.py build # Build SJARACNe binary python setup.py install # Build SJARACNe binary sjaracne -h # Check if SJARACNe works 2.2 Demo data In this tutorial, we will use a ground truth dataset called PBMC14k for demonstration purposes. It was generated from a Peripheral Blood Mononuclear Cells (PBMCs) dataset containing 10 known cell types, with 2,000 cells per type [Zheng et al., 2017]: We first rectified the gene symbol issues of the original dataset, including the dash-dot conversion (e.g. “RP11-34P13.7” changed to “RP11.34P13.7”) and “X” added to those started with numbers (e.g. “7SK” changed to “X7SK”), by referring to the gene annotation file (GRCh37.82) used in the original study. Then we removed 3 cell populations, CD34+ cells, CD4+ helper T cells, and total CD8+ cytotoxic cells, from the dataset because of either low sorting purity or a significant overlap with other immune cells based on the sorting strategy, and created a new dataset with seven known cell types and 14k cells in total. The original dataset is freely available under this accession number SRP073767 and Zenodo. How was the PBMC14K dataset generated from the original dataset? ## Step 1: rectify the invalid gene symbols # "Filtered_DownSampled_SortedPBMC_data.csv" is the raw count matrix directly downloaded from Zenodo counts <- read.csv("Filtered_DownSampled_SortedPBMC_data.csv", row.names = 1) d <- t(counts); dim(d) # it includes 21592 genes and 20000 cells # "genesymbol_from_GTF_GRCh37.txt" contains the official gene ids and symbols extracted from GTF file downloaded from officialGene <- read.table("genesymbol_from_GTF_GRCh37.txt", header = T, sep = "\\t", quote = "", stringsAsFactors = F); head(officialGene) https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/ officialGene$dotted_symbol <- gsub("-", "\\\\.", officialGene$gene_name); officialGene$dotted_symbol <- make.unique(officialGene$dotted_symbol) table(row.names(d) %in% officialGene$dotted_symbol); row.names(d)[! row.names(d) %in% officialGene$dotted_symbol] # two genes are not in: X7SK.1 and X7SK.2 row.names(d) <- gsub("X7SK.1", "7SK", row.names(d)); row.names(d) <- gsub("X7SK.2", "7SK.1", row.names(d)) table(row.names(d) %in% officialGene$dotted_symbol) # all true row.names(officialGene) <- officialGene$dotted_symbol officialGene <- officialGene[row.names(d),] row.names(d) <- make.unique(officialGene$gene_name) # "Labels.csv" contains the true labels of cell types and was directly downloaded from Zenodo celltype <- read.csv("Labels.csv"); head(celltype); table(celltype$x) # 2000 cells for each of 10 cell types: CD14+ Monocyte, CD19+ B, CD34+, CD4+ T Helper2, CD4+/CD25 T Reg, CD4+/CD45RA+/CD25- Naive T, CD4+/CD45RO+ Memory, CD56+ NK, CD8+ Cytotoxic T, CD8+/CD45RA+ Naive Cytotoxic df <- data.frame(cell_barcode = colnames(d), trueLabel_full = celltype$x); dim(df) truelabel_map <- c(`CD14+ Monocyte`="Monocyte", `CD19+ B`="B", `CD34+`="CD34pos", `CD4+ T Helper2`="CD4Th2", `CD4+/CD25 T Reg`="CD4Treg", `CD4+/CD45RA+/CD25- Naive T`="CD4TN", `CD4+/CD45RO+ Memory`="CD4TCM", `CD56+ NK`="NK", `CD8+ Cytotoxic T`="CD8CTL", `CD8+/CD45RA+ Naive Cytotoxic`="CD8TN") df$trueLabel <- as.character(truelabel_map[df$trueLabel_full]) ## Step 2: extract 7 populations df.14k <- df[df$trueLabel_full %in% c("CD14+ Monocyte", "CD19+ B", "CD4+/CD25 T Reg", "CD4+/CD45RA+/CD25- Naive T", "CD4+/CD45RO+ Memory", "CD56+ NK", "CD8+/CD45RA+ Naive Cytotoxic"),] write.table(df.14k, file = "PBMC14k_trueLabel.txt", sep = "\\t", row.names = TRUE, col.names = TRUE, quote = FALSE, append = FALSE) d.14k <- d[,df.14k$cell_barcode] d.14k <- d.14k[rowSums(d.14k) > 0,] write.table(d.14k, file = "PBMC14k_rawCount.txt", sep = "\\t", row.names = FALSE, col.names = TRUE, quote = FALSE, append = FALSE) # 17986 genes, 14000 cells The PBMC14k dataset is embeded in scMINER R package and can be easily loaded by: library(scMINER) data("pbmc14k_rawCount") dim(pbmc14k_rawCount) ## [1] 17986 14000 pbmc14k_rawCount[1:5,1:5] ## 5 x 5 sparse Matrix of class "dgCMatrix" ## CACTTTGACGCAAT GTTACGGAAACGAA AGTCACGACAGGAG TTCGAGGACCAGTA ## AL627309.1 . . . . ## AP006222.2 . . . . ## RP11-206L10.3 . . . . ## RP11-206L10.2 . . . . ## RP11-206L10.9 . . . . ## CACTTATGAGTCGT ## AL627309.1 . ## AP006222.2 . ## RP11-206L10.3 . ## RP11-206L10.2 . ## RP11-206L10.9 . 2.3 Create project space The project space created by scMINER is a folder of specified name in specified directory. It contains 4 subfolders: DATA: to save the sparse eSet objects and other files; MICA: to save the inputs and outputs of mutual information-based clustering analysis; SJARACNe: to save the inputs and outputs of network inference and quality control; PLOT: to save the files of data visualization. The project space can not only keep your data centralized and organized, but also make the scMINER pipeline more smooth and robust. We strongly recommend you to create a project space for each of your studies. This can be easily done by createProjectSpace() in scMINER: scminer_dir <- createProjectSpace(project_dir = "/your-path", project_name = "PBMC14k") The command above will create a folder named PBMC14k in /your-path, and save the path to the project space (/your-path/PBMC14k) to scminer_dir. Can I add, delete or modify files in project space folder? Yes, you can. There are two functions, drawNetworkQC() and getActivityBatch(), that take directories as inputs, and both of them can validate the inputs. For all the rest functions, the inputs are specific files. So adding files in project space never affect the scMINER analysis. Deleting or modifying files in project spare is also safe. The input validation features of scMINER functions can help locate the files with issues. All output files of scMINER are reproducible and can be re-generated quickly. Just be careful with the clustering results in MICA and network files in SJARACNe, ad regerating them can take some time. "],["generate-gene-expresion-matrix.html", "Chapter 3 Generate gene expresion matrix 3.1 From data directory by 10x Genomics 3.2 From text-table file 3.3 From HDF5 file by 10x Genomics 3.4 From H5AD file", " Chapter 3 Generate gene expresion matrix In this chapter, we will generate the gene expression matrix, genes by cells, from multiple input formats commonly used in single-cell RNA sequencing, including sparse matrix by 10x Genomics, text-table file, HDF5 file by 10x Genomics and H5AD file. For demonstration purposes, scMINER embedded four datasets in extdata/demoData_readInput, with one for each input format. All of these four samples were generated by downsampling the real scRNA-seq data. 3.1 From data directory by 10x Genomics This is the most popular input format of scRNA-seq data generated by 10x Genomics. Usually, the data directory contains three files: matrix.mtx: a sparse matrix format containing the raw UMI count per cell-gene combination barcodes.tsv: a tab-separated matrix containing the cell barcodes features.tsv: a tab-separated matrix containing the features/genes and their annotations For more details about this format, please check out here. data_dir <- system.file("extdata/demo_inputs/cell_matrix_10x", package = "scMINER") list.files(path = data_dir, full.names = FALSE) ## [1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz" demo1_mtx <- readInput_10x.dir(input_dir = data_dir, featureType = "gene_symbol", removeSuffix = TRUE, addPrefix = "demo1") ## Reading 10x Genomcis data from: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/scMINER/extdata/demo_inputs/cell_matrix_10x ... ## Multiple data modalities were found: Gene Expression, Peaks . Only the gene expression data (under "Gene Expression") was kept. ## Done! The sparse gene expression matrix has been generated: 500 genes, 100 cells. demo1_mtx[1:5,1:5] ## 5 x 5 sparse Matrix of class "dgTMatrix" ## demo1_AAACAGCCAAACGGGC demo1_AAACAGCCAACTAGCC demo1_AAACAGCCAATTAGGA ## AL590822.3 . . . ## MORN1 . . . ## AL589739.1 . . . ## AL513477.2 . . . ## RER1 . . . ## demo1_AAACAGCCAGCCAGTT demo1_AAACATGCAAAGCTCC ## AL590822.3 . . ## MORN1 . . ## AL589739.1 . . ## AL513477.2 . . ## RER1 1 . The readInput_10x.dir() function can handle these conditions: Alternative file names for feature data: for the datasets generated by CellRanger 3.0 or earlier, the file name is genes.tsv; Compressed input files: one or more input files are compressed, usually in “.gz” format; Data with multiple modalities: like the single cell multiome data. In this case, readInput_10x.dir() only retains the data of “Gene Expression” by default. 3.2 From text-table file This is definitely the most compatible text format for scRNA-seq data. which can be used by all single-cell RNA-seq technologies, like 10x Genomics, Smart-Seq, Drop-Seq and more. The commonly used text table file formats include txt (text file format), csv (comma-separated values) and tsv (tab-separated values). table_file <- system.file("extdata/demo_inputs/table_file/demoData2.txt.gz", package = "scMINER") demo3_mtx <- readInput_table(table_file = table_file, sep = "\\t", is.geneBYcell = TRUE, removeSuffix = TRUE, addPrefix = "demo3") # set is.geneBYcell = FALSE to read features in columns and cell in in rows ## Reading table file: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/scMINER/extdata/demo_inputs/table_file/demoData2.txt.gz ... ## Suffix removal was specified but skipped, since some barcodes do not carry "-1" suffix. ## Done! The sparse gene expression matrix has been generated: 1000 genes, 100 cells. NOTE: A major concern to read the gene expression matrix from text-table files is that the special characters in column names might change to dots (“.”), especially when the matrix is organized in cells by genes. This may cause failures in the identification of mitochondrial genes (usually defined by “MT-|mt-”) or spike-in RNAs (usually defined by “ERCC-|Ercc-”). The readInput_table() function has set check.names = FALSE to avoid this issue. However, if this issue already exists in the source data, you will have to fix it manually. 3.3 From HDF5 file by 10x Genomics This is another popular input format of scRNA-seq data generated by 10x Genomics. The Hierarchical Data Format version 5 (HDF5 or H5) is a binary format that can compress and access data much more efficiently than text formats. It’s super useful when dealing with large datasets. For more details about this format, please check out here. library(hdh5r) h5_file <- system.file("extdata/demo_inputs/hdf5_10x/demoData3.h5", package = "scMINER") demo2_mtx <- readInput_10x.h5(h5_file = h5_file, featureType = "gene_symbol", removeSuffix = TRUE, addPrefix = "demo2") NOTE: The readInput_10x.h5() function is developed exclusively for the HDF5 file generated by CellRanger of 10x Genomics. The HDF5 files from other source may have different hierarchical structures and can not be read by this function. 3.4 From H5AD file The H5AD file is another well-used format for scRNA-seq data. Derived from HDF5 file format, the H5AD file is designed to store large amounts of data efficiently and to facilitate fast access to subsets of the data. Now it’s getting more and more popular in scRNA-seq data analysis, visualization and sharing. For more details about this format, please check out here. library(anndata) h5ad_file <- system.file("extdata/demo_inputs/h5ad_file/demoData4.h5ad", package = "scMINER") demo4_obj <- readInput_h5ad(h5ad_file = h5ad_file, removeSuffix = TRUE, addPrefix = "demo4") # set is.geneBYcell = FALSE to read features in columns and cell in in rows ## Reading h5ad file: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/scMINER/extdata/demo_inputs/h5ad_file/demoData4.h5ad ... ## Suffix removal was specified but skipped, since some barcodes do not carry "-1" suffix. ## Done! The sparse gene expression matrix has been generated: 1000 genes, 100 cells. NOTE: Unlike the other three readInput functions which return a gene expression matrx, the readInput_h5ad() returns an AnnData object. Here are the key components of an AnnData object: X: the primary data matrix, cells by genes; obs: observations (cells) metadata; var: variables (features/genes) metadata. "],["create-sparseeset-object.html", "Chapter 4 Create SparseEset object 4.1 Solely from the gene expression matrix 4.2 Using self-customized meta data 4.3 From multiple samples", " Chapter 4 Create SparseEset object In this chapter, we will introduce you how to create the parseExpressionSet(SparseEset) objects from gene expression matrix, genes by cells. 4.1 Solely from the gene expression matrix This is the most commonly used way to create the sparse eSet object with scMINER: pbmc14k_raw.eset <- createSparseEset(input_matrix = pbmc14k_rawCount, projectID = "PBMC14k", addMetaData = TRUE) ## Creating sparse eset from the input_matrix ... ## Adding meta data based on input_matrix ... ## Done! The sparse eset has been generated: 17986 genes, 14000 cells. pbmc14k_raw.eset ## SparseExpressionSet (storageMode: environment) ## assayData: 17986 features, 14000 samples ## element names: exprs ## protocolData: none ## phenoData ## sampleNames: CACTTTGACGCAAT GTTACGGAAACGAA ... ACGTGCCTTAAAGG (14000 ## total) ## varLabels: CellID projectID ... pctSpikeIn (6 total) ## varMetadata: labelDescription ## featureData ## featureNames: AL627309.1 AP006222.2 ... SRSF10.1 (17986 total) ## fvarLabels: GeneSymbol nCell ## fvarMetadata: labelDescription ## experimentData: use 'experimentData(object)' ## Annotation: input_matrix: it’s usually but not limited to a sparse matrix of raw UMI count. As for the data format, it accepts dgCMatrix, dgTMatrix, dgeMatrix, matrix, data.frame. As for the type of quantification measures, it takes raw counts, normalized counts (e.g. CPM or CP10k), TPM (Transcripts Per Million), FPKM/RPKM (Fragments/Reads Per Kilobase of transcript per Million) and others. What if a data frame object is given to it? When a non-matrix table is passed to input_matrix argument, the createSparseEset() function will automatically convert it to a matrix. And it the matrix, either converted from other format or directly passed from users, is not sparse. createSparseEset() will automatically convert it into sparse matrix, by default. This is controlled by another argument called do.sparseConversion, the default of which is TRUE. It’s not recommended but the users can set it as FALSE to disable the conversion. Then createSparseEset() will create the eSet based on the regular matrix. addMetaData: when this argument is set TRUE (this is the default), createSparseEset() will automatically generate 5 statistics, 4 for cells and 1 for features, and add them into the phenoData and featureData slots. These 5 statistics will be used in quality control and data filtration. ## check the phenoData: metadata of cells head(pData(pbmc14k_raw.eset)) ## CellID projectID nUMI nFeature pctMito pctSpikeIn ## CACTTTGACGCAAT CACTTTGACGCAAT PBMC14k 764 354 0.01832461 0 ## GTTACGGAAACGAA GTTACGGAAACGAA PBMC14k 956 442 0.01569038 0 ## AGTCACGACAGGAG AGTCACGACAGGAG PBMC14k 7940 2163 0.01977330 0 ## TTCGAGGACCAGTA TTCGAGGACCAGTA PBMC14k 4177 1277 0.01149150 0 ## CACTTATGAGTCGT CACTTATGAGTCGT PBMC14k 629 323 0.02066773 0 ## GCATGTGATTCTGT GCATGTGATTCTGT PBMC14k 875 427 0.02628571 0 ## check the featureData: metadata of features head(fData(pbmc14k_raw.eset)) ## GeneSymbol nCell ## AL627309.1 AL627309.1 50 ## AP006222.2 AP006222.2 2 ## RP11-206L10.3 RP11-206L10.3 1 ## RP11-206L10.2 RP11-206L10.2 33 ## RP11-206L10.9 RP11-206L10.9 17 ## LINC00115 LINC00115 115 4.2 Using self-customized meta data In some cases, you may have more meta data of either cells (e.g. sample id, treatment condition) or features (e.g. gene full name, gene type, genome location) which will be used in downstream analysis and you do want to add them into the sparse eSet object. the createSparseEset() function provides another two arguments, cellData and featureData, to take the self-customized meta data. For the PBMC14k dataset, we have the true labels of cell type and would like to add them to the sparse eSet object. ## read the true labels of cell type for PBMC14k dataset true_label <- read.table(system.file("extdata/demo_pbmc14k/PBMC14k_trueLabel.txt.gz", package = "scMINER"), header = T, row.names = 1, sep = "\\t", quote = "", stringsAsFactors = FALSE) head(true_label) ## trueLabel_full trueLabel ## CACTTTGACGCAAT CD14+ Monocyte Monocyte ## GTTACGGAAACGAA CD14+ Monocyte Monocyte ## AGTCACGACAGGAG CD14+ Monocyte Monocyte ## TTCGAGGACCAGTA CD14+ Monocyte Monocyte ## CACTTATGAGTCGT CD14+ Monocyte Monocyte ## GCATGTGATTCTGT CD14+ Monocyte Monocyte table(true_label$trueLabel_full) ## ## CD14+ Monocyte CD19+ B ## 2000 2000 ## CD4+/CD25 T Reg CD4+/CD45RA+/CD25- Naive T ## 2000 2000 ## CD4+/CD45RO+ Memory CD56+ NK ## 2000 2000 ## CD8+/CD45RA+ Naive Cytotoxic ## 2000 ## the true_label much cover all cells in the expression matrix table(colnames(pbmc14k_rawCount) %in% row.names(true_label)) ## ## TRUE ## 14000 ## create the sparse eSet object using the true_label pbmc14k_raw.eset <- createSparseEset(input_matrix = pbmc14k_rawCount, cellData = true_label, featureData = NULL, projectID = "PBMC14k", addMetaData = TRUE) ## Creating sparse eset from the input_matrix ... ## Adding meta data based on input_matrix ... ## Done! The sparse eset has been generated: 17986 genes, 14000 cells. ## check the true labels of cell type from sparse eSet object head(pData(pbmc14k_raw.eset)) ## trueLabel_full trueLabel projectID nUMI nFeature pctMito ## CACTTTGACGCAAT CD14+ Monocyte Monocyte PBMC14k 764 354 0.01832461 ## GTTACGGAAACGAA CD14+ Monocyte Monocyte PBMC14k 956 442 0.01569038 ## AGTCACGACAGGAG CD14+ Monocyte Monocyte PBMC14k 7940 2163 0.01977330 ## TTCGAGGACCAGTA CD14+ Monocyte Monocyte PBMC14k 4177 1277 0.01149150 ## CACTTATGAGTCGT CD14+ Monocyte Monocyte PBMC14k 629 323 0.02066773 ## GCATGTGATTCTGT CD14+ Monocyte Monocyte PBMC14k 875 427 0.02628571 ## pctSpikeIn CellID ## CACTTTGACGCAAT 0 CACTTTGACGCAAT ## GTTACGGAAACGAA 0 GTTACGGAAACGAA ## AGTCACGACAGGAG 0 AGTCACGACAGGAG ## TTCGAGGACCAGTA 0 TTCGAGGACCAGTA ## CACTTATGAGTCGT 0 CACTTATGAGTCGT ## GCATGTGATTCTGT 0 GCATGTGATTCTGT table(pData(pbmc14k_raw.eset)$trueLabel_full) ## ## CD14+ Monocyte CD19+ B ## 2000 2000 ## CD4+/CD25 T Reg CD4+/CD45RA+/CD25- Naive T ## 2000 2000 ## CD4+/CD45RO+ Memory CD56+ NK ## 2000 2000 ## CD8+/CD45RA+ Naive Cytotoxic ## 2000 4.3 From multiple samples What if you have multiple samples for one project? Now it’s pretty common to profile multiple samples of the same tissue but under different conditions (e.g. drug treatment) in one project. Analyzing these samples one by one is crucial, and analyzing them in a combined manner may give you more prospects. For this purpose, scMINER provides a function, combineSparseEset(), to easily combine the sparse eSet objects of multiple samples. ## create a sparse eSet object of each sample to combined demo1_mtx <- readInput_10x.dir(input_dir = system.file("extdata/demo_inputs/cell_matrix_10x", package = "scMINER"), featureType = "gene_symbol", removeSuffix = TRUE) ## Reading 10x Genomcis data from: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/scMINER/extdata/demo_inputs/cell_matrix_10x ... ## Multiple data modalities were found: Gene Expression, Peaks . Only the gene expression data (under "Gene Expression") was kept. ## Done! The sparse gene expression matrix has been generated: 500 genes, 100 cells. demo1.eset <- createSparseEset(input_matrix = demo1_mtx, projectID = "demo1", addMetaData = TRUE) ## Creating sparse eset from the input_matrix ... ## Adding meta data based on input_matrix ... ## Done! The sparse eset has been generated: 500 genes, 100 cells. demo2_mtx <- readInput_table(table_file = system.file("extdata/demo_inputs/table_file/demoData2.txt.gz", package = "scMINER"), sep = "\\t", is.geneBYcell = TRUE, removeSuffix = TRUE) ## Reading table file: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/scMINER/extdata/demo_inputs/table_file/demoData2.txt.gz ... ## Suffix removal was specified but skipped, since some barcodes do not carry "-1" suffix. ## Done! The sparse gene expression matrix has been generated: 1000 genes, 100 cells. demo2.eset <- createSparseEset(input_matrix = demo2_mtx, projectID = "demo2", addMetaData = TRUE) ## Creating sparse eset from the input_matrix ... ## Adding meta data based on input_matrix ... ## Done! The sparse eset has been generated: 1000 genes, 100 cells. ## combine the 4 sparse eSet objects library(dplyr) ## ## Attaching package: 'dplyr' ## The following object is masked from 'package:Biobase': ## ## combine ## The following objects are masked from 'package:BiocGenerics': ## ## combine, intersect, setdiff, union ## The following objects are masked from 'package:stats': ## ## filter, lag ## The following objects are masked from 'package:base': ## ## intersect, setdiff, setequal, union combined.eset <- combineSparseEset(eset_list = c(demo1.eset, demo2.eset), projectID = c("sample1", "sample2"), addPrefix = c("demo1", "demo2"), addSurfix = NULL, addMetaData = TRUE, imputeNA = TRUE) ## Combining the input sparse eSets ... ## NA values were found in the merged matrix and have been replaced by the minimum value: 0 . ## Adding meta data based on merged data matrix ... ## Done! The combined sparse eset has been generated: 1500 genes, 200 cells. dim(combined.eset) ## Features Samples ## 1500 200 A few questions you may have about the combineSparseEset() function: What if the input eSets have different features? combineSparseEset() ALWAYS keep all features from the input eSets, and generate NA values wherever the data is not available. By default, this function impute the NA values with the minimum value of the combined matrix, which is usually but not always zero. If this imputation method doesn’t fit your study, you can set imputeNA to FALSE to disable it. If so, the NAs will retain in the eSet object, and you can manually impute them with your own method. What if the input eSets have some same cell barcodes? combineSparseEset() ALWAYS keep all cells from the input eSets, and will report an error when same barcodes are found in different input eSets. This function provides two arguments, addPrefix and addSurfix, to solve this issue. You can easily avoid the same barcodes of different input eSets by adding a eSet-specific prefix and/or surfix to the barcodes. head(pData(combined.eset)) ## CellID projectID nUMI nFeature pctMito ## demo1_AAACAGCCAAACGGGC demo1_AAACAGCCAAACGGGC sample1 119 43 0 ## demo1_AAACAGCCAACTAGCC demo1_AAACAGCCAACTAGCC sample1 55 28 0 ## demo1_AAACAGCCAATTAGGA demo1_AAACAGCCAATTAGGA sample1 45 20 0 ## demo1_AAACAGCCAGCCAGTT demo1_AAACAGCCAGCCAGTT sample1 175 44 0 ## demo1_AAACATGCAAAGCTCC demo1_AAACATGCAAAGCTCC sample1 51 31 0 ## demo1_AAACATGCAATAGCCC demo1_AAACATGCAATAGCCC sample1 121 44 0 ## pctSpikeIn ## demo1_AAACAGCCAAACGGGC 0 ## demo1_AAACAGCCAACTAGCC 0 ## demo1_AAACAGCCAATTAGGA 0 ## demo1_AAACAGCCAGCCAGTT 0 ## demo1_AAACATGCAAAGCTCC 0 ## demo1_AAACATGCAATAGCCC 0 I have some customized column in the phenoData and/or featureData slots. How does combineSparseEset() handle them? combineSparseEset() only keep the columns of phenoData and featureData that shared by all input eSets. Your customized columns would be kept only when they are available in all input eSets. Are the 5 meta data statistics in the combined eSet still same with those generated in each eSet? No. By default, combineSparseEset() will update (add, if they are not available in input eSets) these 5 meta data statistics based on the combined matrix. It’s not recommended but you can disable it by setting addMataData to FALSE. "],["data-filtration.html", "Chapter 5 Data filtration 5.1 QC report 5.2 Filter the sparse eset object", " Chapter 5 Data filtration In this chapter, we will introduce you how scMINER assess the scRNA-seq data quality, estimate the cutoffs for data filtration, and remove the cells and features of low quality from the SparseEset object. ## QC metrics in scMINER As we mentioned before, scMINER can automatically generate 5 meta data statistics and add them to the SparseEset object. These 5 meta data statistics are the metrics scMINER uses to assess the quality of cells and features: For cell quality assessment, scMINER provides 4 metrics that commonly used by the community: nUMI: number of total UMIs in each cell. Cells with abnormally high nUMI usually indicate doublets, while those with abnormally low nUMI usually indicate poorly sequenced cells or empty droplets. nFeature: number of expressed features/genes in each cell. Similar to nUMI. pctMito: percentage of UMIs of mitochondrial genes (defined by “mt-|MT-”) in each cell. Cells with aberrantly high pctMito usually indicate dying cells. pctSpikeIn: percentage of UMIs of spike-in RNAs (defined by “ERCC-|Ercc-”)) in each cell. This is used to estimate the normalization factor. Cells with extremely high or low pctSpikeIn need to be removed. For feature quality assessment, scMINER provides one metrics: nCell: number of cells expressing the features/genes. Genes with extremely low nCell are poorly sequenced and are usually of low variance. 5.1 QC report To help assess the data quality and determine the cutoffs for data filtration, scMINER provides a function, drawSparseEsetQC(), to generate a html-format quality control report: ## To generate the drawSparseEsetQC(input_eset = pbmc14k_raw.eset, output_html_file = "/your-path/PBMC14k/PLOT/pbmc14k_rawCount.html", overwrite = TRUE) ## scMINER supports group-specific QC highlights drawSparseEsetQC(input_eset = pbmc14k_raw.eset, output_html_file = "/your-path/PBMC14k/PLOT/pbmc14k_rawCount.html", overwrite = FALSE, group_by = "true_label") The quality control report consists of 4 parts: Key Statistics: it highlights 5 key statistics of given eset object, including number of cells, number of genes, mean of genes per cell, mean of UMIs per cell and mean of cells per gene. Detailed statistics of key metrics: it summarizes and visualizes the detailed statistics of 5 key metrics that scMINER uses for filtration: nUMI, nFeature, pctMito, pctSpikeIn, nCell. Detailed statistics per cell and gene: it lists the detailed statistics of each gene and cell. Filtration cutoffs by scMINER: it provides the cutoffs estimated automatically by scMINER based on Median ± 3 * MAD (maximum absolute deviance), and the pseudo-filtration statistics on both genes and cells with these cutoffs. 5.2 Filter the sparse eset object From the quality control report generated above, we have got a better sense about the data quality and the cutoffs to use for filtration. scMINER provides a function, filterSparseEset() for this purpose, and it can work in two modes: auto: in this mode, scMINER will use the cutoffs estimated by Median ± 3*MAD (maximum absolute deviation). Based on our tests, in most cases, this mode works well with the matrix of both raw UMI counts and TPM values. manual: in this mode, the users can manually specify the cutoffs, both low and high, of all 5 metrics: nUMI, nFeature, pctMito, pctSpikeIn for cells, and nCell for genes. No cells or features would be removed under the default cutoffs of each metrics. No matter which mode to use, filterSparseEset() returns a summary table with detailed information of filtration statistics. You can refer to it and adjust the cutoffs accordingly. 5.2.1 Data filtration with auto mode To conduct the filtering using the cutoffs recommended by scMINER: ## Filter eSet under the auto mode pbmc14k_filtered.eset <- filterSparseEset(pbmc14k_raw.eset, filter_mode = "auto", filter_type = "both") ## Checking the availability of the 5 metrics ('nCell', 'nUMI', 'nFeature', 'pctMito', 'pctSpikeIn') used for filtration ... ## Checking passed! All 5 metrics are available. ## Filtration is done! ## Filtration Summary: ## 8846/17986 genes passed! ## 13605/14000 cells passed! ## ## For more details: ## Gene filtration statistics: ## Metrics nCell ## Cutoff_Low 70 ## Cutoff_High Inf ## Gene_total 17986 ## Gene_passed 8846(49.18%) ## Gene_failed 9140(50.82%) ## ## Cell filtration statistics: ## Metrics nUMI nFeature pctMito pctSpikeIn Combined ## Cutoff_Low 458 221 0 0 NA ## Cutoff_High 3694 Inf 0.0408 0.0000 NA ## Cell_total 14000 14000 14000 14000 14000 ## Cell_passed 13826(98.76%) 14000(100.00%) 13778(98.41%) 14000(100.00%) 13605(97.18%) ## Cell_failed 174(1.24%) 0(0.00%) 222(1.59%) 0(0.00%) 395(2.82%) In some cases, you may find that most of the cutoffs generated by the auto mode are good, except one or two. Though there is no ‘hybrid’ mode, scMINER does allow you to customize some of the cutoffs generated by the auto mode. This can be easily done by adding the cutoffs you would customize under the auto mode: ## Filter eSet under the auto mode, with customized values pbmc14k_filtered.eset <- filterSparseEset(pbmc14k_raw.eset, filter_mode = "auto", filter_type = "both", gene.nCell_min = 5) With the code above, scMINER will filter the eSet using all of the cutoffs generated by auto mode, except gene.nCell_min. 5.2.2 Data filtration with manual mode To apply the self-customized cutoffs: ## Filter eSet under the manual mode pbmc14k_filtered.eset <- filterSparseEset(pbmc14k_raw.eset, filter_mode = "manual", filter_type = "both", gene.nCell_min = 10, cell.nUMI_min = 500, cell.nUMI_max = 6500, cell.nFeature_min = 200, cell.nFeature_max = 2500, cell.pctMito_max = 0.1) For any unspecified cutoff arguments, like gene.nCell_max, filterSparseEset() will automatically assign the default values to them. The default values of any cutoff argument would not filter out any cells or features. So, if you want to skip some metrics, just leave the cutoffs of them unspecified. For example, in the codes above, gene.nCell_max is unspecified. Then filterSparseEset() wil assign the default value, which is Inf, to it. No features would be filtered out by this argument. "],["data-normalization.html", "Chapter 6 Data normalization", " Chapter 6 Data normalization In this chapter, we will introduce the method of data normalization in scMINER. We recommend to use log2CPM method for normalization: the raw counts in each cell are normalized to a library size of 1 million, followed by log2 transformation. pbmc14k_log2cpm.eset <- normalizeSparseEset(pbmc14k_filtered.eset, scale_factor = 1000000, log_base = 2, log_pseudoCount = 1) ## Done! The data matrix of eset has been normalized and log-transformed! ## The returned eset contains: 8846 genes, 13605 cells. exprs(pbmc14k_log2cpm.eset)[1:5,1:5] ## 5 x 5 Matrix of class "dgeMatrix" ## CACTTTGACGCAAT GTTACGGAAACGAA CACTTATGAGTCGT GCATGTGATTCTGT ## LINC00115 0 0.00000 0 0 ## NOC2L 0 0.00000 0 0 ## HES4 0 0.00000 0 0 ## ISG15 0 10.05794 0 0 ## C1orf159 0 0.00000 0 0 ## TAGAATACGTATCG ## LINC00115 0 ## NOC2L 0 ## HES4 0 ## ISG15 0 ## C1orf159 0 This normalized and log-transformed SparseEset object can be directly used for Mutual Information-based clustering, network inference and other downstream analysis. Don’t forget to save the SparseEset object after data normalization. saveRDS(pbmc14k_log2cpm.eset, file = "/your-path/PBMC14k/DATA/pbmc14k_log2CPM_annotated.rds") "],["mi-based-clustering-analysis.html", "Chapter 7 MI-based clustering analysis 7.1 Introduction to MICA 7.2 Generate MICA input 7.3 Run MICA 7.4 Integrate MICA outputs into SparseEset object 7.5 Visualize the MICA output", " Chapter 7 MI-based clustering analysis In this chapter, we will introduce more about the MICA component, and walk you through the MICA workflow, including preparing inputs, runing MICA, visualizing and integrationg MICA outputs into SparseEset object. 7.1 Introduction to MICA MICA (Mutual Information-based Clustering Analysis) is a clustering tool designed for single cell genomics data. Compared to most existing single-cell clustering algorithms, MICA has two unique features: MICA uses mutual information to measure cell-cell similarity for unsupervised clustering analysis, while most existing tools employ linear-transformation of PCA and/or co-expression analysis using linear Pearson or Spearman correlations that may not capture the nonlinear cell-cell distance. MICA uses all high-quality features for clustering, while most existing tools select the top highly variable features to improve the clustering speed. This is arbitrary and may lose the information that can distinguish close cell states. MICA is developed using Python framework, to take its strengths in calculation speed and memory consumption. A lot of effort has been made to improve the interoperability between Python and R. Now MICA works seamlessly with the SparseExpressionSet object. The input of MICA can be easily generated from the SparseExpressionSet object by generateMICAinput(), and the output of MICA, the clustering results, can be effortlessly visualized by MICAplot() and integrated into SparseExpressionSet object by addMICAoutput(). 7.2 Generate MICA input The standard input of MICA is a normalized and log-transformed gene expression matrix. scMINER can generate this matrix from sparse eSet object and save it into a file that can be directly read by MICA. MICA accepts .h5ad or .txt format as the input file, which can be easily generated by embedded function generateMICAinput(): ## generate MICA input in txt format generateMICAinput(input_eset = pbmc14k_log2cpm.eset, output_file = "/work-path/PBMC14k/MICA/micaInput.txt", overwrite = FALSE) ## check the format of MICA input mica_input <- read.delim(system.file("extdata/demo_pbmc14k/PBMC14k/MICA/micaInput.txt", package = "scMINER"), header = T, sep = "\\t", row.names = 1) mica_input[1:5,1:5] To use the .h5ad format, run the codes below. ## generate MICA input in h5ad format: anndata package is needed generateMICAinput(input_eset = pbmc14k_log2cpm.eset, output_file = "/work-path/PBMC14k/MICA/micaInput.h5ad", overwrite = FALSE) In addition to generating the standard MICA input file, generateMICAinput() also returns the recommended commands of running MICA. You can copy the commands, modify according and run them. 7.3 Run MICA MICA features two different modes named by their different dimension reduction techniques: Multi-Dimensional Scaling (MDS) mode: this mode is more accurate and robust for small datasets (less than 5,000 cells, be default) due to its global dimension reduction nature; Graph Embedding (GE) mode: this mode works better with large datasets (more than 5,000 cells, by default) using a graph embedding approach to explore distant neighbor cells. In this case, since there are 13,605 cells, we will use the MICA GE mode for the clustering: mica ge -i /work-path/PBMC14k/MICA/micaInput.txt -o /work-path/PBMC14k/MICA/micaOutput -minr 0.1 -maxr 9.0 -ss 0.05 -nw 40 This command will generate the clustering results of multiple resolutions, from 0.1 to 9.0, with a step size of 0.05. 7.4 Integrate MICA outputs into SparseEset object MICA generates several files and save all of them in the output directory specified by the user with -o argument. The core, and only, output file we need for subsequent analysis is the clustering label file named in the format of ProjectName_clustering_VisualizeMethod_euclidean_NumberOfDimensions_Resolution.txt. In this case, since we used a range of resolutions, there are several clustering label files generated, one for each resolution. Based on the knowledge about PBMC14k dataset, we compared the results of different resolutions and picked clustering_UMAP_euclidean_20_2.05.txt for subsequent analysis. micaOutput <- read.table(system.file("extdata/demo_pbmc14k/PBMC14k/MICA/clustering_UMAP_euclidean_20_2.05.txt", package = "scMINER"), header = TRUE, sep = "\\t", quote = "", stringsAsFactors = F) head(micaOutput) ## ID X Y label ## 1 CACTTTGACGCAAT 14.91650 13.04096 6 ## 2 GTTACGGAAACGAA 14.57031 10.27093 6 ## 3 CACTTATGAGTCGT 14.28869 13.61674 6 ## 4 GCATGTGATTCTGT 14.12546 13.36319 6 ## 5 TAGAATACGTATCG 14.91227 11.19407 6 ## 6 CAAGAAGACCCTCA 15.34154 12.25821 6 As shown above, the clustering label file contains four columns: ID: cell barcodes; X: coordinates of UMAP_1 or tSNE_1; Y: coordinates of UMAP_2 or tSNE_2; label: labels of predicted clusters. The clustering result can be easily easily added to the SparseExpressionSet object by addMICAoutput(): pbmc14k_log2cpm.eset <- addMICAoutput(pbmc14k_log2cpm.eset, mica_output_file = system.file("extdata/demo_pbmc14k/PBMC14k/MICA/clustering_UMAP_euclidean_20_2.05.txt", package = "scMINER"), visual_method = "umap") head(pData(pbmc14k_log2cpm.eset)) ## trueLabel_full trueLabel projectID nUMI nFeature pctMito ## CACTTTGACGCAAT CD14+ Monocyte Monocyte PBMC14k 764 354 0.01832461 ## GTTACGGAAACGAA CD14+ Monocyte Monocyte PBMC14k 956 442 0.01569038 ## CACTTATGAGTCGT CD14+ Monocyte Monocyte PBMC14k 629 323 0.02066773 ## GCATGTGATTCTGT CD14+ Monocyte Monocyte PBMC14k 875 427 0.02628571 ## TAGAATACGTATCG CD14+ Monocyte Monocyte PBMC14k 1060 445 0.03207547 ## CAAGAAGACCCTCA CD14+ Monocyte Monocyte PBMC14k 849 384 0.01531213 ## pctSpikeIn CellID UMAP_1 UMAP_2 clusterID ## CACTTTGACGCAAT 0 CACTTTGACGCAAT 14.91650 13.04096 6 ## GTTACGGAAACGAA 0 GTTACGGAAACGAA 14.57031 10.27093 6 ## CACTTATGAGTCGT 0 CACTTATGAGTCGT 14.28869 13.61674 6 ## GCATGTGATTCTGT 0 GCATGTGATTCTGT 14.12546 13.36319 6 ## TAGAATACGTATCG 0 TAGAATACGTATCG 14.91227 11.19407 6 ## CAAGAAGACCCTCA 0 CAAGAAGACCCTCA 15.34154 12.25821 6 7.5 Visualize the MICA output scMINER provides a function, MICAplot() to easily visualize the clustering results on a 2D plot, UMAP or tSNE. And it can be colored by multiple variables, including cluster label, sample source, nUMI, nFeature, pctMito and more. 7.5.1 Color-coded by cluster labels library(ggplot2) MICAplot(input_eset = pbmc14k_log2cpm.eset, color_by = "clusterID", X = "UMAP_1", Y = "UMAP_2", point.size = 0.1, fontsize.cluster_label = 6) 7.5.2 Color-coded by true label of cell types MICAplot(input_eset = pbmc14k_log2cpm.eset, color_by = "trueLabel", X = "UMAP_1", Y = "UMAP_2", point.size = 0.1, fontsize.cluster_label = 4) 7.5.3 Color-coded by nUMI, for QC purpose MICAplot(input_eset = pbmc14k_log2cpm.eset, color_by = "nUMI", do.logTransform = TRUE, point.size = 0.1) ## The values in "nUMI" have been transformed by log2(value + 1). To turn transformation off, set do.logTransform = FALSE. 7.5.4 Color-coded by nFeature, for QC purpose MICAplot(input_eset = pbmc14k_log2cpm.eset, color_by = "nFeature", do.logTransform = TRUE, point.size = 0.1) ## The values in "nFeature" have been transformed by log2(value + 1). To turn transformation off, set do.logTransform = FALSE. ### Color-coded by pctMito, for QC purpose MICAplot(input_eset = pbmc14k_log2cpm.eset, color_by = "pctMito", do.logTransform = FALSE, point.size = 0.1) "],["cell-type-annotation.html", "Chapter 8 Cell type annotation 8.1 Supervised cell type annotation 8.2 Unsupervised cell type annotation 8.3 Add cell type annotations to SparseExpressionSet object", " Chapter 8 Cell type annotation Currently, there are two types of strategies to annotate the clusters: supervised and unsupervised. The supervised methods use a list of known markers of potential cell types curated from somoe existing studies of the same/similar contexts. While in contrast, the unsupervised methods are usually based on the differentially expressed genes. scMINER provide several useful functions to support both types of strategies. 8.1 Supervised cell type annotation In the past a few years, the scRNA-seq data has dramatically increased in both quality and quantity. For the majority of tissue types, some existing studies on the same/similar tissue type are most likely available, and from these existing studies, we can figure out a list of candidate cell types to expect and curate a list of markers for each of them. In this case, we know the 7 cell types involved in the dataset, and curated a marker list from some existing PBMCs studies. 8.1.1 Annotate using signature scores Given a marker list of candidate cell types, scMINER can estimate a signature score, which is mathematically the weighted mean of the expression of marker genes involved, for each candidate cell type across all cell cluster. To do so, you will need to generate a signature table with three columns: signature_name: name of cell types/signatures; signature_feature: markers genes/features of corresponding cell type/signature; weight: weight of corresponding maker/feature in corresponding cell type/signature. It ranges from -1 to 1, so both positive and negtive markers are supoorted. ## Signature table of PBMC14k dataset signature_table <- read.table(system.file("extdata/demo_pbmc14k/PBMC14k_signatureTable.txt", package = "scMINER"), header = TRUE, sep = "\\t", quote = "", stringsAsFactors = FALSE) head(signature_table) ## signature_name signature_feature weight ## 1 Monocyte CD14 1 ## 2 Monocyte LYZ 1 ## 3 Monocyte S100A8 1 ## 4 Monocyte S100A9 1 ## 5 Monocyte S100A12 1 ## 6 NK FCGR3A 1 With this signature table, draw_bubbleplot() can estimate the signature scores and visualize them using bubble plot: ## Violin plot of marker genes across clusters draw_bubbleplot(input_eset = pbmc14k_log2cpm.eset, signature_table = signature_table, group_by = "clusterID") ## 31 features of 7 signatures were found in the input eset and will be used in calculation. In the bubble plot above, the color of the bubbles is proportional to the mean of signature score, and the size of the bubbles is proportional to the percentage of cells with higher signature score than mean. The cell type of each cluster is clear, except the cluster 7, which shows equally-high signature score of both CD4+ TCM and CD4+ Reg and higher percentage of CD4+ TCM cells. 8.1.2 Annotate using individual marker genes scMINER also provides a variety of functions to visualize the selected features: ## For the demonstration purposes, we picked two well known markers for each of the 7 known cell types, plus "CD3D" and "CD4". genes_of_interest <-c("CD14", "LYZ", "GZMB", "NKG7", "CD19", "MS4A1", "CD8A", "CD8B", "SELL", "CCR7", "IL2RA", "FOXP3", "IL7R", "S100A4", "CD3D", "CD4") 8.1.2.1 feature visualization: violin plot ## Violin plot of marker genes across clusters feature_vlnplot(input_eset = pbmc14k_log2cpm.eset, features = genes_of_interest, group_by = "clusterID", ncol = 4) 8.1.2.2 feature visualization: box plot ## Box plot of marker genes across clusters feature_boxplot(input_eset = pbmc14k_log2cpm.eset, features = genes_of_interest, group_by = "clusterID", ncol = 4) 8.1.2.3 feature visualization: scatter plot ## UMAP scatter plot of marker genes feature_scatterplot(input_eset = pbmc14k_log2cpm.eset, features = genes_of_interest, ncol = 4, location_x = "UMAP_1", location_y = "UMAP_2", point.size = 0.5, legend.key_height = 0.3, legend.key_width = 0.2, fontsize.legend_title = 8, fontsize.legend_text = 6, fontsize.axis_title = 8, legend.position = "none") 8.1.2.4 feature visualization: bubble plot ## Bubble plot of marker genes across clusters feature_bubbleplot(input_eset = pbmc14k_log2cpm.eset, features = genes_of_interest, group_by = "clusterID", xlabel.angle = 45) 8.1.2.5 feature visualization: heatmap ## Heatmap of marker genes across clusters feature_heatmap(input_eset = pbmc14k_log2cpm.eset, features = genes_of_interest, group_by = "clusterID", scale_method = "none", annotation_columns = c("trueLabel")) 8.2 Unsupervised cell type annotation Existing studies in the same or similar contexts are not always available, and there is a significant concern regarding the reliability of reference studies. This reliability largely depends on the expertise of the original authors who defined the markers and assigned the cell types. Therefore, we strongly encourage users to also try unsupervised methods, which can serve as a means of cross-validation. scMINER provides a function, getDE(), to perform the differential expression analysis and identify the markers of each cluster. The getDE() function supports three different methods to perform the differential expression analysis, limma, wilcoxon and t.test. And it allows the users to define the groups to compare in a flexible way: ## 1. To perform differential expression analysis in a 1-vs-rest manner for all groups de_res1 <- getDE(input_eset = pbmc14k_log2cpm.eset[500,], group_by = "clusterID", use_method = "limma") ## 7 groups were found in group_by column [ clusterID ]. ## Since no group was specified, the differential analysis will be conducted among all groups in the group_by column [ clusterID ] in the 1-vs-rest manner. ## 1 / 7 : group 1 ( 1 ) vs the rest... ## 2505 cells were found for g1. ## 11100 cells were found for g0. ## 2 / 7 : group 1 ( 2 ) vs the rest... ## 2022 cells were found for g1. ## 11583 cells were found for g0. ## 3 / 7 : group 1 ( 3 ) vs the rest... ## 2014 cells were found for g1. ## 11591 cells were found for g0. ## 4 / 7 : group 1 ( 4 ) vs the rest... ## 1918 cells were found for g1. ## 11687 cells were found for g0. ## 5 / 7 : group 1 ( 5 ) vs the rest... ## 1912 cells were found for g1. ## 11693 cells were found for g0. ## 6 / 7 : group 1 ( 6 ) vs the rest... ## 1786 cells were found for g1. ## 11819 cells were found for g0. ## 7 / 7 : group 1 ( 7 ) vs the rest... ## 1448 cells were found for g1. ## 12157 cells were found for g0. head(de_res1) ## [1] feature g1_tag g0_tag g1_avg g0_avg g1_pct g0_pct log2FC Pval ## [10] FDR Zscore ## <0 rows> (or 0-length row.names) Here is an brief introduction to the results of getDE(): feature: feature name; g1_tag: a vector of clusters or subgroups involved in g1, the fore-ground group; g0_tag: a vector of clusters or subgroups involved in g0, the back-ground group; g1_avg: mean of gene expression of cells in g1; g0_tag: mean of gene expression of cells in g0; g1_pct: percentage of cells expressing the corresponding genes in group 1; g0_pct: percentage of cells expressing the corresponding genes in group 0; log2FC: log2Fold change of gene expression between g1 and g0; Pval: P values of g1-g0 comparison; FDR: FDR of g1-g0 comparison; Zscore: Z score of g1-g0 comparison, signed by log2FC; ## 2. To perform differential expression analysis in a 1-vs-rest manner for one specific group de_res2 <- getDE(input_eset = pbmc14k_log2cpm.eset, group_by = "clusterID", g1 = c("1"), use_method = "limma") ## 3. To perform differential expression analysis in a rest-vs-1 manner for one specific group de_res3 <- getDE(input_eset = pbmc14k_log2cpm.eset, group_by = "clusterID", g0 = c("1"), use_method = "limma") ## 4. To perform differential expression analysis in a 1-vs-1 manner for any two groups de_res4 <- getDE(input_eset = pbmc14k_log2cpm.eset, group_by = "clusterID", g1 = c("1"), g0 = c("3"), use_method = "limma") scMINER also provides a function, getTopFeatures(), to easily extract the group-specific markers from the differential expression result: cluster_markers <- getTopFeatures(input_table = de_res1, number = 10, group_by = "g1_tag", sort_by = "log2FC", sort_decreasing = TRUE) dim(cluster_markers) ## [1] 0 11 head(cluster_markers) ## [1] feature g1_tag g0_tag g1_avg g0_avg g1_pct g0_pct log2FC Pval ## [10] FDR Zscore ## <0 rows> (or 0-length row.names) 8.3 Add cell type annotations to SparseExpressionSet object Based on the supervised and unsupervised methods, we have annotated the cell types for each cluster. To add the cell type annotation information into the sparse eset object: celltype_map <- c(`1`="CD4TN", `2`="CD4TCM", `3`="CD8TN", `4`="NK", `5`="B", `6`="Monocyte", `7`="CD4Treg") pbmc14k_log2cpm.eset$cell_type <- as.character(celltype_map[pbmc14k_log2cpm.eset$clusterID]) head(pData(pbmc14k_log2cpm.eset)) ## trueLabel_full trueLabel projectID nUMI nFeature pctMito ## CACTTTGACGCAAT CD14+ Monocyte Monocyte PBMC14k 764 354 0.01832461 ## GTTACGGAAACGAA CD14+ Monocyte Monocyte PBMC14k 956 442 0.01569038 ## CACTTATGAGTCGT CD14+ Monocyte Monocyte PBMC14k 629 323 0.02066773 ## GCATGTGATTCTGT CD14+ Monocyte Monocyte PBMC14k 875 427 0.02628571 ## TAGAATACGTATCG CD14+ Monocyte Monocyte PBMC14k 1060 445 0.03207547 ## CAAGAAGACCCTCA CD14+ Monocyte Monocyte PBMC14k 849 384 0.01531213 ## pctSpikeIn CellID UMAP_1 UMAP_2 clusterID cell_type ## CACTTTGACGCAAT 0 CACTTTGACGCAAT 14.91650 13.04096 6 Monocyte ## GTTACGGAAACGAA 0 GTTACGGAAACGAA 14.57031 10.27093 6 Monocyte ## CACTTATGAGTCGT 0 CACTTATGAGTCGT 14.28869 13.61674 6 Monocyte ## GCATGTGATTCTGT 0 GCATGTGATTCTGT 14.12546 13.36319 6 Monocyte ## TAGAATACGTATCG 0 TAGAATACGTATCG 14.91227 11.19407 6 Monocyte ## CAAGAAGACCCTCA 0 CAAGAAGACCCTCA 15.34154 12.25821 6 Monocyte The draw_barplot() function can visualize the cell composition of self-defined groups. We can use it to show the purity of MICA clusters: ## Violin plot of marker genes across clusters draw_barplot(input_eset = pbmc14k_log2cpm.eset, group_by = "cell_type", color_by = "trueLabel_full", xlabel.angle = 45) Don’t forget to save the SparseEset object after the cell type annotation added. saveRDS(pbmc14k_log2cpm.eset, file = "/your-path/PBMC14k/DATA/pbmc14k_log2CPM_annotated.rds") "],["network-inference.html", "Chapter 9 Network inference 9.1 Generate SJARACNe input files 9.2 Run SJARACNe 9.3 Assess the quality of networks", " Chapter 9 Network inference SJARACNe is a scalable software tool for gene network reverse engineering from big data. As an improved implementation of the ARACNe, SJARACNe achieves a dramatic improvement in computational performance in both time and memory usage and implements new features while preserving the network inference accuracy of the original algorithm. Similar to MICA, SJARACNe is also a component of scMINER framework, and can work seamlessly with the SparseExpressionSet object. The input for MICA can be easily generated from the SparseExpressionSet object by generateSJARACNeInput(), and the output of SJARACNe, the networks, can be effortlessly assessed by drawNetworkQC() and directly taken for driver activity estimation by getActivity_individual() and getActivity_inBatch(). 9.1 Generate SJARACNe input files The network inference is usually conducted in a cluster- or cell type-specific basis. Given the column names for grouping, generateSJARACNeInput() will create a folder for each group named by the group label. IMPORTANT NOTE: Any illegal characters in path in group labels may cause issues in subsequent analysis. To avoid it, scMINER only accept letters(A-Za-z), numbers(0-9), underscores(’_‘) and periods(’.’). In this case, the true labels of cell type are available, so we use them to define the groups for network inference. ## Columns with any illegal characters can not be used for groupping generateSJARACNeInput(input_eset = pbmc14k_log2cpm.eset, group_name = "trueLabel", sjaracne_dir = "/work-path/PBMC14k/SJARACNe", species_type = "hg", driver_type = "TF_SIG", downSample_N = NULL) For big datasets, generateSJARACNeInput() provides an argument, downSample_N, to allow users to down sample size of each group. The default value of downSample_N is 1,000, any group with >= 1,000 cells will be down-sample to 1,000. ## one folder for each group list.dirs(system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe", package = "scMINER"), full.names = FALSE, recursive = FALSE) ## [1] "B" "CD4TCM" "CD4TN" "CD4Treg" "CD8TN" "Monocyte" "NK" ## file structure of each folder list.files(system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe/B", package = "scMINER"), full.names = FALSE, recursive = TRUE, include.dirs = FALSE, pattern = "[^consensus_network_ncol_.txt]") ## [1] "B.8572_1902.exp.txt" "config_cwlexec.json" ## [3] "runSJARACNe.sh" "SIG/B.4148_1902.sig.txt" ## [5] "TF/B.835_1902.tf.txt" The standard input files of SJARACNe, for each group, include: a “.exp.txt” file: a tab-separated genes/transcripts/proteins by cells/samples expression matrix with the first two columns being ID and symbol. a “TF” folder containing a “.tf.txt” file: a list of significant gene/transcript/protein IDs of TF drivers. a “SIG” folder containing a “.sig.txt” file: a list of significant gene/transcript/protein IDs of SIG drivers. a bash script (runSJARACNe.sh) to run SJARACNe. Further modification is needed to run it. a json file (config_cwlexec.json) containing parameters to run SJARACNe. Usually, the ground truth of cell types is not available. Then the cluster labels, or cell type annotations of the clusters, can be used for grouping in network rewiring, since it’s expected that cells with same cluster label/annotated cell type are of similar gene expression profiles. To generate from annotated cell types, you can run: generateSJARACNeInput(input_eset = pbmc14k_log2cpm.eset, group_name = "cell_type", sjaracne_dir = "/work-path/PBMC14k/SJARACNe/bycelltype", species_type = "hg", driver_type = "TF_SIG") 9.2 Run SJARACNe By default, the generateSJARACNeInput() function also generates a runSJARACNe.sh file in the folder of each group. This file much be modified before you can run it: removed unneeded lines: There are usually 4 lines in this file: the lines starting with “sjaracne lsf” are the command lines to run on IBM LSF cluster, while the lines starting with “sjaracne local” are the command lines runing on a single machine (Linux/OSX). Please select the lines based on your situation and remove the others. -n: number of bootstrap networks to generate. Default: 100. -pc: p value threshold to select edges in building consensus network. Default: e-2 for single-cell data, e-3 for meta-cell data, and e-5 for bulk sample data. Please use “sjaracne lsf -h” or “sjaracne local -h” to check more details of arguments available in SJARACNe. There is another file, config_cwlexec.json, available in the folder. It contains the information (e.g. memory request for each step of SJARACNe run) used for LSF job submission. This file is only needed for LSF runs and the default values works well in most cases. If you are running SJARACNe on a big dataset, you may need to request more memory from it. In this case, we use LSF to run the SJARACNe: ## let's use B cell as an example # for TF sjaracne lsf -e /work-path/PBMC14k/SJARACNe/B/B.8572_1902.exp.txt -g /work-path/PBMC14k/SJARACNe/B/TF/B.835_1902.tf.txt -o /work-path/PBMC14k/SJARACNe/B/TF/bt100_pc001 -n 100 -pc 0.01 -j /work-path/PBMC14k/SJARACNe/B/config_cwlexec.json # for SIG sjaracne lsf -e /work-path/PBMC14k/SJARACNe/B/B.8572_1902.exp.txt -g /work-path/PBMC14k/SJARACNe/B/SIG/B.4148_1902.sig.txt -o /work-path/PBMC14k/SJARACNe/B/SIG/bt100_pc001 -n 100 -pc 0.01 -j /work-path/PBMC14k/SJARACNe/B/config_cwlexec.json We created a folder named “bt100_pc001” in both TF and SIG folders of each group, to save the networks generated under 100 bootstraps (-n 100) and 0.01 consensus p value (-pc 0.01). To run SJARACNe on a local machine: ## let's use B cell as an example # for TF sjaracne local -e /work-path/PBMC14k/SJARACNe/B/B.8572_1902.exp.txt -g /work-path/PBMC14k/SJARACNe/B/TF/B.835_1902.tf.txt -o /work-path/PBMC14k/SJARACNe/B/TF/bt100_pc001 -n 100 -pc 0.01 # for SIG sjaracne local -e /work-path/PBMC14k/SJARACNe/B/B.8572_1902.exp.txt -g /work-path/PBMC14k/SJARACNe/B/SIG/B.4148_1902.sig.txt -o /work-path/PBMC14k/SJARACNe/B/SIG/bt100_pc001 -n 100 -pc 0.01 9.3 Assess the quality of networks 9.3.1 Introduction to the network file by SJARACNe The core output of SJARACNe is the network file named consensus_network_ncol_.txt. It contains 9 columns: source: ID of the source gene, can be the gene symbol; target: ID of the target gene, can be the gene symbol; source.symbol: symbol of the source gene; target.symbol: symbol of the target gene; MI: mutual information of source-gene pair; pearson: Pearson correlation coefficient, [-1,1] pearson: Spearman correlation coefficient, [-1,1] slope: slop of the regression line, returned by stats.linregression() p.value: p-value for a hypothesis test whose null hypothesis is that the slope is zero, using Wald Test with t-distribution of the test statistic network_format <- read.table(system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe/B/TF/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"), header = T, sep = "\\t", quote = "", stringsAsFactors = F) head(network_format) ## source target source.symbol target.symbol MI pearson spearman slope ## 1 AATF ACBD3 AATF ACBD3 0.0509 -0.0310 -0.0311 -0.0193 ## 2 AATF ADD3 AATF ADD3 0.0486 0.0228 0.0258 0.0307 ## 3 AATF AES AATF AES 0.0511 0.0311 0.0289 0.0668 ## 4 AATF AKR7A2 AATF AKR7A2 0.0498 0.0319 0.0366 0.0421 ## 5 AATF AL928768.3 AATF AL928768.3 0.0447 0.0247 0.0293 0.0335 ## 6 AATF ALG8 AATF ALG8 0.0479 0.0358 0.0373 0.0234 ## p.value ## 1 0.1761 ## 2 0.3204 ## 3 0.1756 ## 4 0.1646 ## 5 0.2815 ## 6 0.1183 9.3.2 Generate network QC report There is no simple standards to tell the reliability of networks. Empirically, a network with 50-300 target size is good. scMINER provides a function, drawNetworkQC(), to quickly assess the quality of networks in batch. network_stats <- drawNetworkQC(sjaracne_dir = "/work-path/PBMC14K/SJARACNe", generate_html = FALSE) # Set `generate_html = TRUE` to generate html-format QC report for each network file ## The network QC statistics table is saved seperately, for demonstration purposes. network_stats <- readRDS(system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe/network_stats.rds", package = "scMINER")) head(network_stats) ## network_tag network_node network_edge driver_count targetSize_mean ## 1 B.SIG.bt100_pc001 8572 391889 4148 94.47662 ## 2 B.TF.bt100_pc001 8572 95341 835 114.18084 ## 3 CD4TCM.SIG.bt100_pc001 8660 382153 4209 90.79425 ## 4 CD4TCM.TF.bt100_pc001 8660 94319 838 112.55251 ## 5 CD4TN.SIG.bt100_pc001 8612 401658 4180 96.09043 ## 6 CD4TN.TF.bt100_pc001 8612 95152 831 114.50301 ## targetSize_median targetSize_minimum targetSize_maximum ## 1 94.0 33 396 ## 2 96.0 64 913 ## 3 91.0 31 281 ## 4 95.5 60 689 ## 5 95.0 43 303 ## 6 99.0 64 743 ## network_path ## 1 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/B/SIG/bt100_pc001/sjaracne_workflow-df798096-8dee-4baf-8f70-891c689dc769/consensus_network_ncol_.txt ## 2 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/B/TF/bt100_pc001/sjaracne_workflow-fb2a69b9-f98e-47ff-87a0-6d538822fc6e/consensus_network_ncol_.txt ## 3 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/CD4TCM/SIG/bt100_pc001/sjaracne_workflow-424f1068-13d1-4f0e-9c26-56acd9a2027c/consensus_network_ncol_.txt ## 4 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/CD4TCM/TF/bt100_pc001/sjaracne_workflow-52b3cdf5-5914-4c8c-a77a-05f17c755d83/consensus_network_ncol_.txt ## 5 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/CD4TN/SIG/bt100_pc001/sjaracne_workflow-7b5bb68e-1de5-4d0e-80ec-8d8aa037866f/consensus_network_ncol_.txt ## 6 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/CD4TN/TF/bt100_pc001/sjaracne_workflow-89716541-eb53-435c-8a45-bab63d6b5198/consensus_network_ncol_.txt "],["actvity-based-analysis.html", "Chapter 10 Actvity-based analysis 10.1 Calculate the activities 10.2 Differential activity analysis", " Chapter 10 Actvity-based analysis The driver activity estimation is one of the most important features of scMINER. Mathematically, the activity of one driver is a type of mean of the expressions of its targets. And biologically, the activity can be interpreted as a measure that describes how actively the driver functions, like the enzymes in digesting their subtracts, kinase in activating their downstream genes. Given the gene expression profiles and networks, scMINER can estimate the activities of some predefined drivers, including not only transcription factors (TFs) but also signaling genes (SIGs). scMINER provides a few functions to effortlessly calculate the activities, identify the hidden drivers and visualize them in multiple ways. 10.1 Calculate the activities scMINER provides two functions, getActivity_individual() and getActivity_inBatch(), to calculate the driver activities. 10.1.1 Calculate activities per group getActivity_individual() is designed to calculate the activities per group. It takes the network files as the input: ## let's use B cell as an example activity_B.eset <- getActivity_individual(input_eset = pbmc14k_log2cpm.eset[, pData(pbmc14k_log2cpm.eset)$trueLabel == "B"], network_file.tf = system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe/B/TF/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"), network_file.sig = system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe/B/SIG/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"), driver_type = "TF_SIG") 10.1.2 Calculate activities in batch If you need to calculate the activity for multiple groups, this is usually the case, you can do it using getActivity_individual() as shown above one by one and merge the esets after that. Or, scMINER privides another function, getActivity_inBatch(), to calculate the activity in batch: ## let's use B cell as an example activity.eset <- getActivity_inBatch(input_eset = pbmc14k_log2cpm.eset, sjaracne_dir = system.file("extdata/demo_pbmc14k/PBMC14k/SJARACNe", package = "scMINER"), group_name = "trueLabel", driver_type = "TF_SIG", activity_method = "mean", do.z_normalization = TRUE) ## 7 groups were found in trueLabel ... ## Checking network files for each group ... ## Group 1 / 7 : Monocyte ... ## TF network check passed! ## SIG network check passed! ## Group 2 / 7 : B ... ## TF network check passed! ## SIG network check passed! ## Group 3 / 7 : CD4Treg ... ## TF network check passed! ## SIG network check passed! ## Group 4 / 7 : CD4TN ... ## TF network check passed! ## SIG network check passed! ## Group 5 / 7 : CD4TCM ... ## TF network check passed! ## SIG network check passed! ## Group 6 / 7 : NK ... ## TF network check passed! ## SIG network check passed! ## Group 7 / 7 : CD8TN ... ## TF network check passed! ## SIG network check passed! ## Calculating activity for each group ... ## Group 1 / 7 : Monocyte ... ## Activity calculation is completed successfully! ## Group 2 / 7 : B ... ## Activity calculation is completed successfully! ## Group 3 / 7 : CD4Treg ... ## Activity calculation is completed successfully! ## Group 4 / 7 : CD4TN ... ## Activity calculation is completed successfully! ## Group 5 / 7 : CD4TCM ... ## Activity calculation is completed successfully! ## Group 6 / 7 : NK ... ## Activity calculation is completed successfully! ## Group 7 / 7 : CD8TN ... ## Activity calculation is completed successfully! ## NAs were found in the activity matrix and have been replaced by the minimum value: -0.3968794 . 10.1.3 Save activity eset object saveRDS(activity.eset, file = "/your-path/PBMC14k/DATA/activity.eset") 10.2 Differential activity analysis Similar to getDE(), scMINER provides a function, getDA(), to perform the differential activity analysis and identify the group-specific drivers. ## 1. To perform differential expression analysis in a 1-vs-rest manner for all groups da_res1 <- getDA(input_eset = activity.eset, group_by = "cell_type", use_method = "t.test") ## 7 groups were found in group_by column [ cell_type ]. ## Since no group was specified, the differential analysis will be conducted among all groups in the group_by column [ cell_type ] in the 1-vs-rest manner. ## 1 / 7 : group 1 ( B ) vs the rest... ## 1912 cells were found for g1. ## 11693 cells were found for g0. ## 2 / 7 : group 1 ( CD4TCM ) vs the rest... ## 2022 cells were found for g1. ## 11583 cells were found for g0. ## 3 / 7 : group 1 ( CD4TN ) vs the rest... ## 2505 cells were found for g1. ## 11100 cells were found for g0. ## 4 / 7 : group 1 ( CD4Treg ) vs the rest... ## 1448 cells were found for g1. ## 12157 cells were found for g0. ## 5 / 7 : group 1 ( CD8TN ) vs the rest... ## 2014 cells were found for g1. ## 11591 cells were found for g0. ## 6 / 7 : group 1 ( Monocyte ) vs the rest... ## 1786 cells were found for g1. ## 11819 cells were found for g0. ## 7 / 7 : group 1 ( NK ) vs the rest... ## 1918 cells were found for g1. ## 11687 cells were found for g0. head(da_res1) ## feature g1_tag g0_tag g1_avg ## 4 AASDH_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.008071658 ## 6 AATF_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.051767485 ## 12 ABCB8_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.077615607 ## 8 ABCA2_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.081643603 ## 10 ABCB1_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.134357577 ## 3 AARSD1_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.126010447 ## g0_avg g1_pct g0_pct log2FC Pval FDR ## 4 -0.1025141 0.43043933 0.13991277 0.094442475 2.225074e-308 0.000000e+00 ## 6 -0.1084165 0.21652720 0.08757376 0.056649005 3.918924e-189 5.878386e-189 ## 12 -0.1094585 0.35251046 0.14153767 0.031842866 3.623209e-12 3.952592e-12 ## 8 -0.1101867 0.10198745 0.15676045 0.028543082 1.914570e-58 2.418404e-58 ## 10 -0.1559384 0.04393305 0.06114770 0.021580866 8.079661e-27 9.233898e-27 ## 3 -0.1225746 0.04184100 0.08192936 -0.003435892 4.213744e-02 4.213744e-02 ## Zscore ## 4 37.53784 ## 6 29.33316 ## 12 6.95115 ## 8 16.11775 ## 10 10.72137 ## 3 -2.03216 ## 2. To perform differential expression analysis in a 1-vs-rest manner for one specific group da_res2 <- getDA(input_eset = activity.eset, group_by = "cell_type", g1 = c("B"), use_method = "t.test") ## 3. To perform differential expression analysis in a rest-vs-1 manner for one specific group da_res3 <- getDA(input_eset = activity.eset, group_by = "cell_type", g0 = c("B"), use_method = "t.test") ## 4. To perform differential expression analysis in a 1-vs-1 manner for any two groups da_res4 <- getDA(input_eset = activity.eset, group_by = "cell_type", g1 = c("CD4Treg"), g0 = c("CD4TCM"), use_method = "t.test") The getTopFeatures() function can aslo be used to easily extract the group-specific markers from the differential expression result: top_drivers <- getTopFeatures(input_table = da_res1, number = 10, group_by = "g1_tag", sort_by = "log2FC", sort_decreasing = TRUE) dim(top_drivers) ## [1] 16 11 head(top_drivers) ## feature g1_tag g0_tag g1_avg ## 4 AASDH_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.008071658 ## 6 AATF_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.051767485 ## 12 ABCB8_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.077615607 ## 8 ABCA2_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.081643603 ## 10 ABCB1_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.134357577 ## 3 AARSD1_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.126010447 ## g0_avg g1_pct g0_pct log2FC Pval FDR ## 4 -0.1025141 0.43043933 0.13991277 0.094442475 2.225074e-308 0.000000e+00 ## 6 -0.1084165 0.21652720 0.08757376 0.056649005 3.918924e-189 5.878386e-189 ## 12 -0.1094585 0.35251046 0.14153767 0.031842866 3.623209e-12 3.952592e-12 ## 8 -0.1101867 0.10198745 0.15676045 0.028543082 1.914570e-58 2.418404e-58 ## 10 -0.1559384 0.04393305 0.06114770 0.021580866 8.079661e-27 9.233898e-27 ## 3 -0.1225746 0.04184100 0.08192936 -0.003435892 4.213744e-02 4.213744e-02 ## Zscore ## 4 37.53784 ## 6 29.33316 ## 12 6.95115 ## 8 16.11775 ## 10 10.72137 ## 3 -2.03216 "],["session-information.html", "Chapter 11 Session information", " Chapter 11 Session information sessioninfo::session_info() ## ─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## setting value ## version R version 4.3.3 (2024-02-29) ## os macOS Sonoma 14.5 ## system aarch64, darwin20 ## ui X11 ## language (EN) ## collate en_US.UTF-8 ## ctype en_US.UTF-8 ## tz America/Chicago ## date 2024-08-04 ## pandoc 3.1.11 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown) ## ## ─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## package * version date (UTC) lib source ## anndata * 0.7.5.6 2023-03-17 [1] CRAN (R 4.3.0) ## assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.3.0) ## Biobase * 2.62.0 2023-10-26 [1] Bioconductor ## BiocGenerics * 0.48.1 2023-11-02 [1] Bioconductor ## bookdown 0.40 2024-07-02 [1] CRAN (R 4.3.3) ## bslib 0.7.0 2024-03-29 [1] CRAN (R 4.3.1) ## cachem 1.1.0 2024-05-16 [1] CRAN (R 4.3.3) ## cli 3.6.3 2024-06-21 [1] CRAN (R 4.3.3) ## colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.0) ## digest 0.6.36 2024-06-23 [1] CRAN (R 4.3.3) ## dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1) ## evaluate 0.24.0 2024-06-10 [1] CRAN (R 4.3.3) ## fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.1) ## farver 2.1.2 2024-05-13 [1] CRAN (R 4.3.3) ## fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.3.3) ## generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0) ## ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.3.1) ## glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.1) ## gridExtra 2.3 2017-09-09 [1] CRAN (R 4.3.0) ## gtable 0.3.5 2024-04-22 [1] CRAN (R 4.3.1) ## highr 0.11 2024-05-26 [1] CRAN (R 4.3.3) ## htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.3.1) ## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.3.0) ## jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.1) ## knitr 1.48 2024-07-07 [1] CRAN (R 4.3.3) ## labeling 0.4.3 2023-08-29 [1] CRAN (R 4.3.0) ## lattice 0.22-6 2024-03-20 [1] CRAN (R 4.3.1) ## lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1) ## limma 3.58.1 2023-11-02 [1] Bioconductor ## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0) ## Matrix * 1.6-5 2024-01-11 [1] CRAN (R 4.3.3) ## munsell 0.5.1 2024-04-01 [1] CRAN (R 4.3.1) ## pheatmap 1.0.12 2019-01-04 [1] CRAN (R 4.3.0) ## pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0) ## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0) ## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.3.1) ## png 0.1-8 2022-11-29 [1] CRAN (R 4.3.0) ## R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0) ## RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.3.0) ## Rcpp 1.0.13 2024-07-17 [1] CRAN (R 4.3.3) ## reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.3.0) ## reticulate 1.38.0.9000 2024-07-26 [1] Github (rstudio/reticulate@740169a) ## rlang 1.1.4 2024-06-04 [1] CRAN (R 4.3.3) ## rmarkdown 2.27 2024-05-17 [1] CRAN (R 4.3.3) ## rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.3.1) ## sass 0.4.9 2024-03-15 [1] CRAN (R 4.3.1) ## scales 1.3.0 2023-11-28 [1] CRAN (R 4.3.1) ## scMINER * 1.1.0 2024-08-02 [1] Github (jyyulab/scMINER@79eb712) ## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0) ## statmod 1.5.0 2023-01-06 [1] CRAN (R 4.3.0) ## stringi 1.8.4 2024-05-06 [1] CRAN (R 4.3.1) ## stringr 1.5.1 2023-11-14 [1] CRAN (R 4.3.1) ## tibble 3.2.1 2023-03-20 [1] CRAN (R 4.3.0) ## tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.1) ## utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1) ## vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1) ## withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.1) ## xfun 0.46 2024-07-18 [1] CRAN (R 4.3.3) ## yaml 2.3.9 2024-07-05 [1] CRAN (R 4.3.3) ## ## [1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library ## ## ─ Python configuration ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## python: /Users/qpan/.virtualenvs/r-reticulate/bin/python ## libpython: /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/config-3.9-darwin/libpython3.9.dylib ## pythonhome: /Users/qpan/.virtualenvs/r-reticulate:/Users/qpan/.virtualenvs/r-reticulate ## version: 3.9.6 (default, Mar 29 2024, 10:51:09) [Clang 15.0.0 (clang-1500.3.9.4)] ## numpy: /Users/qpan/.virtualenvs/r-reticulate/lib/python3.9/site-packages/numpy ## numpy_version: 1.26.4 ## anndata: /Users/qpan/.virtualenvs/r-reticulate/lib/python3.9/site-packages/anndata ## ## NOTE: Python version was forced by VIRTUAL_ENV ## ## ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]]
    +[["index.html", "scMINER: a mutual information-based framework for identifying hidden drivers from single-cell omics data ", " scMINER: a mutual information-based framework for identifying hidden drivers from single-cell omics data John Doe 2024-08-05 Welcome to scMINER documentation! scMINER (single-cell Mutual Information-based Network Engineering Ranger) is a computational framework designed for end-to-end analysis of single cell RNA-seq data. Using mutual information to measure cell-cell similarities and gene-gene correlations, scMINER is widely applicable and highly accurate in unsupervised clustering and gene activity inference of scRNA-seq data. In this documentation, we will walk you through every analysis that scMINER can do and introduce you more about the concepts related to scMINER framework. "],["introduction.html", "Chapter 1 Introduction 1.1 A few concepts 1.2 Why use scMINER 1.3 Citation 1.4 Support", " Chapter 1 Introduction This chapter will introduce some principal concepts and unique features of scMINER. 1.1 A few concepts There are a few concepts that may help you understand scMINER better. SparseEset The SparseExpressionSet (or SparseEset for short) is a new class created by scMINER to handle the sparsity in scRNA-seq data. It is derived from ExpressionSet, and enables to compress, store and access efficiently and conveniently. The SparseEset object is the center of scRNA-seq data analysis by scMINER. Mutual Information Mutual information is a measure of the mutual dependence between two random variables. It quantifies the amount of information obtained about one variable through the other variable. In other words, it measures how much knowing the value of one variable reduces uncertainty about the value of the other variable. It’s widely used in probability theory and information theory. Compared with the linear correlation that used by most existing tools for scRNA-seq data clustering, mutual information provides a more general measure of dependence that can capture both linear and non-linear relationships, and hence may increases the accuracy and sensitivity of scRNA-seq data clustering. Comparison of Linear Correlation and Mutual Information (powered by ChatGPT) Linear Correlation Mutual Information Definition Measures linear relationship Measures mutual dependence (both linear and non-linear) Range -1 to 1 0 to Inf Sensitivity to outliers Sensitive Less sensitive Captures Non-linear Relationships No Yes Common Applications Regression, finance, science Feature selection, clustering, network inference Gene Activity The gene activity estimation is one of the most important features of scMINER. Mathematically, the activity of one gene is a type of mean of the expressions of its targets. And biologically, the activity can be interpreted as a measure that describes how actively the driver functions, like the enzymes in digesting their subtracts, kinase in activating their downstream genes. Given the gene expression profiles and networks, scMINER can estimate the activities of some predefined drivers, including not only transcription factors (TFs) but also signaling genes (SIGs). 1.2 Why use scMINER (more details to be added) scMINER includes the following key functions: Mutual information-based clustering: scMINER measures the cell-cell similarities with full feature-derived mutual information. It can catch both linear and non-linear correlations and performs better in cell clustering, especially for those of close states. Gene activity estimation: scMINER rewires the cell-type specific gene networks solely from the scRNA-seq data, and then estimates the gene activities of not only transcription factors (TFs) but also signaling genes (SIGs). The gene activity-based analysis can expose the main regulators of various biological activities, like cellular linage differentiation and tissue specificity. SparseEset-centered full-feature tool: scMINER provides a wide range of functions for data intake, quality control and filtration, MI-based clustering, network inference, gene activity estimation, cell type annotation, differential expression/activity analysis, and data visualization and sharing. Most of these functions are developed in an object-oriented manner for the SparseEset object. 1.3 Citation Please consider citing this paper if you find scMINER useful in your research. 1.4 Support We welcome your feedback! The scMINER software is developed and maintained by the Yu Lab @ St. Jude Children’s Research Hospital and is released under the Apache License (Version 2.0). Feel free to open an issue, or send us an email if you encounter a bug, need our help or just want to make a comment/suggestion. "],["get-started.html", "Chapter 2 Get started 2.1 Installation 2.2 Demo data 2.3 Create project space", " Chapter 2 Get started This chapter will walk you through how to install scMINER, create a project space for your study and prepare the demo data used in this tutorial. 2.1 Installation scMINER framework is mainly developed with R for its advantages in statistical analysis and data visualization. It also includes two components, MICA and SJARACNe, that are developed with Python to take its strengths in calculation speed and memory consumption, since mutual information estimation of large-scale scRNA-seq data is usually compute-intensive. Please install all three components for the full access to scMINER framework. Install scMINER R package The scMINER R package requires R 4.2.3 or newer, and can be installed from GitHub with: # install.packages("devtools") devtools::install_github("https://github.com/jyyulab/scMINER.git") Install MICA and SJARACNe The recommended method to install MICA and SJARACNe is to use conda dependency manager: ## setup conda env conda create -n scminer python=3.9.2 # Create a python virtual environment source activate scminer # Activate the virtual environment ## install MICA git clone https://github.com/jyyulab/MICA # Clone the MICA repo cd MICA # Switch to the MICA root directory pip install . # Install MICA and its dependencies mica -h # Check if MICA works ## install SJARACNE cd .. # Switch to conda env folder git clone https://github.com/jyyulab/SJARACNe.git # Clone the SJARACNe repo cd SJARACNe # Switch to the MICA root directory python setup.py build # Build SJARACNe binary python setup.py install # Build SJARACNe binary sjaracne -h # Check if SJARACNe works 2.2 Demo data In this tutorial, we will use a ground truth dataset called PBMC14k for demonstration purposes. It was generated from a Peripheral Blood Mononuclear Cells (PBMCs) dataset containing 10 known cell types, with 2,000 cells per type [Zheng et al., 2017]: We first rectified the gene symbol issues of the original dataset, including the dash-dot conversion (e.g. “RP11-34P13.7” changed to “RP11.34P13.7”) and “X” added to those started with numbers (e.g. “7SK” changed to “X7SK”), by referring to the gene annotation file (GRCh37.82) used in the original study. Then we removed 3 cell populations, CD34+ cells, CD4+ helper T cells, and total CD8+ cytotoxic cells, from the dataset because of either low sorting purity or a significant overlap with other immune cells based on the sorting strategy, and created a new dataset with seven known cell types and 14k cells in total. The original dataset is freely available under this accession number SRP073767 and Zenodo. How was the PBMC14K dataset generated from the original dataset? ## Step 1: rectify the invalid gene symbols # "Filtered_DownSampled_SortedPBMC_data.csv" is the raw count matrix directly downloaded from Zenodo counts <- read.csv("Filtered_DownSampled_SortedPBMC_data.csv", row.names = 1) d <- t(counts); dim(d) # it includes 21592 genes and 20000 cells # "genesymbol_from_GTF_GRCh37.txt" contains the official gene ids and symbols extracted from GTF file downloaded from officialGene <- read.table("genesymbol_from_GTF_GRCh37.txt", header = T, sep = "\\t", quote = "", stringsAsFactors = F); head(officialGene) https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/ officialGene$dotted_symbol <- gsub("-", "\\\\.", officialGene$gene_name); officialGene$dotted_symbol <- make.unique(officialGene$dotted_symbol) table(row.names(d) %in% officialGene$dotted_symbol); row.names(d)[! row.names(d) %in% officialGene$dotted_symbol] # two genes are not in: X7SK.1 and X7SK.2 row.names(d) <- gsub("X7SK.1", "7SK", row.names(d)); row.names(d) <- gsub("X7SK.2", "7SK.1", row.names(d)) table(row.names(d) %in% officialGene$dotted_symbol) # all true row.names(officialGene) <- officialGene$dotted_symbol officialGene <- officialGene[row.names(d),] row.names(d) <- make.unique(officialGene$gene_name) # "Labels.csv" contains the true labels of cell types and was directly downloaded from Zenodo celltype <- read.csv("Labels.csv"); head(celltype); table(celltype$x) # 2000 cells for each of 10 cell types: CD14+ Monocyte, CD19+ B, CD34+, CD4+ T Helper2, CD4+/CD25 T Reg, CD4+/CD45RA+/CD25- Naive T, CD4+/CD45RO+ Memory, CD56+ NK, CD8+ Cytotoxic T, CD8+/CD45RA+ Naive Cytotoxic df <- data.frame(cell_barcode = colnames(d), trueLabel_full = celltype$x); dim(df) truelabel_map <- c(`CD14+ Monocyte`="Monocyte", `CD19+ B`="B", `CD34+`="CD34pos", `CD4+ T Helper2`="CD4Th2", `CD4+/CD25 T Reg`="CD4Treg", `CD4+/CD45RA+/CD25- Naive T`="CD4TN", `CD4+/CD45RO+ Memory`="CD4TCM", `CD56+ NK`="NK", `CD8+ Cytotoxic T`="CD8CTL", `CD8+/CD45RA+ Naive Cytotoxic`="CD8TN") df$trueLabel <- as.character(truelabel_map[df$trueLabel_full]) ## Step 2: extract 7 populations df.14k <- df[df$trueLabel_full %in% c("CD14+ Monocyte", "CD19+ B", "CD4+/CD25 T Reg", "CD4+/CD45RA+/CD25- Naive T", "CD4+/CD45RO+ Memory", "CD56+ NK", "CD8+/CD45RA+ Naive Cytotoxic"),] write.table(df.14k, file = "PBMC14k_trueLabel.txt", sep = "\\t", row.names = TRUE, col.names = TRUE, quote = FALSE, append = FALSE) d.14k <- d[,df.14k$cell_barcode] d.14k <- d.14k[rowSums(d.14k) > 0,] write.table(d.14k, file = "PBMC14k_rawCount.txt", sep = "\\t", row.names = FALSE, col.names = TRUE, quote = FALSE, append = FALSE) # 17986 genes, 14000 cells The PBMC14k dataset is embeded in scMINER R package and can be easily loaded by: library(scMINER) data("pbmc14k_rawCount") dim(pbmc14k_rawCount) ## [1] 17986 14000 pbmc14k_rawCount[1:5,1:5] ## 5 x 5 sparse Matrix of class "dgCMatrix" ## CACTTTGACGCAAT GTTACGGAAACGAA AGTCACGACAGGAG TTCGAGGACCAGTA ## AL627309.1 . . . . ## AP006222.2 . . . . ## RP11-206L10.3 . . . . ## RP11-206L10.2 . . . . ## RP11-206L10.9 . . . . ## CACTTATGAGTCGT ## AL627309.1 . ## AP006222.2 . ## RP11-206L10.3 . ## RP11-206L10.2 . ## RP11-206L10.9 . 2.3 Create project space The project space created by scMINER is a folder of specified name in specified directory. It contains 4 subfolders: DATA: to save the sparse eSet objects and other files; MICA: to save the inputs and outputs of mutual information-based clustering analysis; SJARACNe: to save the inputs and outputs of network inference and quality control; PLOT: to save the files of data visualization. The project space can not only keep your data centralized and organized, but also make the scMINER pipeline more smooth and robust. We strongly recommend you to create a project space for each of your studies. This can be easily done by createProjectSpace() in scMINER: scminer_dir <- createProjectSpace(project_dir = "/your-path", project_name = "PBMC14k") The command above will create a folder named PBMC14k in /your-path, and save the path to the project space (/your-path/PBMC14k) to scminer_dir. Can I add, delete or modify files in project space folder? Yes, you can. There are two functions, drawNetworkQC() and getActivityBatch(), that take directories as inputs, and both of them can validate the inputs. For all the rest functions, the inputs are specific files. So adding files in project space never affect the scMINER analysis. Deleting or modifying files in project spare is also safe. The input validation features of scMINER functions can help locate the files with issues. All output files of scMINER are reproducible and can be re-generated quickly. Just be careful with the clustering results in MICA and network files in SJARACNe, ad regerating them can take some time. "],["generate-gene-expresion-matrix.html", "Chapter 3 Generate gene expresion matrix 3.1 From data directory by 10x Genomics 3.2 From text-table file 3.3 From HDF5 file by 10x Genomics 3.4 From H5AD file", " Chapter 3 Generate gene expresion matrix In this chapter, we will generate the gene expression matrix, genes by cells, from multiple input formats commonly used in single-cell RNA sequencing, including sparse matrix by 10x Genomics, text-table file, HDF5 file by 10x Genomics and H5AD file. For demonstration purposes, scMINER embedded four datasets in extdata/demoData_readInput, with one for each input format. All of these four samples were generated by downsampling the real scRNA-seq data. 3.1 From data directory by 10x Genomics This is the most popular input format of scRNA-seq data generated by 10x Genomics. Usually, the data directory contains three files: matrix.mtx: a sparse matrix format containing the raw UMI count per cell-gene combination barcodes.tsv: a tab-separated matrix containing the cell barcodes features.tsv: a tab-separated matrix containing the features/genes and their annotations For more details about this format, please check out here. data_dir <- system.file("extdata/demo_inputs/cell_matrix_10x", package = "scMINER") list.files(path = data_dir, full.names = FALSE) ## [1] "barcodes.tsv.gz" "features.tsv.gz" "matrix.mtx.gz" demo1_mtx <- readInput_10x.dir(input_dir = data_dir, featureType = "gene_symbol", removeSuffix = TRUE, addPrefix = "demo1") ## Reading 10x Genomcis data from: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/scMINER/extdata/demo_inputs/cell_matrix_10x ... ## Multiple data modalities were found: Gene Expression, Peaks . Only the gene expression data (under "Gene Expression") was kept. ## Done! The sparse gene expression matrix has been generated: 500 genes, 100 cells. demo1_mtx[1:5,1:5] ## 5 x 5 sparse Matrix of class "dgTMatrix" ## demo1_AAACAGCCAAACGGGC demo1_AAACAGCCAACTAGCC demo1_AAACAGCCAATTAGGA ## AL590822.3 . . . ## MORN1 . . . ## AL589739.1 . . . ## AL513477.2 . . . ## RER1 . . . ## demo1_AAACAGCCAGCCAGTT demo1_AAACATGCAAAGCTCC ## AL590822.3 . . ## MORN1 . . ## AL589739.1 . . ## AL513477.2 . . ## RER1 1 . The readInput_10x.dir() function can handle these conditions: Alternative file names for feature data: for the datasets generated by CellRanger 3.0 or earlier, the file name is genes.tsv; Compressed input files: one or more input files are compressed, usually in “.gz” format; Data with multiple modalities: like the single cell multiome data. In this case, readInput_10x.dir() only retains the data of “Gene Expression” by default. 3.2 From text-table file This is definitely the most compatible text format for scRNA-seq data. which can be used by all single-cell RNA-seq technologies, like 10x Genomics, Smart-Seq, Drop-Seq and more. The commonly used text table file formats include txt (text file format), csv (comma-separated values) and tsv (tab-separated values). table_file <- system.file("extdata/demo_inputs/table_file/demoData2.txt.gz", package = "scMINER") demo3_mtx <- readInput_table(table_file = table_file, sep = "\\t", is.geneBYcell = TRUE, removeSuffix = TRUE, addPrefix = "demo3") # set is.geneBYcell = FALSE to read features in columns and cell in in rows ## Reading table file: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/scMINER/extdata/demo_inputs/table_file/demoData2.txt.gz ... ## Suffix removal was specified but skipped, since some barcodes do not carry "-1" suffix. ## Done! The sparse gene expression matrix has been generated: 1000 genes, 100 cells. NOTE: A major concern to read the gene expression matrix from text-table files is that the special characters in column names might change to dots (“.”), especially when the matrix is organized in cells by genes. This may cause failures in the identification of mitochondrial genes (usually defined by “MT-|mt-”) or spike-in RNAs (usually defined by “ERCC-|Ercc-”). The readInput_table() function has set check.names = FALSE to avoid this issue. However, if this issue already exists in the source data, you will have to fix it manually. 3.3 From HDF5 file by 10x Genomics This is another popular input format of scRNA-seq data generated by 10x Genomics. The Hierarchical Data Format version 5 (HDF5 or H5) is a binary format that can compress and access data much more efficiently than text formats. It’s super useful when dealing with large datasets. For more details about this format, please check out here. library(hdh5r) h5_file <- system.file("extdata/demo_inputs/hdf5_10x/demoData3.h5", package = "scMINER") demo2_mtx <- readInput_10x.h5(h5_file = h5_file, featureType = "gene_symbol", removeSuffix = TRUE, addPrefix = "demo2") NOTE: The readInput_10x.h5() function is developed exclusively for the HDF5 file generated by CellRanger of 10x Genomics. The HDF5 files from other source may have different hierarchical structures and can not be read by this function. 3.4 From H5AD file The H5AD file is another well-used format for scRNA-seq data. Derived from HDF5 file format, the H5AD file is designed to store large amounts of data efficiently and to facilitate fast access to subsets of the data. Now it’s getting more and more popular in scRNA-seq data analysis, visualization and sharing. For more details about this format, please check out here. library(anndata) h5ad_file <- system.file("extdata/demo_inputs/h5ad_file/demoData4.h5ad", package = "scMINER") demo4_obj <- readInput_h5ad(h5ad_file = h5ad_file, removeSuffix = TRUE, addPrefix = "demo4") # set is.geneBYcell = FALSE to read features in columns and cell in in rows ## Reading h5ad file: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/scMINER/extdata/demo_inputs/h5ad_file/demoData4.h5ad ... ## Suffix removal was specified but skipped, since some barcodes do not carry "-1" suffix. ## Done! The sparse gene expression matrix has been generated: 1000 genes, 100 cells. NOTE: Unlike the other three readInput functions which return a gene expression matrx, the readInput_h5ad() returns an AnnData object. Here are the key components of an AnnData object: X: the primary data matrix, cells by genes; obs: observations (cells) metadata; var: variables (features/genes) metadata. "],["create-sparseeset-object.html", "Chapter 4 Create SparseEset object 4.1 Solely from the gene expression matrix 4.2 Using self-customized meta data 4.3 From multiple samples", " Chapter 4 Create SparseEset object In this chapter, we will introduce you how to create the parseExpressionSet(SparseEset) objects from gene expression matrix, genes by cells. 4.1 Solely from the gene expression matrix This is the most commonly used way to create the sparse eSet object with scMINER: pbmc14k_raw.eset <- createSparseEset(input_matrix = pbmc14k_rawCount, projectID = "PBMC14k", addMetaData = TRUE) ## Creating sparse eset from the input_matrix ... ## Adding meta data based on input_matrix ... ## Done! The sparse eset has been generated: 17986 genes, 14000 cells. pbmc14k_raw.eset ## SparseExpressionSet (storageMode: environment) ## assayData: 17986 features, 14000 samples ## element names: exprs ## protocolData: none ## phenoData ## sampleNames: CACTTTGACGCAAT GTTACGGAAACGAA ... ACGTGCCTTAAAGG (14000 ## total) ## varLabels: CellID projectID ... pctSpikeIn (6 total) ## varMetadata: labelDescription ## featureData ## featureNames: AL627309.1 AP006222.2 ... SRSF10.1 (17986 total) ## fvarLabels: GeneSymbol nCell ## fvarMetadata: labelDescription ## experimentData: use 'experimentData(object)' ## Annotation: input_matrix: it’s usually but not limited to a sparse matrix of raw UMI count. As for the data format, it accepts dgCMatrix, dgTMatrix, dgeMatrix, matrix, data.frame. As for the type of quantification measures, it takes raw counts, normalized counts (e.g. CPM or CP10k), TPM (Transcripts Per Million), FPKM/RPKM (Fragments/Reads Per Kilobase of transcript per Million) and others. What if a data frame object is given to it? When a non-matrix table is passed to input_matrix argument, the createSparseEset() function will automatically convert it to a matrix. And it the matrix, either converted from other format or directly passed from users, is not sparse. createSparseEset() will automatically convert it into sparse matrix, by default. This is controlled by another argument called do.sparseConversion, the default of which is TRUE. It’s not recommended but the users can set it as FALSE to disable the conversion. Then createSparseEset() will create the eSet based on the regular matrix. addMetaData: when this argument is set TRUE (this is the default), createSparseEset() will automatically generate 5 statistics, 4 for cells and 1 for features, and add them into the phenoData and featureData slots. These 5 statistics will be used in quality control and data filtration. ## check the phenoData: metadata of cells head(pData(pbmc14k_raw.eset)) ## CellID projectID nUMI nFeature pctMito pctSpikeIn ## CACTTTGACGCAAT CACTTTGACGCAAT PBMC14k 764 354 0.01832461 0 ## GTTACGGAAACGAA GTTACGGAAACGAA PBMC14k 956 442 0.01569038 0 ## AGTCACGACAGGAG AGTCACGACAGGAG PBMC14k 7940 2163 0.01977330 0 ## TTCGAGGACCAGTA TTCGAGGACCAGTA PBMC14k 4177 1277 0.01149150 0 ## CACTTATGAGTCGT CACTTATGAGTCGT PBMC14k 629 323 0.02066773 0 ## GCATGTGATTCTGT GCATGTGATTCTGT PBMC14k 875 427 0.02628571 0 ## check the featureData: metadata of features head(fData(pbmc14k_raw.eset)) ## GeneSymbol nCell ## AL627309.1 AL627309.1 50 ## AP006222.2 AP006222.2 2 ## RP11-206L10.3 RP11-206L10.3 1 ## RP11-206L10.2 RP11-206L10.2 33 ## RP11-206L10.9 RP11-206L10.9 17 ## LINC00115 LINC00115 115 4.2 Using self-customized meta data In some cases, you may have more meta data of either cells (e.g. sample id, treatment condition) or features (e.g. gene full name, gene type, genome location) which will be used in downstream analysis and you do want to add them into the sparse eSet object. the createSparseEset() function provides another two arguments, cellData and featureData, to take the self-customized meta data. For the PBMC14k dataset, we have the true labels of cell type and would like to add them to the sparse eSet object. ## read the true labels of cell type for PBMC14k dataset true_label <- read.table(system.file("extdata/demo_pbmc14k/PBMC14k_trueLabel.txt.gz", package = "scMINER"), header = T, row.names = 1, sep = "\\t", quote = "", stringsAsFactors = FALSE) head(true_label) ## trueLabel_full trueLabel ## CACTTTGACGCAAT CD14+ Monocyte Monocyte ## GTTACGGAAACGAA CD14+ Monocyte Monocyte ## AGTCACGACAGGAG CD14+ Monocyte Monocyte ## TTCGAGGACCAGTA CD14+ Monocyte Monocyte ## CACTTATGAGTCGT CD14+ Monocyte Monocyte ## GCATGTGATTCTGT CD14+ Monocyte Monocyte table(true_label$trueLabel_full) ## ## CD14+ Monocyte CD19+ B ## 2000 2000 ## CD4+/CD25 T Reg CD4+/CD45RA+/CD25- Naive T ## 2000 2000 ## CD4+/CD45RO+ Memory CD56+ NK ## 2000 2000 ## CD8+/CD45RA+ Naive Cytotoxic ## 2000 ## the true_label much cover all cells in the expression matrix table(colnames(pbmc14k_rawCount) %in% row.names(true_label)) ## ## TRUE ## 14000 ## create the sparse eSet object using the true_label pbmc14k_raw.eset <- createSparseEset(input_matrix = pbmc14k_rawCount, cellData = true_label, featureData = NULL, projectID = "PBMC14k", addMetaData = TRUE) ## Creating sparse eset from the input_matrix ... ## Adding meta data based on input_matrix ... ## Done! The sparse eset has been generated: 17986 genes, 14000 cells. ## check the true labels of cell type from sparse eSet object head(pData(pbmc14k_raw.eset)) ## trueLabel_full trueLabel projectID nUMI nFeature pctMito ## CACTTTGACGCAAT CD14+ Monocyte Monocyte PBMC14k 764 354 0.01832461 ## GTTACGGAAACGAA CD14+ Monocyte Monocyte PBMC14k 956 442 0.01569038 ## AGTCACGACAGGAG CD14+ Monocyte Monocyte PBMC14k 7940 2163 0.01977330 ## TTCGAGGACCAGTA CD14+ Monocyte Monocyte PBMC14k 4177 1277 0.01149150 ## CACTTATGAGTCGT CD14+ Monocyte Monocyte PBMC14k 629 323 0.02066773 ## GCATGTGATTCTGT CD14+ Monocyte Monocyte PBMC14k 875 427 0.02628571 ## pctSpikeIn CellID ## CACTTTGACGCAAT 0 CACTTTGACGCAAT ## GTTACGGAAACGAA 0 GTTACGGAAACGAA ## AGTCACGACAGGAG 0 AGTCACGACAGGAG ## TTCGAGGACCAGTA 0 TTCGAGGACCAGTA ## CACTTATGAGTCGT 0 CACTTATGAGTCGT ## GCATGTGATTCTGT 0 GCATGTGATTCTGT table(pData(pbmc14k_raw.eset)$trueLabel_full) ## ## CD14+ Monocyte CD19+ B ## 2000 2000 ## CD4+/CD25 T Reg CD4+/CD45RA+/CD25- Naive T ## 2000 2000 ## CD4+/CD45RO+ Memory CD56+ NK ## 2000 2000 ## CD8+/CD45RA+ Naive Cytotoxic ## 2000 4.3 From multiple samples What if you have multiple samples for one project? Now it’s pretty common to profile multiple samples of the same tissue but under different conditions (e.g. drug treatment) in one project. Analyzing these samples one by one is crucial, and analyzing them in a combined manner may give you more prospects. For this purpose, scMINER provides a function, combineSparseEset(), to easily combine the sparse eSet objects of multiple samples. ## create a sparse eSet object of each sample to combined demo1_mtx <- readInput_10x.dir(input_dir = system.file("extdata/demo_inputs/cell_matrix_10x", package = "scMINER"), featureType = "gene_symbol", removeSuffix = TRUE) ## Reading 10x Genomcis data from: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/scMINER/extdata/demo_inputs/cell_matrix_10x ... ## Multiple data modalities were found: Gene Expression, Peaks . Only the gene expression data (under "Gene Expression") was kept. ## Done! The sparse gene expression matrix has been generated: 500 genes, 100 cells. demo1.eset <- createSparseEset(input_matrix = demo1_mtx, projectID = "demo1", addMetaData = TRUE) ## Creating sparse eset from the input_matrix ... ## Adding meta data based on input_matrix ... ## Done! The sparse eset has been generated: 500 genes, 100 cells. demo2_mtx <- readInput_table(table_file = system.file("extdata/demo_inputs/table_file/demoData2.txt.gz", package = "scMINER"), sep = "\\t", is.geneBYcell = TRUE, removeSuffix = TRUE) ## Reading table file: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/scMINER/extdata/demo_inputs/table_file/demoData2.txt.gz ... ## Suffix removal was specified but skipped, since some barcodes do not carry "-1" suffix. ## Done! The sparse gene expression matrix has been generated: 1000 genes, 100 cells. demo2.eset <- createSparseEset(input_matrix = demo2_mtx, projectID = "demo2", addMetaData = TRUE) ## Creating sparse eset from the input_matrix ... ## Adding meta data based on input_matrix ... ## Done! The sparse eset has been generated: 1000 genes, 100 cells. ## combine the 4 sparse eSet objects library(dplyr) ## ## Attaching package: 'dplyr' ## The following object is masked from 'package:Biobase': ## ## combine ## The following objects are masked from 'package:BiocGenerics': ## ## combine, intersect, setdiff, union ## The following objects are masked from 'package:stats': ## ## filter, lag ## The following objects are masked from 'package:base': ## ## intersect, setdiff, setequal, union combined.eset <- combineSparseEset(eset_list = c(demo1.eset, demo2.eset), projectID = c("sample1", "sample2"), addPrefix = c("demo1", "demo2"), addSurfix = NULL, addMetaData = TRUE, imputeNA = TRUE) ## Combining the input sparse eSets ... ## NA values were found in the merged matrix and have been replaced by the minimum value: 0 . ## Adding meta data based on merged data matrix ... ## Done! The combined sparse eset has been generated: 1500 genes, 200 cells. dim(combined.eset) ## Features Samples ## 1500 200 A few questions you may have about the combineSparseEset() function: What if the input eSets have different features? combineSparseEset() ALWAYS keep all features from the input eSets, and generate NA values wherever the data is not available. By default, this function impute the NA values with the minimum value of the combined matrix, which is usually but not always zero. If this imputation method doesn’t fit your study, you can set imputeNA to FALSE to disable it. If so, the NAs will retain in the eSet object, and you can manually impute them with your own method. What if the input eSets have some same cell barcodes? combineSparseEset() ALWAYS keep all cells from the input eSets, and will report an error when same barcodes are found in different input eSets. This function provides two arguments, addPrefix and addSurfix, to solve this issue. You can easily avoid the same barcodes of different input eSets by adding a eSet-specific prefix and/or surfix to the barcodes. head(pData(combined.eset)) ## CellID projectID nUMI nFeature pctMito ## demo1_AAACAGCCAAACGGGC demo1_AAACAGCCAAACGGGC sample1 119 43 0 ## demo1_AAACAGCCAACTAGCC demo1_AAACAGCCAACTAGCC sample1 55 28 0 ## demo1_AAACAGCCAATTAGGA demo1_AAACAGCCAATTAGGA sample1 45 20 0 ## demo1_AAACAGCCAGCCAGTT demo1_AAACAGCCAGCCAGTT sample1 175 44 0 ## demo1_AAACATGCAAAGCTCC demo1_AAACATGCAAAGCTCC sample1 51 31 0 ## demo1_AAACATGCAATAGCCC demo1_AAACATGCAATAGCCC sample1 121 44 0 ## pctSpikeIn ## demo1_AAACAGCCAAACGGGC 0 ## demo1_AAACAGCCAACTAGCC 0 ## demo1_AAACAGCCAATTAGGA 0 ## demo1_AAACAGCCAGCCAGTT 0 ## demo1_AAACATGCAAAGCTCC 0 ## demo1_AAACATGCAATAGCCC 0 I have some customized column in the phenoData and/or featureData slots. How does combineSparseEset() handle them? combineSparseEset() only keep the columns of phenoData and featureData that shared by all input eSets. Your customized columns would be kept only when they are available in all input eSets. Are the 5 meta data statistics in the combined eSet still same with those generated in each eSet? No. By default, combineSparseEset() will update (add, if they are not available in input eSets) these 5 meta data statistics based on the combined matrix. It’s not recommended but you can disable it by setting addMataData to FALSE. "],["data-filtration.html", "Chapter 5 Data filtration 5.1 QC report 5.2 Filter the sparse eset object", " Chapter 5 Data filtration In this chapter, we will introduce you how scMINER assess the scRNA-seq data quality, estimate the cutoffs for data filtration, and remove the cells and features of low quality from the SparseEset object. ## QC metrics in scMINER As we mentioned before, scMINER can automatically generate 5 meta data statistics and add them to the SparseEset object. These 5 meta data statistics are the metrics scMINER uses to assess the quality of cells and features: For cell quality assessment, scMINER provides 4 metrics that commonly used by the community: nUMI: number of total UMIs in each cell. Cells with abnormally high nUMI usually indicate doublets, while those with abnormally low nUMI usually indicate poorly sequenced cells or empty droplets. nFeature: number of expressed features/genes in each cell. Similar to nUMI. pctMito: percentage of UMIs of mitochondrial genes (defined by “mt-|MT-”) in each cell. Cells with aberrantly high pctMito usually indicate dying cells. pctSpikeIn: percentage of UMIs of spike-in RNAs (defined by “ERCC-|Ercc-”)) in each cell. This is used to estimate the normalization factor. Cells with extremely high or low pctSpikeIn need to be removed. For feature quality assessment, scMINER provides one metrics: nCell: number of cells expressing the features/genes. Genes with extremely low nCell are poorly sequenced and are usually of low variance. 5.1 QC report To help assess the data quality and determine the cutoffs for data filtration, scMINER provides a function, drawSparseEsetQC(), to generate a html-format quality control report: ## To generate the drawSparseEsetQC(input_eset = pbmc14k_raw.eset, output_html_file = "/your-path/PBMC14k/PLOT/pbmc14k_rawCount.html", overwrite = TRUE) ## scMINER supports group-specific QC highlights drawSparseEsetQC(input_eset = pbmc14k_raw.eset, output_html_file = "/your-path/PBMC14k/PLOT/pbmc14k_rawCount.html", overwrite = FALSE, group_by = "true_label") The quality control report consists of 4 parts: Key Statistics: it highlights 5 key statistics of given eset object, including number of cells, number of genes, mean of genes per cell, mean of UMIs per cell and mean of cells per gene. Detailed statistics of key metrics: it summarizes and visualizes the detailed statistics of 5 key metrics that scMINER uses for filtration: nUMI, nFeature, pctMito, pctSpikeIn, nCell. Detailed statistics per cell and gene: it lists the detailed statistics of each gene and cell. Filtration cutoffs by scMINER: it provides the cutoffs estimated automatically by scMINER based on Median ± 3 * MAD (maximum absolute deviance), and the pseudo-filtration statistics on both genes and cells with these cutoffs. 5.2 Filter the sparse eset object From the quality control report generated above, we have got a better sense about the data quality and the cutoffs to use for filtration. scMINER provides a function, filterSparseEset() for this purpose, and it can work in two modes: auto: in this mode, scMINER will use the cutoffs estimated by Median ± 3*MAD (maximum absolute deviation). Based on our tests, in most cases, this mode works well with the matrix of both raw UMI counts and TPM values. manual: in this mode, the users can manually specify the cutoffs, both low and high, of all 5 metrics: nUMI, nFeature, pctMito, pctSpikeIn for cells, and nCell for genes. No cells or features would be removed under the default cutoffs of each metrics. No matter which mode to use, filterSparseEset() returns a summary table with detailed information of filtration statistics. You can refer to it and adjust the cutoffs accordingly. 5.2.1 Data filtration with auto mode To conduct the filtering using the cutoffs recommended by scMINER: ## Filter eSet under the auto mode pbmc14k_filtered.eset <- filterSparseEset(pbmc14k_raw.eset, filter_mode = "auto", filter_type = "both") ## Checking the availability of the 5 metrics ('nCell', 'nUMI', 'nFeature', 'pctMito', 'pctSpikeIn') used for filtration ... ## Checking passed! All 5 metrics are available. ## Filtration is done! ## Filtration Summary: ## 8846/17986 genes passed! ## 13605/14000 cells passed! ## ## For more details: ## Gene filtration statistics: ## Metrics nCell ## Cutoff_Low 70 ## Cutoff_High Inf ## Gene_total 17986 ## Gene_passed 8846(49.18%) ## Gene_failed 9140(50.82%) ## ## Cell filtration statistics: ## Metrics nUMI nFeature pctMito pctSpikeIn Combined ## Cutoff_Low 458 221 0 0 NA ## Cutoff_High 3694 Inf 0.0408 0.0000 NA ## Cell_total 14000 14000 14000 14000 14000 ## Cell_passed 13826(98.76%) 14000(100.00%) 13778(98.41%) 14000(100.00%) 13605(97.18%) ## Cell_failed 174(1.24%) 0(0.00%) 222(1.59%) 0(0.00%) 395(2.82%) In some cases, you may find that most of the cutoffs generated by the auto mode are good, except one or two. Though there is no ‘hybrid’ mode, scMINER does allow you to customize some of the cutoffs generated by the auto mode. This can be easily done by adding the cutoffs you would customize under the auto mode: ## Filter eSet under the auto mode, with customized values pbmc14k_filtered.eset <- filterSparseEset(pbmc14k_raw.eset, filter_mode = "auto", filter_type = "both", gene.nCell_min = 5) With the code above, scMINER will filter the eSet using all of the cutoffs generated by auto mode, except gene.nCell_min. 5.2.2 Data filtration with manual mode To apply the self-customized cutoffs: ## Filter eSet under the manual mode pbmc14k_filtered.eset <- filterSparseEset(pbmc14k_raw.eset, filter_mode = "manual", filter_type = "both", gene.nCell_min = 10, cell.nUMI_min = 500, cell.nUMI_max = 6500, cell.nFeature_min = 200, cell.nFeature_max = 2500, cell.pctMito_max = 0.1) For any unspecified cutoff arguments, like gene.nCell_max, filterSparseEset() will automatically assign the default values to them. The default values of any cutoff argument would not filter out any cells or features. So, if you want to skip some metrics, just leave the cutoffs of them unspecified. For example, in the codes above, gene.nCell_max is unspecified. Then filterSparseEset() wil assign the default value, which is Inf, to it. No features would be filtered out by this argument. "],["data-normalization.html", "Chapter 6 Data normalization", " Chapter 6 Data normalization In this chapter, we will introduce the method of data normalization in scMINER. We recommend to use log2CPM method for normalization: the raw counts in each cell are normalized to a library size of 1 million, followed by log2 transformation. pbmc14k_log2cpm.eset <- normalizeSparseEset(pbmc14k_filtered.eset, scale_factor = 1000000, log_base = 2, log_pseudoCount = 1) ## Done! The data matrix of eset has been normalized and log-transformed! ## The returned eset contains: 8846 genes, 13605 cells. exprs(pbmc14k_log2cpm.eset)[1:5,1:5] ## 5 x 5 Matrix of class "dgeMatrix" ## CACTTTGACGCAAT GTTACGGAAACGAA CACTTATGAGTCGT GCATGTGATTCTGT ## LINC00115 0 0.00000 0 0 ## NOC2L 0 0.00000 0 0 ## HES4 0 0.00000 0 0 ## ISG15 0 10.05794 0 0 ## C1orf159 0 0.00000 0 0 ## TAGAATACGTATCG ## LINC00115 0 ## NOC2L 0 ## HES4 0 ## ISG15 0 ## C1orf159 0 This normalized and log-transformed SparseEset object can be directly used for Mutual Information-based clustering, network inference and other downstream analysis. Don’t forget to save the SparseEset object after data normalization. saveRDS(pbmc14k_log2cpm.eset, file = "/your-path/PBMC14k/DATA/pbmc14k_log2CPM_annotated.rds") "],["mi-based-clustering-analysis.html", "Chapter 7 MI-based clustering analysis 7.1 Introduction to MICA 7.2 Generate MICA input 7.3 Run MICA 7.4 Integrate MICA outputs into SparseEset object 7.5 Visualize the MICA output", " Chapter 7 MI-based clustering analysis In this chapter, we will introduce more about the MICA component, and walk you through the MICA workflow, including preparing inputs, runing MICA, visualizing and integrationg MICA outputs into SparseEset object. 7.1 Introduction to MICA MICA (Mutual Information-based Clustering Analysis) is a clustering tool designed for single cell genomics data. Compared to most existing single-cell clustering algorithms, MICA has two unique features: MICA uses mutual information to measure cell-cell similarity for unsupervised clustering analysis, while most existing tools employ linear-transformation of PCA and/or co-expression analysis using linear Pearson or Spearman correlations that may not capture the nonlinear cell-cell distance. MICA uses all high-quality features for clustering, while most existing tools select the top highly variable features to improve the clustering speed. This is arbitrary and may lose the information that can distinguish close cell states. MICA is developed using Python framework, to take its strengths in calculation speed and memory consumption. A lot of effort has been made to improve the interoperability between Python and R. Now MICA works seamlessly with the SparseExpressionSet object. The input of MICA can be easily generated from the SparseExpressionSet object by generateMICAinput(), and the output of MICA, the clustering results, can be effortlessly visualized by MICAplot() and integrated into SparseExpressionSet object by addMICAoutput(). 7.2 Generate MICA input The standard input of MICA is a normalized and log-transformed gene expression matrix. scMINER can generate this matrix from sparse eSet object and save it into a file that can be directly read by MICA. MICA accepts .h5ad or .txt format as the input file, which can be easily generated by embedded function generateMICAinput(): ## generate MICA input in txt format generateMICAinput(input_eset = pbmc14k_log2cpm.eset, output_file = "/work-path/PBMC14k/MICA/micaInput.txt", overwrite = FALSE) ## check the format of MICA input mica_input <- read.delim(system.file("extdata/demo_pbmc14k/PBMC14k/MICA/micaInput.txt", package = "scMINER"), header = T, sep = "\\t", row.names = 1) mica_input[1:5,1:5] To use the .h5ad format, run the codes below. ## generate MICA input in h5ad format: anndata package is needed generateMICAinput(input_eset = pbmc14k_log2cpm.eset, output_file = "/work-path/PBMC14k/MICA/micaInput.h5ad", overwrite = FALSE) In addition to generating the standard MICA input file, generateMICAinput() also returns the recommended commands of running MICA. You can copy the commands, modify according and run them. 7.3 Run MICA MICA features two different modes named by their different dimension reduction techniques: Multi-Dimensional Scaling (MDS) mode: this mode is more accurate and robust for small datasets (less than 5,000 cells, be default) due to its global dimension reduction nature; Graph Embedding (GE) mode: this mode works better with large datasets (more than 5,000 cells, by default) using a graph embedding approach to explore distant neighbor cells. In this case, since there are 13,605 cells, we will use the MICA GE mode for the clustering: mica ge -i /work-path/PBMC14k/MICA/micaInput.txt -o /work-path/PBMC14k/MICA/micaOutput -minr 0.1 -maxr 9.0 -ss 0.05 -nw 40 This command will generate the clustering results of multiple resolutions, from 0.1 to 9.0, with a step size of 0.05. 7.4 Integrate MICA outputs into SparseEset object MICA generates several files and save all of them in the output directory specified by the user with -o argument. The core, and only, output file we need for subsequent analysis is the clustering label file named in the format of ProjectName_clustering_VisualizeMethod_euclidean_NumberOfDimensions_Resolution.txt. In this case, since we used a range of resolutions, there are several clustering label files generated, one for each resolution. Based on the knowledge about PBMC14k dataset, we compared the results of different resolutions and picked clustering_UMAP_euclidean_20_2.05.txt for subsequent analysis. micaOutput <- read.table(system.file("extdata/demo_pbmc14k/MICA/clustering_UMAP_euclidean_20_2.05.txt", package = "scMINER"), header = TRUE, sep = "\\t", quote = "", stringsAsFactors = F) head(micaOutput) ## ID X Y label ## 1 CACTTTGACGCAAT 14.91650 13.04096 6 ## 2 GTTACGGAAACGAA 14.57031 10.27093 6 ## 3 CACTTATGAGTCGT 14.28869 13.61674 6 ## 4 GCATGTGATTCTGT 14.12546 13.36319 6 ## 5 TAGAATACGTATCG 14.91227 11.19407 6 ## 6 CAAGAAGACCCTCA 15.34154 12.25821 6 As shown above, the clustering label file contains four columns: ID: cell barcodes; X: coordinates of UMAP_1 or tSNE_1; Y: coordinates of UMAP_2 or tSNE_2; label: labels of predicted clusters. The clustering result can be easily easily added to the SparseExpressionSet object by addMICAoutput(): pbmc14k_log2cpm.eset <- addMICAoutput(pbmc14k_log2cpm.eset, mica_output_file = system.file("extdata/demo_pbmc14k/MICA/clustering_UMAP_euclidean_20_2.05.txt", package = "scMINER"), visual_method = "umap") head(pData(pbmc14k_log2cpm.eset)) ## trueLabel_full trueLabel projectID nUMI nFeature pctMito ## CACTTTGACGCAAT CD14+ Monocyte Monocyte PBMC14k 764 354 0.01832461 ## GTTACGGAAACGAA CD14+ Monocyte Monocyte PBMC14k 956 442 0.01569038 ## CACTTATGAGTCGT CD14+ Monocyte Monocyte PBMC14k 629 323 0.02066773 ## GCATGTGATTCTGT CD14+ Monocyte Monocyte PBMC14k 875 427 0.02628571 ## TAGAATACGTATCG CD14+ Monocyte Monocyte PBMC14k 1060 445 0.03207547 ## CAAGAAGACCCTCA CD14+ Monocyte Monocyte PBMC14k 849 384 0.01531213 ## pctSpikeIn CellID UMAP_1 UMAP_2 clusterID ## CACTTTGACGCAAT 0 CACTTTGACGCAAT 14.91650 13.04096 6 ## GTTACGGAAACGAA 0 GTTACGGAAACGAA 14.57031 10.27093 6 ## CACTTATGAGTCGT 0 CACTTATGAGTCGT 14.28869 13.61674 6 ## GCATGTGATTCTGT 0 GCATGTGATTCTGT 14.12546 13.36319 6 ## TAGAATACGTATCG 0 TAGAATACGTATCG 14.91227 11.19407 6 ## CAAGAAGACCCTCA 0 CAAGAAGACCCTCA 15.34154 12.25821 6 7.5 Visualize the MICA output scMINER provides a function, MICAplot() to easily visualize the clustering results on a 2D plot, UMAP or tSNE. And it can be colored by multiple variables, including cluster label, sample source, nUMI, nFeature, pctMito and more. 7.5.1 Color-coded by cluster labels library(ggplot2) MICAplot(input_eset = pbmc14k_log2cpm.eset, color_by = "clusterID", X = "UMAP_1", Y = "UMAP_2", point.size = 0.1, fontsize.cluster_label = 6) 7.5.2 Color-coded by true label of cell types MICAplot(input_eset = pbmc14k_log2cpm.eset, color_by = "trueLabel", X = "UMAP_1", Y = "UMAP_2", point.size = 0.1, fontsize.cluster_label = 4) 7.5.3 Color-coded by nUMI, for QC purpose MICAplot(input_eset = pbmc14k_log2cpm.eset, color_by = "nUMI", do.logTransform = TRUE, point.size = 0.1) ## The values in "nUMI" have been transformed by log2(value + 1). To turn transformation off, set do.logTransform = FALSE. 7.5.4 Color-coded by nFeature, for QC purpose MICAplot(input_eset = pbmc14k_log2cpm.eset, color_by = "nFeature", do.logTransform = TRUE, point.size = 0.1) ## The values in "nFeature" have been transformed by log2(value + 1). To turn transformation off, set do.logTransform = FALSE. ### Color-coded by pctMito, for QC purpose MICAplot(input_eset = pbmc14k_log2cpm.eset, color_by = "pctMito", do.logTransform = FALSE, point.size = 0.1) "],["cell-type-annotation.html", "Chapter 8 Cell type annotation 8.1 Supervised cell type annotation 8.2 Unsupervised cell type annotation 8.3 Add cell type annotations to SparseExpressionSet object", " Chapter 8 Cell type annotation Currently, there are two types of strategies to annotate the clusters: supervised and unsupervised. The supervised methods use a list of known markers of potential cell types curated from somoe existing studies of the same/similar contexts. While in contrast, the unsupervised methods are usually based on the differentially expressed genes. scMINER provide several useful functions to support both types of strategies. 8.1 Supervised cell type annotation In the past a few years, the scRNA-seq data has dramatically increased in both quality and quantity. For the majority of tissue types, some existing studies on the same/similar tissue type are most likely available, and from these existing studies, we can figure out a list of candidate cell types to expect and curate a list of markers for each of them. In this case, we know the 7 cell types involved in the dataset, and curated a marker list from some existing PBMCs studies. 8.1.1 Annotate using signature scores Given a marker list of candidate cell types, scMINER can estimate a signature score, which is mathematically the weighted mean of the expression of marker genes involved, for each candidate cell type across all cell cluster. To do so, you will need to generate a signature table with three columns: signature_name: name of cell types/signatures; signature_feature: markers genes/features of corresponding cell type/signature; weight: weight of corresponding maker/feature in corresponding cell type/signature. It ranges from -1 to 1, so both positive and negtive markers are supoorted. ## Signature table of PBMC14k dataset signature_table <- read.table(system.file("extdata/demo_pbmc14k/PBMC14k_signatureTable.txt", package = "scMINER"), header = TRUE, sep = "\\t", quote = "", stringsAsFactors = FALSE) head(signature_table) ## signature_name signature_feature weight ## 1 Monocyte CD14 1 ## 2 Monocyte LYZ 1 ## 3 Monocyte S100A8 1 ## 4 Monocyte S100A9 1 ## 5 Monocyte S100A12 1 ## 6 NK FCGR3A 1 With this signature table, draw_bubbleplot() can estimate the signature scores and visualize them using bubble plot: ## Violin plot of marker genes across clusters draw_bubbleplot(input_eset = pbmc14k_log2cpm.eset, signature_table = signature_table, group_by = "clusterID") ## 31 features of 7 signatures were found in the input eset and will be used in calculation. In the bubble plot above, the color of the bubbles is proportional to the mean of signature score, and the size of the bubbles is proportional to the percentage of cells with higher signature score than mean. The cell type of each cluster is clear, except the cluster 7, which shows equally-high signature score of both CD4+ TCM and CD4+ Reg and higher percentage of CD4+ TCM cells. 8.1.2 Annotate using individual marker genes scMINER also provides a variety of functions to visualize the selected features: ## For the demonstration purposes, we picked two well known markers for each of the 7 known cell types, plus "CD3D" and "CD4". genes_of_interest <-c("CD14", "LYZ", "GZMB", "NKG7", "CD19", "MS4A1", "CD8A", "CD8B", "SELL", "CCR7", "IL2RA", "FOXP3", "IL7R", "S100A4", "CD3D", "CD4") 8.1.2.1 feature visualization: violin plot ## Violin plot of marker genes across clusters feature_vlnplot(input_eset = pbmc14k_log2cpm.eset, features = genes_of_interest, group_by = "clusterID", ncol = 4) 8.1.2.2 feature visualization: box plot ## Box plot of marker genes across clusters feature_boxplot(input_eset = pbmc14k_log2cpm.eset, features = genes_of_interest, group_by = "clusterID", ncol = 4) 8.1.2.3 feature visualization: scatter plot ## UMAP scatter plot of marker genes feature_scatterplot(input_eset = pbmc14k_log2cpm.eset, features = genes_of_interest, ncol = 4, location_x = "UMAP_1", location_y = "UMAP_2", point.size = 0.5, legend.key_height = 0.3, legend.key_width = 0.2, fontsize.legend_title = 8, fontsize.legend_text = 6, fontsize.axis_title = 8, legend.position = "none") 8.1.2.4 feature visualization: bubble plot ## Bubble plot of marker genes across clusters feature_bubbleplot(input_eset = pbmc14k_log2cpm.eset, features = genes_of_interest, group_by = "clusterID", xlabel.angle = 45) 8.1.2.5 feature visualization: heatmap ## Heatmap of marker genes across clusters feature_heatmap(input_eset = pbmc14k_log2cpm.eset, features = genes_of_interest, group_by = "clusterID", scale_method = "none", annotation_columns = c("trueLabel")) 8.2 Unsupervised cell type annotation Existing studies in the same or similar contexts are not always available, and there is a significant concern regarding the reliability of reference studies. This reliability largely depends on the expertise of the original authors who defined the markers and assigned the cell types. Therefore, we strongly encourage users to also try unsupervised methods, which can serve as a means of cross-validation. scMINER provides a function, getDE(), to perform the differential expression analysis and identify the markers of each cluster. The getDE() function supports three different methods to perform the differential expression analysis, limma, wilcoxon and t.test. And it allows the users to define the groups to compare in a flexible way: ## 1. To perform differential expression analysis in a 1-vs-rest manner for all groups de_res1 <- getDE(input_eset = pbmc14k_log2cpm.eset[500,], group_by = "clusterID", use_method = "limma") ## 7 groups were found in group_by column [ clusterID ]. ## Since no group was specified, the differential analysis will be conducted among all groups in the group_by column [ clusterID ] in the 1-vs-rest manner. ## 1 / 7 : group 1 ( 1 ) vs the rest... ## 2505 cells were found for g1. ## 11100 cells were found for g0. ## 2 / 7 : group 1 ( 2 ) vs the rest... ## 2022 cells were found for g1. ## 11583 cells were found for g0. ## 3 / 7 : group 1 ( 3 ) vs the rest... ## 2014 cells were found for g1. ## 11591 cells were found for g0. ## 4 / 7 : group 1 ( 4 ) vs the rest... ## 1918 cells were found for g1. ## 11687 cells were found for g0. ## 5 / 7 : group 1 ( 5 ) vs the rest... ## 1912 cells were found for g1. ## 11693 cells were found for g0. ## 6 / 7 : group 1 ( 6 ) vs the rest... ## 1786 cells were found for g1. ## 11819 cells were found for g0. ## 7 / 7 : group 1 ( 7 ) vs the rest... ## 1448 cells were found for g1. ## 12157 cells were found for g0. head(de_res1) ## [1] feature g1_tag g0_tag g1_avg g0_avg g1_pct g0_pct log2FC Pval ## [10] FDR Zscore ## <0 rows> (or 0-length row.names) Here is an brief introduction to the results of getDE(): feature: feature name; g1_tag: a vector of clusters or subgroups involved in g1, the fore-ground group; g0_tag: a vector of clusters or subgroups involved in g0, the back-ground group; g1_avg: mean of gene expression of cells in g1; g0_tag: mean of gene expression of cells in g0; g1_pct: percentage of cells expressing the corresponding genes in group 1; g0_pct: percentage of cells expressing the corresponding genes in group 0; log2FC: log2Fold change of gene expression between g1 and g0; Pval: P values of g1-g0 comparison; FDR: FDR of g1-g0 comparison; Zscore: Z score of g1-g0 comparison, signed by log2FC; ## 2. To perform differential expression analysis in a 1-vs-rest manner for one specific group de_res2 <- getDE(input_eset = pbmc14k_log2cpm.eset, group_by = "clusterID", g1 = c("1"), use_method = "limma") ## 3. To perform differential expression analysis in a rest-vs-1 manner for one specific group de_res3 <- getDE(input_eset = pbmc14k_log2cpm.eset, group_by = "clusterID", g0 = c("1"), use_method = "limma") ## 4. To perform differential expression analysis in a 1-vs-1 manner for any two groups de_res4 <- getDE(input_eset = pbmc14k_log2cpm.eset, group_by = "clusterID", g1 = c("1"), g0 = c("3"), use_method = "limma") scMINER also provides a function, getTopFeatures(), to easily extract the group-specific markers from the differential expression result: cluster_markers <- getTopFeatures(input_table = de_res1, number = 10, group_by = "g1_tag", sort_by = "log2FC", sort_decreasing = TRUE) dim(cluster_markers) ## [1] 0 11 head(cluster_markers) ## [1] feature g1_tag g0_tag g1_avg g0_avg g1_pct g0_pct log2FC Pval ## [10] FDR Zscore ## <0 rows> (or 0-length row.names) 8.3 Add cell type annotations to SparseExpressionSet object Based on the supervised and unsupervised methods, we have annotated the cell types for each cluster. To add the cell type annotation information into the sparse eset object: celltype_map <- c(`1`="CD4TN", `2`="CD4TCM", `3`="CD8TN", `4`="NK", `5`="B", `6`="Monocyte", `7`="CD4Treg") pbmc14k_log2cpm.eset$cell_type <- as.character(celltype_map[pbmc14k_log2cpm.eset$clusterID]) head(pData(pbmc14k_log2cpm.eset)) ## trueLabel_full trueLabel projectID nUMI nFeature pctMito ## CACTTTGACGCAAT CD14+ Monocyte Monocyte PBMC14k 764 354 0.01832461 ## GTTACGGAAACGAA CD14+ Monocyte Monocyte PBMC14k 956 442 0.01569038 ## CACTTATGAGTCGT CD14+ Monocyte Monocyte PBMC14k 629 323 0.02066773 ## GCATGTGATTCTGT CD14+ Monocyte Monocyte PBMC14k 875 427 0.02628571 ## TAGAATACGTATCG CD14+ Monocyte Monocyte PBMC14k 1060 445 0.03207547 ## CAAGAAGACCCTCA CD14+ Monocyte Monocyte PBMC14k 849 384 0.01531213 ## pctSpikeIn CellID UMAP_1 UMAP_2 clusterID cell_type ## CACTTTGACGCAAT 0 CACTTTGACGCAAT 14.91650 13.04096 6 Monocyte ## GTTACGGAAACGAA 0 GTTACGGAAACGAA 14.57031 10.27093 6 Monocyte ## CACTTATGAGTCGT 0 CACTTATGAGTCGT 14.28869 13.61674 6 Monocyte ## GCATGTGATTCTGT 0 GCATGTGATTCTGT 14.12546 13.36319 6 Monocyte ## TAGAATACGTATCG 0 TAGAATACGTATCG 14.91227 11.19407 6 Monocyte ## CAAGAAGACCCTCA 0 CAAGAAGACCCTCA 15.34154 12.25821 6 Monocyte The draw_barplot() function can visualize the cell composition of self-defined groups. We can use it to show the purity of MICA clusters: ## Violin plot of marker genes across clusters draw_barplot(input_eset = pbmc14k_log2cpm.eset, group_by = "cell_type", color_by = "trueLabel_full", xlabel.angle = 45) Don’t forget to save the SparseEset object after the cell type annotation added. saveRDS(pbmc14k_log2cpm.eset, file = "/your-path/PBMC14k/DATA/pbmc14k_log2CPM_annotated.rds") "],["network-inference.html", "Chapter 9 Network inference 9.1 Generate SJARACNe input files 9.2 Run SJARACNe 9.3 Assess the quality of networks", " Chapter 9 Network inference SJARACNe is a scalable software tool for gene network reverse engineering from big data. As an improved implementation of the ARACNe, SJARACNe achieves a dramatic improvement in computational performance in both time and memory usage and implements new features while preserving the network inference accuracy of the original algorithm. Similar to MICA, SJARACNe is also a component of scMINER framework, and can work seamlessly with the SparseExpressionSet object. The input for MICA can be easily generated from the SparseExpressionSet object by generateSJARACNeInput(), and the output of SJARACNe, the networks, can be effortlessly assessed by drawNetworkQC() and directly taken for driver activity estimation by getActivity_individual() and getActivity_inBatch(). 9.1 Generate SJARACNe input files The network inference is usually conducted in a cluster- or cell type-specific basis. Given the column names for grouping, generateSJARACNeInput() will create a folder for each group named by the group label. IMPORTANT NOTE: Any illegal characters in path in group labels may cause issues in subsequent analysis. To avoid it, scMINER only accept letters(A-Za-z), numbers(0-9), underscores(’_‘) and periods(’.’). In this case, the true labels of cell type are available, so we use them to define the groups for network inference. ## Columns with any illegal characters can not be used for groupping generateSJARACNeInput(input_eset = pbmc14k_log2cpm.eset, group_name = "trueLabel", sjaracne_dir = "/work-path/PBMC14k/SJARACNe", species_type = "hg", driver_type = "TF_SIG", downSample_N = NULL) For big datasets, generateSJARACNeInput() provides an argument, downSample_N, to allow users to down sample size of each group. The default value of downSample_N is 1,000, any group with >= 1,000 cells will be down-sample to 1,000. ## one folder for each group list.dirs(system.file("extdata/demo_pbmc14k/SJARACNe", package = "scMINER"), full.names = FALSE, recursive = FALSE) ## [1] "B" "CD4TCM" "CD4TN" "CD4Treg" "CD8TN" "Monocyte" "NK" ## file structure of each folder list.files(system.file("extdata/demo_pbmc14k/SJARACNe/B", package = "scMINER"), full.names = FALSE, recursive = TRUE, include.dirs = FALSE, pattern = "[^consensus_network_ncol_.txt]") ## [1] "B.8572_1902.exp.txt" "config_cwlexec.json" ## [3] "runSJARACNe.sh" "SIG/B.4148_1902.sig.txt" ## [5] "TF/B.835_1902.tf.txt" The standard input files of SJARACNe, for each group, include: a “.exp.txt” file: a tab-separated genes/transcripts/proteins by cells/samples expression matrix with the first two columns being ID and symbol. a “TF” folder containing a “.tf.txt” file: a list of significant gene/transcript/protein IDs of TF drivers. a “SIG” folder containing a “.sig.txt” file: a list of significant gene/transcript/protein IDs of SIG drivers. a bash script (runSJARACNe.sh) to run SJARACNe. Further modification is needed to run it. a json file (config_cwlexec.json) containing parameters to run SJARACNe. Usually, the ground truth of cell types is not available. Then the cluster labels, or cell type annotations of the clusters, can be used for grouping in network rewiring, since it’s expected that cells with same cluster label/annotated cell type are of similar gene expression profiles. To generate from annotated cell types, you can run: generateSJARACNeInput(input_eset = pbmc14k_log2cpm.eset, group_name = "cell_type", sjaracne_dir = "/work-path/PBMC14k/SJARACNe/bycelltype", species_type = "hg", driver_type = "TF_SIG") 9.2 Run SJARACNe By default, the generateSJARACNeInput() function also generates a runSJARACNe.sh file in the folder of each group. This file much be modified before you can run it: removed unneeded lines: There are usually 4 lines in this file: the lines starting with “sjaracne lsf” are the command lines to run on IBM LSF cluster, while the lines starting with “sjaracne local” are the command lines runing on a single machine (Linux/OSX). Please select the lines based on your situation and remove the others. -n: number of bootstrap networks to generate. Default: 100. -pc: p value threshold to select edges in building consensus network. Default: e-2 for single-cell data, e-3 for meta-cell data, and e-5 for bulk sample data. Please use “sjaracne lsf -h” or “sjaracne local -h” to check more details of arguments available in SJARACNe. There is another file, config_cwlexec.json, available in the folder. It contains the information (e.g. memory request for each step of SJARACNe run) used for LSF job submission. This file is only needed for LSF runs and the default values works well in most cases. If you are running SJARACNe on a big dataset, you may need to request more memory from it. In this case, we use LSF to run the SJARACNe: ## let's use B cell as an example # for TF sjaracne lsf -e /work-path/PBMC14k/SJARACNe/B/B.8572_1902.exp.txt -g /work-path/PBMC14k/SJARACNe/B/TF/B.835_1902.tf.txt -o /work-path/PBMC14k/SJARACNe/B/TF/bt100_pc001 -n 100 -pc 0.01 -j /work-path/PBMC14k/SJARACNe/B/config_cwlexec.json # for SIG sjaracne lsf -e /work-path/PBMC14k/SJARACNe/B/B.8572_1902.exp.txt -g /work-path/PBMC14k/SJARACNe/B/SIG/B.4148_1902.sig.txt -o /work-path/PBMC14k/SJARACNe/B/SIG/bt100_pc001 -n 100 -pc 0.01 -j /work-path/PBMC14k/SJARACNe/B/config_cwlexec.json We created a folder named “bt100_pc001” in both TF and SIG folders of each group, to save the networks generated under 100 bootstraps (-n 100) and 0.01 consensus p value (-pc 0.01). To run SJARACNe on a local machine: ## let's use B cell as an example # for TF sjaracne local -e /work-path/PBMC14k/SJARACNe/B/B.8572_1902.exp.txt -g /work-path/PBMC14k/SJARACNe/B/TF/B.835_1902.tf.txt -o /work-path/PBMC14k/SJARACNe/B/TF/bt100_pc001 -n 100 -pc 0.01 # for SIG sjaracne local -e /work-path/PBMC14k/SJARACNe/B/B.8572_1902.exp.txt -g /work-path/PBMC14k/SJARACNe/B/SIG/B.4148_1902.sig.txt -o /work-path/PBMC14k/SJARACNe/B/SIG/bt100_pc001 -n 100 -pc 0.01 9.3 Assess the quality of networks 9.3.1 Introduction to the network file by SJARACNe The core output of SJARACNe is the network file named consensus_network_ncol_.txt. It contains 9 columns: source: ID of the source gene, can be the gene symbol; target: ID of the target gene, can be the gene symbol; source.symbol: symbol of the source gene; target.symbol: symbol of the target gene; MI: mutual information of source-gene pair; pearson: Pearson correlation coefficient, [-1,1] pearson: Spearman correlation coefficient, [-1,1] slope: slop of the regression line, returned by stats.linregression() p.value: p-value for a hypothesis test whose null hypothesis is that the slope is zero, using Wald Test with t-distribution of the test statistic network_format <- read.table(system.file("extdata/demo_pbmc14k/SJARACNe/B/TF/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"), header = T, sep = "\\t", quote = "", stringsAsFactors = F) head(network_format) ## source target source.symbol target.symbol MI pearson spearman slope ## 1 AATF ACBD3 AATF ACBD3 0.0509 -0.0310 -0.0311 -0.0193 ## 2 AATF ADD3 AATF ADD3 0.0486 0.0228 0.0258 0.0307 ## 3 AATF AES AATF AES 0.0511 0.0311 0.0289 0.0668 ## 4 AATF AKR7A2 AATF AKR7A2 0.0498 0.0319 0.0366 0.0421 ## 5 AATF AL928768.3 AATF AL928768.3 0.0447 0.0247 0.0293 0.0335 ## 6 AATF ALG8 AATF ALG8 0.0479 0.0358 0.0373 0.0234 ## p.value ## 1 0.1761 ## 2 0.3204 ## 3 0.1756 ## 4 0.1646 ## 5 0.2815 ## 6 0.1183 9.3.2 Generate network QC report There is no simple standards to tell the reliability of networks. Empirically, a network with 50-300 target size is good. scMINER provides a function, drawNetworkQC(), to quickly assess the quality of networks in batch. network_stats <- drawNetworkQC(sjaracne_dir = "/work-path/PBMC14K/SJARACNe", generate_html = FALSE) # Set `generate_html = TRUE` to generate html-format QC report for each network file ## The network QC statistics table is saved seperately, for demonstration purposes. network_stats <- readRDS(system.file("extdata/demo_pbmc14k/SJARACNe/network_stats.rds", package = "scMINER")) head(network_stats) ## network_tag network_node network_edge driver_count targetSize_mean ## 1 B.SIG.bt100_pc001 8572 391889 4148 94.47662 ## 2 B.TF.bt100_pc001 8572 95341 835 114.18084 ## 3 CD4TCM.SIG.bt100_pc001 8660 382153 4209 90.79425 ## 4 CD4TCM.TF.bt100_pc001 8660 94319 838 112.55251 ## 5 CD4TN.SIG.bt100_pc001 8612 401658 4180 96.09043 ## 6 CD4TN.TF.bt100_pc001 8612 95152 831 114.50301 ## targetSize_median targetSize_minimum targetSize_maximum ## 1 94.0 33 396 ## 2 96.0 64 913 ## 3 91.0 31 281 ## 4 95.5 60 689 ## 5 95.0 43 303 ## 6 99.0 64 743 ## network_path ## 1 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/B/SIG/bt100_pc001/sjaracne_workflow-df798096-8dee-4baf-8f70-891c689dc769/consensus_network_ncol_.txt ## 2 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/B/TF/bt100_pc001/sjaracne_workflow-fb2a69b9-f98e-47ff-87a0-6d538822fc6e/consensus_network_ncol_.txt ## 3 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/CD4TCM/SIG/bt100_pc001/sjaracne_workflow-424f1068-13d1-4f0e-9c26-56acd9a2027c/consensus_network_ncol_.txt ## 4 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/CD4TCM/TF/bt100_pc001/sjaracne_workflow-52b3cdf5-5914-4c8c-a77a-05f17c755d83/consensus_network_ncol_.txt ## 5 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/CD4TN/SIG/bt100_pc001/sjaracne_workflow-7b5bb68e-1de5-4d0e-80ec-8d8aa037866f/consensus_network_ncol_.txt ## 6 /Volumes/projects/scRNASeq/yu3grp/scMINER/NG_Revision/QPan/scminer_R/Datasets/PBMC14K/SJARACNe/CD4TN/TF/bt100_pc001/sjaracne_workflow-89716541-eb53-435c-8a45-bab63d6b5198/consensus_network_ncol_.txt "],["actvity-based-analysis.html", "Chapter 10 Actvity-based analysis 10.1 Calculate the activities 10.2 Differential activity analysis", " Chapter 10 Actvity-based analysis The driver activity estimation is one of the most important features of scMINER. Mathematically, the activity of one driver is a type of mean of the expressions of its targets. And biologically, the activity can be interpreted as a measure that describes how actively the driver functions, like the enzymes in digesting their subtracts, kinase in activating their downstream genes. Given the gene expression profiles and networks, scMINER can estimate the activities of some predefined drivers, including not only transcription factors (TFs) but also signaling genes (SIGs). scMINER provides a few functions to effortlessly calculate the activities, identify the hidden drivers and visualize them in multiple ways. 10.1 Calculate the activities scMINER provides two functions, getActivity_individual() and getActivity_inBatch(), to calculate the driver activities. 10.1.1 Calculate activities per group getActivity_individual() is designed to calculate the activities per group. It takes the network files as the input: ## let's use B cell as an example activity_B.eset <- getActivity_individual(input_eset = pbmc14k_log2cpm.eset[, pData(pbmc14k_log2cpm.eset)$trueLabel == "B"], network_file.tf = system.file("extdata/demo_pbmc14k/SJARACNe/B/TF/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"), network_file.sig = system.file("extdata/demo_pbmc14k/SJARACNe/B/SIG/bt100_pc001/consensus_network_ncol_.txt", package = "scMINER"), driver_type = "TF_SIG") 10.1.2 Calculate activities in batch If you need to calculate the activity for multiple groups, this is usually the case, you can do it using getActivity_individual() as shown above one by one and merge the esets after that. Or, scMINER privides another function, getActivity_inBatch(), to calculate the activity in batch: ## let's use B cell as an example activity.eset <- getActivity_inBatch(input_eset = pbmc14k_log2cpm.eset, sjaracne_dir = system.file("extdata/demo_pbmc14k/SJARACNe", package = "scMINER"), group_name = "trueLabel", driver_type = "TF_SIG", activity_method = "mean", do.z_normalization = TRUE) ## 7 groups were found in trueLabel ... ## Checking network files for each group ... ## Group 1 / 7 : Monocyte ... ## TF network check passed! ## SIG network check passed! ## Group 2 / 7 : B ... ## TF network check passed! ## SIG network check passed! ## Group 3 / 7 : CD4Treg ... ## TF network check passed! ## SIG network check passed! ## Group 4 / 7 : CD4TN ... ## TF network check passed! ## SIG network check passed! ## Group 5 / 7 : CD4TCM ... ## TF network check passed! ## SIG network check passed! ## Group 6 / 7 : NK ... ## TF network check passed! ## SIG network check passed! ## Group 7 / 7 : CD8TN ... ## TF network check passed! ## SIG network check passed! ## Calculating activity for each group ... ## Group 1 / 7 : Monocyte ... ## Activity calculation is completed successfully! ## Group 2 / 7 : B ... ## Activity calculation is completed successfully! ## Group 3 / 7 : CD4Treg ... ## Activity calculation is completed successfully! ## Group 4 / 7 : CD4TN ... ## Activity calculation is completed successfully! ## Group 5 / 7 : CD4TCM ... ## Activity calculation is completed successfully! ## Group 6 / 7 : NK ... ## Activity calculation is completed successfully! ## Group 7 / 7 : CD8TN ... ## Activity calculation is completed successfully! ## NAs were found in the activity matrix and have been replaced by the minimum value: -0.3968794 . 10.1.3 Save activity eset object saveRDS(activity.eset, file = "/your-path/PBMC14k/DATA/activity.eset") 10.2 Differential activity analysis Similar to getDE(), scMINER provides a function, getDA(), to perform the differential activity analysis and identify the group-specific drivers. ## 1. To perform differential expression analysis in a 1-vs-rest manner for all groups da_res1 <- getDA(input_eset = activity.eset, group_by = "cell_type", use_method = "t.test") ## 7 groups were found in group_by column [ cell_type ]. ## Since no group was specified, the differential analysis will be conducted among all groups in the group_by column [ cell_type ] in the 1-vs-rest manner. ## 1 / 7 : group 1 ( B ) vs the rest... ## 1912 cells were found for g1. ## 11693 cells were found for g0. ## 2 / 7 : group 1 ( CD4TCM ) vs the rest... ## 2022 cells were found for g1. ## 11583 cells were found for g0. ## 3 / 7 : group 1 ( CD4TN ) vs the rest... ## 2505 cells were found for g1. ## 11100 cells were found for g0. ## 4 / 7 : group 1 ( CD4Treg ) vs the rest... ## 1448 cells were found for g1. ## 12157 cells were found for g0. ## 5 / 7 : group 1 ( CD8TN ) vs the rest... ## 2014 cells were found for g1. ## 11591 cells were found for g0. ## 6 / 7 : group 1 ( Monocyte ) vs the rest... ## 1786 cells were found for g1. ## 11819 cells were found for g0. ## 7 / 7 : group 1 ( NK ) vs the rest... ## 1918 cells were found for g1. ## 11687 cells were found for g0. head(da_res1) ## feature g1_tag g0_tag g1_avg ## 4 AASDH_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.008071658 ## 6 AATF_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.051767485 ## 12 ABCB8_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.077615607 ## 8 ABCA2_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.081643603 ## 10 ABCB1_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.134357577 ## 3 AARSD1_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.126010447 ## g0_avg g1_pct g0_pct log2FC Pval FDR ## 4 -0.1025141 0.43043933 0.13991277 0.094442475 2.225074e-308 0.000000e+00 ## 6 -0.1084165 0.21652720 0.08757376 0.056649005 3.918924e-189 5.878386e-189 ## 12 -0.1094585 0.35251046 0.14153767 0.031842866 3.623209e-12 3.952592e-12 ## 8 -0.1101867 0.10198745 0.15676045 0.028543082 1.914570e-58 2.418404e-58 ## 10 -0.1559384 0.04393305 0.06114770 0.021580866 8.079661e-27 9.233898e-27 ## 3 -0.1225746 0.04184100 0.08192936 -0.003435892 4.213744e-02 4.213744e-02 ## Zscore ## 4 37.53784 ## 6 29.33316 ## 12 6.95115 ## 8 16.11775 ## 10 10.72137 ## 3 -2.03216 ## 2. To perform differential expression analysis in a 1-vs-rest manner for one specific group da_res2 <- getDA(input_eset = activity.eset, group_by = "cell_type", g1 = c("B"), use_method = "t.test") ## 3. To perform differential expression analysis in a rest-vs-1 manner for one specific group da_res3 <- getDA(input_eset = activity.eset, group_by = "cell_type", g0 = c("B"), use_method = "t.test") ## 4. To perform differential expression analysis in a 1-vs-1 manner for any two groups da_res4 <- getDA(input_eset = activity.eset, group_by = "cell_type", g1 = c("CD4Treg"), g0 = c("CD4TCM"), use_method = "t.test") The getTopFeatures() function can aslo be used to easily extract the group-specific markers from the differential expression result: top_drivers <- getTopFeatures(input_table = da_res1, number = 10, group_by = "g1_tag", sort_by = "log2FC", sort_decreasing = TRUE) dim(top_drivers) ## [1] 16 11 head(top_drivers) ## feature g1_tag g0_tag g1_avg ## 4 AASDH_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.008071658 ## 6 AATF_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.051767485 ## 12 ABCB8_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.077615607 ## 8 ABCA2_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.081643603 ## 10 ABCB1_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.134357577 ## 3 AARSD1_SIG B CD4TCM,CD4TN,CD4Treg,CD8TN,Monocyte,NK -0.126010447 ## g0_avg g1_pct g0_pct log2FC Pval FDR ## 4 -0.1025141 0.43043933 0.13991277 0.094442475 2.225074e-308 0.000000e+00 ## 6 -0.1084165 0.21652720 0.08757376 0.056649005 3.918924e-189 5.878386e-189 ## 12 -0.1094585 0.35251046 0.14153767 0.031842866 3.623209e-12 3.952592e-12 ## 8 -0.1101867 0.10198745 0.15676045 0.028543082 1.914570e-58 2.418404e-58 ## 10 -0.1559384 0.04393305 0.06114770 0.021580866 8.079661e-27 9.233898e-27 ## 3 -0.1225746 0.04184100 0.08192936 -0.003435892 4.213744e-02 4.213744e-02 ## Zscore ## 4 37.53784 ## 6 29.33316 ## 12 6.95115 ## 8 16.11775 ## 10 10.72137 ## 3 -2.03216 "],["session-information.html", "Chapter 11 Session information", " Chapter 11 Session information sessioninfo::session_info() ## ─ Session info ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## setting value ## version R version 4.3.3 (2024-02-29) ## os macOS Sonoma 14.5 ## system aarch64, darwin20 ## ui X11 ## language (EN) ## collate en_US.UTF-8 ## ctype en_US.UTF-8 ## tz America/Chicago ## date 2024-08-05 ## pandoc 3.1.11 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown) ## ## ─ Packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## package * version date (UTC) lib source ## anndata * 0.7.5.6 2023-03-17 [1] CRAN (R 4.3.0) ## assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.3.0) ## Biobase * 2.62.0 2023-10-26 [1] Bioconductor ## BiocGenerics * 0.48.1 2023-11-02 [1] Bioconductor ## bookdown 0.40 2024-07-02 [1] CRAN (R 4.3.3) ## bslib 0.8.0 2024-07-29 [1] CRAN (R 4.3.3) ## cachem 1.1.0 2024-05-16 [1] CRAN (R 4.3.3) ## cli 3.6.3 2024-06-21 [1] CRAN (R 4.3.3) ## colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.3.3) ## digest 0.6.36 2024-06-23 [1] CRAN (R 4.3.3) ## dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1) ## evaluate 0.24.0 2024-06-10 [1] CRAN (R 4.3.3) ## fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.1) ## farver 2.1.2 2024-05-13 [1] CRAN (R 4.3.3) ## fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.3.3) ## generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0) ## ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.3.1) ## glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.1) ## gridExtra 2.3 2017-09-09 [1] CRAN (R 4.3.0) ## gtable 0.3.5 2024-04-22 [1] CRAN (R 4.3.1) ## highr 0.11 2024-05-26 [1] CRAN (R 4.3.3) ## htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.3.1) ## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.3.0) ## jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.1) ## knitr 1.48 2024-07-07 [1] CRAN (R 4.3.3) ## labeling 0.4.3 2023-08-29 [1] CRAN (R 4.3.0) ## lattice 0.22-6 2024-03-20 [1] CRAN (R 4.3.1) ## lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1) ## limma 3.58.1 2023-11-02 [1] Bioconductor ## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0) ## Matrix * 1.6-5 2024-01-11 [1] CRAN (R 4.3.3) ## munsell 0.5.1 2024-04-01 [1] CRAN (R 4.3.1) ## pheatmap 1.0.12 2019-01-04 [1] CRAN (R 4.3.0) ## pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0) ## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0) ## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.3.1) ## png 0.1-8 2022-11-29 [1] CRAN (R 4.3.0) ## R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0) ## RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.3.0) ## Rcpp 1.0.13 2024-07-17 [1] CRAN (R 4.3.3) ## reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.3.0) ## reticulate 1.38.0.9000 2024-08-05 [1] Github (rstudio/reticulate@4e55e7c) ## rlang 1.1.4 2024-06-04 [1] CRAN (R 4.3.3) ## rmarkdown 2.27 2024-05-17 [1] CRAN (R 4.3.3) ## rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.3.1) ## sass 0.4.9 2024-03-15 [1] CRAN (R 4.3.1) ## scales 1.3.0 2023-11-28 [1] CRAN (R 4.3.1) ## scMINER * 1.1.0 2024-08-05 [1] Github (jyyulab/scMINER@8180107) ## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0) ## statmod 1.5.0 2023-01-06 [1] CRAN (R 4.3.0) ## stringi 1.8.4 2024-05-06 [1] CRAN (R 4.3.1) ## stringr 1.5.1 2023-11-14 [1] CRAN (R 4.3.1) ## tibble 3.2.1 2023-03-20 [1] CRAN (R 4.3.0) ## tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.1) ## utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1) ## vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1) ## withr 3.0.1 2024-07-31 [1] CRAN (R 4.3.3) ## xfun 0.46 2024-07-18 [1] CRAN (R 4.3.3) ## yaml 2.3.10 2024-07-26 [1] CRAN (R 4.3.3) ## ## [1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library ## ## ─ Python configuration ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## python: /Users/qpan/.virtualenvs/r-reticulate/bin/python ## libpython: /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/config-3.9-darwin/libpython3.9.dylib ## pythonhome: /Users/qpan/.virtualenvs/r-reticulate:/Users/qpan/.virtualenvs/r-reticulate ## version: 3.9.6 (default, Mar 29 2024, 10:51:09) [Clang 15.0.0 (clang-1500.3.9.4)] ## numpy: /Users/qpan/.virtualenvs/r-reticulate/lib/python3.9/site-packages/numpy ## numpy_version: 1.26.4 ## anndata: /Users/qpan/.virtualenvs/r-reticulate/lib/python3.9/site-packages/anndata ## ## NOTE: Python version was forced by VIRTUAL_ENV ## ## ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]]
    diff --git a/docs/bookdown/session-information.html b/docs/bookdown/session-information.html
    index dcb16bc..392b36f 100644
    --- a/docs/bookdown/session-information.html
    +++ b/docs/bookdown/session-information.html
    @@ -23,7 +23,7 @@
     
     
     
    -
    +
     
       
       
    @@ -254,7 +254,7 @@ 

    Chapter 11 Session information

    sessioninfo::session_info()
    -
    ## ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    +
    ## ─ Session info ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     ##  setting  value
     ##  version  R version 4.3.3 (2024-02-29)
     ##  os       macOS Sonoma 14.5
    @@ -264,20 +264,20 @@ 

    Chapter 11 Session informationChapter 11 Session informationChapter 11 Session informationChapter 11 Session information

    +## ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────