diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
index 41d69118..a501f27c 100644
--- a/.github/workflows/ci-test.yml
+++ b/.github/workflows/ci-test.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: ['3.7', '3.8', '3.9']
+        python-version: ['3.8', '3.9', '3.10', '3.11']
 
     steps:
     - uses: actions/checkout@v2
@@ -48,10 +48,6 @@ jobs:
     - name: One sample input test
       run: |
         bash tests/run_one_sample.sh
-    - name: Hashing CITE-Seq pipeline test
-      run: |
-        bash tests/run_hashing_citeseq.sh
-        pytest tests/test_hashing_citeseq.py
     - name: iNMF test
       run: |
         bash tests/run_inmf.sh
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 7e053cac..ae4157db 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,6 +1,35 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
 build:
-  image: latest
+  os: ubuntu-22.04
+  tools:
+    python: "3.9"
+    # You can also specify other tool versions:
+    # nodejs: "20"
+    # rust: "1.70"
+    # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
 sphinx:
   configuration: docs/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 python:
-  version: 3.8
+  install:
+    - requirements: docs/requirements.txt
\ No newline at end of file
diff --git a/docs/api/index.rst b/docs/api/index.rst
index 4311a096..540af78e 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -85,6 +85,7 @@ Cluster Algorithms
     cluster
     louvain
     leiden
+    split_one_cluster
     spectral_louvain
     spectral_leiden
 
diff --git a/docs/conf.py b/docs/conf.py
index 5ca8a416..fc3d4cdf 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -23,22 +23,22 @@
 # -- Project information -----------------------------------------------------
 
 project = "Pegasus"
-copyright = "2022 Genentech, Inc. All rights reserved."
+copyright = "2024 Genentech, Inc. All rights reserved."
 author = (
     "Yiming Yang, Joshua Gould and Bo Li"
 )
 
 # The short X.Y version
-version = "1.7"
+version = "1.9"
 # The full version, including alpha/beta/rc tags
-release = "1.7.1"
+release = "1.9.0"
 
 
 # -- General configuration ---------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
 #
-#needs_sphinx = '1.7'
+#needs_sphinx = '1.8'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
diff --git a/docs/index.rst b/docs/index.rst
index 9289f3c9..c8d37d11 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,7 +10,7 @@
 Release Highlights in Current Stable
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. include:: release_notes/version_1_7.rst
+.. include:: release_notes/version_1_9.rst
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
index 407a6f5c..7a2690a8 100644
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -6,6 +6,16 @@ Release Notes
 .. note::
     Also see the release notes of `PegasusIO <https://pegasusio.readthedocs.io/en/stable/release_notes.html>`__.
 
+Version 1.9
+~~~~~~~~~~~~~
+
+.. include:: release_notes/version_1_9.rst
+
+Version 1.8
+~~~~~~~~~~~~~
+
+.. include:: release_notes/version_1_8.rst
+
 Version 1.7
 ~~~~~~~~~~~~~
 
diff --git a/docs/release_notes/version_1_8.rst b/docs/release_notes/version_1_8.rst
new file mode 100644
index 00000000..4b0947ae
--- /dev/null
+++ b/docs/release_notes/version_1_8.rst
@@ -0,0 +1,21 @@
+1.8.1 :small:`August 23, 2023`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Bug fix in cell marker JSON files for ``infer_cell_types`` function.
+
+1.8.0 :small:`July 21, 2023`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**New Feature and Improvement**
+
+* Updata ``human_immune`` and ``human_lung`` marker sets.
+* Add ``mouse_liver`` marker set.
+* Add `split_one_cluster <./api/pegasus.split_one_cluster.html>`_ function to subcluster one cluster into a specified number of subclusters.
+* Update **neighbors** function to set ``use_cache=False`` by default, and adjust K to ``min(K, int(sqrt(n_samples)))``. [PR `272 <https://github.com/lilab-bcb/pegasus/pull/272>`_]
+* In **infer_doublets** function, argument ``manual_correction`` now accepts a float number threshold specified by users for cut-off. [PR `275 <https://github.com/lilab-bcb/pegasus/pull/275>`_]
+
+**Bug Fix**
+
+* Fix divide by zero issue in ``integrative_nmf`` function. [PR `258 <https://github.com/lilab-bcb/pegasus/pull/258>`_]
+* Compatibility with Pandas v2.0. [PR `261 <https://github.com/lilab-bcb/pegasus/pull/261>`_]
+* Allow ``infer_doublets`` to use any count matrix with key name specified by users. [PR `268 <https://github.com/lilab-bcb/pegasus/pull/268>`_ Thanks to `Donghoon Lee <https://github.com/hoondy>`_]
diff --git a/docs/release_notes/version_1_9.rst b/docs/release_notes/version_1_9.rst
new file mode 100644
index 00000000..fa61e2f3
--- /dev/null
+++ b/docs/release_notes/version_1_9.rst
@@ -0,0 +1,14 @@
+1.9.0 :small:`January 19, 2024`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**New Feature and Improvement**
+
+* ``calculate_z_score`` works with sparse count matrix. [PR `276 <https://github.com/lilab-bcb/pegasus/pull/276>`_ Thanks to `Jayaram Kancherla <https://github.com/jkanche>`_]
+* Plotting functions (``scatter``, ``dotplot``, ``violin``, ``heatmap``) now give warnings on genes/attributes not existing in the data, and skip them in the plots.
+* Improve ``heatmap``:
+
+  * Add ``show_sample_name`` parameter for cases of pseudo-bulk data, nanoString DSP data, etc.
+  * Use Scipy's linkage (``scipy.cluster.hierarchy.linkage``) for dendrograms to use its optimal ordering feature for better results (see ``groupby_optimal_ordering`` parameter).
+
+* Update human lung and mouse immune markers used by ``infer_cell_types`` function.
+* Expose ``online_batch_size`` parameter in ``nmf`` and ``integrative_nmf`` functions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
index f6857eea..e714db1c 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -14,7 +14,6 @@ natsort
 joblib
 psutil
 numba
-importlib_metadata; python_version < '3.8'
 umap-learn
 forceatlas2-python
 pyarrow
diff --git a/pegasus/__init__.py b/pegasus/__init__.py
index ae574a32..3e0d62bc 100644
--- a/pegasus/__init__.py
+++ b/pegasus/__init__.py
@@ -65,6 +65,7 @@
     de_analysis,
     markers,
     write_results_to_excel,
+    cluster_specific_markers,
     find_markers,
     infer_path,
     calc_signature_score,
diff --git a/pegasus/annotate_cluster/annotate_cluster.py b/pegasus/annotate_cluster/annotate_cluster.py
index 5ce05357..22caea43 100644
--- a/pegasus/annotate_cluster/annotate_cluster.py
+++ b/pegasus/annotate_cluster/annotate_cluster.py
@@ -278,7 +278,9 @@ def infer_cell_types(
             * ``'mouse_immune'`` for mouse immune cells;
             * ``'human_brain'`` for human brain cells;
             * ``'mouse_brain'`` for mouse brain cells;
-            * ``'human_lung'`` for human lung cells.
+            * ``'human_lung'`` for human lung cells;
+            * ``'mouse_lung'`` for mouse lung cells;
+            * ``'mouse_liver'`` for mouse liver cells.
         * If ``Dict``, it refers to a Python dictionary describing the markers.
 
     de_test: ``str``, optional, default: ``"mwu"``
@@ -320,6 +322,8 @@ def infer_cell_types(
         human_brain="human_brain_cell_markers.json",
         mouse_brain="mouse_brain_cell_markers.json",
         human_lung="human_lung_cell_markers.json",
+        mouse_lung="mouse_lung_cell_markers.json",
+        mouse_liver="mouse_liver_cell_markers.json",
     )
 
     if isinstance(markers, str):
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 60eaa54c..a4592a6f 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -19,52 +19,117 @@
 				"title" : "T cell subtype markers",
 				"cell_types" : [
 					{
-						"name" : "T helper cell",
+						"name" : "CD4 Naive T cell",
 						"markers" : [
 							{
-								"genes" : ["CD4+"],
+								"genes" : ["CD4+", "CCR7+", "SELL+", "LEF1+", "FHIT+", "ACTN1+", "LDLRAP1+", "TMIGD2+", "TRABD2A+", "LRRN3+"],
 								"weight" : 1.0,
-								"comment" : "CD4+ T cell"
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
 							}
-						]
+ 						]
 					},
 					{
-						"name" : "Cytotoxic T cell",
+						"name" : "CD4 TCM",
 						"markers" : [
 							{
-								"genes" : ["CD8A+", "CD8B+"],
+								"genes" : ["CD4+", "GPR183+", "CD69+", "PASK+", "LIMS1+", "LPAR6+", "SLC2A3+", "SOCS3+"],
 								"weight" : 1.0,
-								"comment" : "CD8+ T cell"
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
 							}
-						]
+						] 
+					},
+					{
+						"name" : "CD4 TEM",
+						"markers" : [
+							{
+								"genes" : ["CD4+", "KLRB1+", "ANXA2+", "LGALS1+", "TIMP1+", "PTGER2+", "AHNAK+", "TNFRSF4+", "YWHAH+", "CD63+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
+							}
+						] 
 					},
 					{
 						"name" : "T regulatory cell",
 						"markers" : [
 							{
-								"genes" : ["FOXP3+", "IL2RA+"],
-								"weight" : 0.7,
-								"comments" : "key T reg markers"
-							},
+								"genes" : ["RTKN2+", "FOXP3+", "IL2RA+", "HACD1+", "AC133644.2+", "FANK1+", "DUSP4+", "STAM+", "CCR10+", "CTLA4+"],
+								"weight" : 1.0,
+								"comments" : "Markers derived from Immune Cell Atlas PBMC data"
+							}
+						]
+					},
+					{
+						"name" : "CD4 CTL",
+						"markers" : [
 							{
-								"genes" : ["CD4+"],
+								"genes" : ["CD4+", "CD8A-", "CD8B-"],
 								"weight" : 0.3,
-								"comment" : "key markers that do not express heavily in droplet-based RNA-Seq"
+								"comments" : "Must be CD4 T"
+							},
+							{
+								"genes" : ["GNLY+", "AGAP1+", "ZNF683+", "RGS9+", "IL5RA+", "LAIR2+", "MTERF2+", "SH3RF2+", "RGS17+"],
+								"weight" : 0.7,
+								"comments" : "CD4 CTL markers that might also be expressed by CD8 TEM"
 							}
 						]
 					},
 					{
-						"name" : "Naive T cell",
+						"name" : "T follicular helper cell",
 						"markers" : [
 							{
-								"genes" : ["CCR7+", "SELL+", "IL7R+", "TCF7+", "CD27+"],
-								"weight" : 0.7,
-								"comment" : "positive markers"
-							},
+								"genes" : ["CD4+", "ST8SIA1+", "PDCD1+", "TIGIT+", "TOX2+", "ICOS+", "SH2D1A+", "IL21+"],
+								"weight" : 1.0,
+								"comments" : "Tfh markers"
+							}
+						]
+					},										
+					{
+						"name" : "CD8 Naive T cell",
+						"markers" : [
 							{
-								"genes" : ["IL2RA-", "CD44-", "CD69-"],
-								"weight" : 0.3,
-								"comment" : "negative markers"
+								"genes" : ["CD8A+", "CD8B+", "CCR7+", "SELL+", "LEF1+", "ACTN1+", "TRABD2A+", "LRRN3+", "LINC02446+", "S100B+", "CLEC11A+", "NELL2+", "PASK+", "APBA2+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
+							}
+						]
+					},
+					{
+						"name" : "CD8 TCM",
+						"markers" : [
+							{
+								"genes" : ["CD8A+", "CD8B+", "GZMK+", "DUSP2+", "LTB+", "CD27+", "IL7R+", "GPR183+", "RGS1+", "CXCR3+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data; CD8A & CD8B are CD8 markers; All others are CD8 TCM specific markers"
+							}
+						]
+					},
+					{
+						"name" : "CD8 TEM",
+						"markers" : [
+							{
+								"genes" : ["CD8A+", "CD8B+", "FGFBP2+", "GZMB+", "GZMH+", "GNLY+", "PRF1+", "KLRD1+", "FCGR3A+", "TBX21+", "CX3CR1+", "ASCL2+", "SPON2+", "ADGRG1+", "PRSS23+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data; FGFBP2, GZMB, GZMH, GNLY, PRF1, KLRD1, FCGR3A are pan TEM markers; TBX21, CX3CR1 and ASCL2 are Temra markers; the last three are purely data driven markers"
+							}
+						]
+					},
+					{
+						"name" : "MAIT",
+						"markers" : [
+							{
+								"genes" : ["SLC4A10+", "KLRB1+", "NCR3+", "CEBPD+", "GPR65+", "LST1+", "CXCR6+", "TRAV1-2+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
+							}
+ 						]
+					},
+					{
+						"name" : "Gamma-delta T cell",
+						"markers" : [
+							{
+								"genes" : ["TRDC+", "TRGC1+", "TRGC2+", "KLRC1+", "KLRD1+", "GNLY+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
 							}
  						]
 					}
@@ -73,24 +138,66 @@
 		},
 
 		{
-			"name" : "B cell",
+			"name" : "Natural killer cell",
 			"markers" : [
 				{
-					"genes" : ["CD19+", "MS4A1+", "CD79A+", "CD79B+"],
-					"weight" : 0.7,
-					"comment" : "CD19, CD20 and CD79"
+					"genes" : ["GNLY+", "KLRF1+", "KLRD1+", "TRDC+", "IL2RB+", "KLRC1+", "NCR1+"],
+					"weight" : 0.6,
+					"comment" : "General NK cell markers also cover some T cells; derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; Added NCR1, a pan NK cell marker"
 				},
 				{
-					"genes" : ["BANK1+", "BLK+"],
+					"genes" : ["NCAM1+", "FCGR3A+"],
 					"weight" : 0.2,
-					"comment" : "Extra B cell markers"
+					"comment" : "NK subtype markers"
 				},
 				{
-					"genes" : ["CD74+", "HLA-DRA+", "HLA-DRB1+", "HLA-DPA1+", "HLA-DPB1+", "HLA-DQA1+", "HLA-DQB1+"],
-					"weight" : 0.1,
-					"comment" : "MHC II"
+					"genes" : ["CD3D-", "CD3E-", "CD3G-"],
+					"weight" : 0.2,
+					"comment" : "No T cell markers"
 				}
 			],
+			"subtypes" : {
+				"title" : "NK cell subtype markers",
+				"cell_types" : [
+					{
+						"name" : "CD56-dim NK cell",
+						"markers" : [
+							{
+								"genes" : ["FCGR3A+", "FGFBP2+", "SPON2+", "MYOM2+", "S1PR5+", "CX3CR1+", "AKR1C3+", "FCRL6+", "LAIR2+", "PRSS23+"],
+								"weight" : 1.0,
+								"comment" : "Cytotoxic NK cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
+							}
+						]
+					},
+					{
+						"name" : "CD56-bright NK cell",
+						"markers" : [
+							{
+								"genes" : ["NCAM1+", "GZMK+", "XCL1+", "SPTSSB+", "CAPG+", "IL7R+", "GPR183+", "IGFBP4+", "SPINK2+", "FUT7+"],
+								"weight" : 1.0,
+								"comment" : "Regulatory NK cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; CD56bright develops into CD56dim"
+							}
+						]
+					}
+				],
+				"comment": "There is also a CD56_dim CD16_dim population in between of the CD56-dim and CD56-bright subtypes."
+			}
+		},
+
+		{
+			"name" : "B cell",
+			"markers" : [
+				{
+					"genes" : ["MS4A1+", "CD79A+", "CD79B+", "CD19+", "BANK1+", "TNFRSF13C+", "CD22+", "BLK+", "FCRLA+", "HLA-DOB+"],
+					"weight" : 0.9,
+					"comment" : "Human and mouse shared B cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; TNFRSF13C (BAFF receptor); CD79A, CD79B, CD19, BLK, FCRLA and HLA-DOB are also expressed in Plasma cells; CD79B in addition is expressed in CD16+ monocytes & HSCs; BANK1 & BLK are expressed higher in memory B"
+				},
+				{
+					"genes" : ["LINC00926+", "VPREB3+"],
+					"weight" : 0.1,
+					"comment" : "B cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
+				}			
+			],
 			"subtypes" : {
 				"title" : "B cell subtype markers",
 				"cell_types" : [
@@ -123,9 +230,9 @@
 						"name" : "Naive B cell",
 						"markers" : [
 							{
-								"genes" : ["IGHD+", "TCL1A+", "FCER2+"],
+								"genes" : ["IGHD+", "TCL1A+", "FCER2+", "IL4R+", "PLPP5+"],
 								"weight" : 1.0,
-								"comments" : "markers for naive B cell, collected from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper. Validated using ICA pbmc data"
+								"comments" : "Markers for naive B cell derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; IGHD & FCER2 are shared with mouse"
 							}
 						]
 					},
@@ -133,9 +240,9 @@
 						"name" : "Memory B cell",
 						"markers" : [
 							{
-								"genes" : ["CD27+", "TNFRSF13B+"],
+								"genes" : ["IGHA1+", "IGHG1+", "CD27+", "TNFRSF13B+", "CLECL1P+", "AIM2+", "LGALS1+", "CRIP1+"],
 								"weight" : 1.0,
-								"comments" : "markers for memory B cell, collected from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper. Validated using ICA pbmc data"
+								"comments" : "Markers for memory B cell derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
 							}
 						]
 					}
@@ -147,19 +254,9 @@
 			"name" : "Germinal Center B cell",
 			"markers" : [
 				{
-					"genes" : ["CD19+", "MS4A1+", "CD79A+", "CD79B+"],
-					"weight" : 0.3,
-					"comment" : "CD19, CD20 and CD79"
-				},
-				{
-					"genes" : ["CD38+", "BCL6+", "BCL2-", "RGS13+"],
-					"weight" : 0.6,
-					"comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last marker is from XXX"
-				},
-				{
-					"genes" : ["PCNA+", "MKI67+"],
-					"weight" : 0.1,
-					"comment" : "From Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 2)"
+					"genes" : ["MEF2B+", "NEIL1+", "RGS13+", "ELL3+", "BCL7A+", "BCL6+", "NUGGC+", "MYBL1+", "EML6+", "FANCA+"],
+					"weight" : 1.0,
+					"comment" : "GC B cell markers"
 				}
 			],
 			"subtypes" : {
@@ -169,9 +266,9 @@
 						"name" : "Dark zone B cell",
 						"markers" : [
 							{
-								"genes" : ["CXCR4+", "AICDA+", "FOXP1+", "MME+"],
+								"genes" : ["NUSAP1+", "NCAPG+", "AURKB+", "HMGB2+", "HJURP+", "TOP2A+"],
 								"weight" : 1.0,
-								"comment" : "Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper"
+								"comment" : "DZ B cell markers"
 							}
 						]
 					},
@@ -190,38 +287,28 @@
 		},
 
 		{
-			"name" : "Natural killer cell",
+			"name" : "Plasma cell",
 			"markers" : [
 				{
-					"genes" : ["NCAM1+"],
-					"weight" : 0.2,
-					"comment" : "CD56"
+					"genes" : ["TNFRSF17+", "PRDM1+", "SLAMF7+", "IRF4+", "SDC1+"],
+					"weight" : 0.5,
+					"comment" : "Human and mouse shared markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; TNFRSF17 (BCMA), PRDM1 (BLIMP1); SDC1 is highly expressed in BMMC but not PBMC"
 				},
 				{
-					"genes" : ["NKG7+"],
+					"genes" : ["IGHA1+", "IGHG1+", "TNFRSF13B+"],
 					"weight" : 0.2,
-					"comment" : "natural killer cell granule protein 7"
-				},
-				{
-					"genes" : ["KLRB1+", "KLRD1+", "KLRF1+", "KLRC1+", "KLRC2+", "KLRC3+", "KLRC4+"],
-					"weight" : 0.25,
-					"comment" : "killer cell lectin like receptors"
-				},
-				{
-					"genes" : ["CD3D-", "CD3E-", "CD3G-"],
-					"weight" : 0.15,
-					"comment" : "not T cell"
+					"comment" : "Markers expressed by both plasma and memory B cells, derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; IGHA1 & IGHG1 indicate class switch"
 				},
 				{
-					"genes" : ["FCGR3A+"],
-					"weight" : 0.1,
-					"comment" : "CD16a"
+					"genes" : ["CD38+", "ABCB9+", "CHPF+", "PLAAT2+"],
+					"weight" : 0.2,
+					"comment" : "Human-specific plasma markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; PLAAT2 is highly expressed in PBMC but not BMMC"
 				},
 				{
-					"genes" : ["ITGAL+", "ITGAM+"],
+					"genes" : ["MS4A1-"],
 					"weight" : 0.1,
-					"comment" : "CD11a,CD11b"
-				}
+					"comment" : "not B cell, doi: https://doi.org/10.1182/bloodadvances.2017004481, long-live plasma can still express CD19"
+				}				
 			]
 		},
 
@@ -270,42 +357,40 @@
 		},
 
 		{
-			"name" : "Plasmacytoid dendritic cell",
+			"name" : "Migratory dendritic cell",
 			"markers" : [
 				{
-					"genes" : ["JCHAIN+", "LILRA4+", "GZMB+", "MZB1+", "IL3RA+", "SERPINF1+", "ITM2C+", "IRF7+"],
+					"genes" : ["FSCN1+", "CCR7+", "LAMP3+", "CCL19+", "CCL22+", "CD40+", "BIRC3+"],
 					"weight" : 1.0,
-					"comment" : "important pDC markers"
+					"comment" : "Xing et al. Science Advances 2021 Table S2 (DCs-C3)"
 				}
 			]
 		},
 
 		{
-			"name" : "Plasma cell",
+			"name" : "Plasmacytoid dendritic cell",
 			"markers" : [
 				{
-					"genes" : ["CD38+", "XBP1+", "CD27+", "SLAMF7+"],
-					"weight" : 0.4,
-					"comment" : "important markers"
-				},
-				{
-					"genes" : ["TNFRSF17+", "TNFRSF13B+"],
-					"weight" : 0.2,
-					"comment" : "TNF-receptor superfamily"
-				},
-				{
-					"genes" : ["IGHA1+", "IGHG1+"],
-					"weight" : 0.2,
-					"comment" : "class switching happened"
-				},
+					"genes" : ["LILRA4+", "SERPINF1+", "IL3RA+", "TPM2+", "SCT+", "UGCG+", "CLEC4C+", "LRRC26+", "SMPD3+", "AC119428.2+"],
+					"weight" : 1.0,
+					"comment" : "Markers derived from Immune Cell Atlas PBMC, BM and CB data"
+				}
+			]
+		},
+
+
+		{
+			"name" : "Follicular dendritic cell",
+			"markers" : [
 				{
-					"genes" : ["MS4A1-"],
-					"weight" : 0.2,
-					"comment" : "not B cell, doi: https://doi.org/10.1182/bloodadvances.2017004481, long-live plasma can still express CD19"
-				}				
+					"genes" : ["CXCL13+", "FCAMR+", "FDCSP+", "SERPINE2+", "PAPPA+", "NPHS1+", "PKDCC+", "SYNM+", "NRG2+", "CDC42EP4+", "MUC3A+", "PRUNE2+", "B4GALNT4+", "NPPC+", "SLC1A2+", "TMEM150C+"],
+					"weight" : 1.0,
+					"comment" : "fDC markers"
+				}
 			]
 		},
 
+
 		{
 			"name" : "Hematopoietic stem cell",
 			"markers" : [
@@ -318,7 +403,7 @@
 		},
 
 		{
-			"name" : "Erythroid cells",
+			"name" : "Erythroid cell",
 			"markers" : [
 				{
 					"genes" : ["GYPA+"], 
@@ -344,7 +429,7 @@
 		},
 
 		{
-			"name" : "Megakaryocyte",
+			"name" : "Platelet",
 			"markers" : [
 				{
 					"genes" : ["PF4+", "PPBP+", "GP5+"],
@@ -359,13 +444,46 @@
 			]
 		},
 
+		{
+			"name" : "Pro-Neutrophil",
+			"markers" : [
+				{
+					"genes" : ["DEFA3+", "DEFA4+", "AZU1+", "MS4A3+", "ELANE+", "SLPI+", "CEACAM6+", "RNASE3+", "PRTN3+", "MPO+", "AC104232.1+", "CTSG+"],
+					"weight" : 1.0,
+					"comment" : "Pro-Neutrophil markers validated using 10x public whole blood dataset"
+				}
+			]
+		},
+
+		{
+			"name" : "Pre-Neutrophil",
+			"markers" : [
+				{
+					"genes" : ["LTF+", "LCN2+", "MMP8+", "CRISP3+", "CAMP+", "PGLYRP1+", "CD177+", "HP+"],
+					"weight" : 1.0,
+					"comment" : "Pre-Neutrophil markers validated using 10x public whole blood dataset"
+				}
+			]
+		},
+
 		{
 			"name" : "Neutrophil",
 			"markers" : [
 				{
-					"genes" : ["FUT4+", "MPO+", "CEACAM8+", "ELANE+", "CXCR1+", "CXCR2+", "LY6G6D+"],
+					"genes" : ["CSF3R+", "G0S2+", "LUCAT1+", "EPHB1+", "TNFRSF10C+", "IL1R2+", "KCNJ15+", "FCGR3B+", "AC007032.1+", "HSD11B1-AS1+"],
 					"weight" : 1.0,
-					"comment" : "key markers"
+					"comment" : "Neutrophil markers validated using 10x public whole blood dataset"
+				}
+			]
+		},
+
+		{
+			"name" : "Basophil",
+			"markers" : [
+				{
+					"genes" : ["AKAP12+", "HDC+", "GATA2+", "ENPP3+", "CA8+", "ITGB8+", "GCSAML+", "CRPPA+", "AC111000.4+", "LINC02223+"],
+					"weight" : 1.0,
+					"comment" : "Basophil markers validated using 10x public whole blood dataset"
 				}
 			]
 		},
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index 4e18b1ad..42cc1eb8 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -5,9 +5,9 @@
 			"name" : "Alveolar type I cell",
 			"markers" : [
 				{
-					"genes" : ["AGER+", "CAV1+", "RTKN2+", "MYL9+", "SPOCK2+", "ANXA3+", "TIMP3+", "CAV2+", "ST6GALNAC5+", "MYRF+"],
+					"genes" : ["AGER+", "SPOCK2+", "RTKN2+", "TNNC1+", "SCEL+", "CLIC5+", "NCKAP5+", "ARHGEF26+", "GGTLC1+", "ITLN2+", "MS4A15+"],
 					"weight" : 1.0,
-					"comment" : "AT1 markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -16,9 +16,9 @@
 			"name" : "Alveolar type II cell",
 			"markers" : [
 				{
-					"genes" : ["SFTPA1+", "SFTPA2+", "SFTPC+", "ETV5+", "TTN+", "PLA2G4F+", "CCDC141+", "LAMP3+", "ABCA3+", "HHIP+"],
+					"genes" : ["SFTPA1+", "SFTPA2+", "SFTPC+", "PGC+", "LAMP3+", "FASN+", "HHIP+", "ETV5+", "RASGRF1+", "ABCA3+"],
 					"weight" : 1.0,
-					"comment" : "AT2 markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -27,9 +27,9 @@
 			"name" : "Basal cell",
 			"markers" : [
 				{
-					"genes" : ["KRT5+", "KRT15+", "KRT17+", "TP63+", "S100A2+", "TNS4+"],
+					"genes" : ["KRT17+", "S100A2+", "MIR205HG+", "KRT15+", "KRT5+", "DLK2+", "CDH3+", "TP63+", "TNS4+"],
 					"weight" : 1.0,
-					"comment" : "Basal cell markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -38,9 +38,9 @@
 			"name" : "Club cell",
 			"markers" : [
 				{
-					"genes" : ["SCGB3A2+", "MGP+", "VIM+", "CST3+"],
+					"genes" : ["SCGB3A2+", "MGP+", "CTSE+"],
 					"weight" : 1.0,
-					"comment" : "Club cell markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -49,9 +49,9 @@
 			"name" : "Ciliated cell",
 			"markers" : [
 				{
-					"genes" : ["ERICH3+", "SNTN+", "CCDC78+", "SNTN+", "ZBBX+", "DNAI1+", "ARMC3+", "CFAP157+", "TTC29+", "CFAP73+"],
+					"genes" : ["ERICH3+", "ARMC3+", "DNAI2+", "ZBBX+", "VWA3B+", "RGS22+", "TTC29+", "CDHR4+", "PPP1R42+", "CFAP46+", "CFAP52+", "CFAP73+", "CFAP77+", "CFAP157+", "DNAH3+", "DNAH9+", "ADGB+", "SNTN+", "CCDC170+", "C6orf118+"],
 					"weight" : 1.0,
-					"comment" : "Ciliated cell markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -60,9 +60,9 @@
 			"name" : "Goblet cell",
 			"markers" : [
 				{
-					"genes" : ["MUC5AC+", "MUC5B+", "BPIFB1+", "MSMB+", "FAM3D+", "SERPINB11+", "CXCL6+", "SCGB1A1+", "FAM3D+", "SERPINB3+"],
+					"genes" : ["MUC5AC+", "MUC5B+", "BPIFB1+", "MSMB+", "SERPINB11+", "CYP2F1+"],
 					"weight" : 1.0,
-					"comment" : "Goblet cell markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -71,9 +71,9 @@
 			"name" : "Ionocyte",
 			"markers" : [
 				{
-					"genes" : ["FOXI1+", "ASCL3+", "CLDN25+", "ATP6V1G3+", "LINC01187+"],
+					"genes" : ["ASCL3+", "CLCNKB+", "FOXI1+", "ATP6V1G3+", "TMPRSS11E+", "BSND+", "LINC01187+", "CLDN25+"],
 					"weight" : 1.0,
-					"comment" : "Ionocyte markers from Travaglini et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -82,9 +82,20 @@
 			"name" : "Plumonary neuroendocrine cell",
 			"markers" : [
 				{
-					"genes" : ["CALCA+", "CHGA+", "ASCL1+", "SLC35D3+", "KIF1A+"],
+					"genes" : ["CHGA+", "CHGB+", "SCGN+", "SCG5+", "CPLX2+", "GRP+", "ASCL1+", "INSM1+"],
 					"weight" : 1.0,
-					"comment" : "Plumonary neuroendocrien cell markers from Travaglini et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Submucosal gland serous cell",
+			"markers" : [
+				{
+					"genes" : ["PRR4+", "TCN1+", "C6orf58+", "PRB3+", "LPO+", "PRB1+", "PRH2+", "PRH1+", "ODAM+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020"
 				}
 			]
 		},
@@ -102,42 +113,35 @@
 
 
 
-
-
 		{
 			"name" : "Vascular endothelial cell",
 			"markers" : [
 				{
-					"genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+"],
-					"weight" : 0.2,
-					"comment" : "Markers for endothelial cells, from Schupp et al. Circulation 2021"
-				},
-				{
-					"genes" : [ "ENG+", "PCDH17+", "CLEC14A+", "ESAM+", "ITM2A+", "BMPR2+", "FLT1+", "ADGRL4+", "SLCO2A1+", "AQP1+", "EPAS1+", "ADGRL2+", "IFI27+"],
-					"weight" : 0.8,
-					"comment" : "Common vascular EC markers from Schupp et al. Circulation 2021 and ADGRL2"
+					"genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+", "ICAM2+", "CLEC14A+", "ITM2A+", "ADGRL4+", "SLCO2A1+", "IFI27+"],
+					"weight" : 1.0,
+					"comment" : "Markers for vascular endothelial cells, validated using Travaglini et al. Nature 2020 and Schupp et al. Circulation 2021 data"
 				}
 			],
 			"subtypes" : {
 				"title" : "Vascular endothelial cell subtype markers",
 				"cell_types" : [
 					{
-						"name" : "Aerocyte",
+						"name" : "EC artery",
 						"markers" : [
 							{
-								"genes" : ["EDNRB+", "TBX2+", "EDA+", "HPGD+", "PRKG1+", "RCSD1+", "CYP3A5+", "VWF-"],
+								"genes" : ["CXCL12+", "GJA5+", "DKK2+", "HEY1+", "IGFBP3+", "SERPINE2+", "EFNB2+", "BMX+"],
 								"weight" : 1.0,
-								"comment" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
 					{
-						"name" : "EC general capillary",
+						"name" : "EC vein",
 						"markers" : [
 							{
-								"genes" : ["VWF+", "EDN1+", "FCN3+", "CD36+", "GPIHBP1+", "NRXN3+", "BTNL8+"],
+								"genes" : ["CPE+", "C7+", "IL1R1+", "PLA1A+", "PTGIS+", "ABI3BP+", "CYP1B1+", "ADGRG6+"],
 								"weight" : 1.0,
-								"comment" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+								"comments" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
@@ -145,79 +149,73 @@
 						"name" : "EC bronchial vessel",
 						"markers" : [
 							{
-								"genes" : ["SPRY1+", "PLVAP+", "VWA1+", "MPZL2+", "ESM1+"],
+								"genes" : ["SPRY1+", "PLVAP+", "VWA1+", "ABCB1+", "COL15A1+", "RUNDC3B+"],
 								"weight" : 1.0,
-								"comment" : "Markers from Travaglini et al. Nature 2020"
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
 					{
-						"name" : "EC pulmonary-venous",
+						"name" : "Aerocyte",
 						"markers" : [
 							{
-								"genes" : ["COL15A1+", "ZNF385D+", "EBF1+", "CPXM2+", "PLVAP+", "VWA1+", "SPRY1+"],
+								"genes" : ["HPGD+", "EDNRB+", "SOSTDC1+", "B3GALNT1+", "CYP3A5+", "TBX2+", "S100A3+", "IL1RL1+", "PRKG1+", "EXPH5+"],
 								"weight" : 1.0,
-								"comments" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
 					{
-						"name" : "EC systemic-venous",
+						"name" : "EC general capillary",
 						"markers" : [
 							{
-								"genes" : ["COL15A1-", "CPE+", "DKK3+", "EFEMP1+", "CDH11+", "PLAT+"],
+								"genes" : ["FCN3+", "IL7R+", "EDN1+", "GPIHBP1+", "SLC6A4+", "NTRK2+", "IL18R1+", "NRXN3+"],
 								"weight" : 1.0,
-								"comments" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					}
 				]
 			}
 		},
+
 		{
 			"name" : "Lymphatic endothelial cell",
 			"markers" : [
 				{
-					"genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+"],
+					"genes" : ["PECAM1+", "CLDN5+", "ERG+", "CDH5+"],
 					"weight" : 0.2,
-					"comment" : "Markers for endothelial cells, from Schupp et al. Circulation 2021"
+					"comment" : "Pan endothelial cell markers, validated using Travaglini et al. Nature 2020 and Schupp et al. Circulation 2021 data"
 				},
 				{
-					"genes" : ["CCL21+", "SEMA3D+", "PROX1+", "PDPN+", "MMRN1+", "RELN+", "PKHD1L1+", "TFF3+", "LYVE1+", "FLT4+", "TBX1+"],
+					"genes" : ["CCL21+", "TFF3+", "PDPN+", "PROX1+", "LYVE1+", "FLT4+", "GPM6A+", "SEMA3D+", "TBX1+", "RELN+"],
 					"weight" : 0.8,
-					"comment" : "Lymphatic-specific markers, from Schupp et al. Circulation 2021"
+					"comment" : "LEC markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
 
 
 
-
-
 		{
 			"name" : "Smooth muscle cell",
 			"markers" : [
 				{
-					"genes" : ["MYH11+", "TAGLN+", "ACTG2+", "CNN1+", "PLN+"],
-					"weight" : 0.8,
-					"comment" : "Markers from Muus et al., Braga et al. and Schupp et al."
-				},
-				{
-					"genes" : ["MYL9+", "TPM2+", "ACTA2+"],
-					"weight" : 0.2,
-					"comment" : "Markers that might also expressed in other stromal cell types"
+					"genes" : ["MYH11+", "ACTG2+", "CNN1+", "PLN+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			],
 			"subtypes" : {
 				"title" : "SMC subtype markers",
 				"cell_types" : [
 					{
-						"name" : "DES+",
+						"name" : "Airway smooth muscle cell",
 						"markers" : [
 							{
-								"genes" : ["DES+"],
+								"genes" : ["DES+", "TNNT2+", "RERGL+"],
 								"weight" : 1.0,
-								"comment" : "DES+ SMC"
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020"
 							}
 						]
 					}
@@ -229,21 +227,10 @@
 			"name" : "Pericyte",
 			"markers" : [
 				{
-					"genes" : ["TRPC6+", "CSPG4+", "FAM162B+", "GJA4+", "GJC1+", "HIGD1B+", "CDH6+", "LAMC3+", "FHL5+"],
-					"weight" : 0.8,
-					"comment" : "Markers from Schupp et al. Circulation 2021 and Travaglini et al. Nature 2020"
-				},
-				{
-					"genes" : ["PDGFRB+", "TBX2+", "EBF1+"],
-					"weight" : 0.1,
-					"comment" : "Markers that are highly expressed in Pericytes but also expressed in fibroblast"
-				},
-				{
-					"genes" : [ "LGI4+", "KCNK17+", "CACNA1H+", "PTN+", "TESC+"],
-					"weight" : 0.1,
-					"comment" : "Markers that are lowly expressed"
+					"genes" : ["COX4I2+", "HIGD1B+", "NDUFA4L2+", "FAM162B+", "LAMC3+", "KCNK3+", "GJA4+", "GJC1+", "CSPG4+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
-
 			]
 		},
 
@@ -251,15 +238,15 @@
 			"name" : "Mesothelial cell",
 			"markers" : [
 				{
-					"genes" : ["WT1+", "VIPR2+", "ITLN1+", "LINC02360+", "BNC1+",  "AP000561.1+", "CALB2+", "HAS1+", "LINC01133+", "GALNT9+"],
+					"genes" : ["CPA4+", "ITLN1+", "GALNT9+", "BNC1+", "CALB2+", "WT1+", "UPK3B+"],
 					"weight" : 1.0,
-					"comment" : "Markers from Schupp et al. and Travaglini et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
 
 		{
-			"name" : "Fibroblast/Myofibroblast",
+			"name" : "Fibroblast",
 			"markers" : [
 				{
 					"genes" : ["COL1A1+", "COL1A2+", "PDGFRA+", "ELN+", "BGN+"],
@@ -268,15 +255,15 @@
 				}
 			],
 			"subtypes" : {
-				"title" : "Fibro/Myofib subtype markers",
+				"title" : "Fibroblast subtype markers",
 				"cell_types" : [
 					{
 						"name" : "Adventitial fibroblast",
 						"markers" : [
 							{
-								"genes" : ["PTGIS+", "SFRP2+", "PDGFRL+", "SCARA5+", "MFAP5+", "PI16+", "AOX1+", "GAS1+", "IGFBP6+", "CXCL14+"],
+								"genes" : ["SFRP2+", "SFRP4+", "PDGFRL+", "PI16+",  "MFAP5+", "SCARA5+"],
 								"weight" : 1.0,
-								"comment" : "Markers from Schupp et al. and Travaglini et al."
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
@@ -284,24 +271,46 @@
 						"name" : "Alveolar fibroblast",
 						"markers" : [
 							{
-								"genes" : ["NKD1+", "FGFR4+", "GPM6B+", "SPINT2+", "SCN7A+", "TCF21+", "CAMK2N1+", "ADAMTS8+"],
+								"genes" : ["GPC3+", "FMO2+", "SCN7A+", "FGFR4+", "NKD2+", "ADAMTS8+"],
 								"weight" : 1.0,
-								"comment" : "Markers from Schupp et al. and Travaglini et al."
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
 					{
-						"name" : "Myofibroblast",
+						"name" : "Lipofibroblast",
 						"markers" : [
 							{
-								"genes" : ["ACTA2+", "MYL9+", "MT2A+", "EEF1A1+", "TMSB10+", "FAU+", "UBA52+", "SERF2+", "PTMA+", "S100A6+"],
+								"genes" : ["MLLT11+", "HAS2+", "SEMA6A+", "LONRF2+", "HOMER1+", "PWWP3B+"],
 								"weight" : 1.0,
-								"comment" : "Markers from Schupp et al. and Travaglini et al."
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020"
 							}
 						]
 					}
 				]
 			}
+		},
+
+		{
+			"name" : "Myofibroblast",
+			"markers" : [
+				{
+					"genes" : ["ASPN+", "SCARA3+", "WIF1+", "ANGPTL2+", "ITGBL1+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020"
+				}
+			]
+		},
+
+		{
+			"name" : "Fibromyocyte",
+			"markers" : [
+				{
+					"genes" : ["SBSPON+", "SCX+", "GREM2+", "KCNMB1+", "LGR6+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020"
+				}
+			]
 		}
 	]
 }
diff --git a/pegasus/annotate_cluster/mouse_brain_cell_markers.json b/pegasus/annotate_cluster/mouse_brain_cell_markers.json
index ebfc0eac..1bdfc86c 100644
--- a/pegasus/annotate_cluster/mouse_brain_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_brain_cell_markers.json
@@ -1,5 +1,6 @@
 {
 	"title" : "Mouse brain cell markers",
+	"comments": "Yao et al. Nature 2021 Allen Mouse Brain Map is a great resource for markers; Map: https://celltypes.brain-map.org/rnaseq/mouse_ctx-hpf_10x?selectedVisualization=Heatmap&colorByFeature=Cell+Type&colorByFeatureValue=Gad1; Cell type metadata: https://brainpalmseq.med.ubc.ca/brain-regions/neocortex-allen-brain-atlas-rnaseq/search-allen-brain-map-by-all-cell-types/; Extended Data Fig 2 & Supp Table 1 of Zhang et al. Nature 2021 is also used in marker selection",
 	"cell_types" : [
 		{
 			"name" : "Glutamatergic neuron",
@@ -168,28 +169,60 @@
 			"name" : "Oligodendrocyte",
 			"markers" : [
 				{
-					"genes" : ["Mbp+", "Plp1+"],
-					"weight" : 0.6,
-					"comment" : "Oligo specific markers (Allen Brain Map)"
-				},
-				{
-					"genes" : ["Mog+"],
-					"weight" : 0.15,
-					"comment" : "Oligo specific markers, but not expressed in all Oligo cells (Allen Brain Map)"
+					"genes" : ["Plp1+", "Cnp+", "Fa2h+", "St18+", "Mbp+"],
+					"weight" : 0.8,
+					"comment" : "Oligo specific markers from Yao et al. Nature 2021 (Allen Brain Map)"
 				},
 				{
 					"genes" : ["Olig1+", "Olig2+", "Sox10+"],
-					"weight" : 0.25,
+					"weight" : 0.2,
 					"comment" : "Expressed in both Oligo and OPC (Allen Brain Map)"
 				}
-			]
+			],
+			"subtypes" : {
+				"title" : "Oligodendrocyte subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Opalin+ Oligodendrocyte",
+						"markers" : [
+							{
+								"genes" : ["Opalin+", "Mog+", "Plekhh1+", "Ermn+"],
+								"weight" : 1.0,
+								"comment": "Opalin+ markers from Yao et al. Nature 2021"
+							}
+						]
+					},
+					{
+						"name" : "Enpp6+ Oligodendrocyte",
+						"markers" : [
+							{
+								"genes" : ["Enpp6+", "Pik3r3+", "Cnksr3+", "Parvb+", "Dusp15+"],
+								"weight" : 1.0,
+								"comment": "Enpp6+ markers from Yao et al. Nature 2021"	
+							}
+						]
+					},
+					{
+						"name" : "Neu4+ Oligodendrocyte",
+						"markers" : [
+							{
+								"genes" : ["Neu4+"],
+								"weight" : 1.0,
+								"comment": "Neu4+ markers from Yao et al. Nature 2021"	
+							}
+						]
+					}
+
+				]
+			}
 		},
 		{
 			"name" : "OPC",
 			"markers" : [
 				{
-					"genes" : ["Pdgfra+", "Cspg4+"],
-					"weight" : 1.0
+					"genes" : ["Pdgfra+", "Cspg4+", "Emid1+", "Fabp7+"],
+					"weight" : 1.0,
+					"comment": "Oligodendrocyte progenitor cell markers from Yao et al. Nature 2021"
 				}
 			]
 		},
@@ -197,71 +230,94 @@
 			"name" : "Astrocyte",
 			"markers" : [
 				{
-					"genes" : ["Aqp4+", "Gja1+", "F3+", "Prex2+"],
-					"weight" : 1.0
+					"genes" : ["Mt2+", "Gja1+", "Prdx6+", "Htra1+", "Ntsr2+", "Aldoc+", "Apoe+", "Prex2+", "Aqp4+", "Gpr37l1+"],
+					"weight" : 1.0,
+					"comment": "Astrocyte markers from Yao et al. Nature 2021"
 				}
-			]
+			],
+			"subtypes" : {
+				"title" : "Astrocyte subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Gfap+ Astrocyte",
+						"markers" : [
+							{
+								"genes" : ["Gfap+", "Aqp4+", "Tmem47+", "Id4+", "Mlc1+", "Sdc4+", "Gstm1+"],
+								"weight" : 1.0,
+								"comment": "Gfap+ markers from Yao et al. Nature 2021"
+							}
+						]
+					},
+					{
+						"name" : "Slc7a10+ Astrocyte",
+						"markers" : [
+							{
+								"genes" : ["Slc7a10+", "Grm3+", "Trpm3+", "Phkg1+", "Cdh10+", "Luzp2+", "Gria2+", "Slc6a1+"],
+								"weight" : 1.0,
+								"comment": "Slc7a10+ markers from Yao et al. Nature 2021"	
+							}
+						]
+					}
+				]
+			}
 		},
 		{
 			"name" : "Microglia",
 			"markers" : [
 				{
-					"genes" : ["C1qb+", "P2ry12+", "Ctss+", "Csf1r+", "Hmha1+"],
-					"weight" : 1.0
+					"genes" : ["Hexb+", "Siglech+", "Selplg+", "Tmem119+", "Ctss+", "P2ry12+", "Cx3cr1+", "Trem2+", "Fcrls+", "Csf1r+"],
+					"weight" : 1.0,
+					"comment": "Microglia specific markers from Yao et al. Nature 2021"
 				}
 			]			
 		},
 		{
-			"name" : "Endothelial",
-			"markers" : [
-				{
-					"genes" : ["Flt1+", "Dcn+", "Xdh+", "Id1+"],
-					"weight" : 1.0
-				}
-			]
-		},
-		{
-			"name" : "Fibroblast",
+			"name" : "Perivascular macrophage",
 			"markers" : [
 				{
-					"genes" : ["Igfbp1+", "Dcn+"],
-					"weight" : 1.0
+					"genes" : ["Mrc1+", "Stab1+", "Lyz2+", "Ms4a6c+", "F13a1+", "Pf4+"],
+					"weight" : 1.0,
+					"comment": "PVM specific markers from Yao et al. Nature 2021"
 				}
 			]			
 		},
 		{
-			"name" : "Mural",
+			"name" : "Endothelial cell",
 			"markers" : [
 				{
-					"genes" : ["Rgs5+", "Acta2+"],
-					"weight" : 1.0
+					"genes" : ["Flt1+", "Pecam1+", "Ly6a+", "Slco1a4+", "Mecom+", "Ptprb+", "Id1+"],
+					"weight" : 1.0,
+					"comment" : "Endo specific markers from Yao et al. Nature 2021; Slco1a4 is specific to mouse brain: see https://journals.plos.org/plosone/article/figures?id=10.1371/journal.pone.0013741"
 				}
-			]			
+			]
 		},
 		{
-			"name" : "Choroid Coch",
+			"name" : "Vascular leptomeningeal cell",
 			"markers" : [
 				{
-					"genes" : ["Tgfbi+"],
-					"weight" : 1.0
+					"genes" : ["Slc7a11+", "Slc6a13+", "Bmp6+", "Igfbp2+", "Fmod+", "Ranbp3l+"],
+					"weight" : 1.0,
+					"comment" : "VLMC specific markers from Yao et al. Nature 2021"
 				}
-			]			
+			]
 		},
 		{
-			"name" : "Ependyma",
+			"name" : "Smooth muscle cell",
 			"markers" : [
 				{
-					"genes" : ["Ccdc153+"],
-					"weight" : 1.0
+					"genes" : ["Atca2+", "Myh11+", "Tagln+", "Pln+", "Mylk+"],
+					"weight" : 1.0,
+					"comment" : "SMC specific markers from Yao et al. Nature 2021"
 				}
 			]			
 		},
 		{
-			"name" : "Smooth muscle cell",
+			"name" : "Pericyte",
 			"markers" : [
 				{
-					"genes" : ["Vtn+", "Colec12+"],
-					"weight" : 1.0
+					"genes" : ["Vtn+", "Atp13a5+", "Abcc9+", "Kcnj8+", "Art3+"],
+					"weight" : 1.0,
+					"comment" : "Pericyte specific markers from Yao et al. Nature 2021"
 				}
 			]			
 		}
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index 64e0fd8a..9b9095eb 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -5,8 +5,9 @@
 			"name" : "T cell",
 			"markers" : [
 				{
-					"genes" : ["Cd28+", "Cd3d+", "Cd3e+", "Cd4+", "Cd8a+"],
-					"weight" : 1.0
+					"genes" : ["Cd3d+", "Cd3e+", "Lat+", "Thy1+", "Lef1+", "Trac+", "Cd28+"],
+					"weight" : 1.0,
+					"comment" : "T cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
 				}
 			],
 			"subtypes" : {
@@ -51,47 +52,245 @@
 				]
 			}
 		},
+
 		{
-			"name" : "Monocyte",
+			"name" : "Immature B cell",
 			"markers" : [
 				{
-					"genes" : ["Lyz2+", "Lyz1+", "S100a4+", "Itgam+"],
-					"weight" : 0.8
+					"genes" : ["Tifa+", "Cecr2+", "Rag1+", "Atp1b1+", "Myb+", "Irf4+", "Fam129c+"],
+					"weight" : 1.0,
+					"comment" : "Immature B cell markers from Hurskainen et al. Nat. Commun. 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "B cell",
+			"markers" : [
+				{
+					"genes" : ["Cd79a+", "Cd79b+", "Ms4a1+", "Cd19+", "H2-Ob+", "Tnfrsf13c+", "Bank1+", "Blk+", "Fcrla+", "Cd22+"],
+					"weight" : 0.91,
+					"comment" : "Human and mouse shared B cell markers; validated using Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020), Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data; Ebf1, Pax5 and Fcmr are good markers for mouse lung and liver but not marrow, tissue-specific marker?"
+				},
+				{
+					"genes" : ["Cxcr5+"],
+					"weight" : 0.09,
+					"comment" : "CXCR5 is constantly expressed by mature B cells and helps to guide B cells to follicle; fDC expresses CXCL13, the ligand for CXCR5; this marker expresses lowly in human but higher in mouse "
+				}
+			],
+			"subtypes" : {
+				"title" : "B cell subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Naive B cell",
+						"markers" : [
+							{
+								"genes" : ["Ighd+", "Fcer2a+", "Vpreb3+", "Fcrl1+", "Chchd10+"],
+								"weight" : 1.0,
+								"comments" : "Markers for naive B cell derived from Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) & Kaptein et al. Cell 2022; Ighd & Fcer2a are shared with human"
+							}
+						]
+					},
+					{
+						"name" : "Memory B cell",
+						"markers" : [
+							{
+								"genes" : ["Zbtb32+", "C130026I21Rik+", "Pdlim1+", "Hepacam2+", "Igha+"],
+								"weight" : 0.8,
+								"comments" : "Markers for memory B cell derived from Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) data; need to check and add Ighg related genes"
+							},
+							{
+								"genes" : ["Nt5e+", "Cd80+", "Fas+", "Pdcd1lg2+"],
+								"weight" : 0.2,
+								"comments" : "Traditional mouse memory B cell validated by Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) data; all lowly expressed; Nt5e (5' Nucleotidase/CD73), Fas (CD95), Pdcd1lg2 (PD-L2/CD273)"
+							}
+						]
+					}
+				]
+			}
+		},
+
+		{
+			"name" : "Plasma cell",
+			"markers" : [
+				{
+					"genes" : ["Sdc1+", "Slamf7+", "Tnfrsf17+", "Irf4+", "Prdm1+"],
+					"weight" : 0.5,
+					"comment" : "Plasma cell markers shared with human and validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020)"
 				},
 				{
-					"genes" : ["C1qb+", "C1qc+", "Mrc1+", "Cd52+"],
-					"weight" : 0.2
+					"genes" : ["Derl3+", "Chst1+", "Eaf2+", "Oosp1+", "Cacna1s+"],
+					"weight" : 0.4,
+					"comment" : "Mouse-specific plasma cell markers validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020)"
+				},
+				{
+					"genes" : ["Xbp1+", "Slc3a2+", "Ly6k+"],
+					"weight" : 0.1,
+					"comment" : "Traditional mouse plasma markers (not ideal) validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020); Xbp1 & Slc3a2 (CD98) expressed highest in plasma but also expressed in other cell types"
 				}
 			]
 		},
+
 		{
-			"name" : "B cell",
+			"name" : "Natural killer cell",
 			"markers" : [
 				{
-					"genes" : ["Cd19+", "Cd79b+", "Cd74+", "Igkc+", "Ighm+", "Iglc2+", "Ms4a1+"],
-					"weight" : 1.0
+					"genes" : ["Gzma+", "Klrb1c+", "Ncr1+", "Klre1+", "Klrc2+"],
+					"weight" : 0.6,
+					"comment" : "NK & ILC1 shared markers from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Kaptein et al. Cell 2022 data"
+				},
+				{
+					"genes" : ["Eomes+", "Cma1+", "Klra4+", "Klra7+", "Klra8+"],
+					"weight" : 0.4,
+					"comment" : "NK cell specific markers (compared to ILC1) from Kaptein et al. Cell 2022; these markers do not have high expressions in Hurskainen et al. Nat. Commun. 2021 data"
 				}
 			]
 		},
+
 		{
-			"name" : "Neutrophil",
+			"name" : "Classical monocyte",
+			"markers" : [
+				{
+					"genes" : ["Ly6c2+", "F13a1+", "Ccr2+", "Ms4a4c+", "Gm9733+", "Mcub+", "S100a4+"],
+					"weight" : 1.0,
+					"comment" : "Classical monocyte markers (except S100a4) inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; Ly6c2, F13a1, Ccr2 and Ms4a4c (in Fig. 1b) are Group III markers from Casanova-Acebes et al. Nature 2021. S100a4 is less specific to classical monocyte."
+				}
+			]
+		},
+
+		{
+			"name" : "Patrolling monocyte",
 			"markers" : [
 				{
-					"genes" : ["Mmp9+", "S100a8+", "S100a9+", "Il1b+", "Retnlg+", "Lcn2+"],
-					"weight" : 1.0
+					"genes" : ["Eno3+", "Cd300e+", "Ace+", "Treml4+", "Spn+", "Adgre4+", "Lair1+", "Fcgr4+", "Ear2+", "Cd300ld+"],
+					"weight" : 1.0,
+					"comment" : "Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; First 6 markers are Group IV markers in Fig. 1b of Casanova-Acebes et al. Nature paper; Eno3, Cd300e, Ace and Lair1 are very specific; Related papers: Domingo-Gonzalez et al. Elife 2020, Thomas et al. Arterioscler Thromb Vasc Biol. 2015, and Schyns et al. Nat. Commun. 2019."
 				}
 			]
 		},
+
 		{
-			"name" : "NK cell",
+			"name" : "Macrophage",
 			"markers" : [
 				{
-					"genes" : ["Nkg7+"],
-					"weight" : 0.55
+					"genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"],
+					"weight" : 1.0,
+					"comment" : "Machrophage markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+		
+		{
+			"name" : "Conventional type 1 dendritic cell",
+			"markers" : [
+				{
+					"genes" : ["Xcr1+", "Ifi205+", "Rab7b+", "Tlr3+", "Sept3+", "Hepacam2+"],
+					"weight" : 0.7,
+					"comment" : "cDC1 markers shared between Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
 				},
 				{
-					"genes" : ["Cd3d-", "Cd3e-"],
-					"weight" : 0.45
+					"genes" : ["Gcsam+", "Snx22+", "Itgae+", "Xlr+"],
+					"weight" : 0.3,
+					"comment" : "cDC1 markers expressed highly in one of Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021, but not both"
+				}
+			]
+		},
+
+		{
+			"name" : "Conventional type 2 dendritic cell",
+			"markers" : [
+				{
+					"genes" : ["Cd209a+","Ltb4r1+", "Mgl2+", "Tnip3+", "Bex6+"],
+					"weight" : 1.0,
+					"comment" : "cDC2 markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Migratory dendritic cell",
+			"markers" : [
+				{
+					"genes" : ["Cacnb3+", "Nudt17+", "Ccl22+", "Apol7c+", "Slco5a1+", "Ccr7+", "Fscn1+", "Il4i1+", "Mreg+", "Bcl2l14+"],
+					"weight" : 1.0,
+					"comment" : "Migratory DC markers shared between Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Plasmacytoid dendritic cell",
+			"markers" : [
+				{
+					"genes" : ["Siglech+", "Ccr9+", "Cox6a2+", "Cd300c+", "Klk1+"],
+					"weight" : 1.0,
+					"comment" : "pDC markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Neutrophil",
+			"markers" : [
+				{
+					"genes" : ["S100a8+", "S100a9+", "Retnlg+", "Mmp9+", "Csf3r+", "Wfdc21+", "Il1r2+", "Cxcr2+"],
+					"weight" : 1.0,
+					"comment" : "Neutrophil markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; related paper: Grieshaber-Bouyer et al. Nat. Commun. 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Immature neutrophil",
+			"markers" : [
+				{
+					"genes" : ["Ngp+", "Camp+", "Ltf+", "Ly6g+", "Cebpe+"],
+					"weight" : 1.0,
+					"comment" : "Immature Neutrophil markers inferred from Hurskainen et al. Nat. Commun. 2021 and checked using Evrard et al. Immunity 2018 Fig. 5"
+				}
+			]
+		},
+
+		{
+			"name" : "Basophil",
+			"markers" : [
+				{
+					"genes" : ["Cd200r3+", "Aqp9+", "Il6+", "Hgf+", "Adora2b+", "Il4+", "L1cam+", "Grm6+"],
+					"weight" : 1.0,
+					"comment" : "Basophil markers inferred from Matsumara et al. Nat. Commun. 2022 and confirmed using data from Kaptein et al. Cell 2022  and Hurskainen et al. Nat. Commun. 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Eosinophil",
+			"markers" : [
+				{
+					"genes" : ["Epx+", "Prg3+", "Eml5+", "Il5ra+", "Qsox2+", "L2hgdh+"],
+					"weight" : 1.0,
+					"comment" : "Eosinophil markers inferred from Matsumara et al. Nat. Commun. 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Mast cell",
+			"markers" : [
+				{
+					"genes" : ["Tph1+", "Clnk+", "Hs6st2+", "Plcg1+"],
+					"weight" : 1.0,
+					"comment" : "Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022"
+				}
+			]
+		}, 
+
+		{
+			"name" : "Red blood cell",
+			"markers" : [
+				{
+					"genes" : ["Hba-a1+", "Hba-a2+", "Hbb-bs+", "Hbb-bt+"],
+					"weight" : 1.0,
+					"comment" : "Hemoglobin genes"
 				}
 			]
 		}
diff --git a/pegasus/annotate_cluster/mouse_liver_cell_markers.json b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
new file mode 100644
index 00000000..f40427b3
--- /dev/null
+++ b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
@@ -0,0 +1,246 @@
+{
+	"title" : "Mouse liver cell type markers",
+	"comment": "Markers are collected from Kaptein et al. Cell 2022",
+	"cell_types" : [
+		{
+			"name" : "Hepatocyte",
+			"markers" : [
+				{
+					"genes" : ["Acaa1b+", "Arg1+", "Sult2a8+", "Hgd+", "Otc+"],
+					"weight" : 1.0,
+					"comment" : "Hepatocyte markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Cholangiocyte",
+			"markers" : [
+				{
+					"genes" : ["Spp1+", "Ddit4l+", "Sox9+", "Fgfr3+", "Plet1+"],
+					"weight" : 1.0,
+					"comment" : "Cholangiocyte markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "HsPC",
+			"markers" : [
+				{
+					"genes" : ["Chrm3+", "Dmbt1+", "Slc4a4+", "Parm1+", "Pcdh11x+"],
+					"weight" : 1.0,
+					"comment" : "Hepatic stem and progenitor cell markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+
+		{
+			"name" : "ILC1",
+			"markers" : [
+				{
+					"genes" : ["Xcl1+", "Cd160+", "Klrc1+", "Cd200r2+", "Gzmc+"],
+					"weight" : 1.0,
+					"comment" : "Innate lymphoid cell type 1 markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+
+		{
+			"name" : "Kupffer cell",
+			"markers" : [
+				{
+					"genes" : ["Cd5l+", "Clec4f+", "Vsig4+", "Folr2+", "Timd4+"],
+					"weight" : 1.0,
+					"comment" : "Kupffer cell markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Peritoneal macrophage",
+			"markers" : [
+				{
+					"genes" : ["Lyz1+", "Saa3+", "Prg4+", "Retnla+", "Cbr2+"],
+					"weight" : 1.0,
+					"comment" : "Peritoneal macrophage markers from Kaptein et al. Cell 2022; Note that Lyve1 is also a good marker but it is also expressed in endothelial cells"
+				}
+			]
+		},
+
+		{
+			"name" : "Macrophage",
+			"markers" : [
+				{
+					"genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"],
+					"weight" : 1.0,
+					"comment" : "Machrophage markers from Kaptein et al. Cell 2022"
+				}
+			],
+			"subtypes" : {
+				"title" : "Macrophage subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Cd207+ macrophage",
+						"markers" : [
+							{
+								"genes" : ["Cd207+", "Tmem119+", "Olfml3+", "Mmp13+"],
+								"weight" : 1.0,
+								"comments" : "Cd207+ macrophage markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Bile-duct lipid-associated macrophage",
+						"markers" : [
+							{
+								"genes" : ["Gpnmb+", "Spp1+", "Syngr1+", "Cd93+"],
+								"weight" : 1.0,
+								"comments" : "Bile-duct LAM markers from Kaptein et al. Cell 2022"
+							}
+						]
+					}
+				]
+			}
+		},
+
+
+		{
+			"name" : "Endothelial cell",
+			"markers" : [
+				{
+					"genes" : ["Mmrn2+", "Cldn5+", "Adgrl4+", "Tek+", "Myct1+"],
+					"weight" : 1.0,
+					"comment" : "Endothelial cell markers from Kaptein et al. Cell 2022"
+				}
+			],
+			"subtypes" : {
+				"title" : "Endothelial cell subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Liver sinusoidal endothelial cell",
+						"markers" : [
+							{
+								"genes" : ["Lyve1+", "Clec1b+", "Chst2+", "Wisp1+"],
+								"weight" : 1.0,
+								"comment" : "LSEC markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Central vein endothelial cell",
+						"markers" : [
+							{
+								"genes" : ["Rspo3+", "Lhx6+", "Wnt9b+", "Plppr5+"],
+								"weight" : 1.0,
+								"comment" : "CV EC markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Portal Vein endothelial cell",
+						"markers" : [
+							{
+								"genes" : ["Adgrg6+", "Nrg1+", "Gja5+","Cmklr1+"],
+								"weight" : 1.0,
+								"comment" : "PV EC markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Lymphatic Endothelial cell",
+						"markers" : [
+							{
+								"genes" : ["Mmrn1+", "Pard6g+", "Nts+", "Ccl21a+"],
+								"weight" : 1.0,
+								"comments" : "LEC markers from Kaptein et al. Cell 2022"
+							}
+						]
+					}
+				]
+			}
+		},
+
+
+		{
+			"name" : "Stellate cell",
+			"markers" : [
+				{
+					"genes" : ["Colec10+", "Rspo3+", "Mapt+", "Lama1+", "Bmp10+"],
+					"weight" : 1.0,
+					"comment" : "Stellate cell markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Vascular smooth muscle cell",
+			"markers" : [
+				{
+					"genes" : ["Cacna1c+", "Myh11+", "Notch3+", "Lmod1+", "Tagln+"],
+					"weight" : 1.0,
+					"comment" : "VSMC markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Mesothelial cell",
+			"markers" : [
+				{
+					"genes" : ["Ephb1+", "Cadm2+", "Prss12+", "Myl7+", "Prph+"],
+					"weight" : 1.0,
+					"comment" : "Mesothelial cell markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Fibroblast",
+			"markers" : [
+				{
+					"genes" : ["Col1a1+", "Mrc2+", "Plcxd3+", "Fndc1+", "Cpxm1+"],
+					"weight" : 1.0,
+					"comment" : "Fibroblast markers from Kaptein et al. Cell 2022"
+				}
+			],
+			"subtypes" : {
+				"title" : "Fibro subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Capsule fibroblast",
+						"markers" : [
+							{
+								"genes" : ["Osr1+", "Cldn10+", "Lgals7+", "Spock3+"],
+								"weight" : 1.0,
+								"comment" : "Capsule fibroblast markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Central vein fibroblast",
+						"markers" : [
+							{
+								"genes" : ["Dpt+", "Pcolce2+", "Ntrk2+", "Pi16+"],
+								"weight" : 1.0,
+								"comment" : "Central vein fibroblast markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Bile-duct fibroblast",
+						"markers" : [
+							{
+								"genes" : ["Itgbl1+", "Plcxd3+", "Nkain3+", "Clic5+"],
+								"weight" : 1.0,
+								"comment" : "Bile-duct fibroblast markers from Kaptein et al. Cell 2022"
+							}
+						]
+					}
+				]
+			}
+		}
+	]
+}
diff --git a/pegasus/annotate_cluster/mouse_lung_cell_markers.json b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
new file mode 100644
index 00000000..543c3cff
--- /dev/null
+++ b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
@@ -0,0 +1,313 @@
+{
+	"title" : "Mouse lung cell type markers",
+	"cell_types" : [
+		{
+			"name" : "Alveolar type I cell",
+			"markers" : [
+				{
+					"genes" : ["Akap5+", "Rtkn2+", "Ndnf+", "Col4a3+", "Spock2+"],
+					"weight" : 1.0,
+					"comment" : "AT1 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Alveolar type II cell",
+			"markers" : [
+				{
+					"genes" : ["Sftpc+", "Sftpa1+", "Lamp3+", "Hc+", "Slc34a2+"],
+					"weight" : 1.0,
+					"comment" : "AT2 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Ciliated cell",
+			"markers" : [
+				{
+					"genes" : ["Dynlrb2+", "Tmem212+", "Foxj1+", "Ccdc153+", "Nme5+"],
+					"weight" : 1.0,
+					"comment" : "Ciliated cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Club cell",
+			"markers" : [
+				{
+					"genes" : ["Scgb1a1+", "Scgb3a2+", "Cckar+", "Gabrp+", "Slc16a11+"],
+					"weight" : 1.0,
+					"comment" : "Club cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Basal cell",
+			"markers" : [
+				{
+					"genes" : ["Aqp3+", "Krt5+", "Dapl1+", "Hspa1a+", "Trp63+"],
+					"weight" : 1.0,
+					"comment" : "Basal cell markers from Montoro et al. Nature 2018 Extended Data Fig. 1d"
+				}
+			]
+		},
+
+		{
+			"name" : "Goblet cell",
+			"markers" : [
+				{
+					"genes" : ["Scgb3a1+", "Muc5b+", "Serpinb11+", "Gp2+", "Dmbt1+"],
+					"weight" : 1.0,
+					"comment" : "Goblet cell markers from Montoro et al. Nature 2018 Supp Table 1"
+				}
+			]
+		},
+
+		{
+			"name" : "Tuft cell",
+			"markers" : [
+				{
+					"genes" : ["Pou2f3+", "Ascl2+", "Dclk1+", "Lrmp+", "Ltc4s+", "Trpm5+", "Gnb3+", "Rgs13+"],
+					"weight" : 1.0,
+					"comment" : "Tuft cell markers from Sun et al. Dev. Cell 2022 and Montoro et al. Nature 2018 Extended Data Fig. 3b; first 3 markers are mainly suggested by Sun et al. the CellCards."
+				}
+			]
+		},
+
+		{
+			"name" : "Plumonary neuroendocrine cell",
+			"markers" : [
+				{
+					"genes" : ["Ascl1+", "Chga+", "Calca+", "Scg2+", "Scg5+"],
+					"weight" : 1.0,
+					"comment" : "Plumonary neuroendocrine cell markers from Montoro et al. Nature 2018 Extended Data Fig. 3b & 3c"
+				}
+			]
+		},
+
+		{
+			"name" : "Ionocyte",
+			"markers" : [
+				{
+					"genes" : ["Foxi1+", "Ascl3+", "Smbd1+", "Moxd1+", "Atp6v0d2+"],
+					"weight" : 1.0,
+					"comment" : "Ionocyte markers from Montoro et al. Nature 2018 Fig. 5a"
+				}
+			]
+		},
+
+
+
+		{
+			"name" : "Endothelial cell",
+			"markers" : [
+				{
+					"genes" : ["Egfl7+", "Cldn5+", "Cdh5+", "Pecam1+", "Calcrl+", "Ecscr+", "Icam2+"],
+					"weight" : 1.0,
+					"comment" : "Endothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+				}
+			],
+			"subtypes" : {
+				"title" : "Endothelial cell subtype markers (Main and Capillary, see https://lungmap.net/cell-cards/)",
+				"cell_types" : [
+					{
+						"name" : "Aerocyte",
+						"markers" : [
+							{
+								"genes" : ["Emp2+", "Car4+", "Tbx2+", "Apln+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					},
+					{
+						"name" : "EC general capillary",
+						"markers" : [
+							{
+								"genes" : ["Gpihbp1+", "Kit+", "Nckap5+", "Aplnr+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					},
+					{
+						"name" : "EC lymphatic",
+						"markers" : [
+							{
+								"genes" : ["Mmrn1+", "Ccl21a+", "Prox1+", "Nts+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					},
+					{
+						"name" : "EC venous",
+						"markers" : [
+							{
+								"genes" : ["Slc6a2+", "Vegfc+", "Ackr3+", "Fabp4+"],
+								"weight" : 1.0,
+								"comments" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					},
+					{
+						"name" : "EC arterial",
+						"markers" : [
+							{
+								"genes" : ["Gja5+", "Cxcl12+", "Pcsk5+", "Thsd7a+"],
+								"weight" : 1.0,
+								"comments" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					}
+				]
+			}
+		},
+
+
+
+		{
+			"name" : "Mesothelial cell",
+			"markers" : [
+				{
+					"genes" : ["Wt1+", "Upk3b+", "Rspo1+", "C2+", "Sbsn+", "Aldh1a2+", "Lrrn4+", "Cldn15+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+				}
+			]
+		},
+
+		{
+			"name" : "Pericyte",
+			"markers" : [
+				{
+					"genes" : ["Notch3+", "Heyl+", "Parm1+", "Ndufa4l2+", "Cox4i2+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+				}
+			],
+			"subtypes" : {
+				"title" : "Pericyte subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Pericyte 1",
+						"markers" : [
+							{
+								"genes" : ["Gpc6+", "Cxcl12+", "Wisp2+", "Map3k7cl+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					},
+					{
+						"name" : "Pericyte 2",
+						"markers" : [
+							{
+								"genes" : ["Higd1b+", "Pcdh18+", "Trpc6+", "Fam162b+", "Clstn2+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					}
+				]
+			}
+		},
+
+		{
+			"name" : "Fibroblast",
+			"markers" : [
+				{
+					"genes" : ["Dpt+", "Clec3b+", "Pcolce2+", "Vegfd+", "Vcam1+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+				}
+			],
+			"subtypes" : {
+				"title" : "Fibro subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Adventitial fibroblast",
+						"markers" : [
+							{
+								"genes" : ["Mfap5+", "Serpinf1+", "Abca8a+", "Twist2+"],
+								"weight" : 1.0,
+								"comment" : "Markers from Schupp et al. and Travaglini et al."
+							}
+						]
+					},
+					{
+						"name" : "Alveolar fibroblast",
+						"markers" : [
+							{
+								"genes" : ["Slit2+", "Col13a1+", "Wnt2+", "Slc38a5+", "Slc27a6+", "Frem1+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					}
+				]
+			}
+		},
+
+		{
+			"name" : "Myofibroblast",
+			"markers" : [
+				{
+					"genes" : ["Egfem1+", "Agt+", "Prag1+", "Etv1+", "Trim67+"],
+					"weight" : 1.0,
+					"comment" : "Markers from Schupp et al. and Travaglini et al."
+				}
+			]
+		},
+
+		{
+			"name" : "Smooth muscle cell",
+			"markers" : [
+				{
+					"genes" : ["Tnnt2+", "Sgcg+", "Sntg2+", "Nrtn+", "Mrvi1+", "Sbspon+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+				}
+			]
+		},
+
+
+
+		{
+			"name" : "ILC2",
+			"markers" : [
+				{
+					"genes" : ["Gata3+", "Il1rl1+", "Arg1+", "Areg+", "Il2ra+", "Csf2+", "Ccl1+", "Ccdc184+", "Calca+", "Il5+"],
+					"weight" : 1.0,
+					"comment" : "Innate lymphoid cell type 2 markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+				}
+			]
+		},
+
+		{
+			"name" : "Alveolar macrophage",
+			"markers" : [
+				{
+					"genes" : ["Ear1+", "Marco+", "Atp6v0d2+", "Olr1+", "F7+", "Tfec+", "Gpnmb+", "Lrp12+", "Pparg+", "Car4+", "Krt19+", "Plet1+"],
+					"weight" : 1.0,
+					"comment" : "First 8 markers are Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data; Ear1 and Marco also show in Casanova-Acebes et al. Nature 2021; Last 4 are markers from Casanova-Acebes et al. Nature 2021 that are validated using Hurskainen et al. Nat. Commun. 2021 data"
+				}
+			]
+		},
+
+		{
+			"name" : "Interstitial macrophage",
+			"markers" : [
+				{
+					"genes" : ["C1qa+", "C1qb+", "C1qc+", "Pf4+", "Ms4a7+", "Fcrls+"],
+					"weight" : 1.0,
+					"comment" : "Interstitial macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+				}
+			]
+		}
+	]
+}
diff --git a/pegasus/commands/Clustering.py b/pegasus/commands/Clustering.py
index 9d8611b2..7b6d7e90 100644
--- a/pegasus/commands/Clustering.py
+++ b/pegasus/commands/Clustering.py
@@ -68,6 +68,7 @@ class Clustering(Base):
   --nmf-n <number>                                 Number of NMF components. IF iNMF is used for batch correction, this parameter also sets iNMF number of components. [default: 20]
 
   --knn-K <number>                                 Number of nearest neighbors for building kNN graph. [default: 100]
+  --exact-K                                        If use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
   --knn-full-speed                                 For the sake of reproducibility, we only run one thread for building kNN indices. Turn on this option will allow multiple threads to be used for index building. However, it will also reduce reproducibility due to the racing between multiple threads.
 
   --kBET                                           Calculate kBET.
@@ -210,6 +211,7 @@ def execute(self):
             "nmf": self.args["--nmf"],
             "nmf_n": int(self.args["--nmf-n"]),
             "K": int(self.args["--knn-K"]),
+            "exact_K": self.args["--exact-K"],
             "full_speed": self.args["--knn-full-speed"],
             "kBET": self.args["--kBET"],
             "kBET_batch": self.args["--kBET-batch"],
diff --git a/pegasus/data_files/emt_human.gmt b/pegasus/data_files/emt_human.gmt
new file mode 100644
index 00000000..dfec37a7
--- /dev/null
+++ b/pegasus/data_files/emt_human.gmt
@@ -0,0 +1,2 @@
+Epithelial-like	Signatures from Gibbons and Creighton Dev. Dyn. 2018	CDH1	DSP	OCLN
+Mesenchymal-like	Signatures from Gibbons and Creighton Dev. Dyn. 2018	VIM	CDH2	FOXC2	SNAI1	SNAI2	TWIST1	FN1	ITGB6	MMP2	MMP3	MMP9	SOX10	GCS
diff --git a/pegasus/data_files/human_lung.gmt b/pegasus/data_files/human_lung.gmt
new file mode 100644
index 00000000..6f488d48
--- /dev/null
+++ b/pegasus/data_files/human_lung.gmt
@@ -0,0 +1,23 @@
+Epithelial	Epithelial markers from HTAPP paper	KRT8	KRT18	EPCAM	CD24
+VEC	Vascular endothelial cell markers from Travaglini et al. Nature 2020 and and Schupp et al. Circulation 2021	PECAM1	CLDN5	CDH5	ERG	ICAM2	CLEC14A	ITM2A	ADGRL4	SLCO2A1	IFI27
+LEC	Lymphatic endothelial cell markers from Travaglini et al. Nature 2020 and and Schupp et al. Circulation 2021	PECAM1	CLDN5	ERG	CDH5	CCL21	TFF3	PDPN	PROX1	LYVE1	FLT4	GPM6A	SEMA3D	TBX1	RELN	
+Fibroblast	Fibroblast/Myofibroblast shared markers from Travaglini et al.	COL1A1	COL1A2	PDGFRA	ELN	BGN
+Macrophage	Macro	CD68	CD163	C1QA	MRC1	MS4A6A	MSR1	MERTK
+SMC	SMC from Muus et al., Braga et al. and Schupp et al.	MYH11	TAGLN	ACTG2	CNN1	PLN
+Pericyte	Pericyte from Schupp et al. and Travaglini et al.	TRPC6	CSPG4	FAM162B	GJA4	GJC1	HIGD1B	CDH6	LAMC3	FHL5
+T cell	T cell markers	CD3D	CD3E	CD3G	TRAC
+B cell	B cell markers	CD19	MS4A1	CD79A	CD79B
+Plasma cell	Plasma cell markers from ICA	TNFRSF17	PRDM1	SLAMF7	IRF4	SDC1	IGHA1	IGHG1	TNFRSF13B	CD38	ABCB9	CHPF	PLAAT2
+Mast cell	Mast cell markers	KIT	CPA3	TPSB2	TPSAB1	AREG	RGS1	RGS2
+ProNeu	Pro-Neutrophil markers validated using 10x public whole blood dataset	DEFA3	DEFA4	AZU1	MS4A3	ELANE	SLPI	CEACAM6	RNASE3	PRTN3	MPO	AC104232.1	CTSG
+PreNeu	Pre-Neutrophil markers validated using 10x public whole blood dataset	LTF	LCN2	MMP8	CRISP3	CAMP	PGLYRP1	CD177	HP
+Neutrophil	Neutrophil markers	CSF3R	G0S2	LUCAT1	EPHB1	TNFRSF10C	IL1R2	KCNJ15	FCGR3B	AC007032.1	HSD11B1-AS1
+AT1	AT1 markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	AGER	SPOCK2	RTKN2	TNNC1	SCEL	CLIC5	NCKAP5	ARHGEF26	GGTLC1	ITLN2	MS4A15
+AT2	AT2 markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	SFTPA1	SFTPA2	SFTPC	PGC	LAMP3	FASN	HHIP	ETV5	RASGRF1	ABCA3
+Basal	Basal cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	KRT17	S100A2	MIR205HG	KRT15	KRT5	DLK2	CDH3	TP63	TNS4
+Club	Club cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	SCGB3A2	MGP	CTSE
+Ciliated	Ciliated cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	ERICH3	ARMC3	DNAI2	ZBBX	VWA3B	RGS22	TTC29	CDHR4	PPP1R42	CFAP46	CFAP52	CFAP73	CFAP77	CFAP157	DNAH3	DNAH9	ADGB	SNTN	CCDC170	C6orf118
+Goblet	Goblet cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	MUC5AC	MUC5B	BPIFB1	MSMB	SERPINB11	CYP2F1
+Ionocyte	Ionocyte markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	ASCL3	CLCNKB	FOXI1	ATP6V1G3	TMPRSS11E	BSND	LINC01187	CLDN25
+PNEC	Plumonary neuroendocrine cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	CHGA	CHGB	SCGN	SCG5	CPLX2	GRP	ASCL1	INSM1
+SMG	SMG serous cell markers inferred from Travaglini et al. Nature 2020	PRR4	TCN1	C6orf58	PRB3	LPO	PRB1	PRH2	PRH1	ODAM
diff --git a/pegasus/data_files/human_t_cell_markers.gmt b/pegasus/data_files/human_t_cell_markers.gmt
new file mode 100644
index 00000000..aac1cda8
--- /dev/null
+++ b/pegasus/data_files/human_t_cell_markers.gmt
@@ -0,0 +1,11 @@
+CD4_Naive	CD4 Naive T	CD4	CCR7	SELL	LEF1	FHIT	ACTN1	LDLRAP1	TMIGD2	TRABD2A	LRRN3
+CD4_TCM	CD4 TCM	CD4	GPR183	CD69	PASK	LIMS1	LPAR6	SLC2A3	SOCS3
+CD4_TEM	CD4 TEM	CD4	KLRB1	ANXA2	LGALS1	TIMP1	PTGER2	AHNAK	TNFRSF4	YWHAH	CD63
+Treg	Treg	RTKN2	FOXP3	IL2RA	HACD1	AC133644.2	FANK1	DUSP4	STAM	CCR10	CTLA4
+CD4_CTL	CD4 Cytotoxic Lymphocyte	CD4	GNLY	AGAP1	ZNF683	RGS9	IL5RA	LAIR2	MTERF2	SH3RF2	RGS17
+Tfh	T follicular helper	CD4	ST8SIA1	PDCD1	TIGIT	TOX2	ICOS	SH2D1A	IL21
+CD8_Naive	CD8 Naive T	CD8A	CD8B	CCR7	SELL	LEF1	ACTN1	TRABD2A	LRRN3	LINC02446	S100B	CLEC11A	NELL2	PASK	APBA2
+CD8_TCM	CD8 TCM	CD8A	CD8B	GZMK	DUSP2	RGS1	CXCR3	CMC1	TIGIT	CST7	NKG7
+CD8_TEM	CD8 TEM	CD8A	CD8B	FGFBP2	GZMB	FCGR3A	SPON2	ADGRG1	CX3CR1	ASCL2	PRSS23
+MAIT	MAIT	SLC4A10	KLRB1	NCR3	CEBPD	GPR65	LST1	CXCR6	TRAV1-2
+gdT	gdT	TRDC	TRGC1	TRGC2	KLRC1	KLRD1	GNLY
diff --git a/pegasus/data_files/mouse_brain.gmt b/pegasus/data_files/mouse_brain.gmt
new file mode 100644
index 00000000..fa32ef25
--- /dev/null
+++ b/pegasus/data_files/mouse_brain.gmt
@@ -0,0 +1,11 @@
+GlutamatergicNeuron	Glutamatergic neuron	Slc17a7	Slc17a6	Neurod6	Neurod2	
+GABAergicNeuron	GABAergic neuron	Gad1	Gad2	Slc32a1
+Oligodendrocyte	Oligodendrocyte	Plp1	Cnp	Fa2h	St18	Mbp
+OPC	Oligodendrocyte progenitor cell	Pdgfra	Cspg4	Emid1	Fabp7
+SMC	Smooth muscle cell	Atca2	Myh11	Tagln	Pln	Mylk
+Pericyte	Pericyte	Vtn	Atp13a5	Abcc9	Kcnj8	Art3
+Endo	Endothelial cell	Flt1	Pecam1	Ly6a	Slco1a4	Mecom	Ptprb	Id1
+Microglia	Microglia cell	Hexb	Siglech	Selplg	Tmem119	Ctss	P2ry12	Cx3cr1	Trem2	Fcrls	Csf1r
+Astrocyte	Astrocyte	Mt2	Gja1	Prdx6	Htra1	Ntsr2	Aldoc	Apoe	Prex2	Aqp4	Gpr37l1
+PVM	Perivascular macrophages	Mrc1	Stab1	Lyz2	Ms4a6c	F13a1	Pf4
+VLMC	Vascular leptomeningeal cells	Slc7a11	Slc6a13	Bmp6	Igfbp2	Fmod	Ranbp3l
diff --git a/pegasus/data_files/mouse_liver.gmt b/pegasus/data_files/mouse_liver.gmt
new file mode 100644
index 00000000..d9c8bb4b
--- /dev/null
+++ b/pegasus/data_files/mouse_liver.gmt
@@ -0,0 +1,28 @@
+Endo	Endothelial cell	Mmrn2	Cldn5	Adgrl4	Tek	Myct1
+Stellate	Stellate cell	Colec10	Rspo3	Mapt	Lama1	Bmp10
+VSMC	Vascular smooth muscle cell	Cacna1c	Myh11	Notch3	Lmod1	Tagln
+Meso	Mesothelial cell	Ephb1	Cadm2	Prss12	Myl7	Prph
+Fibro	Fibroblast	Col1a1	Mrc2	Plcxd3	Fndc1	Cpxm1
+Hepatocyte	Hepatocye	Acaa1b	Arg1	Sult2a8	Hgd	Otc
+Cholangiocyte	Cholangiocyte	Spp1	Ddit4l	Sox9	Fgfr3	Plet1
+HSPC	Hepatic stem and progenitor cell	Chrm3	Dmbt1	Slc4a4	Parm1	Pcdh11x	
+T	T cell	Cd3d	Cd3e	Lat	Thy1	Lef1	Trac	Cd28
+B	B cell	Cd19	Ms4a1	Cd79a	Cd79b	Ebf1	Pax5	Fcmr	Bank1
+NK	NK cell	Eomes	Cma1	Klra4	Klra7	Klra8
+ILC1	Innate lymphoid cell type 1	Xcl1	Cd160	Klrc1	Cd200r2	Gzmc
+cDC1	cDC1	Xcr1	Ifi205	Rab7b	Tlr3	Sept3	Hepacam2	Gcsam	Snx22	Itgae	Xlr
+cDC2	cDC2	Cd209a	Ltb4r1	Mgl2	Tnip3	Bex6
+migDC	Migoritory DC	Cacnb3	Nudt17	Ccl22	Apol7c	Slco5a1	Ccr7	Fscn1	Il4i1	Mreg	Bcl2l14
+pDC	Plasmacytoid dendritic cell	Siglech	Ccr9	Cox6a2	Cd300c	Klk1
+MonoI	Inflammatory monocyte	Ly6c2	F13a1	Ms4a4c	Ccr2	Gm9733	Mcub
+MonoP	Patrolling monocyte	Ace	Eno3	Ear2	Treml4	Spn	Fcgr4	Lair1	Cd300e	Cd300ld	Adgre4
+PeriMac	Peritoneal macrophage	Lyz1	Saa3	Prg4	Retnla	Cbr2
+Mac	Macrophage	Cd14	Ms4a7	Cx3cr1	Trem2	Hpgds
+Kupffer	Kupffer cell	Cd5l	Clec4f	Vsig4	Folr2	Timd4
+Neutrophil	Neutrophil	S100a8	S100a9	Retnlg	Mmp9	Csf3r	Wfdc21	Il1r2	Cxcr2
+Basophil	Basophil	Cd200r3	Aqp9	Il6	Hgf	Adora2b	Il4	L1cam	Grm6
+Eosinophil	Eosinophil	Epx	Prg3	Eml5	Il5ra	Qsox2	L2hgdh
+Mast	Mast cell	Tph1	Clnk	Hs6st2	Plcg1
+Pericentral	Pericentral liver zonation markers from Halpern et al. Nature 2017 and Guilliams et al. Cell 2022	Mup11	Oat	Rgn	Glul	Cyp2e1	Axin2	Cyp1a2	Gstm3	Psmd4
+Periportal	Periportal liver zonation markers from Halpern et al. Nature 2017 and Guilliams et al. Cell 2022	Cyp2f2	Hal	Sds	Ass1	Asl	Alb	Arg1	Pck1	C2	Sdhd
+Midlobular	Mid-lobular liver zonation markers picked from Fig. 3 and Extended Data Fig 10a of Halpern et al. Nature 2017	Hamp	Igfbp2	Cyp8b1	Mup3	Hamp2	Hsbp8	Ces1d	Cebpa	Fkbp8	Clpp
diff --git a/pegasus/data_files/mouse_lung.gmt b/pegasus/data_files/mouse_lung.gmt
new file mode 100644
index 00000000..0ed0bc5b
--- /dev/null
+++ b/pegasus/data_files/mouse_lung.gmt
@@ -0,0 +1,31 @@
+AT1	AT1 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021	Akap5	Rtkn2	Ndnf	Col4a3	Spock2
+AT2	AT2 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021	Sftpc	Sftpa1	Lamp3	Hc	Slc34a2 
+Ciliated	Ciliated cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021	Dynlrb2	Tmem212	Foxj1	Ccdc153	Nme5
+Club	Club cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021	Scgb1a1	Scgb3a2	Cckar	Gabrp	Slc16a11
+Basal	Basal cell markers from Montoro et al. Nature 2018 Extended Data Fig. 1d	Aqp3	Krt5	Dapl1	Hspa1a	Trp63
+Goblet	Goblet cell markers from Montoro et al. Nature 2018 Supp Table 1	Scgb3a1	Muc5b	Serpinb11	Gp2	Dmbt1
+Tuft	Tuft cell markers from Sun et al. Dev. Cell 2022 and Montoro et al. Nature 2018 Extended Data Fig. 3b	Pou2f3	Ascl2	Dclk1	Lrmp	Ltc4s	Trpm5	Gnb3	Rgs13
+PNEC	Plumonary neuroendocrine cell markers from Montoro et al. Nature 2018 Extended Data Fig. 3b & 3c	Ascl1	Chga	Calca	Scg2	Scg5
+Ionocyte	Ionocyte markers from Montoro et al. Nature 2018 Fig. 5a	Foxi1	Ascl3	Smbd1	Moxd1	Atp6v0d2
+Endothelial	Endothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Egfl7	Cldn5	Cdh5	Pecam1	Calcrl	Ecscr	Icam2
+Mesothelial	Mesothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021	Wt1	Upk3b	Rspo1	C2	Sbsn	Aldh1a2	Lrrn4	Cldn15
+Pericyte	Pericyte markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Notch3	Heyl	Parm1	Ndufa4l2	Cox4i2
+Fibroblast	Fibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Dpt	Clec3b	Pcolce2	Vegfd	Vcam1
+Myofibroblast	Myofibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Egfem1	Agt	Prag1	Etv1	Trim67
+SMC	Smooth muscle cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Tnnt2	Sgcg	Sntg2	Nrtn	Mrvi1	Sbspon
+AlvMf	Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021	Atp6v0d2	Olr1	F7	Ear1	Tfec	Gpnmb	Lrp12	Marco
+IntMf	Interstitial macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021	C1qa	C1qb	C1qc	Pf4	Ms4a7	Fcrls
+ILC2	Innate lymphoid cell type 2 markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Gata3	Il1rl1	Arg1	Areg	Il2ra	Csf2	Ccl1	Ccdc184	Calca	Il5
+T	T cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Cd3d	Cd3e	Lat	Thy1	Lef1	Trac	Cd28
+B	B cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Cd19	Ms4a1	Cd79a	Cd79b	Ebf1	Pax5	Fcmr	Bank1
+NK	NK cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Gzma	Klrb1c	Ncr1	Klre1	Klrc2	Eomes	Cma1	Klra4	Klra7	Klra8
+cDC1	cDC1 markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Xcr1	Ifi205	Rab7b	Tlr3	Sept3	Hepacam2	Gcsam	Snx22	Itgae	Xlr
+cDC2	cDC2 markers from Kaptein et al. Cell 2022	Cd209a	Ltb4r1	Mgl2	Tnip3	Bex6
+migDC	Migoritory DC markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Cacnb3	Nudt17	Ccl22	Apol7c	Slco5a1	Ccr7	Fscn1	Il4i1	Mreg	Bcl2l14
+pDC	Plasmacytoid dendritic cell markers from Kaptein et al. Cell 2022	Siglech	Ccr9	Cox6a2	Cd300c	Klk1
+MonoI	Inflammatory monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Ly6c2	F13a1	Ms4a4c	Ccr2	Gm9733	Mcub 
+MonoP	Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Ace	Eno3	Ear2	Treml4	Spn	Fcgr4	Lair1	Cd300e	Cd300ld	Adgre4
+Neutrophil	Neutrophil markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	S100a8	S100a9	Retnlg	Mmp9	Csf3r	Wfdc21	Il1r2	Cxcr2
+Basophil	Basophil markers inferred from Matsumara et al. Nat. Commun. 2022 data	Cd200r3	Aqp9	Il6	Hgf	Adora2b	Il4	L1cam	Grm6
+Eosinophil	Eosinophil markers inferred from Matsumara et al. Nat. Commun. 2022 data	Epx	Prg3	Eml5	Il5ra	Qsox2	L2hgdh
+Mast	Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022 data	Tph1	Clnk	Hs6st2	Plcg1
diff --git a/pegasus/data_files/tonsil_markers.gmt b/pegasus/data_files/tonsil_markers.gmt
new file mode 100644
index 00000000..bfefe13c
--- /dev/null
+++ b/pegasus/data_files/tonsil_markers.gmt
@@ -0,0 +1,18 @@
+Skeletal muscle cells	Skeletal muscle cells	MYBPC1	TNNT1	TNNC1	MYL1	MYBPH	TNNC2	TNNI1	MYH7	MYL2
+Tfh	T Follicular helper markers (one reference point is https://www.thermofisher.com/us/en/home/life-science/cell-analysis/cell-analysis-learning-center/immunology-at-work/t-follicular-helper-cell-overview.html)	CD4	ST8SIA1	PDCD1	TIGIT	TOX2	ICOS	SH2D1A	IL21
+Tregs	Tregs	CTLA4	TIGIT	IL2RA	FOXP3	CCR8	BATF
+T_Naive	Naive T cell	CCR7	SELL	IL7R	TCF7	CD27
+DC_Migratory	Migratory Conventional Dendritic cell	FSCN1	CCR7	LAMP3	CCL19	CCL22	CD40	BIRC3
+MAIT	MAIT	SLC4A10
+EC lymphatic	Schupp et al. Circulation 2021	PECAM1	CLDN5	CDH5	ERG	CCL21	SEMA3D	PROX1	PDPN	MMRN1	RELN	PKHD1L1	TFF3	LYVE1	FLT4	TBX1
+fDC	Follicular dendritic cell	CXCL13	FCAMR	FDCSP	SERPINE2	PAPPA	NPHS1	PKDCC	SYNM	NRG2	CDC42EP4	MUC3A	PRUNE2	B4GALNT4	NPPC	SLC1A2	TMEM150C
+DCs_CLEC9A	Conventional Dendritic cell type 1	CLEC9A	BATF3	IRF8	CPVL	CADM1
+DCs_CD1C	Conventional Dendritic cell type 2	CD1C	FCER1A	FCGBP	CD1A	CD207	HLA-DQB2
+pDCs	Plasmacytoid Dendritic cell	IRF4	LILRA4	TCF4	MZB1
+B_Naive	Naïve B cell	MS4A1	IGHD	TCL1A	FCER2
+B_Memory	Memory B cell	MS4A1	CD27	TNFRSF13B
+B_Germinal_Center	Germinal center B cell	MEF2B	NEIL1	RGS13	ELL3	BCL7A	BCL6	NUGGC	MYBL1	EML6	FANCA
+B_light_zone	Light Zone	CD83	LMO2
+B_dark_zone	Dark Zone	CXCR4	AICDA	FOXP1	MME
+Mono_DCs	Monocytes Derived DC	CD14	FCGR2B	CCL17	CLEC10A
+MyoF	Myofibroblast from Travaglini et al. and Tony et al.	ACTA2	MYL9	MT2A	EEF1A1	TMSB10	FAU	UBA52	SERF2	PTMA	S100A6
diff --git a/pegasus/pipeline/pipeline.py b/pegasus/pipeline/pipeline.py
index 34626967..5ede69b3 100644
--- a/pegasus/pipeline/pipeline.py
+++ b/pegasus/pipeline/pipeline.py
@@ -92,6 +92,7 @@ def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool,
         tools.neighbors(
             unidata,
             K=kwargs["K"],
+            exact_k=kwargs["exact_K"],
             rep=dim_key,
             n_jobs=kwargs["n_jobs"],
             random_state=kwargs["random_state"],
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index f4a95549..edce18d9 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
+import matplotlib
 import matplotlib.pyplot as plt
 
 from scipy.sparse import issparse
@@ -31,6 +32,7 @@
     _generate_categories,
     _plot_corners,
     _plot_spots,
+    _get_valid_attrs,
 )
 
 
@@ -45,7 +47,7 @@ def scatter(
     fix_corners: Optional[bool] = True,
     alpha: Optional[Union[float, List[float]]] = 1.0,
     legend_loc: Optional[Union[str, List[str]]] = "right margin",
-    legend_fontsize: Optional[Union[int, List[int]]] = 10, 
+    legend_fontsize: Optional[Union[int, List[int]]] = 10,
     legend_ncol: Optional[str] = None,
     palettes: Optional[Union[str, List[str]]] = None,
     cmaps: Optional[Union[str, List[str]]] = "YlOrRd",
@@ -152,6 +154,9 @@ def scatter(
     elif not is_list_like(attrs):
         attrs = [attrs]
 
+    # Select only valid attributes
+    attrs = _get_valid_attrs(data, attrs)
+
     if isinstance(basis, str):
         basis = [basis]
     if isinstance(components, tuple):
@@ -214,7 +219,7 @@ def scatter(
 
             if global_marker_size == None:
                 global_marker_size = _get_marker_size(x.size) if marker_size is None else marker_size
-            
+
             x_label = f"{basis_}{comp_key[0]}"
             y_label = f"{basis_}{comp_key[1]}"
 
@@ -236,8 +241,6 @@ def scatter(
                     values = slicing(data.X, col = loc)
                 else:
                     obsm_key, sep, component = attr.partition("@")
-                    if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()):
-                        raise KeyError(f"{attr} is not in data.obs, data.var_names or data.obsm!")
                     values = data.obsm[obsm_key][:, int(component)]
 
                 selected = restr_obj.get_satisfied(data, attr)
@@ -864,6 +867,7 @@ def violin(
     hue: Optional[str] = None,
     matkey: Optional[str] = None,
     stripplot: Optional[bool] = False,
+    stripsize: int = 1,
     inner: Optional[str] = None,
     scale: Optional[str] = 'width',
     panel_size: Optional[Tuple[float, float]] = (8, 0.5),
@@ -943,6 +947,9 @@ def violin(
         assert not isinstance(data, anndata.AnnData)
         data.select_matrix(matkey)
 
+    # Filter out attributes not existing in the data
+    attrs = _get_valid_attrs(data, attrs)
+
     nrows = len(attrs)
     fig, axes = _get_subplot_layouts(nrows=nrows, ncols=1, panel_size=panel_size, dpi=dpi, left=left, bottom=bottom, wspace=wspace, hspace=0, squeeze=False, sharey=False)
 
@@ -954,15 +961,20 @@ def violin(
             assert is_numeric_dtype(data.obs[key])
             obs_keys.append(key)
         else:
-            if key not in data.var_names:
-                logger.warning(f"Cannot find gene {key}. Please make sure all genes are included in data.var_names before running this function!")
-                return None
             genes.append(key)
 
     df_list = [pd.DataFrame({"label": data.obs[groupby].values})]
+
     if hue is not None:
         df_list.append(pd.DataFrame({hue: data.obs[hue].values}))
         stripplot = False
+        kwargs['hue'] = hue
+        kwargs['split'] = True
+    else:
+        kwargs['hue'] = "label"
+        kwargs['legend'] = False
+        kwargs['split'] = False
+
     if len(obs_keys) > 0:
         df_list.append(data.obs[obs_keys].reset_index(drop=True))
     if len(genes) > 0:
@@ -973,8 +985,8 @@ def violin(
     for i in range(nrows):
         ax = axes[i, 0]
         if stripplot:
-            sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=1, color="k", jitter=True)
-        sns.violinplot(x="label", y=attrs[i], hue = hue, data=df, inner=inner, linewidth=1, ax=ax, cut=0, scale=scale, split=True, palette=palette, **kwargs)
+            sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=stripsize, color="k", jitter=True)
+        sns.violinplot(x="label", y=attrs[i], data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, palette=palette, **kwargs)
         ax.grid(False)
 
         if hue is not None:
@@ -987,6 +999,7 @@ def violin(
             ax.set_xlabel("")
         else:
             ax.set_xlabel(groupby)
+            ax.set_xticks(ax.get_xticks())  # Get rid of the UserWarning: set_ticklabels() should only be used with a fixed number of ticks
             ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
         ax.set_ylabel(attrs[i], labelpad=8, rotation=0, horizontalalignment='right', fontsize='medium')
         ax.tick_params(axis='y', right=True, left=False, labelright=True, labelleft=False, labelsize='small')
@@ -1005,16 +1018,26 @@ def violin(
 def heatmap(
     data: Union[MultimodalData, UnimodalData, anndata.AnnData],
     attrs: Union[str, List[str]],
-    groupby: str,
+    groupby: Optional[str] = None,
     matkey: Optional[str] = None,
-    on_average: bool = True,
-    switch_axes: bool = False,
+    gene_zscore: Optional[bool] = True,
+    on_average: Optional[bool] = True,
+    switch_axes: Optional[bool] = False,
     attrs_cluster: Optional[bool] = False,
     attrs_dendrogram: Optional[bool] = True,
+    attrs_method: Optional[bool] = 'ward',
+    attrs_optimal_ordering: Optional[bool] = True,
+    xlabel_size: Optional[float] = 10.0,
+    ylabel_size: Optional[float] = 10.0,
+    legend_fontsize: Optional[float] = 10.0,
+    xlabel_rotation: Optional[float] = 90.0,
+    ylabel_rotation: Optional[float] = 0.0,
     groupby_cluster: Optional[bool] = True,
     groupby_dendrogram: Optional[bool] = True,
-    attrs_labelsize: Optional[float] = 10.0,
-    groupby_labelsize: Optional[float] = 10.0,
+    groupby_method: Optional[bool] = 'ward',
+    groupby_optimal_ordering: Optional[bool] = True,
+    groupby_precomputed_linkage: Optional[np.array] = None,
+    show_sample_name: Optional[bool] = None,
     cbar_labelsize: Optional[float] = 10.0,
     panel_size: Tuple[float, float] = (10, 10),
     return_fig: Optional[bool] = False,
@@ -1026,7 +1049,6 @@ def heatmap(
 
     Parameters
     -----------
-
     data: ``AnnData`` or ``MultimodalData`` or ``UnimodalData`` object
         Single-cell expression data.
     attrs: ``str`` or ``List[str]``
@@ -1034,13 +1056,16 @@ def heatmap(
         Cell attributes must exist in ``data.obs`` and must be numeric.
         Features must exist in ``data.var``.
         By default, attrs are plotted as columns.
-    groupby: ``str``
+    groupby: ``str``, optional, default: ``None``
         A categorical variable in data.obs that is used to categorize the cells, e.g. Clusters.
         By default, data.obs['groupby'] is plotted as rows.
+        If ``None``, use data.obs_names instead.
     matkey: ``str``, optional, default: ``None``
         If matkey is set, select matrix with matkey as keyword in the current modality. Only works for MultimodalData or UnimodalData objects.
+    gene_zscore: ``bool``, optional, default: ``True``
+        If ``True``, compute and then plot z scores for gene expression.
     on_average: ``bool``, optional, default: ``True``
-        If ``True``, plot cluster average gene expression (i.e. show a Matrixplot); otherwise, plot a general heatmap.
+        If ``True``, plot cluster average gene expression or z score (i.e. show a Matrixplot); otherwise, plot a general heatmap.
     switch_axes: ``bool``, optional, default: ``False``
         By default, X axis is for attributes, and Y axis for clusters. If this parameter is ``True``, switch the axes.
         Moreover, with ``on_average`` being ``False``, if ``switch_axes`` is ``False``, ``row_cluster`` is enforced to be ``False``; if ``switch_axes`` is ``True``, ``col_cluster`` is enforced to be ``False``.
@@ -1048,14 +1073,32 @@ def heatmap(
         Cluster attributes and generate a attribute-wise dendrogram.
     attrs_dendrogram: ``bool``, optional, default: ``True``
         Only matters if attrs_cluster is True. Show the dendrogram if this option is True.
+    attrs_method: ``str``, optional, default: ``ward``
+        Linkage method for attrs, choosing from ``single``, ``complete``, ``average``, ``weighted``, ``centroid``, ``median`` and ``ward``.
+    attrs_optimal_ordering: ``bool``, optional, default: ``True``
+        Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the attrs linkage matrix will be reordered so that the distance between successive leaves is minima.
+    xlabel_size: ``float``, optional, default: 10.0
+        Fontsize for x-axis labels.
+    ylabel_size: ``float``, optional, default: 10.0
+        Fontsize for y-axis labels.
+    legend_fontsize: ``float``, optional, default: 10.0
+        Fontsize for legend labels.
+    xlabel_rotation: ``float``, optional, default: 90.0
+        Rotation of x-axis labels.
+    ylabel_rotation: ``float``, optional, default: 0.0
+        Rotation of y-axis labels.
     groupby_cluster: ``bool``, optional, default: ``True``
         Cluster data.obs['groupby'] and generate a cluster-wise dendrogram.
     groupby_dendrogram: ``bool``, optional, default: ``True``
         Only matters if groupby_cluster is True. Show the dendrogram if this option is True.
-    attrs_labelsize: ``float``, optional, default: 10.0
-        Fontsize for labels of attrs.
-    groupby_labelsize: ``float``, optional, default: 10.0
-        Fontsize for labels of data.obs['groupby'].
+    groupby_method: ``str``, optional, default: ``ward``
+        Linkage method for groupby, choosing from ``single``, ``complete``, ``average``, ``weighted``, ``centroid``, ``median`` and ``ward``.
+    groupby_optimal_ordering: ``bool``, optional, default: ``True``
+        Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the groupby linkage matrix will be reordered so that the distance between successive leaves is minima.
+    groupby_precomputed_linkage: ``np.array``, optional, default: ``None``
+        Pass a precomputed linkage.
+    show_sample_name: ``bool``, optional, default: ``None``
+        If show sample names as tick labels. If ``None``, show_sample_name == ``True`` if groupby == ``None`` and otherwise show_sample_name == ``False``.
     cbar_labelsize: ``float``, optional, default: 10.0
         Fontsize of the color bar.
     panel_size: ``Tuple[float, float]``, optional, default: ``(10, 10)``
@@ -1072,11 +1115,11 @@ def heatmap(
     -------
 
     ``Figure`` object
-        A ``matplotlib.figure.Figure`` object containing the dot plot if ``return_fig == True``
+        A ``matplotlib.figure.Figure`` object containing the heatmap if ``return_fig == True``; Otherwise, A ``seaborn.matrix.ClusterGrid`` object is returned.
 
     Examples
     --------
-    >>> pg.heatmap(data, genes=['CD14', 'TRAC', 'CD34'], groupby='louvain_labels')
+    >>> pg.heatmap(data, attrs=['CD14', 'TRAC', 'CD34'], groupby='leiden_labels')
 
     """
     if not isinstance(data, anndata.AnnData):
@@ -1088,6 +1131,9 @@ def heatmap(
     if isinstance(attrs, str):
         attrs = [attrs]
 
+    # Filter out attributes not existing in the data
+    attrs = _get_valid_attrs(data, attrs)
+
     obs_keys = []
     genes = []
     for key in attrs:
@@ -1100,71 +1146,99 @@ def heatmap(
                 return None
             genes.append(key)
 
-    clusters = data.obs[groupby].values
-    if not is_categorical_dtype(clusters):
-        clusters = pd.Categorical(clusters)
-    else:
-        clusters = clusters.remove_unused_categories()
-    df_list = [pd.DataFrame({'cluster_name': clusters})]
-
+    df_list = []
     if len(obs_keys) > 0:
         df_list.append(data.obs[obs_keys].reset_index(drop=True))
     if len(genes) > 0:
         expr_mat = slicing(data[:, genes].X)
+        if gene_zscore:
+            from scipy.stats import zscore
+            expr_mat = zscore(expr_mat, ddof=1)
         df_list.append(pd.DataFrame(data=expr_mat, columns=genes))
     df = pd.concat(df_list, axis = 1)
-    attr_names = df.columns[1:].values
+    df.index = data.obs_names
+    attr_names = df.columns.values
+
+    if show_sample_name is None:
+        show_sample_name = True if groupby is None else False
+    groupby_tick_labels = df.index if show_sample_name else []
+
+    cluster_ids = None
+    cell_colors = None
+    if groupby is not None:
+        cluster_ids = data.obs[groupby].values
+        if not is_categorical_dtype(cluster_ids):
+            cluster_ids = pd.Categorical(cluster_ids)
+        else:
+            cluster_ids = cluster_ids.remove_unused_categories()
+
+        if on_average:
+            if not 'cmap' in kwargs.keys():
+                kwargs['cmap'] = 'Reds'
+            df['cluster_name'] = cluster_ids
+            df = df.groupby(by='cluster_name', observed=True).mean()
+            cluster_ids = df.index
+            groupby_tick_labels = cluster_ids
+        else:
+            if not groupby_cluster:
+                idx = cluster_ids.argsort(kind = 'mergesort')
+                df = df.iloc[idx, :]  # organize df by category order
+                cluster_ids = cluster_ids[idx]
 
-    if on_average:
-        if not 'cmap' in kwargs.keys():
-            kwargs['cmap'] = 'Reds'
-        df = df.groupby('cluster_name').mean()
-        cluster_ids = df.index
-    else:
-        cluster_ids = df.pop('cluster_name').values
-        if not groupby_cluster:
-            idx = cluster_ids.argsort(kind = 'mergesort')
-            df = df.iloc[idx, :]  # organize df by category order
-            cluster_ids = cluster_ids[idx]
+            cell_colors = np.zeros(df.shape[0], dtype=object)
+            palette = _get_palette(cluster_ids.categories.size)
+
+            for k, cat in enumerate(cluster_ids.categories):
+                cell_colors[cluster_ids == cat] = palette[k]
 
-        cell_colors = np.zeros(df.shape[0], dtype=object)
-        palette = _get_palette(cluster_ids.categories.size)
+    from scipy.cluster.hierarchy import linkage
 
-        for k, cat in enumerate(cluster_ids.categories):
-            cell_colors[cluster_ids == cat] = palette[k]
+    groupby_linkage = None
+    if groupby_cluster:
+        if groupby_precomputed_linkage is not None:
+            groupby_linkage = groupby_precomputed_linkage
+        else:
+            groupby_linkage = linkage(df, groupby_method, optimal_ordering = groupby_optimal_ordering)
+    attrs_linkage = None
+    if attrs_cluster:
+        attrs_linkage = linkage(df.T, attrs_method, optimal_ordering = attrs_optimal_ordering)
 
     if not switch_axes:
         cg = sns.clustermap(
             data=df,
-            row_colors=cell_colors if not on_average else None,
+            row_colors=cell_colors,
             col_colors=None,
             row_cluster=groupby_cluster,
             col_cluster=attrs_cluster,
+            row_linkage=groupby_linkage,
+            col_linkage=attrs_linkage,
             linewidths=0,
-            yticklabels=cluster_ids if on_average else [],
+            yticklabels=groupby_tick_labels,
             xticklabels=attr_names,
             figsize=panel_size,
             **kwargs,
         )
         cg.ax_heatmap.set_ylabel("")
-        if attrs_labelsize is not None:
-            cg.ax_heatmap.tick_params(axis='x', labelsize=attrs_labelsize, labelrotation=75)
+        cg.ax_heatmap.tick_params(axis='x', labelsize=xlabel_size, labelrotation=xlabel_rotation)
+        cg.ax_heatmap.tick_params(axis='y', labelsize=ylabel_size, labelrotation=ylabel_rotation)
     else:
         cg = sns.clustermap(
             data=df.T,
             row_colors=None,
-            col_colors=cell_colors if not on_average else None,
+            col_colors=cell_colors,
             row_cluster=attrs_cluster,
             col_cluster=groupby_cluster,
+            row_linkage=attrs_linkage,
+            col_linkage=groupby_linkage,
             linewidths=0,
             yticklabels=attr_names,
-            xticklabels=cluster_ids if on_average else [],
+            xticklabels=groupby_tick_labels,
             figsize=panel_size,
             **kwargs,
         )
         cg.ax_heatmap.set_xlabel("")
-        if attrs_labelsize is not None:
-            cg.ax_heatmap.tick_params(axis='y', labelsize=attrs_labelsize)
+        cg.ax_heatmap.tick_params(axis='y', labelsize=ylabel_size, labelrotation=ylabel_rotation)
+        cg.ax_heatmap.tick_params(axis='x', labelsize=xlabel_size, labelrotation=xlabel_rotation)
 
     show_row_dendrogram = (attrs_cluster and attrs_dendrogram) if switch_axes else (groupby_cluster and groupby_dendrogram)
     show_col_dendrogram = (groupby_cluster and groupby_dendrogram) if switch_axes else (attrs_cluster and attrs_dendrogram)
@@ -1193,21 +1267,17 @@ def heatmap(
         cg.ax_cbar.yaxis.set_ticks_position("right")
 
 
-    if show_col_dendrogram:
-        cg.ax_heatmap.xaxis.tick_bottom()
-        cg.ax_col_dendrogram.set_visible(True)
-    else:
-        cg.ax_heatmap.xaxis.tick_top()
-        cg.ax_col_dendrogram.set_visible(False)
+    cg.ax_heatmap.xaxis.tick_bottom()
+    cg.ax_col_dendrogram.set_visible(show_col_dendrogram)
 
     cg.ax_cbar.tick_params(labelsize=cbar_labelsize)
     cg.fig.dpi = dpi
 
-    if not on_average:
+    if (groupby is not None) and (not on_average):
         if groupby_cluster:
             from matplotlib.patches import Patch
             legend_elements = [Patch(color = color, label = label) for color, label in zip(palette, cluster_ids.categories)]
-            cg.ax_heatmap.legend(handles=legend_elements, loc='lower left', bbox_to_anchor = (1.02, 1.02), fontsize = groupby_labelsize)
+            cg.ax_heatmap.legend(handles=legend_elements, loc='lower left', bbox_to_anchor = (1.02, 1.02), fontsize = legend_fontsize)
         else:
             values = cluster_ids.value_counts().values
             ticks = np.cumsum(values) - values / 2
@@ -1221,20 +1291,20 @@ def heatmap(
                 cg.ax_col_colors.xaxis.tick_top()
                 cg.ax_col_colors.set_xticks(ticks)
                 cg.ax_col_colors.set_xticklabels(labels, rotation=45)
-                cg.ax_col_colors.tick_params(axis='x', top = False, labelsize = groupby_labelsize, length=10)
+                cg.ax_col_colors.tick_params(axis='x', top = False, labelsize = xlabel_size, length=10)
 
     if not isinstance(data, anndata.AnnData):
         if cur_matkey != data.current_matrix():
             data.select_matrix(cur_matkey)
 
-    return cg.fig if return_fig else None
+    return cg.fig if return_fig else cg
 
 
 def dotplot(
     data: Union[MultimodalData, UnimodalData, anndata.AnnData],
     genes: Union[str, List[str]],
     groupby: str,
-    reduce_function: Callable[[np.ndarray], float] = np.mean,
+    reduce_function: Union[str, Callable[[np.ndarray], float]] = "mean",
     fraction_min: float = 0,
     fraction_max: float = None,
     dot_min: int = 0,
@@ -1259,7 +1329,7 @@ def dotplot(
         Features to plot.
     groupby: ``str``
         A categorical variable in data.obs that is used to categorize the cells, e.g. Clusters.
-    reduce_function: ``Callable[[np.ndarray], float]``, optional, default: ``np.mean``
+    reduce_function: ``Union[str, Callable[[np.ndarray], float]]``, optional, default: ``"mean"``
         Function to calculate statistic on expression data. Default is mean.
     fraction_min: ``float``, optional, default: ``0``.
         Minimum fraction of expressing cells to consider.
@@ -1298,12 +1368,14 @@ def dotplot(
     sns.set(font_scale=0.7, style='whitegrid')
 
     if not is_list_like(genes):
-        geness = [genes]
+        genes = [genes]
+
+    # Select only genes existing in the data
+    genes = _get_valid_attrs(data, genes)
 
     keywords = dict(cmap=cmap)
     keywords.update(kwds)
 
-    from scipy.sparse import issparse
     X = slicing(data[:, genes].X)
     df = pd.DataFrame(data=X, columns=genes)
     df[groupby] = data.obs[groupby].values
@@ -1316,12 +1388,12 @@ def dotplot(
     idx = series == 0
     if idx.sum() > 0:
         logger.warning(f"The following categories contain no cells and are removed: {','.join(list(series.index[idx]))}.")
-        df[groupby] = df[groupby].cat.remove_unused_categories()
 
     def non_zero(g):
         return np.count_nonzero(g) / g.shape[0]
 
-    summarized_df = df.groupby(groupby).aggregate([reduce_function, non_zero])
+    # Set observed=True to suppress warnings.
+    summarized_df = df.groupby(by=groupby, observed=True).aggregate([reduce_function, non_zero])
 
     row_indices = summarized_df.index.tolist()
     if sort_function == "natsorted":
@@ -1359,9 +1431,9 @@ def non_zero(g):
     yticks = summarized_df.index.map(str).values
 
     if switch_axes:
-        x, y = y, x
-        xlabel, ylabel = ylabel, xlabel
-        xticks, yticks = yticks, xticks
+        x, y = y[::-1], x[::-1]
+        xlabel, ylabel = ylabel[::-1], xlabel[::-1]
+        xticks, yticks = yticks[::-1], xticks[::-1]
 
     dotplot_df = pd.DataFrame(data=dict(x=x, y=y, value=summary_values, pixels=pixels, fraction=fraction,
                     xlabel=np.array(xlabel)[x], ylabel=np.array(ylabel)[y]))
@@ -1440,7 +1512,7 @@ def non_zero(g):
     size_legend.grid(False)
 
     # Reset global settings.
-    sns.reset_orig()
+    matplotlib.rc_file_defaults()
 
     return fig if return_fig else None
 
@@ -1497,7 +1569,7 @@ def dendrogram(
     linkage: ``str``, optional, default: ``complete``
         Which linkage criterion to use, used by hierarchical clustering. Below are available options:
             - ``ward`` minimizes the variance of the clusters being merged.
-            - ``avarage`` uses the average of the distances of each observation of the two sets.
+            - ``average`` uses the average of the distances of each observation of the two sets.
             - ``complete`` uses the maximum distances between all observations of the two sets. (Default)
             - ``single`` uses the minimum of the distances between all observations of the two sets.
 
diff --git a/pegasus/plotting/plot_utils.py b/pegasus/plotting/plot_utils.py
index 0c61cb58..e0b6fde8 100644
--- a/pegasus/plotting/plot_utils.py
+++ b/pegasus/plotting/plot_utils.py
@@ -9,6 +9,9 @@
 from matplotlib.patches import Circle
 from matplotlib.collections import PatchCollection
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 def _transform_basis(basis: str) -> str:
     if basis == "tsne":
@@ -435,3 +438,24 @@ def _plot_spots(x: np.ndarray, y: np.ndarray, c: Union[str, np.ndarray], s: floa
         spots.set_clim(vmin, vmax)
     ax.add_collection(spots)
     return spots
+
+
+def _get_valid_attrs(data:Union[MultimodalData, UnimodalData], attrs: List[str]) -> List[str]:
+    attrs_filt = []
+    attrs_drop = []
+    for attr in attrs:
+        if (attr == '_all') or (attr in data.obs) or (attr in data.var_names) or ('@' in attr):
+            if not '@' in attr:
+                attrs_filt.append(attr)
+            else:
+                obsm_key, sep, component = attr.partition("@")
+                if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()):
+                    attrs_drop.append(attr)
+                else:
+                    attrs_filt.append(attr)
+        else:
+            attrs_drop.append(attr)
+    if len(attrs_drop) > 0:
+        logger.warning(f"Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
+
+    return attrs_filt
diff --git a/pegasus/tools/__init__.py b/pegasus/tools/__init__.py
index ac0f149e..3b5eebd3 100644
--- a/pegasus/tools/__init__.py
+++ b/pegasus/tools/__init__.py
@@ -56,7 +56,7 @@
     net_umap,
     net_fle,
 )
-from .diff_expr import de_analysis, markers, write_results_to_excel, run_de_analysis
+from .diff_expr import de_analysis, markers, write_results_to_excel, cluster_specific_markers, run_de_analysis
 from .gradient_boosting import find_markers, run_find_markers
 from .subcluster_utils import clone_subset
 from .signature_score import calc_signature_score, calculate_z_score
diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py
index 29c50d5b..432129e5 100644
--- a/pegasus/tools/clustering.py
+++ b/pegasus/tools/clustering.py
@@ -1,6 +1,7 @@
 import time
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_categorical_dtype
 from pegasusio import MultimodalData
 from natsort import natsorted
 
@@ -643,10 +644,11 @@ def split_one_cluster(
     n_clust: int,
     res_label: str,
     rep: str = "pca",
+    n_comps: int = None,
     random_state: int = 0,
 ) -> None:
     """
-    Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' clusters and write the new clusting results to 'res_label'. Assume 'clust_label' named clusters as numbers (in str format).
+    Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' sub-clusters and write the new clusting results to 'res_label'. The sub-cluster names are the concatenation of original cluster name and the subcluster id (e.g. 'T' -> 'T-1', 'T-2').
 
     Parameters
     ----------
@@ -663,11 +665,14 @@ def split_one_cluster(
         Split 'clust_id' into `n_clust' subclusters.
 
     res_label: `str`,
-        Write new clustering in data.obs['res_label']. The largest subcluster will use 'clust_id' as its cluster ID, while other subclusters will be numbered after existing clusters.
+        Write new clustering in data.obs['res_label']. The sub-cluster names are the concatenation of original cluster name and the subcluster id (e.g. 'T' -> 'T-1', 'T-2').
 
     rep: ``str``, optional, default: ``"pca"``
         The embedding representation used for Kmeans clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.
 
+    n_comps: `int`, optional (default: None)
+        Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions.
+
     n_jobs : `int`, optional (default: -1)
         Number of threads to use for the KMeans step in 'spectral_louvain' and 'spectral_leiden'. -1 refers to using all physical CPU cores.
 
@@ -685,16 +690,35 @@ def split_one_cluster(
     --------
     >>> pg.split_one_cluster(data, 'leiden_labels', '15', 2, 'leiden_labels_split')
     """
-    idx = np.where(data.obs[clust_label] == clust_id)[0]
+    cats = None
+    if is_categorical_dtype(data.obs[clust_label]):
+        cats = data.obs[clust_label].cat.categories.values
+    else:
+        cats = pd.Categorical(data.obs[clust_label]).categories.values
+        if cats.dtype.kind not in {'S', 'U'}:
+            cats = cats.astype(str)
+    idx_cat = np.nonzero(cats==clust_id)[0]
+
+    if idx_cat.size == 0:
+        raise ValueError(f"{clust_id} is not in {clust_label}!")
+    elif idx_cat.size > 1:
+        raise ValueError(f"Detected more than one categories in {clust_label} with name {clust_id}!")
+    else:
+        idx_cat = idx_cat[0]
+
+    idx = np.nonzero((data.obs[clust_label] == clust_id).values)[0]
     tmpdat = data[idx].copy()
     from pegasus.tools import neighbors
-    neighbors(tmpdat, rep=rep, use_cache=False)
+    neighbors(tmpdat, rep=rep, n_comps=n_comps, use_cache=False)
     leiden(tmpdat, rep=rep, resolution=None, n_clust=n_clust, random_state=random_state)
-    new_clust = data.obs[clust_label].values.astype(int)
-    new_label = new_clust.max() + 1
-    for label in tmpdat.obs['leiden_labels'].value_counts().index[1:]:
-        new_clust[idx[(tmpdat.obs['leiden_labels'] == label).values]] = new_label
-        new_label += 1
-    data.obs[res_label] = pd.Categorical(values = new_clust.astype(str), categories = np.array(range(1, new_label)).astype(str))
+
+    new_clust = data.obs[clust_label].values.astype(object)
+    cats_sub = []
+    for i, label in enumerate(tmpdat.obs['leiden_labels'].value_counts().index):
+        sub_id = f"{clust_id}-{i+1}"
+        new_clust[idx[(tmpdat.obs['leiden_labels'] == label).values]] = sub_id
+        cats_sub.append(sub_id)
+
+    data.obs[res_label] = pd.Categorical(values = new_clust, categories = np.concatenate((cats[0:idx_cat], np.array(cats_sub), cats[idx_cat+1:])))
     data.register_attr(res_label, "cluster")
     del tmpdat
diff --git a/pegasus/tools/diff_expr.py b/pegasus/tools/diff_expr.py
index 45461628..f83c01e1 100644
--- a/pegasus/tools/diff_expr.py
+++ b/pegasus/tools/diff_expr.py
@@ -419,7 +419,7 @@ def de_analysis(
     n_jobs: ``int``, optional, default: ``-1``
         Number of threads to use. If ``-1``, use all available threads.
 
-    t: ``bool``, optional, default: ``True``
+    t: ``bool``, optional, default: ``False``
         If ``True``, calculate Welch's t test.
 
     fisher: ``bool``, optional, default: ``False``
@@ -756,6 +756,62 @@ def add_worksheet(
     logger.info("Excel spreadsheet is written.")
 
 
+def cluster_specific_markers(
+    markers: Dict[str, Dict[str, pd.DataFrame]],
+    clust_id: str,
+    min_auroc: float = 0.7,
+    expected_pfc: float = 10.0,
+    n_lo: int = 25,
+    n_up: int = 50,
+) -> pd.DataFrame:
+    """ Extract cluster-specific markers from DE results ``markers``.
+
+    This function extracts cluster-specific markers (e.g. with auroc >= min_auroc and high in percentage fold change). The extracted markers can be screened for signatures representing the cluster.
+
+    The selection procedure is as follows: First, pick genes with AUROC >= min_auroc and pfc (percentage fold change) >= expected_pfc. If the number is between [n_lo, n_up], return the subset of markers containing only these genes. Otherwise, if the number < n_lo, extend the gene set to include up to n_lo genes in descending order of their pfc. If the number > n_up, truncate the set by keeping only n_up genes with highest pfc.
+
+    Parameters
+    ----------
+    markers: ``Dict[str, Dict[str, pd.DataFrame]]``
+        Markers from `de_analysis`.
+
+    clust_id: ``str``
+        Cluster ID to tell which cluster to focus on.
+
+    min_auroc: ``float``, default, ``0.7``
+        Minimum AUROC for a gene.
+
+    expected_pfc: ``float``, optional, default: ``10.0``
+        Expected percentage fold change for a gene.
+
+    n_lo: ``int``, optional, default: ``25``
+        Lower bound (inclusive) on the number of genes to return.
+
+    n_up: ``int``, optional, default: ``50``
+        Upper bound (inclusive) on the number of genes to return.
+
+    Returns
+    -------
+    results: ``pd.DataFrame``
+        A Python dataframe containing selected markers, ranking in descending order with respect to AUROC.
+
+    Examples
+    --------
+    >>> candidates = pg.cluster_specific_markers(markers, 'Mono')
+    """
+    df = markers[clust_id]['up']
+    idx_auc = df['auroc'] >= min_auroc
+    idx_epf = df['percentage_fold_change'] >= expected_pfc
+    idx = idx_auc & idx_epf
+    n = idx.sum()
+    if n >= n_lo and n <= n_up:
+        return df[idx]
+    else:
+        res = df[idx_auc].sort_values('percentage_fold_change', ascending=False)
+        res = res.iloc[0:(n_lo if n < n_lo else n_up)].sort_values('auroc', ascending=False)
+        return res
+
+
 @timer(logger=logger)
 def run_de_analysis(
     input_file: str,
diff --git a/pegasus/tools/doublet_detection.py b/pegasus/tools/doublet_detection.py
index 26aadd4a..f9f69393 100644
--- a/pegasus/tools/doublet_detection.py
+++ b/pegasus/tools/doublet_detection.py
@@ -267,7 +267,7 @@ def _run_scrublet(
         If True, plot diagnostic histograms. Each sample would have a figure consisting of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets.
 
     manual_correction: ``str``, optional, default: ``None``
-        If present, use human guide provided in manual_correction to select threshold. Currently support 'peak' and 'expected'. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate.
+        If present, use human guide provided in manual_correction to select threshold. Currently support 'peak', 'expected' and threshold. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate. If not both, convert guide to float and use as user-specified threshold.
 
     Returns
     --------
@@ -349,7 +349,7 @@ def _run_scrublet(
     if k is None:
         k = int(round(0.5 * np.sqrt(obsX.shape[0])))
     k_adj = int(round(k * (1.0 + r)))
-    indices, _ = calculate_nearest_neighbors(pc_coords, K = k_adj + 1, n_jobs = n_jobs)
+    indices, _, _ = calculate_nearest_neighbors(pc_coords, K=k_adj + 1, n_jobs=n_jobs, exact_k=True)
 
     # Calculate scrublet-like doublet score
     k_d = is_doublet[indices].sum(axis = 1)
@@ -420,6 +420,8 @@ def _run_scrublet(
             threshold = np.exp(x[maxima_by_x[-1]])
         elif manual_correction == "expected":
             threshold = threshold_theory
+        else:
+            threshold = float(manual_correction)
 
     data.obs["doublet_score"] = obs_scores.astype(np.float32)
     data.obs["pred_dbl"] = obs_scores > threshold
@@ -474,7 +476,7 @@ def infer_doublets(
     data: MultimodalData,
     channel_attr: Optional[str] = None,
     clust_attr: Optional[str] = None,
-    raw_mat_key: Optional[str] = 'counts',
+    raw_mat_key: Optional[str] = None,
     min_cell: Optional[int] = 100,
     expected_doublet_rate: Optional[float] = None,
     sim_doublet_ratio: Optional[float] = 2.0,
@@ -501,6 +503,9 @@ def infer_doublets(
     clust_attr: ``str``, optional, default: None
         Attribute indicating cluster labels. If set, estimate proportion of doublets in each cluster and statistical significance.
 
+    raw_mat_key: ``str``, optional, default: None
+        The key for raw count matrix. By default, Pegasus will first try "counts" and then try "raw.X"
+
     min_cell: ``int``, optional, default: 100
         Minimum number of cells per sample to calculate doublet scores. For samples having less than 'min_cell' cells, doublet score calculation will be skipped.
 
@@ -529,7 +534,7 @@ def infer_doublets(
         If not None, plot diagnostic histograms using ``plot_hist`` as the prefix. If `channel_attr` is None, ``plot_hist.dbl.png`` is generated; Otherwise, ``plot_hist.channel_name.dbl.png`` files are generated. Each figure consists of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets. Each plot contains two dashed lines. The red dashed line represents the theoretical cutoff (calucalted based on number of cells and 10x doublet table) and the black dashed line represents the cutof inferred from the data.
     
     manual_correction: ``str``, optional, default: ``None``
-        Use human guide to correct doublet threshold for certain channels. This is string representing a comma-separately list. Each item in the list represent one sample and the sample name and correction guide are separated using ':'. The orrection guides supported are 'peak' and 'expected'. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate. If only one sample available, use '' as the sample name.
+        Use human guide to correct doublet threshold for certain channels. This is string representing a comma-separately list. Each item in the list represent one sample and the sample name and correction guide are separated using ':'. The correction guides supported are 'peak', 'expected' and threshold. 'peak' means cutting at the center of the peak; 'expected' means cutting at the expected doublet rate; threshold is the user-specified doublet threshold; if the guide is neither 'peak' nor 'expected', pegasus will try to convert the string into float and use it as doublet threshold. If only one sample available, no need to specify sample name.
 
     Returns
     -------
@@ -545,6 +550,11 @@ def infer_doublets(
     >>> pg.infer_doublets(data, channel_attr = 'Channel', clust_attr = 'Annotation')
     """
     assert data.get_modality() == "rna"
+
+    if raw_mat_key is None:
+        raw_mat_key = 'counts'
+        if raw_mat_key not in data.list_keys():
+            raw_mat_key = 'raw.X'
     try:
         rawX = data.get_matrix(raw_mat_key)
     except ValueError:
@@ -554,10 +564,13 @@ def infer_doublets(
 
     mancor = {}
     if manual_correction is not None:
-        for item in manual_correction.split(','):
-            name, action = item.split(':')
-            mancor[name] = action
-
+        if channel_attr is None:
+            mancor[''] = manual_correction
+        else:
+            for item in manual_correction.split(','):
+                name, action = item.split(':')
+                mancor[name] = action
+            
     if channel_attr is None:
         if data.shape[0] >= min_cell:
             fig = _run_scrublet(data, raw_mat_key, expected_doublet_rate = expected_doublet_rate, sim_doublet_ratio = sim_doublet_ratio, \
@@ -586,9 +599,9 @@ def infer_doublets(
             if idx.size >= min_cell:
                 unidata = UnimodalData({"barcodekey": data.obs_names[idx]}, 
                                        {"featurekey": data.var_names},
-                                       {"counts": rawX[idx]},
+                                       {raw_mat_key: rawX[idx]},
                                        {"genome": genome, "modality": modality},
-                                       cur_matrix = "counts")
+                                       cur_matrix = raw_mat_key)
                 # Identify robust genes, count and log normalized and select top 2,000 highly variable features
                 identify_robust_genes(unidata)
                 log_norm(unidata)
diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py
index fdc16109..fd95e4ca 100644
--- a/pegasus/tools/nearest_neighbors.py
+++ b/pegasus/tools/nearest_neighbors.py
@@ -34,27 +34,61 @@ def calculate_nearest_neighbors(
     K: int = 100,
     n_jobs: int = -1,
     method: str = "hnsw",
+    exact_k: bool = False,
     M: int = 20,
     efC: int = 200,
     efS: int = 200,
     random_state: int = 0,
     full_speed: int = False,
     dist: str = 'l2',
-):
-    """Calculate nearest neighbors
-    X is the sample by feature matrix
-    Return K -1 neighbors, the first one is the point itself and thus omitted.
-    TODO: Documentation
-    """
+) -> Tuple[List[int], List[float], int]:
+    """Find K nearest neighbors for each data point in the matrix and return the indices and distances arrays.
+
+    K is determined by min(K, int(sqrt(X.shape[0]))) if exact_k == False.
+
+    Parameters
+    ----------
+
+    X : `np.array`
+        An array of n_samples by n_features.
+    K : `int`, optional (default: 100)
+        Number of neighbors, including the data point itself. If K is None, determine K by sqrt(X.shape[0]).
+    n_jobs : `int`, optional (default: -1)
+        Number of threads to use. -1 refers to using all physical CPU cores.
+    method: `str`, optional (default: 'hnsw')
+        Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search. If X.shape[0] <= 1000, method will be automatically set to "sklearn" for exact KNN search
+    exact_k: `bool`, optional (default: 'False')
+        If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
+    M, efC, efS: `int`, optional (20, 200, 200)
+        HNSW algorithm parameters.
+    random_state: `int`, optional (default: 0)
+        Random seed for random number generator.
+    full_speed: `bool`, optional (default: False)
+        If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.
+    dist: `str`, optional (default: 'l2')
+        Distance metric to use. By default, use squared L2 distance. Available options, 'l2', inner product 'ip' or cosine similarity 'cosine'.
+
+    Returns
+    -------
 
+    kNN indices array, distances array and adjusted K.
+
+    Examples
+    --------
+    >>> indices, distances = calculate_nearest_neighbors(X)
+    """
     nsample = X.shape[0]
 
     if nsample <= 1000:
         method = "sklearn"
 
-    if nsample < K:
-        logger.warning(f"Warning: in calculate_nearest_neighbors, number of samples = {nsample} < K = {K}!\n Set K to {nsample}.")
-        K = nsample
+    k_rot = int(nsample ** 0.5) # rot, rule of thumb
+    if (K is None) or (K > k_rot and (not exact_k)):
+        K = k_rot
+        logger.info(f"in calculate_nearest_neighbors, K is adjusted to {K}.")
+
+    if K == 1:
+        return np.zeros((nsample, 0), dtype=int), np.zeros((nsample, 0), dtype=np.float32), K
 
     n_jobs = eff_n_jobs(n_jobs)
 
@@ -91,7 +125,7 @@ def calculate_nearest_neighbors(
         knn.fit(X)
         distances, indices = knn.kneighbors()
 
-    return indices, distances
+    return indices, distances, K
 
 
 def knn_is_cached(
@@ -114,11 +148,15 @@ def get_neighbors(
     n_jobs: int = -1,
     random_state: int = 0,
     full_speed: bool = False,
-    use_cache: bool = True,
+    use_cache: bool = False,
     dist: str = "l2",
-) -> Tuple[List[int], List[float]]:
+    method: str = "hnsw",
+    exact_k: bool = False,
+) -> Tuple[List[int], List[float], int]:
     """Find K nearest neighbors for each data point and return the indices and distances arrays.
 
+    K is determined by min(K, int(sqrt(data.shape[0]))) if exact_k == False.
+
     Parameters
     ----------
 
@@ -136,34 +174,44 @@ def get_neighbors(
         Random seed for random number generator.
     full_speed: `bool`, optional (default: False)
         If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.
-    use_cache: `bool`, optional (default: True)
+    use_cache: `bool`, optional (default: False)
         If use_cache and found cached knn results, will not recompute.
     dist: `str`, optional (default: 'l2')
-        Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'.
+        Distance metric to use. By default, use squared L2 distance. Available options, 'l2' or inner product 'ip' or cosine similarity 'cosine'.
+    method: `str`, optional (default: 'hnsw')
+        Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search.
+    exact_k: `bool`, optional (default: 'False')
+        If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
 
     Returns
     -------
 
-    kNN indices and distances arrays.
+    kNN indices array, distances array, and adjusted K.
 
     Examples
     --------
-    >>> indices, distances = tools.get_neighbors(data)
+    >>> indices, distances, K = tools.get_neighbors(data)
     """
-
     rep = update_rep(rep)
     indices_key = rep + "_knn_indices"
     distances_key = rep + "_knn_distances"
 
+    k_rot = int(data.shape[0] ** 0.5) # rot, rule of thumb
+    if (K is None) or (K > k_rot and (not exact_k)):
+        K = k_rot
+        logger.info(f"in get_neighbors, K is adjusted to {K}.")
+
     if use_cache and knn_is_cached(data, indices_key, distances_key, K):
         indices = data.obsm[indices_key]
         distances = data.obsm[distances_key]
         logger.info("Found cached kNN results, no calculation is required.")
     else:
-        indices, distances = calculate_nearest_neighbors(
+        indices, distances, _ = calculate_nearest_neighbors(
             X_from_rep(data, rep, n_comps),
             K=K,
             n_jobs=eff_n_jobs(n_jobs),
+            method=method,
+            exact_k=exact_k,
             random_state=random_state,
             full_speed=full_speed,
             dist=dist,
@@ -173,7 +221,7 @@ def get_neighbors(
         data.obsm[distances_key] = distances
         data.register_attr(distances_key, "knn")
 
-    return indices, distances
+    return indices, distances, K
 
 
 def get_symmetric_matrix(csr_mat: "csr_matrix") -> "csr_matrix":
@@ -235,13 +283,17 @@ def neighbors(
     n_jobs: int = -1,
     random_state: int = 0,
     full_speed: bool = False,
-    use_cache: bool = True,
+    use_cache: bool = False,
     dist: str = "l2",
+    method: str = "hnsw",
+    exact_k: bool = False,
 ) -> None:
     """Compute k nearest neighbors and affinity matrix, which will be used for diffmap and graph-based community detection algorithms.
 
     The kNN calculation uses `hnswlib <https://github.com/nmslib/hnswlib>`_ introduced by [Malkov16]_.
 
+    K is determined by min(K, sqrt(data.shape[0])).
+
     Parameters
     ----------
 
@@ -267,12 +319,18 @@ def neighbors(
         * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
         * Otherwise, use only one thread to make sure results are reproducible.
 
-    use_cache: ``bool``, optional, default: ``True``
+    use_cache: ``bool``, optional, default: ``False``
         * If ``True`` and found cached knn results, Pegasus will use cached results and do not recompute.
         * Otherwise, compute kNN irrespective of caching status.
 
     dist: ``str``, optional (default: ``"l2"``)
-        Distance metric to use. By default, use squared L2 distance. Available options, inner product ``"ip"`` or cosine similarity ``"cosine"``.
+        Distance metric to use. By default, use squared L2 distance. Available options, ``"l2"`` or inner product ``"ip"`` or cosine similarity ``"cosine"``.
+
+    method: ``str``, optional (default: ``"hnsw"``)
+        Choose from "hnsw" or "sklearn". "hnsw" uses HNSW algorithm for approximate nearest neighbor search and "sklearn" uses sklearn package for exact nearest neighbor search.
+
+    exact_k: ``bool``, optional (default: ``False``)
+        If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
 
     Returns
     -------
@@ -292,7 +350,7 @@ def neighbors(
 
     # calculate kNN
     rep = update_rep(rep)
-    indices, distances = get_neighbors(
+    indices, distances, K = get_neighbors(
         data,
         K=K,
         rep=rep,
@@ -302,6 +360,8 @@ def neighbors(
         full_speed=full_speed,
         use_cache=use_cache,
         dist=dist,
+        method=method,
+        exact_k=exact_k,
     )
 
     # calculate affinity matrix
@@ -408,7 +468,7 @@ def calc_kBET(
     attr_values = data.obs[attr].values.copy()
     attr_values.categories = range(nbatch)
 
-    indices, distances = get_neighbors(
+    indices, distances, K = get_neighbors(
         data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, use_cache=use_cache,
     )
     knn_indices = np.concatenate(
@@ -499,7 +559,7 @@ def calc_kSIM(
     assert attr in data.obs
     nsample = data.shape[0]
 
-    indices, distances = get_neighbors(
+    indices, distances, K = get_neighbors(
         data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, use_cache=use_cache,
     )
     knn_indices = np.concatenate(
diff --git a/pegasus/tools/nmf.py b/pegasus/tools/nmf.py
index a37399d2..8cf2cb08 100644
--- a/pegasus/tools/nmf.py
+++ b/pegasus/tools/nmf.py
@@ -81,6 +81,7 @@ def nmf(
     alpha_H: float = 0.0,
     l1_ratio_H: float = 0.0,
     fp_precision: str = "float",
+    online_chunk_size: int = 5000,
     n_jobs: int = -1,
     random_state: int = 0,
 ) -> None:
@@ -137,6 +138,9 @@ def nmf(
     fp_precision: ``str``, optional, default: ``float``
         The numeric precision on the results. Choose from ``float`` and ``double``.
 
+    online_chunk_size: ``int``, optional, default: ``int``
+        The chunk / mini-batch size for online learning. Only works when ``mode='online'``.
+
     n_jobs : `int`, optional (default: -1)
         Number of threads to use. -1 refers to using all physical CPU cores.
 
@@ -189,6 +193,7 @@ def nmf(
         alpha_H=alpha_H,
         l1_ratio_H=l1_ratio_H,
         fp_precision=fp_precision,
+        online_chunk_size=online_chunk_size,
     )
 
     data.uns["nmf_features"] = features # record which feature to use
@@ -285,6 +290,7 @@ def integrative_nmf(
     use_gpu: bool = False,
     lam: float = 5.0,
     fp_precision: str = "float",
+    online_chunk_size: int = 5000,
     n_jobs: int = -1,
     random_state: int = 0,
     quantile_norm: bool = True,
@@ -334,6 +340,9 @@ def integrative_nmf(
     fp_precision: ``str``, optional, default: ``float``
         The numeric precision on the results. Choose from ``float`` and ``double``.
 
+    online_chunk_size: ``int``, optional, default: ``5000``
+        The chunk / mini-batch size for online learning. Only works when ``mode='online'``.
+
     n_jobs : `int`, optional (default: -1)
         Number of threads to use. -1 refers to using all physical CPU cores.
 
@@ -394,6 +403,7 @@ def integrative_nmf(
         use_gpu=use_gpu,
         lam=lam,
         fp_precision=fp_precision,
+        online_chunk_size=online_chunk_size,
     )
 
     # Implementation of algo 3, quantile normalization
@@ -406,14 +416,19 @@ def integrative_nmf(
     seeds = rg.integers(4294967295, size=nbatch)
     ref_batch = max_size = -1
     for i in range(nbatch):
-        H_new = np.ascontiguousarray(Hs[i] / np.linalg.norm(Hs[i], axis=0), dtype=np.float32) # Scale H
+        h_norm = np.linalg.norm(Hs[i], axis=0)
+        idx_h_zeros = np.where(h_norm==0)[0]
+        if idx_h_zeros.size > 0:
+            # Set norm 0 to 1 to avoid divide by zero issue
+            h_norm[idx_h_zeros] = 1.0
+        H_new = np.ascontiguousarray(Hs[i] / h_norm, dtype=np.float32) # Scale H
         Hs_new.append(H_new) # Append scaled H
 
         if not quantile_norm:
             continue
 
         clusters = np.argmax(H_new, axis=1) # Assign cluster
-        indices, _ = calculate_nearest_neighbors(H_new, K=20, n_jobs=n_jobs, random_state=seeds[i]) # KNN with K=20
+        indices, _, _ = calculate_nearest_neighbors(H_new, K=20, n_jobs=n_jobs, random_state=seeds[i]) # KNN with K=20
         clusters, csum = _refine_cluster(clusters, indices, n_components) # Refine cluster
         csums.append(csum)
         ids_by_clusts.append(np.argsort(clusters, kind='stable'))
diff --git a/pegasus/tools/preprocessing.py b/pegasus/tools/preprocessing.py
index 71c78b3b..dd105f74 100644
--- a/pegasus/tools/preprocessing.py
+++ b/pegasus/tools/preprocessing.py
@@ -276,10 +276,9 @@ def _run_filter_data(
 
         if output_filt is not None:
             group_key = unidata.get_uid()
-            writer = pd.ExcelWriter(f"{output_filt}.{group_key}.filt.xlsx", engine="xlsxwriter")
-            df_cells = get_filter_stats(unidata, min_genes_before_filt = min_genes_before_filt)
-            df_cells.to_excel(writer, sheet_name="Cell filtration stats")
-            writer.save()
+            with pd.ExcelWriter(f"{output_filt}.{group_key}.filt.xlsx", engine="xlsxwriter") as writer:
+                df_cells = get_filter_stats(unidata, min_genes_before_filt = min_genes_before_filt)
+                df_cells.to_excel(writer, sheet_name="Cell filtration stats")
             logger.info(f"Filtration results for {group_key} are written.")
 
         if plot_filt is not None:
@@ -347,7 +346,7 @@ def _set_target_mat(data, X, target_matrix, select, base_matrix, suffix):
     if target_matrix in data.matrices:
        logger.warning(f"{target_matrix} is in data's matrices. It will be rewritten.")
 
-    data.add_matrix(target_matrix, X)
+    data.update_matrix(target_matrix, X)
 
     if select:
         data.select_matrix(target_matrix)
diff --git a/pegasus/tools/scvitools.py b/pegasus/tools/scvitools.py
index a01c0a8e..20dfd1c3 100644
--- a/pegasus/tools/scvitools.py
+++ b/pegasus/tools/scvitools.py
@@ -190,9 +190,14 @@ def run_scvi(
     scvi.settings.num_threads = eff_n_jobs(n_jobs) # set n_jobs
     scvi.settings.seed = random_state # set random_state, see [here](https://docs.scvi-tools.org/en/stable/_modules/scvi/_settings.html) for more details.
 
+    print(max_epochs)
+    
     if max_epochs is None:
         max_epochs = np.min([round((20000 / len(adata.obs)) * 400), 400])
 
+    print(type(max_epochs))
+    print(max_epochs)
+
     scvi.model.SCVI.setup_anndata(adata,
         batch_key=batch,
         categorical_covariate_keys=categorical_covariate_keys,
diff --git a/pegasus/tools/signature_score.py b/pegasus/tools/signature_score.py
index ee14c446..1f055d39 100644
--- a/pegasus/tools/signature_score.py
+++ b/pegasus/tools/signature_score.py
@@ -1,4 +1,5 @@
 import numpy as np
+import scipy.sparse as sp
 import pandas as pd
 
 from typing import Dict, List, Union
@@ -30,7 +31,7 @@ def _check_and_calc_sig_background(data: UnimodalData, n_bins: int) -> bool:
             bins = pd.qcut(mean_vec, n_bins, duplicates = "drop")
         if bins.value_counts().min() == 1:
             logger.warning("Detected bins with only 1 gene!")
-        bins.categories = bins.categories.astype(str)
+        bins = bins.rename_categories(dict(zip(bins.categories, bins.categories.astype(str))))
         data.var["bins"] = bins
 
         # calculate background expectations
@@ -89,7 +90,11 @@ def calculate_z_score(
     if not _check_and_calc_sig_background(data, n_bins):
         return None
 
-    z_score_mat = (data.X.toarray().astype(np.float32) - data.var["mean"].values.astype(np.float32) - data.obsm["sig_bkg_mean"][:, data.var["bins"].cat.codes].astype(np.float32)) / data.obsm["sig_bkg_std"][:, data.var["bins"].cat.codes].astype(np.float32)
+    mat = data.X
+    if sp.issparse(mat):
+        mat = mat.toarray()
+
+    z_score_mat = (mat.astype(np.float32) - data.var["mean"].values.astype(np.float32) - data.obsm["sig_bkg_mean"][:, data.var["bins"].cat.codes].astype(np.float32)) / data.obsm["sig_bkg_std"][:, data.var["bins"].cat.codes].astype(np.float32)
 
     return z_score_mat
 
@@ -100,6 +105,7 @@ def calc_signature_score(
     signatures: Union[Dict[str, List[str]], str],
     n_bins: int = 50,
     show_omitted_genes: bool = False,
+    skip_threshold: int = 1,
     random_state: int = 0
 ) -> None:
     """Calculate signature / gene module score. [Li20-1]_
@@ -124,12 +130,21 @@ def calc_signature_score(
             * ``apoptosis_human`` contains one signature, ``apoptosis``, which includes apoptosis-related genes from the KEGG pathway.
             * ``cell_cycle_mouse``, ``gender_mouse``, ``mitochondrial_genes_mouse``, ``ribosomal_genes_mouse`` and ``apoptosis_mouse`` are the corresponding signatures for mouse. Gene symbols are directly translated from human genes.
 
+        In addition, Pegasus provides the following 4 curated signature panels:
+            * ``emt_human``, the Epithelial-Mesenchymal Transition signature from Gibbons and Creighton Dev. Dyn. 2018.
+            * ``human_lung``, human lung cell type markers.
+            * ``mouse_brain``, mouse brain cell type markers.
+            * ``mouse_liver``, mouse liver cell type markers.
+
     n_bins: ``int``, optional, default: 50
         Number of bins on expression levels for grouping genes.
 
     show_omitted_genes: ``bool``, optional, default False
         Signature genes that are not expressed in the data will be omitted. By default, pegasus does not report which genes are omitted. If this option is turned on, report omitted genes.
 
+    skip_threshold: ``int``, optional, default 1
+        Skip signature calculation of number of kept genes is less than skip_threshold.
+
     random_state: ``int``, optional, default: 0
         Random state used by KMeans if signature == ``gender_human`` or ``gender_mouse``.
 
@@ -170,16 +185,22 @@ def calc_signature_score(
         sig_string = signatures
         if sig_string in predefined_signatures:
             signatures = load_signatures_from_file(predefined_signatures[sig_string])
-            from threadpoolctl import threadpool_limits
+
+            if sig_string.startswith("mitochondrial_genes"):
+                del signatures["mito_noncoding"]
+            elif sig_string.startswith("ribosomal_genes"):
+                del signatures["ribo_like"]
+            
+            _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold)
 
             if sig_string.startswith("cell_cycle"):
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
                 data.obs["cycle_diff"] = data.obs["G2/M"] - data.obs["G1/S"]
 
                 values = data.obs[["G1/S", "G2/M"]].values
                 maxvalues = values.max(axis = 1)
                 data.obs["cycling"] = maxvalues
 
+                from threadpoolctl import threadpool_limits
                 kmeans = KMeans(n_clusters=2, random_state=random_state)
                 with threadpool_limits(limits = 1):
                     kmeans.fit(maxvalues.reshape(-1, 1))
@@ -191,9 +212,9 @@ def calc_signature_score(
 
                 data.obs["predicted_phase"] = pd.Categorical.from_codes(codes, categories = ["G0", "G1/S", "G2/M"])
             elif sig_string.startswith("gender"):
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
                 data.obs["gender_score"] = data.obs["male_score"] - data.obs["female_score"]
 
+                from threadpoolctl import threadpool_limits
                 kmeans = KMeans(n_clusters=3, random_state=random_state)
                 with threadpool_limits(limits = 1):
                     kmeans.fit(data.obs["gender_score"].values.reshape(-1, 1))
@@ -201,18 +222,10 @@ def calc_signature_score(
                 codes = list(map(lambda x: reorg_dict[x], kmeans.labels_))
 
                 data.obs["predicted_gender"] = pd.Categorical.from_codes(codes, categories = ["female", "uncertain", "male"])
-            elif sig_string.startswith("mitochondrial_genes"):
-                del signatures["mito_noncoding"]
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
-            elif sig_string.startswith("ribosomal_genes"):
-                del signatures["ribo_like"]
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
-            elif sig_string.startswith("apoptosis"):
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
-            else:
-                assert False
+            elif sig_string == "emt_human":
+                data.obs["EMT_score"] = data.obs["Mesenchymal-like"] - data.obs["Epithelial-like"]
         else:
             signatures = load_signatures_from_file(sig_string)
-            _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
+            _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold)
     else:
-        _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
+        _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold)
diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py
index d3970847..fb739456 100644
--- a/pegasus/tools/utils.py
+++ b/pegasus/tools/utils.py
@@ -192,6 +192,11 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: Union[str,
     ribosomal_genes_mouse=pkg_resources.resource_filename("pegasus", "data_files/ribosomal_genes_mouse.gmt"),
     apoptosis_human=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_human.gmt"),
     apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"),
+    human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"),
+    mouse_lung=pkg_resources.resource_filename("pegasus", "data_files/mouse_lung.gmt"),
+    mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"),
+    mouse_liver=pkg_resources.resource_filename("pegasus", "data_files/mouse_liver.gmt"),
+    emt_human=pkg_resources.resource_filename("pegasus", "data_files/emt_human.gmt"),
 )
 
 predefined_pathways = dict(
diff --git a/pegasus/tools/visualization.py b/pegasus/tools/visualization.py
index c1dab252..1d660ada 100644
--- a/pegasus/tools/visualization.py
+++ b/pegasus/tools/visualization.py
@@ -276,6 +276,7 @@ def umap(
     dens_var_shift: float = 0.1,
     n_jobs: int = -1,
     full_speed: bool = False,
+    use_cache: bool = True,
     random_state: int = 0,
     out_basis: str = "umap",
 ) -> None:
@@ -334,6 +335,9 @@ def umap(
         * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
         * Otherwise, use only one thread to make sure results are reproducible.
 
+    use_cache: ``bool``, optional, default: ``True``
+        If use_cache and found cached knn results, will not recompute.
+
     random_state: ``int``, optional, default: ``0``
         Random seed set for reproducing results.
 
@@ -354,11 +358,7 @@ def umap(
     rep = update_rep(rep)
     X = X_from_rep(data, rep, rep_ncomps)
 
-    if data.shape[0] < n_neighbors:
-        logger.warning(f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}.")
-        n_neighbors = data.shape[0]
-
-    knn_indices, knn_dists = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+    knn_indices, knn_dists, n_neighbors = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
     knn_indices = np.insert(knn_indices[:, 0 : n_neighbors - 1], 0, range(data.shape[0]), axis=1)
     knn_dists = np.insert(knn_dists[:, 0 : n_neighbors - 1], 0, 0.0, axis=1)
 
@@ -539,6 +539,7 @@ def net_umap(
     select_K: int = 25,
     select_alpha: float = 1.0,
     full_speed: bool = False,
+    use_cache: bool = True,
     net_alpha: float = 0.1,
     polish_learning_rate: float = 10.0,
     polish_n_epochs: int = 30,
@@ -612,6 +613,9 @@ def net_umap(
         * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
         * Otherwise, use only one thread to make sure results are reproducible.
 
+    use_cache: ``bool``, optional, default: ``True``
+        If use_cache and found cached knn results, will not recompute.
+
     net_alpha: ``float``, optional, default: ``0.1``
         L2 penalty (regularization term) parameter of the deep regressor.
 
@@ -641,7 +645,7 @@ def net_umap(
 
     rep = update_rep(rep)
     n_jobs = eff_n_jobs(n_jobs)
-    knn_indices, knn_dists = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+    knn_indices, knn_dists, select_K = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
 
     selected = select_cells(
         knn_dists,
@@ -659,7 +663,7 @@ def net_umap(
 
     ds_indices_key = "ds_" + rep + "_knn_indices"  # ds refers to down-sampling
     ds_distances_key = "ds_" + rep + "_knn_distances"
-    indices, distances = calculate_nearest_neighbors(
+    indices, distances, n_neighbors = calculate_nearest_neighbors(
         X,
         K=n_neighbors,
         n_jobs=n_jobs,
@@ -702,7 +706,7 @@ def net_umap(
 
     data.obsm["X_" + out_basis + "_pred"] = Y_init
 
-    knn_indices, knn_dists = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+    knn_indices, knn_dists, n_neighbors = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
     knn_indices = np.insert(knn_indices[:, 0 : n_neighbors - 1], 0, range(data.shape[0]), axis=1)
     knn_dists = np.insert(knn_dists[:, 0 : n_neighbors - 1], 0, 0.0, axis=1)
 
@@ -735,6 +739,7 @@ def net_fle(
     rep: str = "diffmap",
     K: int = 50,
     full_speed: bool = False,
+    use_cache: bool = True,
     target_change_per_node: float = 2.0,
     target_steps: int = 5000,
     is3d: bool = False,
@@ -778,6 +783,9 @@ def net_fle(
         * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
         * Otherwise, use only one thread to make sure results are reproducible.
 
+    use_cache: ``bool``, optional, default: ``True``
+        If use_cache and found cached knn results, will not recompute.
+
     target_change_per_node: ``float``, optional, default: ``2.0``
         Target change per node to stop ForceAtlas2.
 
@@ -845,7 +853,7 @@ def net_fle(
             full_speed=full_speed,
         )
 
-    knn_indices, knn_dists = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+    knn_indices, knn_dists, select_K = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
 
     selected = select_cells(
         knn_dists,
@@ -860,7 +868,7 @@ def net_fle(
 
     ds_indices_key = "ds_" + rep + "_knn_indices"
     ds_distances_key = "ds_" + rep + "_knn_distances"
-    indices, distances = calculate_nearest_neighbors(
+    indices, distances, K = calculate_nearest_neighbors(
         X, K=K, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed
     )
     data.uns[ds_indices_key] = indices
diff --git a/requirements.txt b/requirements.txt
index f9a154c7..5458ce4d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,6 @@ Cython
 docopt
 demuxEM
 hnswlib
-importlib_metadata>=0.7; python_version < '3.8'
 psutil
 threadpoolctl
 joblib>=0.14
@@ -16,15 +15,15 @@ natsort
 numba
 numpy
 pandas>=1.2.0
-pegasusio>=0.5.1
+pegasusio>=0.9.0
 pybind11
 scikit-learn>=0.23.2
 scikit-misc
 scipy
-seaborn
+seaborn>=0.13.0
 setuptools
 statsmodels
 umap-learn>=0.5.2
 wordcloud
 xlsxwriter
-igraph<=0.9.10
+igraph
diff --git a/setup.py b/setup.py
index 9cd66105..087a27bc 100644
--- a/setup.py
+++ b/setup.py
@@ -36,9 +36,10 @@
         "Topic :: Software Development :: Build Tools",
         "Topic :: Scientific/Engineering :: Bio-Informatics",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
     ],
     keywords="single cell/nucleus genomics analysis",
     packages=find_packages(),
@@ -58,14 +59,15 @@
         scvi=["scvi-tools"],
         all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools"]
     ),
-    python_requires="~=3.7",
+    python_requires="~=3.8",
     package_data={
         "pegasus.annotate_cluster": [
             "human_immune_cell_markers.json",
-            "mouse_immune_cell_markers.json",
-            "mouse_brain_cell_markers.json",
             "human_brain_cell_markers.json",
             "human_lung_cell_markers.json",
+            "mouse_immune_cell_markers.json",
+            "mouse_brain_cell_markers.json",
+            "mouse_liver_cell_markers.json",
         ],
         "pegasus.check_sample_indexes": ["chromium-shared-sample-indexes-plate.json", "Chromium-i7-Multiplex-Kit-N-Set-A-sample-indexes-plate.json"],
         "pegasus": ["data_files/*.gmt"],
diff --git a/tests/run_hashing_citeseq.sh b/tests/run_hashing_citeseq.sh
deleted file mode 100644
index e546244e..00000000
--- a/tests/run_hashing_citeseq.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-pegasus demuxEM -p 2 --generate-diagnostic-plots tests/data/hashing_citeseq/cb_cc_raw_gene_bc_matrices_h5.h5 tests/data/hashing_citeseq/cb_cell_hashing.csv tests/cb_cc
-if [ -f "tests/cb_cc_demux.zarr.zip" ]; then
-    pegasus aggregate_matrix --select-only-singlets --min-genes 100 tests/data/sample_hashing_citeseq.csv tests/cb_cc_citeseq
-
-    if [ -f "tests/cb_cc_citeseq.zarr.zip" ]; then
-        pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --mito-prefix MT- --percent-mito 20 --louvain --umap --citeseq --citeseq-umap --citeseq-umap-exclude Mouse_IgG1,Mouse_IgG2a,Mouse_IgG2b,Rat_IgG2b tests/cb_cc_citeseq.zarr.zip tests/citeseq_result
-
-        if [ -f "tests/citeseq_result.zarr.zip" ]; then
-            pegasus plot scatter --basis umap --attributes louvain_labels,assignment tests/citeseq_result.zarr.zip tests/citeseq_result.umap.pdf
-            pegasus plot scatter --basis citeseq_umap --attributes louvain_labels,assignment tests/citeseq_result.zarr.zip tests/citeseq_result.citeseq_umap.pdf
-        fi
-    fi
-fi
diff --git a/tests/run_pipeline.sh b/tests/run_pipeline.sh
index c6e0d2eb..8c516f2f 100644
--- a/tests/run_pipeline.sh
+++ b/tests/run_pipeline.sh
@@ -1,14 +1,14 @@
 pegasus aggregate_matrix tests/data/count_matrix.csv tests/aggr
 
 if [ -f "tests/aggr.zarr.zip" ]; then
-    pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --correct-batch-effect --nmf --louvain --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr louvain_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result
+    pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --exact-K --correct-batch-effect --nmf --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr leiden_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result
 fi
 
 if [ -f "tests/result.zarr.zip" ]; then
-    pegasus de_analysis -p 2 --labels louvain_labels --t --fisher tests/result.zarr.zip tests/result.de.xlsx
+    pegasus de_analysis -p 2 --labels leiden_labels --t --fisher tests/result.zarr.zip tests/result.de.xlsx
     pegasus annotate_cluster --markers mouse_immune,mouse_brain tests/result.zarr.zip tests/result.anno.txt
     pegasus plot compo --groupby leiden_labels --condition Channel tests/result.zarr.zip tests/result.compo.pdf
-    pegasus plot scatter --basis umap --attributes louvain_labels,Channel tests/result.zarr.zip tests/result.louvain_labels.umap.pdf
+    pegasus plot scatter --basis umap --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.umap.pdf
     pegasus plot scatter --basis tsne --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.tsne.pdf
-    pegasus plot scatter --basis fle --attributes louvain_labels,Channel tests/result.zarr.zip tests/result.louvain_labels.fle.pdf
+    pegasus plot scatter --basis fle --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.fle.pdf
 fi
diff --git a/tests/test_hashing_citeseq.py b/tests/test_hashing_citeseq.py
deleted file mode 100644
index e30cdff7..00000000
--- a/tests/test_hashing_citeseq.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-Unittest module for hashing_citeseq
-"""
-
-import os
-import glob
-import unittest
-
-import numpy as np
-import pandas as pd
-import pegasus as pg
-
-
-class TestPipeline(unittest.TestCase):
-    def test_demux(self):
-        data = pg.read_input("tests/cb_cc_demux.zarr.zip")
-        self.assertEqual(data.shape, (737280, 33694), "Demux data shape differs!")
-        self.assertIn('demux_type', data.obs.columns, "Demux type is lost!")
-        self.assertIn('assignment', data.obs.columns, "Cell assignment is lost!")
-        f_list = glob.glob("tests/cb_cc.*.pdf")
-        self.assertEqual(len(f_list), 4, "Demux diagnosis plots are missing!")
-        self.assertIn('cb_cc.out.demuxEM.zarr.zip', os.listdir('tests'), "Demultiplexed RNA matrix is lost!")
-
-    def test_citeseq(self):
-        data = pg.read_input("tests/cb_cc_citeseq.zarr.zip")
-        self.assertSetEqual(set(data.list_data()), set(['GRCh38-citeseq', 'GRCh38-rna']), "Some modality is missing!")
-        self.assertNotIn('demux_type', data.obs.columns, "Demux type is not removed!")
-        self.assertIn('assignment', data.obs.columns, "Cell assignment is lost!")
-        self.assertEqual(data.shape, (14363, 33694), "RNA data shape differs!")
-        data.select_data('GRCh38-citeseq')
-        self.assertEqual(data.shape, (14363, 31), "CITE-Seq data shape differs!")
-
-    def test_clustering(self):
-        data = pg.read_input("tests/citeseq_result.zarr.zip")
-        self.assertSetEqual(set(data.list_data()), set(['GRCh38-citeseq', 'GRCh38-rna']), "Some modality is missing!")
-        n_rna_cells = data.shape[0]
-        self.assertNotIn('demux_type', data.obs.columns, "Demux type is not removed!")
-        self.assertEqual(data.obs['assignment'].cat.categories.size, 7, "Not all cells are demultiplexed singlets!")
-        self.assertIn('X_citeseq', data.obsm.keys(), "CITE-Seq coordinates are lost!")
-        self.assertEqual(data.obsm['X_citeseq_umap'].shape[1], data.obsm['X_umap'].shape[1], "Some of UMAP embeddings is lost!")
-        data.select_data('GRCh38-citeseq')
-        n_citeseq_cells = data.shape[0]
-        self.assertEqual(n_rna_cells, n_citeseq_cells, "Two modalities have inconsistent number of cells!")
-
-    def test_plot(self):
-        self.assertIn('citeseq_result.citeseq_umap.pdf', os.listdir('tests'), "CITE-Seq UMAP plot is lost!")
-        self.assertIn('citeseq_result.umap.pdf', os.listdir('tests'), "RNA UMAP plot is lost!")
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 4b1da926..3e8b7f3b 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -27,7 +27,6 @@ def test_qc(self):
     def test_clustering(self):
         self.assertEqual(self.data.obsm['pca_harmony_knn_indices'].shape, (1043, 99), "KNN graph shape differs!")
         self.assertEqual(self.data.obsm['pca_harmony_knn_distances'].shape, (1043, 99), "KNN distance matrix shape differs!")
-        self.assertIn('louvain_labels', self.data.obs.columns, "Louvain result is lost!")
         self.assertIn('leiden_labels', self.data.obs.columns, "Leiden result is lost!")
 
     def test_doublet_detection(self):
@@ -60,9 +59,9 @@ def test_annotation(self):
 
     def test_plot(self):
         self.assertIn('result.compo.pdf', os.listdir('tests'), "Composition plot is lost!")
-        self.assertIn('result.louvain_labels.umap.pdf', os.listdir('tests'), "UMAP plot is lost!")
+        self.assertIn('result.leiden_labels.umap.pdf', os.listdir('tests'), "UMAP plot is lost!")
         self.assertIn('result.leiden_labels.tsne.pdf', os.listdir('tests'), "tSNE plot is lost!")
-        self.assertIn('result.louvain_labels.fle.pdf', os.listdir('tests'), 'FLE plot is lost!')
+        self.assertIn('result.leiden_labels.fle.pdf', os.listdir('tests'), 'FLE plot is lost!')
 
     def test_output(self):
         data_h5ad = pg.read_input("tests/result.mm10-rna.h5ad")
diff --git a/wheel_build/build_wheel_for_linux.sh b/wheel_build/build_wheel_for_linux.sh
index 792a1d48..98d6dc96 100755
--- a/wheel_build/build_wheel_for_linux.sh
+++ b/wheel_build/build_wheel_for_linux.sh
@@ -11,7 +11,7 @@ function repair_wheel {
     fi
 }
 
-declare -a PythonVersions=("cp37-cp37m" "cp38-cp38" "cp39-cp39")
+declare -a PythonVersions=("cp38-cp38" "cp39-cp39" "cp310-cp310" "cp311-cp311")
 
 for val in ${PythonVersions[@]}; do
     /opt/python/$val/bin/pip install -r /src/requirements.txt