From 1234a12ad943f6f1944902c78fb5828ded42db51 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sun, 4 Dec 2022 13:15:49 -0800
Subject: [PATCH 01/57] Added n_comps to split_one_cluster function

---
 pegasus/tools/clustering.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py
index 29c50d5b..89d41895 100644
--- a/pegasus/tools/clustering.py
+++ b/pegasus/tools/clustering.py
@@ -643,6 +643,7 @@ def split_one_cluster(
     n_clust: int,
     res_label: str,
     rep: str = "pca",
+    n_comps: int = None,
     random_state: int = 0,
 ) -> None:
     """
@@ -668,6 +669,9 @@ def split_one_cluster(
     rep: ``str``, optional, default: ``"pca"``
         The embedding representation used for Kmeans clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.
 
+    n_comps: `int`, optional (default: None)
+        Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions.
+
     n_jobs : `int`, optional (default: -1)
         Number of threads to use for the KMeans step in 'spectral_louvain' and 'spectral_leiden'. -1 refers to using all physical CPU cores.
 
@@ -688,7 +692,7 @@ def split_one_cluster(
     idx = np.where(data.obs[clust_label] == clust_id)[0]
     tmpdat = data[idx].copy()
     from pegasus.tools import neighbors
-    neighbors(tmpdat, rep=rep, use_cache=False)
+    neighbors(tmpdat, rep=rep, n_comps=n_comps, use_cache=False)
     leiden(tmpdat, rep=rep, resolution=None, n_clust=n_clust, random_state=random_state)
     new_clust = data.obs[clust_label].values.astype(int)
     new_label = new_clust.max() + 1

From 92c9ae51972d5082734217a3d041e93ba122c14c Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sun, 4 Dec 2022 15:04:47 -0800
Subject: [PATCH 02/57] Added Migratory DC markers

---
 .../human_immune_cell_markers.json            | 105 ++++++++++--------
 1 file changed, 58 insertions(+), 47 deletions(-)

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 60eaa54c..9e307620 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -72,6 +72,42 @@
 			}
 		},
 
+		{
+			"name" : "Natural killer cell",
+			"markers" : [
+				{
+					"genes" : ["NCAM1+"],
+					"weight" : 0.2,
+					"comment" : "CD56"
+				},
+				{
+					"genes" : ["NKG7+"],
+					"weight" : 0.2,
+					"comment" : "natural killer cell granule protein 7"
+				},
+				{
+					"genes" : ["KLRB1+", "KLRD1+", "KLRF1+", "KLRC1+", "KLRC2+", "KLRC3+", "KLRC4+"],
+					"weight" : 0.25,
+					"comment" : "killer cell lectin like receptors"
+				},
+				{
+					"genes" : ["CD3D-", "CD3E-", "CD3G-"],
+					"weight" : 0.15,
+					"comment" : "not T cell"
+				},
+				{
+					"genes" : ["FCGR3A+"],
+					"weight" : 0.1,
+					"comment" : "CD16a"
+				},
+				{
+					"genes" : ["ITGAL+", "ITGAM+"],
+					"weight" : 0.1,
+					"comment" : "CD11a,CD11b"
+				}
+			]
+		},
+
 		{
 			"name" : "B cell",
 			"markers" : [
@@ -154,7 +190,7 @@
 				{
 					"genes" : ["CD38+", "BCL6+", "BCL2-", "RGS13+"],
 					"weight" : 0.6,
-					"comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last marker is from XXX"
+					"comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last marker is from Xing et al. Science Advances 2021 Table S2 (Germinal center B)"
 				},
 				{
 					"genes" : ["PCNA+", "MKI67+"],
@@ -190,38 +226,28 @@
 		},
 
 		{
-			"name" : "Natural killer cell",
+			"name" : "Plasma cell",
 			"markers" : [
 				{
-					"genes" : ["NCAM1+"],
-					"weight" : 0.2,
-					"comment" : "CD56"
+					"genes" : ["CD38+", "XBP1+", "CD27+", "SLAMF7+"],
+					"weight" : 0.4,
+					"comment" : "important markers"
 				},
 				{
-					"genes" : ["NKG7+"],
+					"genes" : ["TNFRSF17+", "TNFRSF13B+"],
 					"weight" : 0.2,
-					"comment" : "natural killer cell granule protein 7"
-				},
-				{
-					"genes" : ["KLRB1+", "KLRD1+", "KLRF1+", "KLRC1+", "KLRC2+", "KLRC3+", "KLRC4+"],
-					"weight" : 0.25,
-					"comment" : "killer cell lectin like receptors"
-				},
-				{
-					"genes" : ["CD3D-", "CD3E-", "CD3G-"],
-					"weight" : 0.15,
-					"comment" : "not T cell"
+					"comment" : "TNF-receptor superfamily"
 				},
 				{
-					"genes" : ["FCGR3A+"],
-					"weight" : 0.1,
-					"comment" : "CD16a"
+					"genes" : ["IGHA1+", "IGHG1+"],
+					"weight" : 0.2,
+					"comment" : "class switching happened"
 				},
 				{
-					"genes" : ["ITGAL+", "ITGAM+"],
-					"weight" : 0.1,
-					"comment" : "CD11a,CD11b"
-				}
+					"genes" : ["MS4A1-"],
+					"weight" : 0.2,
+					"comment" : "not B cell, doi: https://doi.org/10.1182/bloodadvances.2017004481, long-live plasma can still express CD19"
+				}				
 			]
 		},
 
@@ -270,39 +296,24 @@
 		},
 
 		{
-			"name" : "Plasmacytoid dendritic cell",
+			"name" : "Migratory dendritic cell",
 			"markers" : [
 				{
-					"genes" : ["JCHAIN+", "LILRA4+", "GZMB+", "MZB1+", "IL3RA+", "SERPINF1+", "ITM2C+", "IRF7+"],
+					"genes" : ["FSCN1+", "CCR7+", "LAMP3+", "CCL19+", "CCL22+", "CD40+", "BIRC3+"],
 					"weight" : 1.0,
-					"comment" : "important pDC markers"
+					"comment" : "Xing et al. Science Advances 2021 Table S2 (DCs-C3)"
 				}
 			]
 		},
 
 		{
-			"name" : "Plasma cell",
+			"name" : "Plasmacytoid dendritic cell",
 			"markers" : [
 				{
-					"genes" : ["CD38+", "XBP1+", "CD27+", "SLAMF7+"],
-					"weight" : 0.4,
-					"comment" : "important markers"
-				},
-				{
-					"genes" : ["TNFRSF17+", "TNFRSF13B+"],
-					"weight" : 0.2,
-					"comment" : "TNF-receptor superfamily"
-				},
-				{
-					"genes" : ["IGHA1+", "IGHG1+"],
-					"weight" : 0.2,
-					"comment" : "class switching happened"
-				},
-				{
-					"genes" : ["MS4A1-"],
-					"weight" : 0.2,
-					"comment" : "not B cell, doi: https://doi.org/10.1182/bloodadvances.2017004481, long-live plasma can still express CD19"
-				}				
+					"genes" : ["JCHAIN+", "LILRA4+", "GZMB+", "MZB1+", "IL3RA+", "SERPINF1+", "ITM2C+", "IRF7+"],
+					"weight" : 1.0,
+					"comment" : "important pDC markers"
+				}
 			]
 		},
 

From 5d72a0ee5ad9547535d3a42c008d17ccce4898e6 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Fri, 23 Dec 2022 14:49:20 -0500
Subject: [PATCH 03/57] Updated markers

---
 .../human_immune_cell_markers.json            |  11 +-
 .../mouse_brain_cell_markers.json             | 142 ++++++++++++------
 pegasus/data_files/human_lung.gmt             |  19 +++
 pegasus/data_files/mouse_brain.gmt            |  11 ++
 pegasus/tools/signature_score.py              |   4 +-
 pegasus/tools/utils.py                        |   2 +
 6 files changed, 137 insertions(+), 52 deletions(-)
 create mode 100644 pegasus/data_files/human_lung.gmt
 create mode 100644 pegasus/data_files/mouse_brain.gmt

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 9e307620..cc153e55 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -188,14 +188,9 @@
 					"comment" : "CD19, CD20 and CD79"
 				},
 				{
-					"genes" : ["CD38+", "BCL6+", "BCL2-", "RGS13+"],
-					"weight" : 0.6,
-					"comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last marker is from Xing et al. Science Advances 2021 Table S2 (Germinal center B)"
-				},
-				{
-					"genes" : ["PCNA+", "MKI67+"],
-					"weight" : 0.1,
-					"comment" : "From Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 2)"
+					"genes" : ["CD38+", "BCL6+", "BCL2-", "RGS13+", "MEF2B"],
+					"weight" : 0.7,
+					"comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last 2 markers are from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper"
 				}
 			],
 			"subtypes" : {
diff --git a/pegasus/annotate_cluster/mouse_brain_cell_markers.json b/pegasus/annotate_cluster/mouse_brain_cell_markers.json
index ebfc0eac..f7895afe 100644
--- a/pegasus/annotate_cluster/mouse_brain_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_brain_cell_markers.json
@@ -1,5 +1,6 @@
 {
 	"title" : "Mouse brain cell markers",
+	"comments": "Yao et al. Nature 2021 Allen Mouse Brain Map is a great resource for markers; Map: https://celltypes.brain-map.org/rnaseq/mouse_ctx-hpf_10x?selectedVisualization=Heatmap&colorByFeature=Cell+Type&colorByFeatureValue=Gad1; Cell type metadata: https://brainpalmseq.med.ubc.ca/brain-regions/neocortex-allen-brain-atlas-rnaseq/search-allen-brain-map-by-all-cell-types/; Extended Data Fig 2 & Supp Table 1 of Zhang et al. Nature 2021 is also used in marker selection",
 	"cell_types" : [
 		{
 			"name" : "Glutamatergic neuron",
@@ -168,28 +169,60 @@
 			"name" : "Oligodendrocyte",
 			"markers" : [
 				{
-					"genes" : ["Mbp+", "Plp1+"],
-					"weight" : 0.6,
-					"comment" : "Oligo specific markers (Allen Brain Map)"
-				},
-				{
-					"genes" : ["Mog+"],
-					"weight" : 0.15,
-					"comment" : "Oligo specific markers, but not expressed in all Oligo cells (Allen Brain Map)"
+					"genes" : ["Plp1+", "Cnp+", "Fa2h+", "St18+", "Mbp+"],
+					"weight" : 0.8,
+					"comment" : "Oligo specific markers from Yao et al. Nature 2021 (Allen Brain Map)"
 				},
 				{
 					"genes" : ["Olig1+", "Olig2+", "Sox10+"],
-					"weight" : 0.25,
+					"weight" : 0.2,
 					"comment" : "Expressed in both Oligo and OPC (Allen Brain Map)"
 				}
-			]
+			],
+			"subtypes" : {
+				"title" : "Oligodendrocyte subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Opalin+ Oligodendrocyte",
+						"markers" : [
+							{
+								"genes" : ["Opalin+", "Mog+", "Plekhh1+", "Ermn+"],
+								"weight" : 1.0,
+								"comment": "Opalin+ markers from Yao et al. Nature 2021"
+							}
+						]
+					},
+					{
+						"name" : "Enpp6+ Oligodendrocyte",
+						"markers" : [
+							{
+								"genes" : ["Enpp6+", "Pik3r3+", "Cnksr3+", "Parvb+", "Dusp15+"],
+								"weight" : 1.0,
+								"comment": "Enpp6+ markers from Yao et al. Nature 2021"	
+							}
+						]
+					},
+					{
+						"name" : "Neu4+ Oligodendrocyte",
+						"markers" : [
+							{
+								"genes" : ["Neu4+"],
+								"weight" : 1.0,
+								"comment": "Neu4+ markers from Yao et al. Nature 2021"	
+							}
+						]
+					}
+
+				]
+			}
 		},
 		{
 			"name" : "OPC",
 			"markers" : [
 				{
-					"genes" : ["Pdgfra+", "Cspg4+"],
-					"weight" : 1.0
+					"genes" : ["Pdgfra+", "Cspg4+", "Emid1+", "Fabp7+"],
+					"weight" : 1.0,
+					"comment": "Oligodendrocyte progenitor cell markers from Yao et al. Nature 2021"
 				}
 			]
 		},
@@ -197,71 +230,94 @@
 			"name" : "Astrocyte",
 			"markers" : [
 				{
-					"genes" : ["Aqp4+", "Gja1+", "F3+", "Prex2+"],
-					"weight" : 1.0
+					"genes" : ["Mt2+", "Gja1+", "Prdx6+", "Htra1+", "Ntsr2+", "Aldoc+", "Apoe+", "Prex2+", "Aqp4+", "Gpr37l1+"],
+					"weight" : 1.0,
+					"comment": "Astrocyte markers from Yao et al. Nature 2021"
 				}
-			]
+			],
+			"subtypes" : {
+				"title" : "Astrocyte subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Gfap+ Astrocyte",
+						"markers" : [
+							{
+								"genes" : ["Gfap+", "Aqp4+", "Tmem47+", "Id4+", "Mlc1+", "Sdc4+", "Gstm1+"],
+								"weight" : 1.0,
+								"comment": "Gfap+ markers from Yao et al. Nature 2021"
+							}
+						]
+					},
+					{
+						"name" : "Slc7a10+ Astrocyte",
+						"markers" : [
+							{
+								"genes" : ["Slc7a10+", "Grm3+", "Trpm3+", "Phkg1+", "Cdh10+", "Luzp2+", "Gria2+", "Slc6a1+"],
+								"weight" : 1.0,
+								"comment": "Slc7a10+ markers from Yao et al. Nature 2021"	
+							}
+						]
+					}
+				]
+			}
 		},
 		{
 			"name" : "Microglia",
 			"markers" : [
 				{
-					"genes" : ["C1qb+", "P2ry12+", "Ctss+", "Csf1r+", "Hmha1+"],
-					"weight" : 1.0
+					"genes" : ["Hexb+", "Siglech+", "Selplg+", "Tmem119+", "Ctss+", "P2ry12+", "Cx3cr1+", "Trem2+", "Fcrls+", "Csf1r+"],
+					"weight" : 1.0,
+					"comment": "Microglia specific markers from Yao et al. Nature 2021"
 				}
 			]			
 		},
 		{
-			"name" : "Endothelial",
-			"markers" : [
-				{
-					"genes" : ["Flt1+", "Dcn+", "Xdh+", "Id1+"],
-					"weight" : 1.0
-				}
-			]
-		},
-		{
-			"name" : "Fibroblast",
+			"name" : "Perivascular macrophage",
 			"markers" : [
 				{
-					"genes" : ["Igfbp1+", "Dcn+"],
-					"weight" : 1.0
+					"genes" : ["Mrc1+", "Stab1+", "Lyz2+", "Ms4a6c+", "F13a1+", "Pf4+"],
+					"weight" : 1.0,
+					"comment": "PVM specific markers from Yao et al. Nature 2021"
 				}
 			]			
 		},
 		{
-			"name" : "Mural",
+			"name" : "Endothelial cell",
 			"markers" : [
 				{
-					"genes" : ["Rgs5+", "Acta2+"],
-					"weight" : 1.0
+					"genes" : ["Flt1+", "Pecam1+", "Ly6a+", "Slco1a4+", "Mecom+", "Ptprb+", "Id1+"],
+					"weight" : 1.0,
+					"comment" : "Endo specific markers from Yao et al. Nature 2021"
 				}
-			]			
+			]
 		},
 		{
-			"name" : "Choroid Coch",
+			"name" : "Vascular leptomeningeal cell",
 			"markers" : [
 				{
-					"genes" : ["Tgfbi+"],
-					"weight" : 1.0
+					"genes" : ["Slc7a11+", "Slc6a13+", "Bmp6+", "Igfbp2+", "Fmod+", "Ranbp3l+"],
+					"weight" : 1.0,
+					"comment" : "VLMC specific markers from Yao et al. Nature 2021"
 				}
-			]			
+			]
 		},
 		{
-			"name" : "Ependyma",
+			"name" : "Smooth muscle cell",
 			"markers" : [
 				{
-					"genes" : ["Ccdc153+"],
-					"weight" : 1.0
+					"genes" : ["Atca2+", "Myh11+", "Tagln+", "Pln+", "Mylk+"],
+					"weight" : 1.0,
+					"comment" : "SMC specific markers from Yao et al. Nature 2021"
 				}
 			]			
 		},
 		{
-			"name" : "Smooth muscle cell",
+			"name" : "Pericyte",
 			"markers" : [
 				{
-					"genes" : ["Vtn+", "Colec12+"],
-					"weight" : 1.0
+					"genes" : ["Vtn+", "Atp13a5+", "Abcc9+", "Kcnj8+", "Art3+"],
+					"weight" : 1.0,
+					"comment" : "Pericyte specific markers from Yao et al. Nature 2021"
 				}
 			]			
 		}
diff --git a/pegasus/data_files/human_lung.gmt b/pegasus/data_files/human_lung.gmt
new file mode 100644
index 00000000..726c485b
--- /dev/null
+++ b/pegasus/data_files/human_lung.gmt
@@ -0,0 +1,19 @@
+Epithelial	Epithelial markers from HTAPP paper	KRT8	KRT18	EPCAM	CD24
+Endothelial	Endothelial shared markers from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021	PECAM1	CLDN5	CDH5	ERG
+Fibroblast	Fibroblast/Myofibroblast shared markers from Travaglini et al.	COL1A1	COL1A2	PDGFRA	ELN	BGN
+Macrophage	Macro	CD68	CD163	C1QA	MRC1	MS4A6A	MSR1	MERTK
+SMC	SMC from Muus et al., Braga et al. and Schupp et al.	MYH11	TAGLN	ACTG2	CNN1	PLN
+Pericyte	Pericyte from Schupp et al. and Travaglini et al.	TRPC6	CSPG4	FAM162B	GJA4	GJC1	HIGD1B	CDH6	LAMC3	FHL5
+T cell	T cell markers	CD3D	CD3E	CD3G	TRAC
+B cell	B cell markers	CD19	MS4A1	CD79A	CD79B
+Plasma cell	Plasma cell markers	CD38	XBP1	CD27	SLAMF7	TNFRSF17	TNFRSF13B
+Mast cell	Mast cell markers	KIT	CPA3	TPSB2	TPSAB1	AREG	RGS1	RGS2
+Neutrophil	Neutrophil markers	FUT4	MPO	CEACAM8	ELANE	CXCR1	CXCR2	LY6G6D
+AT1	AT1 markers from Schupp et al., Travaglini et al. and Tony et al.	AGER	CAV1	RTKN2	MYL9	SPOCK2	ANXA3	TIMP3	CAV2	ST6GALNAC5	MYRF
+AT2	AT2 markers from Schupp et al., Travaglini et al. and Tony et al.	SFTPA1	SFTPA2	SFTPC	ETV5	TTN	PLA2G4F	CCDC141	LAMP3	ABCA3	HHIP
+Basal	Basal cell markers from Schupp et al., Travaglini et al. and Tony et al.	KRT5	KRT15	KRT17	TP63	S100A2	TNS4
+Ciliated	Ciliated cell markers from Schupp et al., Travaglini et al. and Tony et al.	ERICH3	SNTN	CCDC78	SNTN	ZBBX	DNAI1	ARMC3	CFAP157	TTC29	CFAP73
+Club	Club cell markers from Schupp et al., Travaglini et al. and Tony et al.	SCGB3A2	MGP	VIM	CST3
+Goblet	Goblet cell markers from Schupp et al., Travaglini et al. and Tony et al.	MUC5AC	MUC5B	BPIFB1	MSMB	FAM3D	SERPINB11	CXCL6	SCGB1A1	FAM3D	SERPINB3
+Ionocyte	Ionocyte markers from Travaglini et al.	FOXI1	ASCL3	CLDN25	ATP6V1G3	LINC01187
+PNEC	Plumonary neuroendocrien cell markers from Travaglini et al.	CALCA	CHGA	ASCL1	SLC35D3	KIF1A
diff --git a/pegasus/data_files/mouse_brain.gmt b/pegasus/data_files/mouse_brain.gmt
new file mode 100644
index 00000000..fa32ef25
--- /dev/null
+++ b/pegasus/data_files/mouse_brain.gmt
@@ -0,0 +1,11 @@
+GlutamatergicNeuron	Glutamatergic neuron	Slc17a7	Slc17a6	Neurod6	Neurod2	
+GABAergicNeuron	GABAergic neuron	Gad1	Gad2	Slc32a1
+Oligodendrocyte	Oligodendrocyte	Plp1	Cnp	Fa2h	St18	Mbp
+OPC	Oligodendrocyte progenitor cell	Pdgfra	Cspg4	Emid1	Fabp7
+SMC	Smooth muscle cell	Atca2	Myh11	Tagln	Pln	Mylk
+Pericyte	Pericyte	Vtn	Atp13a5	Abcc9	Kcnj8	Art3
+Endo	Endothelial cell	Flt1	Pecam1	Ly6a	Slco1a4	Mecom	Ptprb	Id1
+Microglia	Microglia cell	Hexb	Siglech	Selplg	Tmem119	Ctss	P2ry12	Cx3cr1	Trem2	Fcrls	Csf1r
+Astrocyte	Astrocyte	Mt2	Gja1	Prdx6	Htra1	Ntsr2	Aldoc	Apoe	Prex2	Aqp4	Gpr37l1
+PVM	Perivascular macrophages	Mrc1	Stab1	Lyz2	Ms4a6c	F13a1	Pf4
+VLMC	Vascular leptomeningeal cells	Slc7a11	Slc6a13	Bmp6	Igfbp2	Fmod	Ranbp3l
diff --git a/pegasus/tools/signature_score.py b/pegasus/tools/signature_score.py
index ee14c446..965e4129 100644
--- a/pegasus/tools/signature_score.py
+++ b/pegasus/tools/signature_score.py
@@ -210,7 +210,9 @@ def calc_signature_score(
             elif sig_string.startswith("apoptosis"):
                 _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
             else:
-                assert False
+                assert sig_string in predefined_signatures
+                signatures = load_signatures_from_file(predefined_signatures[sig_string])
+                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)                
         else:
             signatures = load_signatures_from_file(sig_string)
             _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py
index ebaf11bc..2b80d819 100644
--- a/pegasus/tools/utils.py
+++ b/pegasus/tools/utils.py
@@ -187,6 +187,8 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: str, warni
     ribosomal_genes_mouse=pkg_resources.resource_filename("pegasus", "data_files/ribosomal_genes_mouse.gmt"),
     apoptosis_human=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_human.gmt"),
     apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"),
+    human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"),
+    mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"),
 )
 
 predefined_pathways = dict(

From 6f35adf7178a4a8429ae16a7066fd2fc02af485c Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Thu, 29 Dec 2022 01:33:05 -0500
Subject: [PATCH 04/57] Added mouse liver markers

---
 .../mouse_brain_cell_markers.json             |   2 +-
 .../mouse_immune_cell_markers.json            | 145 ++++++++++--
 .../mouse_liver_cell_markers.json             | 209 ++++++++++++++++++
 pegasus/data_files/mouse_liver.gmt            |  23 ++
 setup.py                                      |   5 +-
 5 files changed, 367 insertions(+), 17 deletions(-)
 create mode 100644 pegasus/annotate_cluster/mouse_liver_cell_markers.json
 create mode 100644 pegasus/data_files/mouse_liver.gmt

diff --git a/pegasus/annotate_cluster/mouse_brain_cell_markers.json b/pegasus/annotate_cluster/mouse_brain_cell_markers.json
index f7895afe..1bdfc86c 100644
--- a/pegasus/annotate_cluster/mouse_brain_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_brain_cell_markers.json
@@ -287,7 +287,7 @@
 				{
 					"genes" : ["Flt1+", "Pecam1+", "Ly6a+", "Slco1a4+", "Mecom+", "Ptprb+", "Id1+"],
 					"weight" : 1.0,
-					"comment" : "Endo specific markers from Yao et al. Nature 2021"
+					"comment" : "Endo specific markers from Yao et al. Nature 2021; Slco1a4 is specific to mouse brain: see https://journals.plos.org/plosone/article/figures?id=10.1371/journal.pone.0013741"
 				}
 			]
 		},
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index 64e0fd8a..4f462f13 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -5,8 +5,9 @@
 			"name" : "T cell",
 			"markers" : [
 				{
-					"genes" : ["Cd28+", "Cd3d+", "Cd3e+", "Cd4+", "Cd8a+"],
-					"weight" : 1.0
+					"genes" : ["Cd3d+", "Cd3e+", "Cd3g+", "Trac+", "Cd28+"],
+					"weight" : 1.0,
+					"comment" : "T cell markers from Kaptein et al. Cell 2022"
 				}
 			],
 			"subtypes" : {
@@ -51,6 +52,7 @@
 				]
 			}
 		},
+
 		{
 			"name" : "Monocyte",
 			"markers" : [
@@ -64,36 +66,151 @@
 				}
 			]
 		},
+
 		{
 			"name" : "B cell",
 			"markers" : [
 				{
-					"genes" : ["Cd19+", "Cd79b+", "Cd74+", "Igkc+", "Ighm+", "Iglc2+", "Ms4a1+"],
-					"weight" : 1.0
+					"genes" : ["Cd19+", "Ms4a1+", "Cd79a+", "Cd79b+", "Ebf1+"],
+					"weight" : 1.0,
+					"comment" : "B cell markers from Kaptein et al. Cell 2022"
 				}
 			]
 		},
+
 		{
-			"name" : "Neutrophil",
+			"name" : "Natural killer cell",
 			"markers" : [
 				{
-					"genes" : ["Mmp9+", "S100a8+", "S100a9+", "Il1b+", "Retnlg+", "Lcn2+"],
-					"weight" : 1.0
+					"genes" : ["Eomes+", "Cma1+", "Klra4+", "Klra7+", "Klra8+"],
+					"weight" : 1.0,
+					"comment" : "NK cell markers from Kaptein et al. Cell 2022"
 				}
 			]
 		},
+
 		{
-			"name" : "NK cell",
+			"name" : "Inflammatory monocyte",
 			"markers" : [
 				{
-					"genes" : ["Nkg7+"],
-					"weight" : 0.55
-				},
+					"genes" : ["Ly6c2+", "F13a1+", "Chil3+", "Ms4a4c+", "Ccr2+"],
+					"weight" : 1.0,
+					"comment" : "Inflammatory monocyte markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Patrolling monocyte",
+			"markers" : [
 				{
-					"genes" : ["Cd3d-", "Cd3e-"],
-					"weight" : 0.45
+					"genes" : ["Ace+", "Eno3+", "Ear2+", "Treml4+", "Fabp4+"],
+					"weight" : 1.0,
+					"comment" : "Patrolling monocyte markers from Kaptein et al. Cell 2022"
 				}
 			]
-		}
+		},
+
+		{
+			"name" : "Macrophage",
+			"markers" : [
+				{
+					"genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"],
+					"weight" : 1.0,
+					"comment" : "Machrophage markers from Kaptein et al. Cell 2022"
+				}
+			],
+			"subtypes" : {
+				"title" : "Macrophage subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Cd207+ macrophage",
+						"markers" : [
+							{
+								"genes" : ["Cd207+", "Tmem119+", "Olfml3+", "Mmp13+"],
+								"weight" : 1.0,
+								"comments" : "Cd207+ macrophage markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Bile-duct lipid-associated macrophage",
+						"markers" : [
+							{
+								"genes" : ["Gpnmb+", "Spp1+", "Syngr1+", "Cd93+"],
+								"weight" : 1.0,
+								"comments" : "Bile-duct LAM markers from Kaptein et al. Cell 2022"
+							}
+						]
+					}
+				]
+			}
+		},
+
+		{
+			"name" : "Conventional type 1 dendritic cell",
+			"markers" : [
+				{
+					"genes" : ["Xcr1+", "Gcsam+", "Snx22+", "Rab7b+", "Ifi205+"],
+					"weight" : 1.0,
+					"comment" : "cDC1 markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Conventional type 2 dendritic cell",
+			"markers" : [
+				{
+					"genes" : ["Cd209a+","Ltb4r1+", "Mgl2+", "Tnip3+", "Bex6+"],
+					"weight" : 1.0,
+					"comment" : "cDC2 markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Migratory dendritic cell",
+			"markers" : [
+				{
+					"genes" : ["Cacnb3+", "Nudt17+", "Ccl22+", "Apol7c+", "Slco5a1+"],
+					"weight" : 1.0,
+					"comment" : "Migratory DC markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Plasmacytoid dendritic cell",
+			"markers" : [
+				{
+					"genes" : ["Siglech+", "Ccr9+", "Cox6a2+", "Cd300c+", "Klk1+"],
+					"weight" : 1.0,
+					"comment" : "pDC markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Neutrophil",
+			"markers" : [
+				{
+					"genes" : ["S100a8+", "S100a9+", "Retnlg+", "Mmp9+", "Clec4d+"],
+					"weight" : 1.0,
+					"comment" : "Neutrophil markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Basophil",
+			"markers" : [
+				{
+					"genes" : ["Fcer1a+", "Cyp11a+", "Cd200r3+", "Il6+", "Ms4a2+"],
+					"weight" : 1.0,
+					"comment" : "Basophil markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
 	]
 }
diff --git a/pegasus/annotate_cluster/mouse_liver_cell_markers.json b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
new file mode 100644
index 00000000..b8d40786
--- /dev/null
+++ b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
@@ -0,0 +1,209 @@
+{
+	"title" : "Mouse liver cell type markers",
+	"comment": "Markers are collected from Kaptein et al. Cell 2022",
+	"cell_types" : [
+		{
+			"name" : "Hepatocye",
+			"markers" : [
+				{
+					"genes" : ["Acaa1b+", "Arg1+", "Sult2a8+", "Hgd+", "Otc+"],
+					"weight" : 1.0,
+					"comment" : "Hepatocye markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Cholangiocyte",
+			"markers" : [
+				{
+					"genes" : ["Spp1+", "Ddit4l+", "Sox9+", "Fgfr3+", "Plet1+"],
+					"weight" : 1.0,
+					"comment" : "Cholangiocyte markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "HsPC",
+			"markers" : [
+				{
+					"genes" : ["Chrm3+", "Dmbt1+", "Slc4a4+", "Parm1+", "Pcdh11x+"],
+					"weight" : 1.0,
+					"comment" : "Hepatic stem and progenitor cell markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+
+		{
+			"name" : "ILC1",
+			"markers" : [
+				{
+					"genes" : ["Xcl1+", "Cd160+", "Klrc1+", "Cd200r2+", "Gzmc+"],
+					"weight" : 1.0,
+					"comment" : "Innate lymphoid cell type 1 markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Kupffer cell",
+			"markers" : [
+				{
+					"genes" : ["Cd5l+", "Clec4f+", "Vig4+", "Folr2+", "Timd4+"],
+					"weight" : 1.0,
+					"comment" : "Kupffer cell markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Peritoneal macrophage",
+			"markers" : [
+				{
+					"genes" : ["Lyz1+", "Saa3+", "Prg4+", "Retnla+", "Cbr2+"],
+					"weight" : 1.0,
+					"comment" : "Peritoneal macrophage markers from Kaptein et al. Cell 2022; Note that Lyve1 is also a good marker but it is also expressed in endothelial cells"
+				}
+			]
+		},
+
+
+		{
+			"name" : "Endothelial cell",
+			"markers" : [
+				{
+					"genes" : ["Mmrn2+", "Cldn5+", "Adgrl4+", "Tek+", "Myct1+"],
+					"weight" : 1.0,
+					"comment" : "Endothelial cell markers from Kaptein et al. Cell 2022"
+				}
+			],
+			"subtypes" : {
+				"title" : "Endothelial cell subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Liver sinusoidal endothelial cell",
+						"markers" : [
+							{
+								"genes" : ["Lyve1+", "Clec1b+", "Chst2+", "Wisp1+"],
+								"weight" : 1.0,
+								"comment" : "LSEC markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Central vein endothelial cell",
+						"markers" : [
+							{
+								"genes" : ["Rspo3+", "Lhx6+", "Wnt9b+", "Plppr5+"],
+								"weight" : 1.0,
+								"comment" : "CV EC markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Portal Vein endothelial cell",
+						"markers" : [
+							{
+								"genes" : ["Adgrg6+", "Nrg1+", "Gja5+","Cmklr1+"],
+								"weight" : 1.0,
+								"comment" : "PV EC markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Lymphatic Endothelial cell",
+						"markers" : [
+							{
+								"genes" : ["Mmrn1+", "Pard6g+", "Nts+", "Ccl21a+"],
+								"weight" : 1.0,
+								"comments" : "LEC markers from Kaptein et al. Cell 2022"
+							}
+						]
+					}
+				]
+			}
+		},
+
+
+		{
+			"name" : "Stellate cell",
+			"markers" : [
+				{
+					"genes" : ["Colec10+", "Rspo3+", "Mapt+", "Lama1+", "Bmp10+"],
+					"weight" : 1.0,
+					"comment" : "Stellate cell markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Vascular smooth muscle cell",
+			"markers" : [
+				{
+					"genes" : ["Cacna1c+", "Myh11+", "Notch3+", "Lmod1+", "Tagln+"],
+					"weight" : 1.0,
+					"comment" : "VSMC markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Mesothelial cell",
+			"markers" : [
+				{
+					"genes" : ["Ephb1+", "Cadm2+", "Prss12+", "Myl7+", "Prph+"],
+					"weight" : 1.0,
+					"comment" : "Mesothelial cell markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Fibroblast",
+			"markers" : [
+				{
+					"genes" : ["Col1a1+", "Mrc2+", "Plcxd3+", "Fndc1+", "Cpxm1+"],
+					"weight" : 1.0,
+					"comment" : "Fibroblast markers from Kaptein et al. Cell 2022"
+				}
+			],
+			"subtypes" : {
+				"title" : "Fibro subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Capsule fibroblast",
+						"markers" : [
+							{
+								"genes" : ["Osr1+", "Cldn10+", "Lgals7+", "Spock3+"],
+								"weight" : 1.0,
+								"comment" : "Capsule fibroblast markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Central vein fibroblast",
+						"markers" : [
+							{
+								"genes" : ["Dpt+", "Pcolce2+", "Ntrk2+", "Pi16+"],
+								"weight" : 1.0,
+								"comment" : "Central vein fibroblast markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Bile-duct fibroblast",
+						"markers" : [
+							{
+								"genes" : ["Itgbl1+", "Plcxd3+", "Nkain3+", "Clic5+"],
+								"weight" : 1.0,
+								"comment" : "Bile-duct fibroblast markers from Kaptein et al. Cell 2022"
+							}
+						]
+					}
+				]
+			}
+		}
+	]
+}
diff --git a/pegasus/data_files/mouse_liver.gmt b/pegasus/data_files/mouse_liver.gmt
new file mode 100644
index 00000000..5229ce86
--- /dev/null
+++ b/pegasus/data_files/mouse_liver.gmt
@@ -0,0 +1,23 @@
+Endo	Endothelial cell	Mmrn2	Cldn5	Adgrl4	Tek	Myct1
+Stellate	Stellate cell	Colec10	Rspo3	Mapt	Lama1	Bmp10
+VSMC	Vascular smooth muscle cell	Cacna1c	Myh11	Notch3	Lmod1	Tagln
+Meso	Mesothelial cell	Ephb1	Cadm2	Prss12	Myl7	Prph
+Fibro	Fibroblast	Col1a1	Mrc2	Plcxd3	Fndc1	Cpxm1
+Hepatocyte	Hepatocye	Acaa1b	Arg1	Sult2a8	Hgd	Otc
+Cholangiocyte	Cholangiocyte	Spp1	Ddit4l	Sox9	Fgfr3	Plet1
+HSPC	Hepatic stem and progenitor cell	Chrm3	Dmbt1	Slc4a4	Parm1	Pcdh11x	
+T	T cell	Cd3d	Cd3e	Cd3g	Trac	Cd28
+B	B cell	Cd19	Ms4a1	Cd79a	Cd79b	Ebf1	
+NK	NK cell	Eomes	Cma1	Klra4	Klra7	Klra8
+ILC1	Innate lymphoid cell type 1	Xcl1	Cd160	Klrc1	Cd200r2	Gzmc
+cDC1	cDC1	Xcr1	Gcsam	Snx22	Rab7b	Ifi205
+cDC2	cDC2	Cd209a	Ltb4r1	Mgl2	Tnip3	Bex6
+Mig_cDC	Migoritory cDC	Cacnb3	Nudt17	Ccl22	Apol7c	Slco5a1
+pDC	Plasmacytoid dendritic cell	Siglech	Ccr9	Cox6a2	Cd300c	Klk1
+MonoI	Inflammatory monocyte	Ly6c2	F13a1	Chil3	Ms4a4c	Ccr2
+MonoP	Patrolling monocyte	Ace	Eno3	Ear2	Treml4	Fabp4
+PeriMac	Peritoneal macrophage	Lyz1	Saa3	Prg4	Retnla	Cbr2
+Mac	Macrophage	Cd14	Ms4a7	Cx3cr1	Trem2	Hpgds
+Kupffer	Kupffer cell	Cd5l	Clec4f	Vig4	Folr2	Timd4
+Neutrophil	Neutrophil	S100a8	S100a9	Retnlg	Mmp9	Clec4d
+Basophil	Basophil	Fcer1a	Cyp11a	Cd200r3	Il6	Ms4a2
diff --git a/setup.py b/setup.py
index 9cd66105..b353e796 100644
--- a/setup.py
+++ b/setup.py
@@ -62,10 +62,11 @@
     package_data={
         "pegasus.annotate_cluster": [
             "human_immune_cell_markers.json",
-            "mouse_immune_cell_markers.json",
-            "mouse_brain_cell_markers.json",
             "human_brain_cell_markers.json",
             "human_lung_cell_markers.json",
+            "mouse_immune_cell_markers.json",
+            "mouse_brain_cell_markers.json",
+            "mouse_liver_cell_markers.json",
         ],
         "pegasus.check_sample_indexes": ["chromium-shared-sample-indexes-plate.json", "Chromium-i7-Multiplex-Kit-N-Set-A-sample-indexes-plate.json"],
         "pegasus": ["data_files/*.gmt"],

From 94867ebb783d7cb0acdd54b35107c171bb6dbda8 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Thu, 29 Dec 2022 08:01:08 -0500
Subject: [PATCH 05/57] Fixed typos in mouse liver and immune cell types

---
 pegasus/annotate_cluster/annotate_cluster.py            | 4 +++-
 pegasus/annotate_cluster/mouse_immune_cell_markers.json | 4 ++--
 pegasus/annotate_cluster/mouse_liver_cell_markers.json  | 2 +-
 pegasus/data_files/mouse_liver.gmt                      | 4 ++--
 pegasus/tools/utils.py                                  | 1 +
 5 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/pegasus/annotate_cluster/annotate_cluster.py b/pegasus/annotate_cluster/annotate_cluster.py
index 5ce05357..4b0b6fce 100644
--- a/pegasus/annotate_cluster/annotate_cluster.py
+++ b/pegasus/annotate_cluster/annotate_cluster.py
@@ -278,7 +278,8 @@ def infer_cell_types(
             * ``'mouse_immune'`` for mouse immune cells;
             * ``'human_brain'`` for human brain cells;
             * ``'mouse_brain'`` for mouse brain cells;
-            * ``'human_lung'`` for human lung cells.
+            * ``'human_lung'`` for human lung cells;
+            * ``'mouse_liver'`` for mouse liver cells.
         * If ``Dict``, it refers to a Python dictionary describing the markers.
 
     de_test: ``str``, optional, default: ``"mwu"``
@@ -320,6 +321,7 @@ def infer_cell_types(
         human_brain="human_brain_cell_markers.json",
         mouse_brain="mouse_brain_cell_markers.json",
         human_lung="human_lung_cell_markers.json",
+        mouse_liver="mouse_liver_cell_markers.json",
     )
 
     if isinstance(markers, str):
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index 4f462f13..f330b19c 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -206,11 +206,11 @@
 			"name" : "Basophil",
 			"markers" : [
 				{
-					"genes" : ["Fcer1a+", "Cyp11a+", "Cd200r3+", "Il6+", "Ms4a2+"],
+					"genes" : ["Fcer1a+", "Cyp11a1+", "Cd200r3+", "Il6+", "Ms4a2+"],
 					"weight" : 1.0,
 					"comment" : "Basophil markers from Kaptein et al. Cell 2022"
 				}
 			]
-		},
+		}
 	]
 }
diff --git a/pegasus/annotate_cluster/mouse_liver_cell_markers.json b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
index b8d40786..e1350ff5 100644
--- a/pegasus/annotate_cluster/mouse_liver_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
@@ -51,7 +51,7 @@
 			"name" : "Kupffer cell",
 			"markers" : [
 				{
-					"genes" : ["Cd5l+", "Clec4f+", "Vig4+", "Folr2+", "Timd4+"],
+					"genes" : ["Cd5l+", "Clec4f+", "Vsig4+", "Folr2+", "Timd4+"],
 					"weight" : 1.0,
 					"comment" : "Kupffer cell markers from Kaptein et al. Cell 2022"
 				}
diff --git a/pegasus/data_files/mouse_liver.gmt b/pegasus/data_files/mouse_liver.gmt
index 5229ce86..a4a5f070 100644
--- a/pegasus/data_files/mouse_liver.gmt
+++ b/pegasus/data_files/mouse_liver.gmt
@@ -18,6 +18,6 @@ MonoI	Inflammatory monocyte	Ly6c2	F13a1	Chil3	Ms4a4c	Ccr2
 MonoP	Patrolling monocyte	Ace	Eno3	Ear2	Treml4	Fabp4
 PeriMac	Peritoneal macrophage	Lyz1	Saa3	Prg4	Retnla	Cbr2
 Mac	Macrophage	Cd14	Ms4a7	Cx3cr1	Trem2	Hpgds
-Kupffer	Kupffer cell	Cd5l	Clec4f	Vig4	Folr2	Timd4
+Kupffer	Kupffer cell	Cd5l	Clec4f	Vsig4	Folr2	Timd4
 Neutrophil	Neutrophil	S100a8	S100a9	Retnlg	Mmp9	Clec4d
-Basophil	Basophil	Fcer1a	Cyp11a	Cd200r3	Il6	Ms4a2
+Basophil	Basophil	Fcer1a	Cyp11a1	Cd200r3	Il6	Ms4a2
diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py
index 2b80d819..2410d547 100644
--- a/pegasus/tools/utils.py
+++ b/pegasus/tools/utils.py
@@ -189,6 +189,7 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: str, warni
     apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"),
     human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"),
     mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"),
+    mouse_liver=pkg_resources.resource_filename("pegasus", "data_files/mouse_liver.gmt"),
 )
 
 predefined_pathways = dict(

From 9a15fcbbdb455b16069243c9e5a200dec6275e67 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sun, 15 Jan 2023 13:06:19 -0800
Subject: [PATCH 06/57] Fixed switched_axes for dotplot

---
 pegasus/plotting/plot_library.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index f4a95549..f9874be7 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1359,9 +1359,9 @@ def non_zero(g):
     yticks = summarized_df.index.map(str).values
 
     if switch_axes:
-        x, y = y, x
-        xlabel, ylabel = ylabel, xlabel
-        xticks, yticks = yticks, xticks
+        x, y = y[::-1], x[::-1]
+        xlabel, ylabel = ylabel[::-1], xlabel[::-1]
+        xticks, yticks = yticks[::-1], xticks[::-1]
 
     dotplot_df = pd.DataFrame(data=dict(x=x, y=y, value=summary_values, pixels=pixels, fraction=fraction,
                     xlabel=np.array(xlabel)[x], ylabel=np.array(ylabel)[y]))

From 766d4f93d50ba69c7dcfdc18b39fb313b3e36aa6 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Tue, 17 Jan 2023 22:07:48 -0800
Subject: [PATCH 07/57] Fixed a typo in the docstring of plot_heatmap

---
 pegasus/plotting/plot_library.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index f9874be7..824cefaa 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1076,7 +1076,7 @@ def heatmap(
 
     Examples
     --------
-    >>> pg.heatmap(data, genes=['CD14', 'TRAC', 'CD34'], groupby='louvain_labels')
+    >>> pg.heatmap(data, attrs=['CD14', 'TRAC', 'CD34'], groupby='louvain_labels')
 
     """
     if not isinstance(data, anndata.AnnData):

From ce70e86d6dbe0ba897b94230bd72de7cc27b1ebe Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Thu, 19 Jan 2023 23:44:49 -0800
Subject: [PATCH 08/57] Simplified sig score calculation and added emt score

---
 pegasus/data_files/emt_human.gmt |  2 ++
 pegasus/tools/signature_score.py | 40 ++++++++++++++++++--------------
 pegasus/tools/utils.py           |  1 +
 3 files changed, 26 insertions(+), 17 deletions(-)
 create mode 100644 pegasus/data_files/emt_human.gmt

diff --git a/pegasus/data_files/emt_human.gmt b/pegasus/data_files/emt_human.gmt
new file mode 100644
index 00000000..dfec37a7
--- /dev/null
+++ b/pegasus/data_files/emt_human.gmt
@@ -0,0 +1,2 @@
+Epithelial-like	Signatures from Gibbons and Creighton Dev. Dyn. 2018	CDH1	DSP	OCLN
+Mesenchymal-like	Signatures from Gibbons and Creighton Dev. Dyn. 2018	VIM	CDH2	FOXC2	SNAI1	SNAI2	TWIST1	FN1	ITGB6	MMP2	MMP3	MMP9	SOX10	GCS
diff --git a/pegasus/tools/signature_score.py b/pegasus/tools/signature_score.py
index 965e4129..b41071c0 100644
--- a/pegasus/tools/signature_score.py
+++ b/pegasus/tools/signature_score.py
@@ -100,6 +100,7 @@ def calc_signature_score(
     signatures: Union[Dict[str, List[str]], str],
     n_bins: int = 50,
     show_omitted_genes: bool = False,
+    skip_threshold: int = 1,
     random_state: int = 0
 ) -> None:
     """Calculate signature / gene module score. [Li20-1]_
@@ -124,12 +125,21 @@ def calc_signature_score(
             * ``apoptosis_human`` contains one signature, ``apoptosis``, which includes apoptosis-related genes from the KEGG pathway.
             * ``cell_cycle_mouse``, ``gender_mouse``, ``mitochondrial_genes_mouse``, ``ribosomal_genes_mouse`` and ``apoptosis_mouse`` are the corresponding signatures for mouse. Gene symbols are directly translated from human genes.
 
+        In addition, Pegasus provides the following 4 curated signature panels:
+            * ``emt_human``, the Epithelial-Mesenchymal Transition signature from Gibbons and Creighton Dev. Dyn. 2018.
+            * ``human_lung``, human lung cell type markers.
+            * ``mouse_brain``, mouse brain cell type markers.
+            * ``mouse_liver``, mouse liver cell type markers.
+
     n_bins: ``int``, optional, default: 50
         Number of bins on expression levels for grouping genes.
 
     show_omitted_genes: ``bool``, optional, default False
         Signature genes that are not expressed in the data will be omitted. By default, pegasus does not report which genes are omitted. If this option is turned on, report omitted genes.
 
+    skip_threshold: ``int``, optional, default 1
+        Skip signature calculation of number of kept genes is less than skip_threshold.
+
     random_state: ``int``, optional, default: 0
         Random state used by KMeans if signature == ``gender_human`` or ``gender_mouse``.
 
@@ -170,16 +180,22 @@ def calc_signature_score(
         sig_string = signatures
         if sig_string in predefined_signatures:
             signatures = load_signatures_from_file(predefined_signatures[sig_string])
-            from threadpoolctl import threadpool_limits
+
+            if sig_string.startswith("mitochondrial_genes"):
+                del signatures["mito_noncoding"]
+            elif sig_string.startswith("ribosomal_genes"):
+                del signatures["ribo_like"]
+            
+            _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold)
 
             if sig_string.startswith("cell_cycle"):
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
                 data.obs["cycle_diff"] = data.obs["G2/M"] - data.obs["G1/S"]
 
                 values = data.obs[["G1/S", "G2/M"]].values
                 maxvalues = values.max(axis = 1)
                 data.obs["cycling"] = maxvalues
 
+                from threadpoolctl import threadpool_limits
                 kmeans = KMeans(n_clusters=2, random_state=random_state)
                 with threadpool_limits(limits = 1):
                     kmeans.fit(maxvalues.reshape(-1, 1))
@@ -191,9 +207,9 @@ def calc_signature_score(
 
                 data.obs["predicted_phase"] = pd.Categorical.from_codes(codes, categories = ["G0", "G1/S", "G2/M"])
             elif sig_string.startswith("gender"):
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
                 data.obs["gender_score"] = data.obs["male_score"] - data.obs["female_score"]
 
+                from threadpoolctl import threadpool_limits
                 kmeans = KMeans(n_clusters=3, random_state=random_state)
                 with threadpool_limits(limits = 1):
                     kmeans.fit(data.obs["gender_score"].values.reshape(-1, 1))
@@ -201,20 +217,10 @@ def calc_signature_score(
                 codes = list(map(lambda x: reorg_dict[x], kmeans.labels_))
 
                 data.obs["predicted_gender"] = pd.Categorical.from_codes(codes, categories = ["female", "uncertain", "male"])
-            elif sig_string.startswith("mitochondrial_genes"):
-                del signatures["mito_noncoding"]
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
-            elif sig_string.startswith("ribosomal_genes"):
-                del signatures["ribo_like"]
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
-            elif sig_string.startswith("apoptosis"):
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
-            else:
-                assert sig_string in predefined_signatures
-                signatures = load_signatures_from_file(predefined_signatures[sig_string])
-                _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)                
+            elif sig_string == "emt_human":
+                data.obs["EMT_score"] = data.obs["Mesenchymal-like"] - data.obs["Epithelial-like"]
         else:
             signatures = load_signatures_from_file(sig_string)
-            _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
+            _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold)
     else:
-        _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
+        _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold)
diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py
index 2410d547..09aa0f69 100644
--- a/pegasus/tools/utils.py
+++ b/pegasus/tools/utils.py
@@ -190,6 +190,7 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: str, warni
     human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"),
     mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"),
     mouse_liver=pkg_resources.resource_filename("pegasus", "data_files/mouse_liver.gmt"),
+    emt_human=pkg_resources.resource_filename("pegasus", "data_files/emt_human.gmt"),
 )
 
 predefined_pathways = dict(

From 4793f788f22c1680d571c475e43abba3ea89ef7f Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Fri, 10 Feb 2023 13:43:04 -0800
Subject: [PATCH 09/57] Add stripsize to violin function

---
 pegasus/plotting/plot_library.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 824cefaa..f5f236e8 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -45,7 +45,7 @@ def scatter(
     fix_corners: Optional[bool] = True,
     alpha: Optional[Union[float, List[float]]] = 1.0,
     legend_loc: Optional[Union[str, List[str]]] = "right margin",
-    legend_fontsize: Optional[Union[int, List[int]]] = 10, 
+    legend_fontsize: Optional[Union[int, List[int]]] = 10,
     legend_ncol: Optional[str] = None,
     palettes: Optional[Union[str, List[str]]] = None,
     cmaps: Optional[Union[str, List[str]]] = "YlOrRd",
@@ -214,7 +214,7 @@ def scatter(
 
             if global_marker_size == None:
                 global_marker_size = _get_marker_size(x.size) if marker_size is None else marker_size
-            
+
             x_label = f"{basis_}{comp_key[0]}"
             y_label = f"{basis_}{comp_key[1]}"
 
@@ -864,6 +864,7 @@ def violin(
     hue: Optional[str] = None,
     matkey: Optional[str] = None,
     stripplot: Optional[bool] = False,
+    stripsize: int = 1,
     inner: Optional[str] = None,
     scale: Optional[str] = 'width',
     panel_size: Optional[Tuple[float, float]] = (8, 0.5),
@@ -973,7 +974,7 @@ def violin(
     for i in range(nrows):
         ax = axes[i, 0]
         if stripplot:
-            sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=1, color="k", jitter=True)
+            sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=stripsize, color="k", jitter=True)
         sns.violinplot(x="label", y=attrs[i], hue = hue, data=df, inner=inner, linewidth=1, ax=ax, cut=0, scale=scale, split=True, palette=palette, **kwargs)
         ax.grid(False)
 

From 89c10a8066b4d75a08d53b0ad2f4f51d6859e348 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Mon, 6 Mar 2023 10:33:31 -0800
Subject: [PATCH 10/57] Added more marker files and cluster_specific_marker
 function'

---
 pegasus/__init__.py                           |   1 +
 .../human_lung_cell_markers.json              |   2 +-
 .../mouse_immune_cell_markers.json            | 113 +++----
 .../mouse_liver_cell_markers.json             |  41 ++-
 .../mouse_lung_cell_markers.json              | 291 ++++++++++++++++++
 pegasus/data_files/human_lung.gmt             |   2 +-
 pegasus/data_files/mouse_liver.gmt            |  19 +-
 pegasus/data_files/mouse_lung.gmt             |  38 +++
 pegasus/tools/__init__.py                     |   2 +-
 pegasus/tools/clustering.py                   |  38 ++-
 pegasus/tools/diff_expr.py                    |  59 +++-
 11 files changed, 531 insertions(+), 75 deletions(-)
 create mode 100644 pegasus/annotate_cluster/mouse_lung_cell_markers.json
 create mode 100644 pegasus/data_files/mouse_lung.gmt

diff --git a/pegasus/__init__.py b/pegasus/__init__.py
index ae574a32..3e0d62bc 100644
--- a/pegasus/__init__.py
+++ b/pegasus/__init__.py
@@ -65,6 +65,7 @@
     de_analysis,
     markers,
     write_results_to_excel,
+    cluster_specific_markers,
     find_markers,
     infer_path,
     calc_signature_score,
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index 4e18b1ad..c787456e 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -84,7 +84,7 @@
 				{
 					"genes" : ["CALCA+", "CHGA+", "ASCL1+", "SLC35D3+", "KIF1A+"],
 					"weight" : 1.0,
-					"comment" : "Plumonary neuroendocrien cell markers from Travaglini et al."
+					"comment" : "Plumonary neuroendocrine cell markers from Travaglini et al."
 				}
 			]
 		},
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index f330b19c..240615d3 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -71,9 +71,9 @@
 			"name" : "B cell",
 			"markers" : [
 				{
-					"genes" : ["Cd19+", "Ms4a1+", "Cd79a+", "Cd79b+", "Ebf1+"],
+					"genes" : ["Cd19+", "Ms4a1+", "Cd79a+", "Cd79b+", "Ebf1+", "Pax5+", "Fcmr+", "Bank1+"],
 					"weight" : 1.0,
-					"comment" : "B cell markers from Kaptein et al. Cell 2022"
+					"comment" : "B cell markers from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
 				}
 			]
 		},
@@ -81,10 +81,15 @@
 		{
 			"name" : "Natural killer cell",
 			"markers" : [
+				{
+					"genes" : ["Gzma+", "Klrb1c+", "Ncr1+", "Klre1+", "Klrc2+"],
+					"weight" : 0.6,
+					"comment" : "NK & ILC1 shared markers from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Kaptein et al. Cell 2022 data"
+				},
 				{
 					"genes" : ["Eomes+", "Cma1+", "Klra4+", "Klra7+", "Klra8+"],
-					"weight" : 1.0,
-					"comment" : "NK cell markers from Kaptein et al. Cell 2022"
+					"weight" : 0.4,
+					"comment" : "NK cell specific markers (compared to ILC1) from Kaptein et al. Cell 2022; these markers do not have high expressions in Hurskainen et al. Nat. Commun. 2021 data"
 				}
 			]
 		},
@@ -93,9 +98,9 @@
 			"name" : "Inflammatory monocyte",
 			"markers" : [
 				{
-					"genes" : ["Ly6c2+", "F13a1+", "Chil3+", "Ms4a4c+", "Ccr2+"],
+					"genes" : ["Ly6c2+", "F13a1+", "Ms4a4c+", "Ccr2+", "Gm9733+", "Mcub+"],
 					"weight" : 1.0,
-					"comment" : "Inflammatory monocyte markers from Kaptein et al. Cell 2022"
+					"comment" : "Inflammatory monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
 				}
 			]
 		},
@@ -104,56 +109,25 @@
 			"name" : "Patrolling monocyte",
 			"markers" : [
 				{
-					"genes" : ["Ace+", "Eno3+", "Ear2+", "Treml4+", "Fabp4+"],
+					"genes" : ["Ace+", "Eno3+", "Ear2+", "Treml4+", "Spn+", "Fcgr4+", "Lair1+", "Cd300e+", "Cd300ld+", "Adgre4+"],
 					"weight" : 1.0,
-					"comment" : "Patrolling monocyte markers from Kaptein et al. Cell 2022"
+					"comment" : "Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; Related papers: Domingo-Gonzalez et al. Elife 2020, Thomas et al. Arterioscler Thromb Vasc Biol. 2015, and Schyns et al. Nat. Commun. 2019."
 				}
 			]
 		},
 
-		{
-			"name" : "Macrophage",
-			"markers" : [
-				{
-					"genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"],
-					"weight" : 1.0,
-					"comment" : "Machrophage markers from Kaptein et al. Cell 2022"
-				}
-			],
-			"subtypes" : {
-				"title" : "Macrophage subtype markers",
-				"cell_types" : [
-					{
-						"name" : "Cd207+ macrophage",
-						"markers" : [
-							{
-								"genes" : ["Cd207+", "Tmem119+", "Olfml3+", "Mmp13+"],
-								"weight" : 1.0,
-								"comments" : "Cd207+ macrophage markers from Kaptein et al. Cell 2022"
-							}
-						]
-					},
-					{
-						"name" : "Bile-duct lipid-associated macrophage",
-						"markers" : [
-							{
-								"genes" : ["Gpnmb+", "Spp1+", "Syngr1+", "Cd93+"],
-								"weight" : 1.0,
-								"comments" : "Bile-duct LAM markers from Kaptein et al. Cell 2022"
-							}
-						]
-					}
-				]
-			}
-		},
-
 		{
 			"name" : "Conventional type 1 dendritic cell",
 			"markers" : [
 				{
-					"genes" : ["Xcr1+", "Gcsam+", "Snx22+", "Rab7b+", "Ifi205+"],
-					"weight" : 1.0,
-					"comment" : "cDC1 markers from Kaptein et al. Cell 2022"
+					"genes" : ["Xcr1+", "Ifi205+", "Rab7b+", "Tlr3+", "Sept3+", "Hepacam2+"],
+					"weight" : 0.7,
+					"comment" : "cDC1 markers shared between Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
+				},
+				{
+					"genes" : ["Gcsam+", "Snx22+", "Itgae+", "Xlr+"],
+					"weight" : 0.3,
+					"comment" : "cDC1 markers expressed highly in one of Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021, but not both"
 				}
 			]
 		},
@@ -173,9 +147,9 @@
 			"name" : "Migratory dendritic cell",
 			"markers" : [
 				{
-					"genes" : ["Cacnb3+", "Nudt17+", "Ccl22+", "Apol7c+", "Slco5a1+"],
+					"genes" : ["Cacnb3+", "Nudt17+", "Ccl22+", "Apol7c+", "Slco5a1+", "Ccr7+", "Fscn1+", "Il4i1+", "Mreg+", "Bcl2l14+"],
 					"weight" : 1.0,
-					"comment" : "Migratory DC markers from Kaptein et al. Cell 2022"
+					"comment" : "Migratory DC markers shared between Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
 				}
 			]
 		},
@@ -195,9 +169,20 @@
 			"name" : "Neutrophil",
 			"markers" : [
 				{
-					"genes" : ["S100a8+", "S100a9+", "Retnlg+", "Mmp9+", "Clec4d+"],
+					"genes" : ["S100a8+", "S100a9+", "Retnlg+", "Mmp9+", "Csf3r+", "Wfdc21+", "Il1r2+", "Cxcr2+"],
+					"weight" : 1.0,
+					"comment" : "Neutrophil markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; related paper: Grieshaber-Bouyer et al. Nat. Commun. 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Immature neutrophil",
+			"markers" : [
+				{
+					"genes" : ["Ngp+", "Camp+", "Ltf+", "Ly6g+", "Cebpe+"],
 					"weight" : 1.0,
-					"comment" : "Neutrophil markers from Kaptein et al. Cell 2022"
+					"comment" : "Immature Neutrophil markers inferred from Hurskainen et al. Nat. Commun. 2021 and checked using Evrard et al. Immunity 2018 Fig. 5"
 				}
 			]
 		},
@@ -206,9 +191,31 @@
 			"name" : "Basophil",
 			"markers" : [
 				{
-					"genes" : ["Fcer1a+", "Cyp11a1+", "Cd200r3+", "Il6+", "Ms4a2+"],
+					"genes" : ["Cd200r3+", "Aqp9+", "Il6+", "Hgf+", "Adora2b+", "Il4+", "L1cam+", "Grm6+"],
+					"weight" : 1.0,
+					"comment" : "Basophil markers inferred from Matsumara et al. Nat. Commun. 2022 and confirmed using data from Kaptein et al. Cell 2022  and Hurskainen et al. Nat. Commun. 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Eosinophil",
+			"markers" : [
+				{
+					"genes" : ["Epx+", "Prg3+", "Eml5+", "Il5ra+", "Qsox2+", "L2hgdh+"],
+					"weight" : 1.0,
+					"comment" : "Eosinophil markers inferred from Matsumara et al. Nat. Commun. 2022"
+				}
+			]
+		},
+
+		{
+			"name" : "Mast cell",
+			"markers" : [
+				{
+					"genes" : ["Tph1+", "Clnk+", "Hs6st2+", "Plcg1+"],
 					"weight" : 1.0,
-					"comment" : "Basophil markers from Kaptein et al. Cell 2022"
+					"comment" : "Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022"
 				}
 			]
 		}
diff --git a/pegasus/annotate_cluster/mouse_liver_cell_markers.json b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
index e1350ff5..f40427b3 100644
--- a/pegasus/annotate_cluster/mouse_liver_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
@@ -3,12 +3,12 @@
 	"comment": "Markers are collected from Kaptein et al. Cell 2022",
 	"cell_types" : [
 		{
-			"name" : "Hepatocye",
+			"name" : "Hepatocyte",
 			"markers" : [
 				{
 					"genes" : ["Acaa1b+", "Arg1+", "Sult2a8+", "Hgd+", "Otc+"],
 					"weight" : 1.0,
-					"comment" : "Hepatocye markers from Kaptein et al. Cell 2022"
+					"comment" : "Hepatocyte markers from Kaptein et al. Cell 2022"
 				}
 			]
 		},
@@ -47,6 +47,7 @@
 			]
 		},
 
+
 		{
 			"name" : "Kupffer cell",
 			"markers" : [
@@ -69,6 +70,42 @@
 			]
 		},
 
+		{
+			"name" : "Macrophage",
+			"markers" : [
+				{
+					"genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"],
+					"weight" : 1.0,
+					"comment" : "Machrophage markers from Kaptein et al. Cell 2022"
+				}
+			],
+			"subtypes" : {
+				"title" : "Macrophage subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Cd207+ macrophage",
+						"markers" : [
+							{
+								"genes" : ["Cd207+", "Tmem119+", "Olfml3+", "Mmp13+"],
+								"weight" : 1.0,
+								"comments" : "Cd207+ macrophage markers from Kaptein et al. Cell 2022"
+							}
+						]
+					},
+					{
+						"name" : "Bile-duct lipid-associated macrophage",
+						"markers" : [
+							{
+								"genes" : ["Gpnmb+", "Spp1+", "Syngr1+", "Cd93+"],
+								"weight" : 1.0,
+								"comments" : "Bile-duct LAM markers from Kaptein et al. Cell 2022"
+							}
+						]
+					}
+				]
+			}
+		},
+
 
 		{
 			"name" : "Endothelial cell",
diff --git a/pegasus/annotate_cluster/mouse_lung_cell_markers.json b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
new file mode 100644
index 00000000..2c81e13f
--- /dev/null
+++ b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
@@ -0,0 +1,291 @@
+{
+	"title" : "Mouse lung cell type markers",
+	"cell_types" : [
+		{
+			"name" : "Alveolar type I cell",
+			"markers" : [
+				{
+					"genes" : ["Akap5+", "Rtkn2+", "Ndnf+", "Col4a3+", "Spock2+"],
+					"weight" : 1.0,
+					"comment" : "AT1 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Alveolar type II cell",
+			"markers" : [
+				{
+					"genes" : ["Sftpc+", "Sftpa1+", "Lamp3+", "Hc+", "Slc34a2+"],
+					"weight" : 1.0,
+					"comment" : "AT2 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Ciliated cell",
+			"markers" : [
+				{
+					"genes" : ["Dynlrb2+", "Tmem212+", "Foxj1+", "Ccdc153+", "Nme5+"],
+					"weight" : 1.0,
+					"comment" : "Ciliated cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Club cell",
+			"markers" : [
+				{
+					"genes" : ["Scgb1a1+", "Scgb3a2+", "Cckar+", "Gabrp+", "Slc16a11+"],
+					"weight" : 1.0,
+					"comment" : "Club cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Basal cell",
+			"markers" : [
+				{
+					"genes" : ["Aqp3+", "Krt5+", "Dapl1+", "Hspa1a+", "Trp63+"],
+					"weight" : 1.0,
+					"comment" : "Basal cell markers from Montoro et al. Nature 2018 Extended Data Fig. 1d"
+				}
+			]
+		},
+
+		{
+			"name" : "Goblet cell",
+			"markers" : [
+				{
+					"genes" : ["Scgb3a1+", "Muc5b+", "Serpinb11+", "Gp2+", "Dmbt1+"],
+					"weight" : 1.0,
+					"comment" : "Goblet cell markers from Montoro et al. Nature 2018 Supp Table 1"
+				}
+			]
+		},
+
+		{
+			"name" : "Tuft cell",
+			"markers" : [
+				{
+					"genes" : ["Pou2f3+", "Ascl2+", "Dclk1+", "Lrmp+", "Ltc4s+", "Trpm5+", "Gnb3+", "Rgs13+"],
+					"weight" : 1.0,
+					"comment" : "Tuft cell markers from Sun et al. Dev. Cell 2022 and Montoro et al. Nature 2018 Extended Data Fig. 3b; first 3 markers are mainly suggested by Sun et al. the CellCards."
+				}
+			]
+		},
+
+		{
+			"name" : "Plumonary neuroendocrine cell",
+			"markers" : [
+				{
+					"genes" : ["Ascl1+", "Chga+", "Calca+", "Scg2+", "Scg5+"],
+					"weight" : 1.0,
+					"comment" : "Plumonary neuroendocrine cell markers from Montoro et al. Nature 2018 Extended Data Fig. 3b & 3c"
+				}
+			]
+		},
+
+		{
+			"name" : "Ionocyte",
+			"markers" : [
+				{
+					"genes" : ["Foxi1+", "Ascl3+", "Smbd1+", "Moxd1+", "Atp6v0d2+"],
+					"weight" : 1.0,
+					"comment" : "Ionocyte markers from Montoro et al. Nature 2018 Fig. 5a"
+				}
+			]
+		},
+
+
+
+		{
+			"name" : "Endothelial cell",
+			"markers" : [
+				{
+					"genes" : ["Egfl7+", "Cldn5+", "Cdh5+", "Pecam1+", "Calcrl+", "Ecscr+", "Icam2+"],
+					"weight" : 1.0,
+					"comment" : "Endothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+				}
+			],
+			"subtypes" : {
+				"title" : "Endothelial cell subtype markers (Main and Capillary, see https://lungmap.net/cell-cards/)",
+				"cell_types" : [
+					{
+						"name" : "Aerocyte",
+						"markers" : [
+							{
+								"genes" : ["Emp2+", "Car4+", "Tbx2+", "Apln+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					},
+					{
+						"name" : "EC general capillary",
+						"markers" : [
+							{
+								"genes" : ["Gpihbp1+", "Kit+", "Nckap5+", "Aplnr+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					},
+					{
+						"name" : "EC lymphatic",
+						"markers" : [
+							{
+								"genes" : ["Mmrn1+", "Ccl21a+", "Prox1+", "Nts+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					},
+					{
+						"name" : "EC venous",
+						"markers" : [
+							{
+								"genes" : ["Slc6a2+", "Vegfc+", "Ackr3+", "Fabp4+"],
+								"weight" : 1.0,
+								"comments" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					},
+					{
+						"name" : "EC arterial",
+						"markers" : [
+							{
+								"genes" : ["Gja5+", "Cxcl12+", "Pcsk5+", "Thsd7a+"],
+								"weight" : 1.0,
+								"comments" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					}
+				]
+			}
+		},
+
+
+
+		{
+			"name" : "Mesothelial cell",
+			"markers" : [
+				{
+					"genes" : ["Wt1+", "Upk3b+", "Rspo1+", "C2+", "Sbsn+", "Aldh1a2+", "Lrrn4+", "Cldn15+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+				}
+			]
+		},
+
+		{
+			"name" : "Pericyte",
+			"markers" : [
+				{
+					"genes" : ["Notch3+", "Heyl+", "Parm1+", "Ndufa4l2+", "Cox4i2+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+				}
+			],
+			"subtypes" : {
+				"title" : "Pericyte subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Pericyte 1",
+						"markers" : [
+							{
+								"genes" : ["Gpc6+", "Cxcl12+", "Wisp2+", "Map3k7cl+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					},
+					{
+						"name" : "Pericyte 2",
+						"markers" : [
+							{
+								"genes" : ["Higd1b+", "Pcdh18+", "Trpc6+", "Fam162b+", "Clstn2+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					}
+				]
+			}
+		},
+
+		{
+			"name" : "Fibroblast",
+			"markers" : [
+				{
+					"genes" : ["Dpt+", "Clec3b+", "Pcolce2+", "Vegfd+", "Vcam1+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+				}
+			],
+			"subtypes" : {
+				"title" : "Fibro subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Adventitial fibroblast",
+						"markers" : [
+							{
+								"genes" : ["Mfap5+", "Serpinf1+", "Abca8a+", "Twist2+"],
+								"weight" : 1.0,
+								"comment" : "Markers from Schupp et al. and Travaglini et al."
+							}
+						]
+					},
+					{
+						"name" : "Alveolar fibroblast",
+						"markers" : [
+							{
+								"genes" : ["Slit2+", "Col13a1+", "Wnt2+", "Slc38a5+", "Slc27a6+", "Frem1+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+							}
+						]
+					}
+				]
+			}
+		},
+
+		{
+			"name" : "Myofibroblast",
+			"markers" : [
+				{
+					"genes" : ["Egfem1+", "Agt+", "Prag1+", "Etv1+", "Trim67+"],
+					"weight" : 1.0,
+					"comment" : "Markers from Schupp et al. and Travaglini et al."
+				}
+			]
+		},
+
+		{
+			"name" : "Smooth muscle cell",
+			"markers" : [
+				{
+					"genes" : ["Tnnt2+", "Sgcg+", "Sntg2+", "Nrtn+", "Mrvi1+", "Sbspon+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+				}
+			]
+		},
+
+
+
+		{
+			"name" : "ILC2",
+			"markers" : [
+				{
+					"genes" : ["Gata3+", "Il1rl1+", "Arg1+", "Areg+", "Il2ra+", "Csf2+", "Ccl1+", "Ccdc184+", "Calca+", "Il5+"],
+					"weight" : 1.0,
+					"comment" : "Innate lymphoid cell type 2 markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+				}
+			]
+		},
+	]
+}
diff --git a/pegasus/data_files/human_lung.gmt b/pegasus/data_files/human_lung.gmt
index 726c485b..871c9dec 100644
--- a/pegasus/data_files/human_lung.gmt
+++ b/pegasus/data_files/human_lung.gmt
@@ -16,4 +16,4 @@ Ciliated	Ciliated cell markers from Schupp et al., Travaglini et al. and Tony et
 Club	Club cell markers from Schupp et al., Travaglini et al. and Tony et al.	SCGB3A2	MGP	VIM	CST3
 Goblet	Goblet cell markers from Schupp et al., Travaglini et al. and Tony et al.	MUC5AC	MUC5B	BPIFB1	MSMB	FAM3D	SERPINB11	CXCL6	SCGB1A1	FAM3D	SERPINB3
 Ionocyte	Ionocyte markers from Travaglini et al.	FOXI1	ASCL3	CLDN25	ATP6V1G3	LINC01187
-PNEC	Plumonary neuroendocrien cell markers from Travaglini et al.	CALCA	CHGA	ASCL1	SLC35D3	KIF1A
+PNEC	Plumonary neuroendocrine cell markers from Travaglini et al.	CALCA	CHGA	ASCL1	SLC35D3	KIF1A
diff --git a/pegasus/data_files/mouse_liver.gmt b/pegasus/data_files/mouse_liver.gmt
index a4a5f070..ad62209e 100644
--- a/pegasus/data_files/mouse_liver.gmt
+++ b/pegasus/data_files/mouse_liver.gmt
@@ -7,17 +7,22 @@ Hepatocyte	Hepatocye	Acaa1b	Arg1	Sult2a8	Hgd	Otc
 Cholangiocyte	Cholangiocyte	Spp1	Ddit4l	Sox9	Fgfr3	Plet1
 HSPC	Hepatic stem and progenitor cell	Chrm3	Dmbt1	Slc4a4	Parm1	Pcdh11x	
 T	T cell	Cd3d	Cd3e	Cd3g	Trac	Cd28
-B	B cell	Cd19	Ms4a1	Cd79a	Cd79b	Ebf1	
+B	B cell	Cd19	Ms4a1	Cd79a	Cd79b	Ebf1	Pax5	Fcmr	Bank1
 NK	NK cell	Eomes	Cma1	Klra4	Klra7	Klra8
 ILC1	Innate lymphoid cell type 1	Xcl1	Cd160	Klrc1	Cd200r2	Gzmc
-cDC1	cDC1	Xcr1	Gcsam	Snx22	Rab7b	Ifi205
+cDC1	cDC1	Xcr1	Ifi205	Rab7b	Tlr3	Sept3	Hepacam2	Gcsam	Snx22	Itgae	Xlr
 cDC2	cDC2	Cd209a	Ltb4r1	Mgl2	Tnip3	Bex6
-Mig_cDC	Migoritory cDC	Cacnb3	Nudt17	Ccl22	Apol7c	Slco5a1
+migDC	Migoritory DC	Cacnb3	Nudt17	Ccl22	Apol7c	Slco5a1	Ccr7	Fscn1	Il4i1	Mreg	Bcl2l14
 pDC	Plasmacytoid dendritic cell	Siglech	Ccr9	Cox6a2	Cd300c	Klk1
-MonoI	Inflammatory monocyte	Ly6c2	F13a1	Chil3	Ms4a4c	Ccr2
-MonoP	Patrolling monocyte	Ace	Eno3	Ear2	Treml4	Fabp4
+MonoI	Inflammatory monocyte	Ly6c2	F13a1	Ms4a4c	Ccr2	Gm9733	Mcub
+MonoP	Patrolling monocyte	Ace	Eno3	Ear2	Treml4	Spn	Fcgr4	Lair1	Cd300e	Cd300ld	Adgre4
 PeriMac	Peritoneal macrophage	Lyz1	Saa3	Prg4	Retnla	Cbr2
 Mac	Macrophage	Cd14	Ms4a7	Cx3cr1	Trem2	Hpgds
 Kupffer	Kupffer cell	Cd5l	Clec4f	Vsig4	Folr2	Timd4
-Neutrophil	Neutrophil	S100a8	S100a9	Retnlg	Mmp9	Clec4d
-Basophil	Basophil	Fcer1a	Cyp11a1	Cd200r3	Il6	Ms4a2
+Neutrophil	Neutrophil	S100a8	S100a9	Retnlg	Mmp9	Csf3r	Wfdc21	Il1r2	Cxcr2
+Basophil	Basophil	Cd200r3	Aqp9	Il6	Hgf	Adora2b	Il4	L1cam	Grm6
+Eosinophil	Eosinophil	Epx	Prg3	Eml5	Il5ra	Qsox2	L2hgdh
+Mast	Mast cell	Tph1	Clnk	Hs6st2	Plcg1
+Pericentral	Pericentral liver zonation markers from Halpern et al. Nature 2017 and Guilliams et al. Cell 2022	Mup11	Oat	Rgn	Glul	Cyp2e1	Axin2	Cyp1a2	Gstm3	Psmd4
+Periportal	Periportal liver zonation markers from Halpern et al. Nature 2017 and Guilliams et al. Cell 2022	Cyp2f2	Hal	Sds	Ass1	Asl	Alb	Arg1	Pck1	C2	Sdhd
+Midlobular	Mid-lobular liver zonation markers picked from Fig. 3 and Extended Data Fig 10a of Halpern et al. Nature 2017	Hamp	Igfbp2	Cyp8b1	Mup3	Hamp2	Hsbp8	Ces1d	Cebpa	Fkbp8	Clpp
diff --git a/pegasus/data_files/mouse_lung.gmt b/pegasus/data_files/mouse_lung.gmt
new file mode 100644
index 00000000..f3ab9d0f
--- /dev/null
+++ b/pegasus/data_files/mouse_lung.gmt
@@ -0,0 +1,38 @@
+AT1	AT1 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021	Akap5	Rtkn2	Ndnf	Col4a3	Spock2
+AT2	AT2 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021	Sftpc	Sftpa1	Lamp3	Hc	Slc34a2 
+Ciliated	Ciliated cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021	Dynlrb2	Tmem212	Foxj1	Ccdc153	Nme5
+Club	Club cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021	Scgb1a1	Scgb3a2	Cckar	Gabrp	Slc16a11
+Basal	Basal cell markers from Montoro et al. Nature 2018 Extended Data Fig. 1d	Aqp3	Krt5	Dapl1	Hspa1a	Trp63
+Goblet	Goblet cell markers from Montoro et al. Nature 2018 Supp Table 1	Scgb3a1	Muc5b	Serpinb11	Gp2	Dmbt1
+Tuft	Tuft cell markers from Sun et al. Dev. Cell 2022 and Montoro et al. Nature 2018 Extended Data Fig. 3b	Pou2f3	Ascl2	Dclk1	Lrmp	Ltc4s	Trpm5	Gnb3	Rgs13
+PNEC	Plumonary neuroendocrine cell markers from Montoro et al. Nature 2018 Extended Data Fig. 3b & 3c	Ascl1	Chga	Calca	Scg2	Scg5
+Ionocyte	Ionocyte markers from Montoro et al. Nature 2018 Fig. 5a	Foxi1	Ascl3	Smbd1	Moxd1	Atp6v0d2
+Endothelial	Endothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Egfl7	Cldn5	Cdh5	Pecam1	Calcrl	Ecscr	Icam2
+Mesothelial	Mesothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021	Wt1	Upk3b	Rspo1	C2	Sbsn	Aldh1a2	Lrrn4	Cldn15
+Pericyte	Pericyte markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Notch3	Heyl	Parm1	Ndufa4l2	Cox4i2
+Fibroblast	Fibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Dpt	Clec3b	Pcolce2	Vegfd	Vcam1
+Myofibroblast	Myofibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Egfem1	Agt	Prag1	Etv1	Trim67
+SMC	Smooth muscle cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Tnnt2	Sgcg	Sntg2	Nrtn	Mrvi1	Sbspon
+ILC2	Innate lymphoid cell type 2 markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Gata3	Il1rl1	Arg1	Areg	Il2ra	Csf2	Ccl1	Ccdc184	Calca	Il5
+B	B cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Cd19	Ms4a1	Cd79a	Cd79b	Ebf1	Pax5	Fcmr	Bank1
+NK	NK cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Gzma	Klrb1c	Ncr1	Klre1	Klrc2	Eomes	Cma1	Klra4	Klra7	Klra8
+cDC1	cDC1 markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Xcr1	Ifi205	Rab7b	Tlr3	Sept3	Hepacam2	Gcsam	Snx22	Itgae	Xlr
+cDC2	cDC2 markers from Kaptein et al. Cell 2022	Cd209a	Ltb4r1	Mgl2	Tnip3	Bex6
+migDC	Migoritory DC markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Cacnb3	Nudt17	Ccl22	Apol7c	Slco5a1	Ccr7	Fscn1	Il4i1	Mreg	Bcl2l14
+pDC	Plasmacytoid dendritic cell markers from Kaptein et al. Cell 2022	Siglech	Ccr9	Cox6a2	Cd300c	Klk1
+MonoI	Inflammatory monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Ly6c2	F13a1	Ms4a4c	Ccr2	Gm9733	Mcub 
+MonoP	Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Ace	Eno3	Ear2	Treml4	Spn	Fcgr4	Lair1	Cd300e	Cd300ld	Adgre4
+Neutrophil	Neutrophil markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	S100a8	S100a9	Retnlg	Mmp9	Csf3r	Wfdc21	Il1r2	Cxcr2
+Basophil	Basophil markers inferred from Matsumara et al. Nat. Commun. 2022 data	Cd200r3	Aqp9	Il6	Hgf	Adora2b	Il4	L1cam	Grm6
+Eosinophil	Eosinophil markers inferred from Matsumara et al. Nat. Commun. 2022 data	Epx	Prg3	Eml5	Il5ra	Qsox2	L2hgdh
+Mast	Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022 data	Tph1	Clnk	Hs6st2	Plcg1
+
+
+
+Macrophage	Macro	CD68	CD163	C1QA	MRC1	MS4A6A	MSR1	MERTK
+T cell	T cell markers	CD3D	CD3E	CD3G	TRAC
+
+Plasma cell	Plasma cell markers	CD38	XBP1	CD27	SLAMF7	TNFRSF17	TNFRSF13B
+
+
+Neutrophil	Neutrophil markers	FUT4	MPO	CEACAM8	ELANE	CXCR1	CXCR2	LY6G6D
diff --git a/pegasus/tools/__init__.py b/pegasus/tools/__init__.py
index ac0f149e..3b5eebd3 100644
--- a/pegasus/tools/__init__.py
+++ b/pegasus/tools/__init__.py
@@ -56,7 +56,7 @@
     net_umap,
     net_fle,
 )
-from .diff_expr import de_analysis, markers, write_results_to_excel, run_de_analysis
+from .diff_expr import de_analysis, markers, write_results_to_excel, cluster_specific_markers, run_de_analysis
 from .gradient_boosting import find_markers, run_find_markers
 from .subcluster_utils import clone_subset
 from .signature_score import calc_signature_score, calculate_z_score
diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py
index 89d41895..aa16291b 100644
--- a/pegasus/tools/clustering.py
+++ b/pegasus/tools/clustering.py
@@ -1,6 +1,7 @@
 import time
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_categorical_dtype
 from pegasusio import MultimodalData
 from natsort import natsorted
 
@@ -647,7 +648,7 @@ def split_one_cluster(
     random_state: int = 0,
 ) -> None:
     """
-    Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' clusters and write the new clusting results to 'res_label'. Assume 'clust_label' named clusters as numbers (in str format).
+    Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' sub-clusters and write the new clusting results to 'res_label'. The sub-cluster names are the concatenation of original cluster name and the subcluster id (e.g. 'T' -> 'T-1', 'T-2').
 
     Parameters
     ----------
@@ -664,7 +665,7 @@ def split_one_cluster(
         Split 'clust_id' into `n_clust' subclusters.
 
     res_label: `str`,
-        Write new clustering in data.obs['res_label']. The largest subcluster will use 'clust_id' as its cluster ID, while other subclusters will be numbered after existing clusters.
+        Write new clustering in data.obs['res_label']. The sub-cluster names are the concatenation of original cluster name and the subcluster id (e.g. 'T' -> 'T-1', 'T-2').
 
     rep: ``str``, optional, default: ``"pca"``
         The embedding representation used for Kmeans clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.
@@ -689,16 +690,35 @@ def split_one_cluster(
     --------
     >>> pg.split_one_cluster(data, 'leiden_labels', '15', 2, 'leiden_labels_split')
     """
-    idx = np.where(data.obs[clust_label] == clust_id)[0]
+    cats = None
+    if is_categorical_dtype(data.obs[clust_label]):
+        cats = data.obs[clust_label].cat.categories.values
+    else:
+        cats = pd.Categorical(data.obs[clust_label]).categories.values
+        if cats.dtype.kind not in {'S', 'U'}:
+            cats = cats.astype(str)
+    idx_cat = np.nonzero(cats==clust_id)[0]
+
+    if idx_cat.size == 0:
+        raise ValueError(f"{clust_id} is not in {clust_label}!")
+    elif idx_cat.size > 1:
+        raise ValueError(f"Detected more than one categories in {clust_label} with name {clust_id}!")
+    else:
+        idx_cat = idx_cat[0]
+
+    idx = np.nonzero((data.obs[clust_label] == clust_id).values)[0]
     tmpdat = data[idx].copy()
     from pegasus.tools import neighbors
     neighbors(tmpdat, rep=rep, n_comps=n_comps, use_cache=False)
     leiden(tmpdat, rep=rep, resolution=None, n_clust=n_clust, random_state=random_state)
-    new_clust = data.obs[clust_label].values.astype(int)
-    new_label = new_clust.max() + 1
-    for label in tmpdat.obs['leiden_labels'].value_counts().index[1:]:
-        new_clust[idx[(tmpdat.obs['leiden_labels'] == label).values]] = new_label
-        new_label += 1
-    data.obs[res_label] = pd.Categorical(values = new_clust.astype(str), categories = np.array(range(1, new_label)).astype(str))
+
+    new_clust = data.obs[clust_label].values.astype(str)
+    cats_sub = []
+    for i, label in enumerate(tmpdat.obs['leiden_labels'].value_counts().index):
+        sub_id = f"{clust_id}-{i+1}"
+        new_clust[idx[(tmpdat.obs['leiden_labels'] == label).values]] = sub_id
+        cats_sub.append(sub_id)
+
+    data.obs[res_label] = pd.Categorical(values = new_clust, categories = np.concatenate((cats[0:idx_cat], np.array(cats_sub), cats[idx_cat+1:])))
     data.register_attr(res_label, "cluster")
     del tmpdat
diff --git a/pegasus/tools/diff_expr.py b/pegasus/tools/diff_expr.py
index 45461628..53366c93 100644
--- a/pegasus/tools/diff_expr.py
+++ b/pegasus/tools/diff_expr.py
@@ -419,7 +419,7 @@ def de_analysis(
     n_jobs: ``int``, optional, default: ``-1``
         Number of threads to use. If ``-1``, use all available threads.
 
-    t: ``bool``, optional, default: ``True``
+    t: ``bool``, optional, default: ``False``
         If ``True``, calculate Welch's t test.
 
     fisher: ``bool``, optional, default: ``False``
@@ -756,6 +756,63 @@ def add_worksheet(
     logger.info("Excel spreadsheet is written.")
 
 
+def cluster_specific_markers(
+    markers: Dict[str, Dict[str, pd.DataFrame]],
+    clust_id: str,
+    min_auroc: float = 0.7,
+    expected_pfc: float = 10.0,
+    n_lo: int = 25,
+    n_up: int = 50,
+) -> pd.DataFrame:
+    """ Extract cluster-specific markers from DE results ``markers``.
+
+    This function extracts cluster-specific markers (e.g. with auroc >= min_auroc and high in percentage fold change). The extracted markers can be screened for signatures representing the cluster.
+
+    The selection procedure is as follows: First, pick genes with AUROC >= min_auroc and pfc (percentage fold change) >= expected_pfc. If the number is between [n_lo, n_up], return the subset of markers containing only these genes. Otherwise, if the number < n_lo, extend the gene set to include up to n_lo genes in descending order of their pfc. If the number > n_up, truncate the set by keeping only n_up genes with highest pfc.
+
+    Parameters
+    ----------
+    markers: ``Dict[str, Dict[str, pd.DataFrame]]``
+        Markers from `de_analysis`.
+
+    clust_id: ``str``
+        Cluster ID to tell which cluster to focus on.
+
+    min_auroc: ``float``, default, ``0.7``
+        Minimum AUROC for a gene.
+
+    expected_pfc: ``float``, optional, default: ``10.0``
+        Expected percentage fold change for a gene.
+
+    n_lo: ``int``, optional, default: ``25``
+        Lower bound (inclusive) on the number of genes to return.
+
+    n_up: ``int``, optional, default: ``50``
+        Upper bound (inclusive) on the number of genes to return.
+
+    Returns
+    -------
+    results: ``pd.DataFrame``
+        A Python dataframe containing selected markers, ranking in descending order with respect to AUROC.
+
+    Examples
+    --------
+    >>> candidates = pg.cluster_specific_markers(markers, 'Mono')
+    """
+    df = markers[clust_id]['up']
+    idx_auc = df['auroc'] >= min_auroc
+    idx_epf = df['percentage_fold_change'] >= expected_pfc
+    idx = idx_auc & idx_epf
+    n = idx.sum()
+    if n >= n_lo and n <= n_up:
+        return df[idx]
+    elif n < n_lo:
+        res = df[idx_auc].sort_values('percentage_fold_change', ascending=False)
+        return res.iloc[0:n_lo]
+    else:
+        return df[idx].iloc[0:n_up]
+
+
 @timer(logger=logger)
 def run_de_analysis(
     input_file: str,

From 8d2b3fb14b54b8e79b3a90f8b90efccc8f780edd Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Tue, 7 Mar 2023 18:27:24 -0800
Subject: [PATCH 11/57] Updated markers

---
 .../mouse_immune_cell_markers.json            | 15 +++++++++++--
 .../mouse_lung_cell_markers.json              | 22 +++++++++++++++++++
 pegasus/data_files/mouse_liver.gmt            |  2 +-
 pegasus/data_files/mouse_lung.gmt             | 13 +++--------
 pegasus/tools/diff_expr.py                    |  7 +++---
 5 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index 240615d3..bb3ac649 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -5,9 +5,9 @@
 			"name" : "T cell",
 			"markers" : [
 				{
-					"genes" : ["Cd3d+", "Cd3e+", "Cd3g+", "Trac+", "Cd28+"],
+					"genes" : ["Cd3d+", "Cd3e+", "Lat+", "Thy1+", "Lef1+", "Trac+", "Cd28+"],
 					"weight" : 1.0,
-					"comment" : "T cell markers from Kaptein et al. Cell 2022"
+					"comment" : "T cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
 				}
 			],
 			"subtypes" : {
@@ -67,6 +67,17 @@
 			]
 		},
 
+		{
+			"name" : "Immature B cell",
+			"markers" : [
+				{
+					"genes" : ["Tifa+", "Cecr2+", "Rag1+", "Atp1b1+", "Myb+", "Irf4+", "Fam129c+"],
+					"weight" : 1.0,
+					"comment" : "Immature B cell markers from Hurskainen et al. Nat. Commun. 2021"
+				}
+			]
+		},
+
 		{
 			"name" : "B cell",
 			"markers" : [
diff --git a/pegasus/annotate_cluster/mouse_lung_cell_markers.json b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
index 2c81e13f..cb091482 100644
--- a/pegasus/annotate_cluster/mouse_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
@@ -287,5 +287,27 @@
 				}
 			]
 		},
+
+		{
+			"name" : "Alveolar macrophage",
+			"markers" : [
+				{
+					"genes" : ["Atp6v0d2+", "Olr1+", "F7+", "Ear1+", "Tfec+", "Gpnmb+", "Lrp12+", "Marco+"],
+					"weight" : 1.0,
+					"comment" : "Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+				}
+			]
+		},
+
+		{
+			"name" : "Interstitial macrophage",
+			"markers" : [
+				{
+					"genes" : ["C1qa+", "C1qb+", "C1qc+", "Pf4+", "Ms4a7+", "Fcrls+"],
+					"weight" : 1.0,
+					"comment" : "Interstitial macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+				}
+			]
+		}
 	]
 }
diff --git a/pegasus/data_files/mouse_liver.gmt b/pegasus/data_files/mouse_liver.gmt
index ad62209e..d9c8bb4b 100644
--- a/pegasus/data_files/mouse_liver.gmt
+++ b/pegasus/data_files/mouse_liver.gmt
@@ -6,7 +6,7 @@ Fibro	Fibroblast	Col1a1	Mrc2	Plcxd3	Fndc1	Cpxm1
 Hepatocyte	Hepatocye	Acaa1b	Arg1	Sult2a8	Hgd	Otc
 Cholangiocyte	Cholangiocyte	Spp1	Ddit4l	Sox9	Fgfr3	Plet1
 HSPC	Hepatic stem and progenitor cell	Chrm3	Dmbt1	Slc4a4	Parm1	Pcdh11x	
-T	T cell	Cd3d	Cd3e	Cd3g	Trac	Cd28
+T	T cell	Cd3d	Cd3e	Lat	Thy1	Lef1	Trac	Cd28
 B	B cell	Cd19	Ms4a1	Cd79a	Cd79b	Ebf1	Pax5	Fcmr	Bank1
 NK	NK cell	Eomes	Cma1	Klra4	Klra7	Klra8
 ILC1	Innate lymphoid cell type 1	Xcl1	Cd160	Klrc1	Cd200r2	Gzmc
diff --git a/pegasus/data_files/mouse_lung.gmt b/pegasus/data_files/mouse_lung.gmt
index f3ab9d0f..0ed0bc5b 100644
--- a/pegasus/data_files/mouse_lung.gmt
+++ b/pegasus/data_files/mouse_lung.gmt
@@ -13,7 +13,10 @@ Pericyte	Pericyte markers inferred from Hurskainen et al. Nat. Commun. 2021 data
 Fibroblast	Fibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Dpt	Clec3b	Pcolce2	Vegfd	Vcam1
 Myofibroblast	Myofibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Egfem1	Agt	Prag1	Etv1	Trim67
 SMC	Smooth muscle cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Tnnt2	Sgcg	Sntg2	Nrtn	Mrvi1	Sbspon
+AlvMf	Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021	Atp6v0d2	Olr1	F7	Ear1	Tfec	Gpnmb	Lrp12	Marco
+IntMf	Interstitial macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021	C1qa	C1qb	C1qc	Pf4	Ms4a7	Fcrls
 ILC2	Innate lymphoid cell type 2 markers inferred from Hurskainen et al. Nat. Commun. 2021 data	Gata3	Il1rl1	Arg1	Areg	Il2ra	Csf2	Ccl1	Ccdc184	Calca	Il5
+T	T cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Cd3d	Cd3e	Lat	Thy1	Lef1	Trac	Cd28
 B	B cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Cd19	Ms4a1	Cd79a	Cd79b	Ebf1	Pax5	Fcmr	Bank1
 NK	NK cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Gzma	Klrb1c	Ncr1	Klre1	Klrc2	Eomes	Cma1	Klra4	Klra7	Klra8
 cDC1	cDC1 markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data	Xcr1	Ifi205	Rab7b	Tlr3	Sept3	Hepacam2	Gcsam	Snx22	Itgae	Xlr
@@ -26,13 +29,3 @@ Neutrophil	Neutrophil markers inferred from Kaptein et al. Cell 2022 and Hurskai
 Basophil	Basophil markers inferred from Matsumara et al. Nat. Commun. 2022 data	Cd200r3	Aqp9	Il6	Hgf	Adora2b	Il4	L1cam	Grm6
 Eosinophil	Eosinophil markers inferred from Matsumara et al. Nat. Commun. 2022 data	Epx	Prg3	Eml5	Il5ra	Qsox2	L2hgdh
 Mast	Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022 data	Tph1	Clnk	Hs6st2	Plcg1
-
-
-
-Macrophage	Macro	CD68	CD163	C1QA	MRC1	MS4A6A	MSR1	MERTK
-T cell	T cell markers	CD3D	CD3E	CD3G	TRAC
-
-Plasma cell	Plasma cell markers	CD38	XBP1	CD27	SLAMF7	TNFRSF17	TNFRSF13B
-
-
-Neutrophil	Neutrophil markers	FUT4	MPO	CEACAM8	ELANE	CXCR1	CXCR2	LY6G6D
diff --git a/pegasus/tools/diff_expr.py b/pegasus/tools/diff_expr.py
index 53366c93..f83c01e1 100644
--- a/pegasus/tools/diff_expr.py
+++ b/pegasus/tools/diff_expr.py
@@ -806,11 +806,10 @@ def cluster_specific_markers(
     n = idx.sum()
     if n >= n_lo and n <= n_up:
         return df[idx]
-    elif n < n_lo:
-        res = df[idx_auc].sort_values('percentage_fold_change', ascending=False)
-        return res.iloc[0:n_lo]
     else:
-        return df[idx].iloc[0:n_up]
+        res = df[idx_auc].sort_values('percentage_fold_change', ascending=False)
+        res = res.iloc[0:(n_lo if n < n_lo else n_up)].sort_values('auroc', ascending=False)
+        return res
 
 
 @timer(logger=logger)

From e312855c42c0489ffa4d679cb6c5f6aa6ac0f138 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Fri, 7 Apr 2023 17:02:56 -0700
Subject: [PATCH 12/57] fix divide by zero issue in quantile normalization step
 of integrative_nmf

---
 pegasus/tools/nmf.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pegasus/tools/nmf.py b/pegasus/tools/nmf.py
index a37399d2..4cc6f270 100644
--- a/pegasus/tools/nmf.py
+++ b/pegasus/tools/nmf.py
@@ -406,7 +406,12 @@ def integrative_nmf(
     seeds = rg.integers(4294967295, size=nbatch)
     ref_batch = max_size = -1
     for i in range(nbatch):
-        H_new = np.ascontiguousarray(Hs[i] / np.linalg.norm(Hs[i], axis=0), dtype=np.float32) # Scale H
+        h_norm = np.linalg.norm(Hs[i], axis=0)
+        idx_h_zeros = np.where(h_norm==0)[0]
+        if idx_h_zeros.size > 0:
+            # Set norm 0 to 1 to avoid divide by zero issue
+            h_norm[idx_h_zeros] = 1.0
+        H_new = np.ascontiguousarray(Hs[i] / h_norm, dtype=np.float32) # Scale H
         Hs_new.append(H_new) # Append scaled H
 
         if not quantile_norm:

From 8f29e1a9c154b0c08990a1fa17cc626a4f8373c2 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Mon, 10 Apr 2023 00:02:08 -0700
Subject: [PATCH 13/57] Updated human lung and immune markers

---
 pegasus/annotate_cluster/annotate_cluster.py  |   2 +
 .../human_immune_cell_markers.json            |   4 +-
 .../human_lung_cell_markers.json              | 135 +++++++++---------
 pegasus/tools/utils.py                        |   1 +
 4 files changed, 71 insertions(+), 71 deletions(-)

diff --git a/pegasus/annotate_cluster/annotate_cluster.py b/pegasus/annotate_cluster/annotate_cluster.py
index 4b0b6fce..22caea43 100644
--- a/pegasus/annotate_cluster/annotate_cluster.py
+++ b/pegasus/annotate_cluster/annotate_cluster.py
@@ -279,6 +279,7 @@ def infer_cell_types(
             * ``'human_brain'`` for human brain cells;
             * ``'mouse_brain'`` for mouse brain cells;
             * ``'human_lung'`` for human lung cells;
+            * ``'mouse_lung'`` for mouse lung cells;
             * ``'mouse_liver'`` for mouse liver cells.
         * If ``Dict``, it refers to a Python dictionary describing the markers.
 
@@ -321,6 +322,7 @@ def infer_cell_types(
         human_brain="human_brain_cell_markers.json",
         mouse_brain="mouse_brain_cell_markers.json",
         human_lung="human_lung_cell_markers.json",
+        mouse_lung="mouse_lung_cell_markers.json",
         mouse_liver="mouse_liver_cell_markers.json",
     )
 
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index cc153e55..94e21464 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -305,9 +305,9 @@
 			"name" : "Plasmacytoid dendritic cell",
 			"markers" : [
 				{
-					"genes" : ["JCHAIN+", "LILRA4+", "GZMB+", "MZB1+", "IL3RA+", "SERPINF1+", "ITM2C+", "IRF7+"],
+					"genes" : ["LILRA4+", "SERPINF1+", "IL3RA+", "TPM2+", "SCT+", "UGCG+", "CLEC4C+", "LRRC26+", "SMPD3+", "AC119428.2+"],
 					"weight" : 1.0,
-					"comment" : "important pDC markers"
+					"comment" : "Markers derived from Immune Cell Atlas PBMC, BM and CB data"
 				}
 			]
 		},
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index c787456e..c2e9f6c4 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -102,8 +102,6 @@
 
 
 
-
-
 		{
 			"name" : "Vascular endothelial cell",
 			"markers" : [
@@ -122,22 +120,22 @@
 				"title" : "Vascular endothelial cell subtype markers",
 				"cell_types" : [
 					{
-						"name" : "Aerocyte",
+						"name" : "EC artery",
 						"markers" : [
 							{
-								"genes" : ["EDNRB+", "TBX2+", "EDA+", "HPGD+", "PRKG1+", "RCSD1+", "CYP3A5+", "VWF-"],
+								"genes" : ["CXCL12+", "GJA5+", "DKK2+", "HEY1+", "IGFBP3+", "SERPINE2+", "EFNB2+", "BMX+"],
 								"weight" : 1.0,
-								"comment" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
 					{
-						"name" : "EC general capillary",
+						"name" : "EC vein",
 						"markers" : [
 							{
-								"genes" : ["VWF+", "EDN1+", "FCN3+", "CD36+", "GPIHBP1+", "NRXN3+", "BTNL8+"],
+								"genes" : ["CPE+", "C7+", "IL1R1+", "PLA1A+", "PTGIS+", "ABI3BP+", "CYP1B1+", "ADGRG6+"],
 								"weight" : 1.0,
-								"comment" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+								"comments" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
@@ -145,52 +143,45 @@
 						"name" : "EC bronchial vessel",
 						"markers" : [
 							{
-								"genes" : ["SPRY1+", "PLVAP+", "VWA1+", "MPZL2+", "ESM1+"],
+								"genes" : ["SPRY1+", "PLVAP+", "VWA1+", "ABCB1+", "COL15A1+", "RUNDC3B+"],
+								"weight" : 1.0,
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
+							}
+						]
+					},
+					{
+						"name" : "Aerocyte",
+						"markers" : [
+							{
+								"genes" : ["HPGD+", "EDNRB+", "SOSTDC1+", "B3GALNT1+", "CYP3A5+", "TBX2+", "S100A3+", "IL1RL1+", "PRKG1+", "EXPH5+"],
 								"weight" : 1.0,
-								"comment" : "Markers from Travaglini et al. Nature 2020"
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
 					{
-						"name" : "EC pulmonary-venous",
+						"name" : "EC general capillary",
 						"markers" : [
 							{
-								"genes" : ["COL15A1+", "ZNF385D+", "EBF1+", "CPXM2+", "PLVAP+", "VWA1+", "SPRY1+"],
+								"genes" : ["FCN3+", "IL7R+", "EDN1+", "GPIHBP1+", "SLC6A4+", "NTRK2+", "IL18R1+", "NRXN3+"],
 								"weight" : 1.0,
-								"comments" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
 					{
-						"name" : "EC systemic-venous",
+						"name" : "EC lymphatic",
 						"markers" : [
 							{
-								"genes" : ["COL15A1-", "CPE+", "DKK3+", "EFEMP1+", "CDH11+", "PLAT+"],
+								"genes" : ["CCL21+", "TFF3+", "PDPN+", "PROX1+", "GPM6A+", "SEMA3D+", "TBX1+", "RELN+"],
 								"weight" : 1.0,
-								"comments" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					}
 				]
 			}
 		},
-		{
-			"name" : "Lymphatic endothelial cell",
-			"markers" : [
-				{
-					"genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+"],
-					"weight" : 0.2,
-					"comment" : "Markers for endothelial cells, from Schupp et al. Circulation 2021"
-				},
-				{
-					"genes" : ["CCL21+", "SEMA3D+", "PROX1+", "PDPN+", "MMRN1+", "RELN+", "PKHD1L1+", "TFF3+", "LYVE1+", "FLT4+", "TBX1+"],
-					"weight" : 0.8,
-					"comment" : "Lymphatic-specific markers, from Schupp et al. Circulation 2021"
-				}
-			]
-		},
-
-
 
 
 
@@ -198,26 +189,21 @@
 			"name" : "Smooth muscle cell",
 			"markers" : [
 				{
-					"genes" : ["MYH11+", "TAGLN+", "ACTG2+", "CNN1+", "PLN+"],
-					"weight" : 0.8,
-					"comment" : "Markers from Muus et al., Braga et al. and Schupp et al."
-				},
-				{
-					"genes" : ["MYL9+", "TPM2+", "ACTA2+"],
-					"weight" : 0.2,
-					"comment" : "Markers that might also expressed in other stromal cell types"
+					"genes" : ["MYH11+", "ACTG2+", "CNN1+", "PLN+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			],
 			"subtypes" : {
 				"title" : "SMC subtype markers",
 				"cell_types" : [
 					{
-						"name" : "DES+",
+						"name" : "Airway smooth muscle cell",
 						"markers" : [
 							{
-								"genes" : ["DES+"],
+								"genes" : ["DES+", "TNNT2+", "RERGL+"],
 								"weight" : 1.0,
-								"comment" : "DES+ SMC"
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020"
 							}
 						]
 					}
@@ -229,21 +215,10 @@
 			"name" : "Pericyte",
 			"markers" : [
 				{
-					"genes" : ["TRPC6+", "CSPG4+", "FAM162B+", "GJA4+", "GJC1+", "HIGD1B+", "CDH6+", "LAMC3+", "FHL5+"],
-					"weight" : 0.8,
-					"comment" : "Markers from Schupp et al. Circulation 2021 and Travaglini et al. Nature 2020"
-				},
-				{
-					"genes" : ["PDGFRB+", "TBX2+", "EBF1+"],
-					"weight" : 0.1,
-					"comment" : "Markers that are highly expressed in Pericytes but also expressed in fibroblast"
-				},
-				{
-					"genes" : [ "LGI4+", "KCNK17+", "CACNA1H+", "PTN+", "TESC+"],
-					"weight" : 0.1,
-					"comment" : "Markers that are lowly expressed"
+					"genes" : ["COX4I2+", "HIGD1B+", "NDUFA4L2+", "FAM162B+", "LAMC3+", "KCNK3+", "GJA4+", "GJC1+", "CSPG4+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
-
 			]
 		},
 
@@ -251,15 +226,15 @@
 			"name" : "Mesothelial cell",
 			"markers" : [
 				{
-					"genes" : ["WT1+", "VIPR2+", "ITLN1+", "LINC02360+", "BNC1+",  "AP000561.1+", "CALB2+", "HAS1+", "LINC01133+", "GALNT9+"],
+					"genes" : ["CPA4+", "ITLN1+", "GALNT9+", "BNC1+", "CALB2+", "WT1+", "UPK3B+"],
 					"weight" : 1.0,
-					"comment" : "Markers from Schupp et al. and Travaglini et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
 
 		{
-			"name" : "Fibroblast/Myofibroblast",
+			"name" : "Fibroblast",
 			"markers" : [
 				{
 					"genes" : ["COL1A1+", "COL1A2+", "PDGFRA+", "ELN+", "BGN+"],
@@ -268,15 +243,15 @@
 				}
 			],
 			"subtypes" : {
-				"title" : "Fibro/Myofib subtype markers",
+				"title" : "Fibroblast subtype markers",
 				"cell_types" : [
 					{
 						"name" : "Adventitial fibroblast",
 						"markers" : [
 							{
-								"genes" : ["PTGIS+", "SFRP2+", "PDGFRL+", "SCARA5+", "MFAP5+", "PI16+", "AOX1+", "GAS1+", "IGFBP6+", "CXCL14+"],
+								"genes" : ["SFRP2+", "SFRP4+", "PDGFRL+", "PI16+",  "MFAP5+", "SCARA5+"],
 								"weight" : 1.0,
-								"comment" : "Markers from Schupp et al. and Travaglini et al."
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
@@ -284,24 +259,46 @@
 						"name" : "Alveolar fibroblast",
 						"markers" : [
 							{
-								"genes" : ["NKD1+", "FGFR4+", "GPM6B+", "SPINT2+", "SCN7A+", "TCF21+", "CAMK2N1+", "ADAMTS8+"],
+								"genes" : ["GPC3+", "FMO2+", "SCN7A+", "FGFR4+", "NKD2+", "ADAMTS8+"],
 								"weight" : 1.0,
-								"comment" : "Markers from Schupp et al. and Travaglini et al."
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
 					},
 					{
-						"name" : "Myofibroblast",
+						"name" : "Lipofibroblast",
 						"markers" : [
 							{
-								"genes" : ["ACTA2+", "MYL9+", "MT2A+", "EEF1A1+", "TMSB10+", "FAU+", "UBA52+", "SERF2+", "PTMA+", "S100A6+"],
+								"genes" : ["MLLT11+", "HAS2+", "SEMA6A+", "LONRF2+", "HOMER1+", "PWWP3B+"],
 								"weight" : 1.0,
-								"comment" : "Markers from Schupp et al. and Travaglini et al."
+								"comment" : "Markers inferred from Travaglini et al. Nature 2020"
 							}
 						]
 					}
 				]
 			}
+		},
+
+		{
+			"name" : "Myofibroblast",
+			"markers" : [
+				{
+					"genes" : ["ASPN+", "SCARA3+", "WIF1+", "ANGPTL2+", "ITGBL1+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020"
+				}
+			]
+		},
+
+		{
+			"name" : "Fibromyocyte",
+			"markers" : [
+				{
+					"genes" : ["SBSPON+", "SCX+", "GREM2+", "KCNMB1+", "LGR6+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020"
+				}
+			]
 		}
 	]
 }
diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py
index 09aa0f69..cd3bbf69 100644
--- a/pegasus/tools/utils.py
+++ b/pegasus/tools/utils.py
@@ -188,6 +188,7 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: str, warni
     apoptosis_human=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_human.gmt"),
     apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"),
     human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"),
+    mouse_lung=pkg_resources.resource_filename("pegasus", "data_files/mouse_lung.gmt"),
     mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"),
     mouse_liver=pkg_resources.resource_filename("pegasus", "data_files/mouse_liver.gmt"),
     emt_human=pkg_resources.resource_filename("pegasus", "data_files/emt_human.gmt"),

From ea404155c461170f43bb4af21554564321c54c96 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Tue, 11 Apr 2023 14:37:51 -0700
Subject: [PATCH 14/57] Make compatible with Pandas 2.0

---
 pegasus/tools/preprocessing.py   | 7 +++----
 pegasus/tools/signature_score.py | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/pegasus/tools/preprocessing.py b/pegasus/tools/preprocessing.py
index 71c78b3b..d3c1d6c8 100644
--- a/pegasus/tools/preprocessing.py
+++ b/pegasus/tools/preprocessing.py
@@ -276,10 +276,9 @@ def _run_filter_data(
 
         if output_filt is not None:
             group_key = unidata.get_uid()
-            writer = pd.ExcelWriter(f"{output_filt}.{group_key}.filt.xlsx", engine="xlsxwriter")
-            df_cells = get_filter_stats(unidata, min_genes_before_filt = min_genes_before_filt)
-            df_cells.to_excel(writer, sheet_name="Cell filtration stats")
-            writer.save()
+            with pd.ExcelWriter(f"{output_filt}.{group_key}.filt.xlsx", engine="xlsxwriter") as writer:
+                df_cells = get_filter_stats(unidata, min_genes_before_filt = min_genes_before_filt)
+                df_cells.to_excel(writer, sheet_name="Cell filtration stats")
             logger.info(f"Filtration results for {group_key} are written.")
 
         if plot_filt is not None:
diff --git a/pegasus/tools/signature_score.py b/pegasus/tools/signature_score.py
index b41071c0..5030162f 100644
--- a/pegasus/tools/signature_score.py
+++ b/pegasus/tools/signature_score.py
@@ -30,7 +30,7 @@ def _check_and_calc_sig_background(data: UnimodalData, n_bins: int) -> bool:
             bins = pd.qcut(mean_vec, n_bins, duplicates = "drop")
         if bins.value_counts().min() == 1:
             logger.warning("Detected bins with only 1 gene!")
-        bins.categories = bins.categories.astype(str)
+        bins = bins.rename_categories(dict(zip(bins.categories, bins.categories.astype(str))))
         data.var["bins"] = bins
 
         # calculate background expectations

From d875435004bba29e68aeb77f2bb955fa09434321 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Tue, 18 Apr 2023 18:08:08 -0700
Subject: [PATCH 15/57] Change string type from fixed-length to var-length

---
 pegasus/tools/clustering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py
index aa16291b..432129e5 100644
--- a/pegasus/tools/clustering.py
+++ b/pegasus/tools/clustering.py
@@ -712,7 +712,7 @@ def split_one_cluster(
     neighbors(tmpdat, rep=rep, n_comps=n_comps, use_cache=False)
     leiden(tmpdat, rep=rep, resolution=None, n_clust=n_clust, random_state=random_state)
 
-    new_clust = data.obs[clust_label].values.astype(str)
+    new_clust = data.obs[clust_label].values.astype(object)
     cats_sub = []
     for i, label in enumerate(tmpdat.obs['leiden_labels'].value_counts().index):
         sub_id = f"{clust_id}-{i+1}"

From 3b252a34f5ded402231c471a415de0df04e66728 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Tue, 18 Apr 2023 23:50:24 -0700
Subject: [PATCH 16/57] Updated neutrophil markers

---
 pegasus/annotate_cluster/human_immune_cell_markers.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 94e21464..eb601134 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -369,9 +369,9 @@
 			"name" : "Neutrophil",
 			"markers" : [
 				{
-					"genes" : ["FUT4+", "MPO+", "CEACAM8+", "ELANE+", "CXCR1+", "CXCR2+", "LY6G6D+"],
+					"genes" : ["KCNJ15+", "IL1R2+", "LUCAT1+", "G0S2+", "TREM1+", "CSF3R+", "FCGR3B+", "CXCR1+", "CXCR2+"],
 					"weight" : 1.0,
-					"comment" : "key markers"
+					"comment" : "Neutrophil markers validated using 10x public whole blood dataset."
 				}
 			]
 		},

From e2167d0c1c249a2ebb5c787f86aa83b01c3829e7 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sun, 30 Apr 2023 23:16:45 -0700
Subject: [PATCH 17/57] Updated neutrophil markers

---
 .../human_immune_cell_markers.json            | 37 +++++++++++++++-
 .../human_lung_cell_markers.json              | 43 ++++++++++++-------
 pegasus/data_files/human_lung.gmt             | 21 +++++----
 pegasus/tools/scvitools.py                    |  5 +++
 4 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index eb601134..b849947f 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -365,13 +365,46 @@
 			]
 		},
 
+		{
+			"name" : "Pro-Neutrophil",
+			"markers" : [
+				{
+					"genes" : ["DEFA3+", "DEFA4+", "AZU1+", "MS4A3+", "ELANE+", "SLPI+", "CEACAM6+", "RNASE3+", "PRTN3+", "MPO+", "AC104232.1+", "CTSG+"],
+					"weight" : 1.0,
+					"comment" : "Pro-Neutrophil markers validated using 10x public whole blood dataset"
+				}
+			]
+		},
+
+		{
+			"name" : "Pre-Neutrophil",
+			"markers" : [
+				{
+					"genes" : ["LTF+", "LCN2+", "MMP8+", "CRISP3+", "CAMP+", "PGLYRP1+", "CD177+", "HP+"],
+					"weight" : 1.0,
+					"comment" : "Pre-Neutrophil markers validated using 10x public whole blood dataset"
+				}
+			]
+		},
+
 		{
 			"name" : "Neutrophil",
 			"markers" : [
 				{
-					"genes" : ["KCNJ15+", "IL1R2+", "LUCAT1+", "G0S2+", "TREM1+", "CSF3R+", "FCGR3B+", "CXCR1+", "CXCR2+"],
+					"genes" : ["CSF3R+", "G0S2+", "LUCAT1+", "EPHB1+", "TNFRSF10C+", "IL1R2+", "KCNJ15+", "FCGR3B+", "AC007032.1+", "HSD11B1-AS1+"],
+					"weight" : 1.0,
+					"comment" : "Neutrophil markers validated using 10x public whole blood dataset"
+				}
+			]
+		},
+
+		{
+			"name" : "Basophil",
+			"markers" : [
+				{
+					"genes" : ["AKAP12+", "HDC+", "GATA2+", "ENPP3+", "CA8+", "ITGB8+", "GCSAML+", "CRPPA+", "AC111000.4+", "LINC02223+"],
 					"weight" : 1.0,
-					"comment" : "Neutrophil markers validated using 10x public whole blood dataset."
+					"comment" : "Basophil markers validated using 10x public whole blood dataset"
 				}
 			]
 		},
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index c2e9f6c4..9166aa1a 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -5,9 +5,9 @@
 			"name" : "Alveolar type I cell",
 			"markers" : [
 				{
-					"genes" : ["AGER+", "CAV1+", "RTKN2+", "MYL9+", "SPOCK2+", "ANXA3+", "TIMP3+", "CAV2+", "ST6GALNAC5+", "MYRF+"],
+					"genes" : ["AGER+", "SPOCK2+", "RTKN2+", "TNNC1+", "SCEL+", "CLIC5+", "NCKAP5+", "ARHGEF26+", "GGTLC1+", "ITLN2+", "MS4A15+"],
 					"weight" : 1.0,
-					"comment" : "AT1 markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -16,9 +16,9 @@
 			"name" : "Alveolar type II cell",
 			"markers" : [
 				{
-					"genes" : ["SFTPA1+", "SFTPA2+", "SFTPC+", "ETV5+", "TTN+", "PLA2G4F+", "CCDC141+", "LAMP3+", "ABCA3+", "HHIP+"],
+					"genes" : ["SFTPA1+", "SFTPA2+", "SFTPC+", "PGC+", "LAMP3+", "FASN+", "HHIP+", "ETV5+", "RASGRF1+", "ABCA3+"],
 					"weight" : 1.0,
-					"comment" : "AT2 markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -27,9 +27,9 @@
 			"name" : "Basal cell",
 			"markers" : [
 				{
-					"genes" : ["KRT5+", "KRT15+", "KRT17+", "TP63+", "S100A2+", "TNS4+"],
+					"genes" : ["KRT17+", "S100A2+", "MIR205HG+", "KRT15+", "KRT5+", "DLK2+", "CDH3+", "TP63+", "TNS4+"],
 					"weight" : 1.0,
-					"comment" : "Basal cell markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -38,9 +38,9 @@
 			"name" : "Club cell",
 			"markers" : [
 				{
-					"genes" : ["SCGB3A2+", "MGP+", "VIM+", "CST3+"],
+					"genes" : ["SCGB3A2+", "MGP+", "CTSE+"],
 					"weight" : 1.0,
-					"comment" : "Club cell markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -49,9 +49,9 @@
 			"name" : "Ciliated cell",
 			"markers" : [
 				{
-					"genes" : ["ERICH3+", "SNTN+", "CCDC78+", "SNTN+", "ZBBX+", "DNAI1+", "ARMC3+", "CFAP157+", "TTC29+", "CFAP73+"],
+					"genes" : ["ERICH3+", "ARMC3+", "DNAI2+", "ZBBX+", "VWA3B+", "RGS22+", "TTC29+", "CDHR4+", "PPP1R42+", "CFAP46+", "CFAP52+", "CFAP73+", "CFAP77+", "CFAP157+", "DNAH3+", "DNAH9+", "ADGB+", "SNTN+", "CCDC170+", "C6orf118+"],
 					"weight" : 1.0,
-					"comment" : "Ciliated cell markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -60,9 +60,9 @@
 			"name" : "Goblet cell",
 			"markers" : [
 				{
-					"genes" : ["MUC5AC+", "MUC5B+", "BPIFB1+", "MSMB+", "FAM3D+", "SERPINB11+", "CXCL6+", "SCGB1A1+", "FAM3D+", "SERPINB3+"],
+					"genes" : ["MUC5AC+", "MUC5B+", "BPIFB1+", "MSMB+", "SERPINB11+", "CYP2F1+"],
 					"weight" : 1.0,
-					"comment" : "Goblet cell markers from Schupp et al., Travaglini et al. and Tony et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -71,9 +71,9 @@
 			"name" : "Ionocyte",
 			"markers" : [
 				{
-					"genes" : ["FOXI1+", "ASCL3+", "CLDN25+", "ATP6V1G3+", "LINC01187+"],
+					"genes" : ["ASCL3+", "CLCNKB+", "FOXI1+", "ATP6V1G3+", "TMPRSS11E+", "BSND+", "LINC01187+", "CLDN25+"],
 					"weight" : 1.0,
-					"comment" : "Ionocyte markers from Travaglini et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
 		},
@@ -82,9 +82,20 @@
 			"name" : "Plumonary neuroendocrine cell",
 			"markers" : [
 				{
-					"genes" : ["CALCA+", "CHGA+", "ASCL1+", "SLC35D3+", "KIF1A+"],
+					"genes" : ["CHGA+", "CHGB+", "SCGN+", "SCG5+", "CPLX2+", "GRP+", "ASCL1+", "INSM1+"],
 					"weight" : 1.0,
-					"comment" : "Plumonary neuroendocrine cell markers from Travaglini et al."
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
+				}
+			]
+		},
+
+		{
+			"name" : "Submucosal gland serous cel",
+			"markers" : [
+				{
+					"genes" : ["PRR4+", "TCN1+", "C6orf58+", "PRB3+", "LPO+", "PRB1+", "PRH2+", "PRH1+", "ODAM+"],
+					"weight" : 1.0,
+					"comment" : "Markers inferred from Travaglini et al. Nature 2020"
 				}
 			]
 		},
diff --git a/pegasus/data_files/human_lung.gmt b/pegasus/data_files/human_lung.gmt
index 871c9dec..3559ccb9 100644
--- a/pegasus/data_files/human_lung.gmt
+++ b/pegasus/data_files/human_lung.gmt
@@ -8,12 +8,15 @@ T cell	T cell markers	CD3D	CD3E	CD3G	TRAC
 B cell	B cell markers	CD19	MS4A1	CD79A	CD79B
 Plasma cell	Plasma cell markers	CD38	XBP1	CD27	SLAMF7	TNFRSF17	TNFRSF13B
 Mast cell	Mast cell markers	KIT	CPA3	TPSB2	TPSAB1	AREG	RGS1	RGS2
-Neutrophil	Neutrophil markers	FUT4	MPO	CEACAM8	ELANE	CXCR1	CXCR2	LY6G6D
-AT1	AT1 markers from Schupp et al., Travaglini et al. and Tony et al.	AGER	CAV1	RTKN2	MYL9	SPOCK2	ANXA3	TIMP3	CAV2	ST6GALNAC5	MYRF
-AT2	AT2 markers from Schupp et al., Travaglini et al. and Tony et al.	SFTPA1	SFTPA2	SFTPC	ETV5	TTN	PLA2G4F	CCDC141	LAMP3	ABCA3	HHIP
-Basal	Basal cell markers from Schupp et al., Travaglini et al. and Tony et al.	KRT5	KRT15	KRT17	TP63	S100A2	TNS4
-Ciliated	Ciliated cell markers from Schupp et al., Travaglini et al. and Tony et al.	ERICH3	SNTN	CCDC78	SNTN	ZBBX	DNAI1	ARMC3	CFAP157	TTC29	CFAP73
-Club	Club cell markers from Schupp et al., Travaglini et al. and Tony et al.	SCGB3A2	MGP	VIM	CST3
-Goblet	Goblet cell markers from Schupp et al., Travaglini et al. and Tony et al.	MUC5AC	MUC5B	BPIFB1	MSMB	FAM3D	SERPINB11	CXCL6	SCGB1A1	FAM3D	SERPINB3
-Ionocyte	Ionocyte markers from Travaglini et al.	FOXI1	ASCL3	CLDN25	ATP6V1G3	LINC01187
-PNEC	Plumonary neuroendocrine cell markers from Travaglini et al.	CALCA	CHGA	ASCL1	SLC35D3	KIF1A
+ProNeu	Pro-Neutrophil markers validated using 10x public whole blood dataset	DEFA3	DEFA4	AZU1	MS4A3	ELANE	SLPI	CEACAM6	RNASE3	PRTN3	MPO	AC104232.1	CTSG
+PreNeu	Pre-Neutrophil markers validated using 10x public whole blood dataset	LTF	LCN2	MMP8	CRISP3	CAMP	PGLYRP1	CD177	HP
+Neutrophil	Neutrophil markers	CSF3R	G0S2	LUCAT1	EPHB1	TNFRSF10C	IL1R2	KCNJ15	FCGR3B	AC007032.1	HSD11B1-AS1
+AT1	AT1 markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	AGER	SPOCK2	RTKN2	TNNC1	SCEL	CLIC5	NCKAP5	ARHGEF26	GGTLC1	ITLN2	MS4A15
+AT2	AT2 markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	SFTPA1	SFTPA2	SFTPC	PGC	LAMP3	FASN	HHIP	ETV5	RASGRF1	ABCA3
+Basal	Basal cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	KRT17	S100A2	MIR205HG	KRT15	KRT5	DLK2	CDH3	TP63	TNS4
+Club	Club cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	SCGB3A2	MGP	CTSE
+Ciliated	Ciliated cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	ERICH3	ARMC3	DNAI2	ZBBX	VWA3B	RGS22	TTC29	CDHR4	PPP1R42	CFAP46	CFAP52	CFAP73	CFAP77	CFAP157	DNAH3	DNAH9	ADGB	SNTN	CCDC170	C6orf118
+Goblet	Goblet cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	MUC5AC	MUC5B	BPIFB1	MSMB	SERPINB11	CYP2F1
+Ionocyte	Ionocyte markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	ASCL3	CLCNKB	FOXI1	ATP6V1G3	TMPRSS11E	BSND	LINC01187	CLDN25
+PNEC	Plumonary neuroendocrine cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021	CHGA	CHGB	SCGN	SCG5	CPLX2	GRP	ASCL1	INSM1
+SMG	SMG serous cell markers inferred from Travaglini et al. Nature 2020	PRR4	TCN1	C6orf58	PRB3	LPO	PRB1	PRH2	PRH1	ODAM
diff --git a/pegasus/tools/scvitools.py b/pegasus/tools/scvitools.py
index a01c0a8e..20dfd1c3 100644
--- a/pegasus/tools/scvitools.py
+++ b/pegasus/tools/scvitools.py
@@ -190,9 +190,14 @@ def run_scvi(
     scvi.settings.num_threads = eff_n_jobs(n_jobs) # set n_jobs
     scvi.settings.seed = random_state # set random_state, see [here](https://docs.scvi-tools.org/en/stable/_modules/scvi/_settings.html) for more details.
 
+    print(max_epochs)
+    
     if max_epochs is None:
         max_epochs = np.min([round((20000 / len(adata.obs)) * 400), 400])
 
+    print(type(max_epochs))
+    print(max_epochs)
+
     scvi.model.SCVI.setup_anndata(adata,
         batch_key=batch,
         categorical_covariate_keys=categorical_covariate_keys,

From 47fde0836693386940f230d8219750a3c187c18b Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Tue, 23 May 2023 09:08:53 +0800
Subject: [PATCH 18/57] Updated NK cell and NK subtype markers

---
 .../human_immune_cell_markers.json            | 57 +++++++++++--------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index b849947f..f4ed25b4 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -76,36 +76,47 @@
 			"name" : "Natural killer cell",
 			"markers" : [
 				{
-					"genes" : ["NCAM1+"],
-					"weight" : 0.2,
-					"comment" : "CD56"
+					"genes" : ["GNLY+", "KLRF1+", "KLRD1+", "TRDC+", "IL2RB+", "KLRC1+"],
+					"weight" : 0.6,
+					"comment" : "General NK cell markers also cover some T cells; derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
 				},
 				{
-					"genes" : ["NKG7+"],
+					"genes" : ["NCAM1+", "FCGR3A+"],
 					"weight" : 0.2,
-					"comment" : "natural killer cell granule protein 7"
-				},
-				{
-					"genes" : ["KLRB1+", "KLRD1+", "KLRF1+", "KLRC1+", "KLRC2+", "KLRC3+", "KLRC4+"],
-					"weight" : 0.25,
-					"comment" : "killer cell lectin like receptors"
+					"comment" : "NK subtype markers"
 				},
 				{
 					"genes" : ["CD3D-", "CD3E-", "CD3G-"],
-					"weight" : 0.15,
-					"comment" : "not T cell"
-				},
-				{
-					"genes" : ["FCGR3A+"],
-					"weight" : 0.1,
-					"comment" : "CD16a"
-				},
-				{
-					"genes" : ["ITGAL+", "ITGAM+"],
-					"weight" : 0.1,
-					"comment" : "CD11a,CD11b"
+					"weight" : 0.2,
+					"comment" : "No T cell markers"
 				}
-			]
+			],
+			"subtypes" : {
+				"title" : "NK cell subtype markers",
+				"cell_types" : [
+					{
+						"name" : "CD56-dim NK cell",
+						"markers" : [
+							{
+								"genes" : ["FCGR3A+", "FGFBP2+", "SPON2+", "MYOM2+", "S1PR5+", "CX3CR1+", "AKR1C3+", "FCRL6+", "LAIR2+", "PRSS23+"],
+								"weight" : 1.0,
+								"comment" : "Cytotoxic NK cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
+							}
+						]
+					},
+					{
+						"name" : "CD56-bright NK cell",
+						"markers" : [
+							{
+								"genes" : ["NCAM1+", "GZMK+", "XCL1+", "SPTSSB+", "CAPG+", "IL7R+", "GPR183+", "IGFBP4+", "SPINK2+", "FUT7+"],
+								"weight" : 1.0,
+								"comment" : "Regulatory NK cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; CD56bright develops into CD56dim"
+							}
+						]
+					}
+				],
+				"comment": "There is also a CD56_dim CD16_dim population in between of the CD56-dim and CD56-bright subtypes."
+			}
 		},
 
 		{

From a587dcbd6198855a74e491dad079f6b3d5b21217 Mon Sep 17 00:00:00 2001
From: Donghoon Lee <hoondy@users.noreply.github.com>
Date: Fri, 26 May 2023 10:31:29 -0400
Subject: [PATCH 19/57] Update doublet_detection.py

Fixes a bug where you have `raw_mat_key` other than default value, which is `counts`.
---
 pegasus/tools/doublet_detection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pegasus/tools/doublet_detection.py b/pegasus/tools/doublet_detection.py
index 26aadd4a..a1607cbb 100644
--- a/pegasus/tools/doublet_detection.py
+++ b/pegasus/tools/doublet_detection.py
@@ -586,9 +586,9 @@ def infer_doublets(
             if idx.size >= min_cell:
                 unidata = UnimodalData({"barcodekey": data.obs_names[idx]}, 
                                        {"featurekey": data.var_names},
-                                       {"counts": rawX[idx]},
+                                       {raw_mat_key: rawX[idx]},
                                        {"genome": genome, "modality": modality},
-                                       cur_matrix = "counts")
+                                       cur_matrix = raw_mat_key)
                 # Identify robust genes, count and log normalized and select top 2,000 highly variable features
                 identify_robust_genes(unidata)
                 log_norm(unidata)

From 5d29f9fa73dcfb1de47d8a265950b28d7954efb2 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sat, 27 May 2023 14:14:02 +0800
Subject: [PATCH 20/57] Updated T cell subtype markers

---
 .../human_immune_cell_markers.json            | 87 ++++++++++++++-----
 pegasus/data_files/human_t_cell_markers.gmt   |  9 ++
 2 files changed, 73 insertions(+), 23 deletions(-)
 create mode 100644 pegasus/data_files/human_t_cell_markers.gmt

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index f4ed25b4..73a424cd 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -19,55 +19,96 @@
 				"title" : "T cell subtype markers",
 				"cell_types" : [
 					{
-						"name" : "T helper cell",
+						"name" : "CD4 Naive T cell",
 						"markers" : [
 							{
-								"genes" : ["CD4+"],
+								"genes" : ["CD4+", "CCR7+", "SELL+", "LEF1+", "FHIT+", "ACTN1+", "LDLRAP1+", "TMIGD2+", "TRABD2A+", "LRRN3+"],
 								"weight" : 1.0,
-								"comment" : "CD4+ T cell"
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
+							}
+ 						]
+					},
+					{
+						"name" : "CD4 TCM",
+						"markers" : [
+							{
+								"genes" : ["CD4+", "GPR183+", "CD69+", "PASK+", "LIMS1+", "LPAR6+", "SLC2A3+", "SOCS3+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
+							}
+						] 
+					},
+					{
+						"name" : "CD4 TEM",
+						"markers" : [
+							{
+								"genes" : ["CD4+", "KLRB1+", "ANXA2+", "LGALS1+", "TIMP1+", "PTGER2+", "AHNAK+", "TNFRSF4+", "YWHAH+", "CD63+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
+							}
+						] 
+					},
+					{
+						"name" : "T regulatory cell",
+						"markers" : [
+							{
+								"genes" : ["RTKN2+", "FOXP3+", "IL2RA+", "HACD1+", "AC133644.2+", "FANK1+", "DUSP4+", "STAM+", "CCR10+", "CTLA4+"],
+								"weight" : 1.0,
+								"comments" : "Markers derived from Immune Cell Atlas PBMC data"
 							}
 						]
 					},
 					{
-						"name" : "Cytotoxic T cell",
+						"name" : "CD8 Naive T cell",
 						"markers" : [
 							{
-								"genes" : ["CD8A+", "CD8B+"],
+								"genes" : ["CD8A+", "CD8B+", "CCR7+", "SELL+", "LEF1+", "ACTN1+", "TRABD2A+", "LRRN3+", "LINC02446+", "S100B+", "CLEC11A+", "NELL2+", "PASK+", "APBA2+"],
 								"weight" : 1.0,
-								"comment" : "CD8+ T cell"
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
 							}
 						]
 					},
 					{
-						"name" : "T regulatory cell",
+						"name" : "CD8 TCM",
 						"markers" : [
 							{
-								"genes" : ["FOXP3+", "IL2RA+"],
-								"weight" : 0.7,
-								"comments" : "key T reg markers"
-							},
+								"genes" : ["CD8A+", "CD8B+", "GZMK+", "DUSP2+", "RGS1+", "CXCR3+", "CMC1+", "TIGIT+", "CST7+", "NKG7+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data; CD8A & CD8B are CD8 markers; GZMK, DUSP2, RGS1 & CXCR3 are specific to TCM; CMC1 & TIGIT are biased towards TCM; CST7 & NKG7 are shared by TCM & TEM"
+							}
+						]
+					},
+					{
+						"name" : "CD8 TEM",
+						"markers" : [
 							{
-								"genes" : ["CD4+"],
-								"weight" : 0.3,
-								"comment" : "key markers that do not express heavily in droplet-based RNA-Seq"
+								"genes" : ["CD8A+", "CD8B+", "FGFBP2+", "GZMB+", "FCGR3A+", "SPON2+", "ADGRG1+", "CX3CR1+", "ASCL2+", "PRSS23+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
 							}
 						]
 					},
 					{
-						"name" : "Naive T cell",
+						"name" : "MAIT",
 						"markers" : [
 							{
-								"genes" : ["CCR7+", "SELL+", "IL7R+", "TCF7+", "CD27+"],
-								"weight" : 0.7,
-								"comment" : "positive markers"
-							},
+								"genes" : ["SLC4A10+", "KLRB1+", "NCR3+", "CEBPD+", "GPR65+", "LST1+", "CXCR6+", "TRAV1-2+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
+							}
+ 						]
+					},
+					{
+						"name" : "Gamma-delta T cell",
+						"markers" : [
 							{
-								"genes" : ["IL2RA-", "CD44-", "CD69-"],
-								"weight" : 0.3,
-								"comment" : "negative markers"
+								"genes" : ["TRDC+", "TRGC1+", "TRGC2+", "KLRC1+", "KLRD1+", "GNLY+"],
+								"weight" : 1.0,
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
 							}
  						]
-					}
+					},
+
 				]
 			}
 		},
diff --git a/pegasus/data_files/human_t_cell_markers.gmt b/pegasus/data_files/human_t_cell_markers.gmt
new file mode 100644
index 00000000..33bd98af
--- /dev/null
+++ b/pegasus/data_files/human_t_cell_markers.gmt
@@ -0,0 +1,9 @@
+CD4_Naive	CD4 Naive T	CD4	CCR7	SELL	LEF1	FHIT	ACTN1	LDLRAP1	TMIGD2	TRABD2A	LRRN3
+CD4_TCM	CD4 TCM	CD4	GPR183	CD69	PASK	LIMS1	LPAR6	SLC2A3	SOCS3
+CD4_TEM	CD4 TEM	CD4	KLRB1	ANXA2	LGALS1	TIMP1	PTGER2	AHNAK	TNFRSF4	YWHAH	CD63
+Treg	Treg	RTKN2	FOXP3	IL2RA	HACD1	AC133644.2	FANK1	DUSP4	STAM	CCR10	CTLA4
+CD8_Naive	CD8 Naive T	CD8A	CD8B	CCR7	SELL	LEF1	ACTN1	TRABD2A	LRRN3	LINC02446	S100B	CLEC11A	NELL2	PASK	APBA2
+CD8_TCM	CD8 TCM	CD8A	CD8B	GZMK	DUSP2	RGS1	CXCR3	CMC1	TIGIT	CST7	NKG7
+CD8_TEM	CD8 TEM	CD8A	CD8B	FGFBP2	GZMB	FCGR3A	SPON2	ADGRG1	CX3CR1	ASCL2	PRSS23
+MAIT	MAIT	SLC4A10	KLRB1	NCR3	CEBPD	GPR65	LST1	CXCR6	TRAV1-2
+gdT	gdT	TRDC	TRGC1	TRGC2	KLRC1	KLRD1	GNLY

From 469693d185a11dc5c28f10eaa6b5beac78b743db Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sat, 27 May 2023 16:12:24 +0800
Subject: [PATCH 21/57] Added CD4 CTL markers

---
 .../human_immune_cell_markers.json                | 15 +++++++++++++++
 pegasus/data_files/human_t_cell_markers.gmt       |  1 +
 2 files changed, 16 insertions(+)

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 73a424cd..021a07ac 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -58,6 +58,21 @@
 							}
 						]
 					},
+					{
+						"name" : "CD4 CTL",
+						"markers" : [
+							{
+								"genes" : ["CD4+", "CD8A-", "CD8B-"],
+								"weight" : 0.3,
+								"comments" : "Must be CD4 T"
+							},
+							{
+								"genes" : ["GNLY+", "AGAP1+", "ZNF683+", "RGS9+", "IL5RA+", "LAIR2+", "MTERF2+", "SH3RF2+", "RGS17+"],
+								"weight" : 0.7,
+								"comments" : "CD4 CTL markers that might also be expressed by CD8 TEM"
+							}
+						]
+					},					
 					{
 						"name" : "CD8 Naive T cell",
 						"markers" : [
diff --git a/pegasus/data_files/human_t_cell_markers.gmt b/pegasus/data_files/human_t_cell_markers.gmt
index 33bd98af..aa062fe5 100644
--- a/pegasus/data_files/human_t_cell_markers.gmt
+++ b/pegasus/data_files/human_t_cell_markers.gmt
@@ -2,6 +2,7 @@ CD4_Naive	CD4 Naive T	CD4	CCR7	SELL	LEF1	FHIT	ACTN1	LDLRAP1	TMIGD2	TRABD2A	LRRN3
 CD4_TCM	CD4 TCM	CD4	GPR183	CD69	PASK	LIMS1	LPAR6	SLC2A3	SOCS3
 CD4_TEM	CD4 TEM	CD4	KLRB1	ANXA2	LGALS1	TIMP1	PTGER2	AHNAK	TNFRSF4	YWHAH	CD63
 Treg	Treg	RTKN2	FOXP3	IL2RA	HACD1	AC133644.2	FANK1	DUSP4	STAM	CCR10	CTLA4
+CD4_CTL	CD4 Cytotoxic Lymphocyte	CD4	GNLY	AGAP1	ZNF683	RGS9	IL5RA	LAIR2	MTERF2	SH3RF2	RGS17
 CD8_Naive	CD8 Naive T	CD8A	CD8B	CCR7	SELL	LEF1	ACTN1	TRABD2A	LRRN3	LINC02446	S100B	CLEC11A	NELL2	PASK	APBA2
 CD8_TCM	CD8 TCM	CD8A	CD8B	GZMK	DUSP2	RGS1	CXCR3	CMC1	TIGIT	CST7	NKG7
 CD8_TEM	CD8 TEM	CD8A	CD8B	FGFBP2	GZMB	FCGR3A	SPON2	ADGRG1	CX3CR1	ASCL2	PRSS23

From f965c2638d7cc88d5385909aeaf2b933ef2625fc Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Thu, 8 Jun 2023 07:58:00 +0800
Subject: [PATCH 22/57] Updated markers

---
 .../human_immune_cell_markers.json            | 40 ++++++++++++++-----
 pegasus/data_files/human_t_cell_markers.gmt   |  1 +
 pegasus/data_files/tonsil_markers.gmt         | 18 +++++++++
 3 files changed, 48 insertions(+), 11 deletions(-)
 create mode 100644 pegasus/data_files/tonsil_markers.gmt

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 021a07ac..fd2afc23 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -72,7 +72,17 @@
 								"comments" : "CD4 CTL markers that might also be expressed by CD8 TEM"
 							}
 						]
-					},					
+					},
+					{
+						"name" : "T follicular helper cell",
+						"markers" : [
+							{
+								"genes" : ["CD4+", "ST8SIA1+", "PDCD1+", "TIGIT+", "TOX2+", "ICOS+", "SH2D1A+", "IL21+"],
+								"weight" : 1.0,
+								"comments" : "Tfh markers"
+							}
+						]
+					},										
 					{
 						"name" : "CD8 Naive T cell",
 						"markers" : [
@@ -250,14 +260,9 @@
 			"name" : "Germinal Center B cell",
 			"markers" : [
 				{
-					"genes" : ["CD19+", "MS4A1+", "CD79A+", "CD79B+"],
-					"weight" : 0.3,
-					"comment" : "CD19, CD20 and CD79"
-				},
-				{
-					"genes" : ["CD38+", "BCL6+", "BCL2-", "RGS13+", "MEF2B"],
-					"weight" : 0.7,
-					"comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last 2 markers are from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper"
+					"genes" : ["MEF2B+", "NEIL1+", "RGS13+", "ELL3+", "BCL7A+", "BCL6+", "NUGGC+", "MYBL1+", "EML6+", "FANCA+"],
+					"weight" : 1.0,
+					"comment" : "GC B cell markers"
 				}
 			],
 			"subtypes" : {
@@ -267,9 +272,9 @@
 						"name" : "Dark zone B cell",
 						"markers" : [
 							{
-								"genes" : ["CXCR4+", "AICDA+", "FOXP1+", "MME+"],
+								"genes" : ["NUSAP1+", "NCAPG+", "AURKB+", "HMGB2+", "HJURP+", "TOP2A+"],
 								"weight" : 1.0,
-								"comment" : "Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper"
+								"comment" : "DZ B cell markers"
 							}
 						]
 					},
@@ -379,6 +384,19 @@
 			]
 		},
 
+
+		{
+			"name" : "Follicular dendritic cell",
+			"markers" : [
+				{
+					"genes" : ["CXCL13+", "FCAMR+", "FDCSP+", "SERPINE2+", "PAPPA+", "NPHS1+", "PKDCC+", "SYNM+", "NRG2+", "CDC42EP4+", "MUC3A+", "PRUNE2+", "B4GALNT4+", "NPPC+", "SLC1A2+", "TMEM150C+"],
+					"weight" : 1.0,
+					"comment" : "fDC markers"
+				}
+			]
+		},
+
+
 		{
 			"name" : "Hematopoietic stem cell",
 			"markers" : [
diff --git a/pegasus/data_files/human_t_cell_markers.gmt b/pegasus/data_files/human_t_cell_markers.gmt
index aa062fe5..aac1cda8 100644
--- a/pegasus/data_files/human_t_cell_markers.gmt
+++ b/pegasus/data_files/human_t_cell_markers.gmt
@@ -3,6 +3,7 @@ CD4_TCM	CD4 TCM	CD4	GPR183	CD69	PASK	LIMS1	LPAR6	SLC2A3	SOCS3
 CD4_TEM	CD4 TEM	CD4	KLRB1	ANXA2	LGALS1	TIMP1	PTGER2	AHNAK	TNFRSF4	YWHAH	CD63
 Treg	Treg	RTKN2	FOXP3	IL2RA	HACD1	AC133644.2	FANK1	DUSP4	STAM	CCR10	CTLA4
 CD4_CTL	CD4 Cytotoxic Lymphocyte	CD4	GNLY	AGAP1	ZNF683	RGS9	IL5RA	LAIR2	MTERF2	SH3RF2	RGS17
+Tfh	T follicular helper	CD4	ST8SIA1	PDCD1	TIGIT	TOX2	ICOS	SH2D1A	IL21
 CD8_Naive	CD8 Naive T	CD8A	CD8B	CCR7	SELL	LEF1	ACTN1	TRABD2A	LRRN3	LINC02446	S100B	CLEC11A	NELL2	PASK	APBA2
 CD8_TCM	CD8 TCM	CD8A	CD8B	GZMK	DUSP2	RGS1	CXCR3	CMC1	TIGIT	CST7	NKG7
 CD8_TEM	CD8 TEM	CD8A	CD8B	FGFBP2	GZMB	FCGR3A	SPON2	ADGRG1	CX3CR1	ASCL2	PRSS23
diff --git a/pegasus/data_files/tonsil_markers.gmt b/pegasus/data_files/tonsil_markers.gmt
new file mode 100644
index 00000000..bfefe13c
--- /dev/null
+++ b/pegasus/data_files/tonsil_markers.gmt
@@ -0,0 +1,18 @@
+Skeletal muscle cells	Skeletal muscle cells	MYBPC1	TNNT1	TNNC1	MYL1	MYBPH	TNNC2	TNNI1	MYH7	MYL2
+Tfh	T Follicular helper markers (one reference point is https://www.thermofisher.com/us/en/home/life-science/cell-analysis/cell-analysis-learning-center/immunology-at-work/t-follicular-helper-cell-overview.html)	CD4	ST8SIA1	PDCD1	TIGIT	TOX2	ICOS	SH2D1A	IL21
+Tregs	Tregs	CTLA4	TIGIT	IL2RA	FOXP3	CCR8	BATF
+T_Naive	Naive T cell	CCR7	SELL	IL7R	TCF7	CD27
+DC_Migratory	Migratory Conventional Dendritic cell	FSCN1	CCR7	LAMP3	CCL19	CCL22	CD40	BIRC3
+MAIT	MAIT	SLC4A10
+EC lymphatic	Schupp et al. Circulation 2021	PECAM1	CLDN5	CDH5	ERG	CCL21	SEMA3D	PROX1	PDPN	MMRN1	RELN	PKHD1L1	TFF3	LYVE1	FLT4	TBX1
+fDC	Follicular dendritic cell	CXCL13	FCAMR	FDCSP	SERPINE2	PAPPA	NPHS1	PKDCC	SYNM	NRG2	CDC42EP4	MUC3A	PRUNE2	B4GALNT4	NPPC	SLC1A2	TMEM150C
+DCs_CLEC9A	Conventional Dendritic cell type 1	CLEC9A	BATF3	IRF8	CPVL	CADM1
+DCs_CD1C	Conventional Dendritic cell type 2	CD1C	FCER1A	FCGBP	CD1A	CD207	HLA-DQB2
+pDCs	Plasmacytoid Dendritic cell	IRF4	LILRA4	TCF4	MZB1
+B_Naive	Naïve B cell	MS4A1	IGHD	TCL1A	FCER2
+B_Memory	Memory B cell	MS4A1	CD27	TNFRSF13B
+B_Germinal_Center	Germinal center B cell	MEF2B	NEIL1	RGS13	ELL3	BCL7A	BCL6	NUGGC	MYBL1	EML6	FANCA
+B_light_zone	Light Zone	CD83	LMO2
+B_dark_zone	Dark Zone	CXCR4	AICDA	FOXP1	MME
+Mono_DCs	Monocytes Derived DC	CD14	FCGR2B	CCL17	CLEC10A
+MyoF	Myofibroblast from Travaglini et al. and Tony et al.	ACTA2	MYL9	MT2A	EEF1A1	TMSB10	FAU	UBA52	SERF2	PTMA	S100A6

From 3a9cca8a01a09e400eb81911e010329b0303c2b5 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Wed, 14 Jun 2023 20:11:38 +0800
Subject: [PATCH 23/57] Added method option for nearest neighbor search to
 choose between hnsw and sklearn

---
 .../annotate_cluster/human_immune_cell_markers.json |  3 +--
 pegasus/tools/nearest_neighbors.py                  | 13 +++++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index fd2afc23..feb55f0a 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -132,8 +132,7 @@
 								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
 							}
  						]
-					},
-
+					}
 				]
 			}
 		},
diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py
index fdc16109..37965183 100644
--- a/pegasus/tools/nearest_neighbors.py
+++ b/pegasus/tools/nearest_neighbors.py
@@ -44,9 +44,8 @@ def calculate_nearest_neighbors(
     """Calculate nearest neighbors
     X is the sample by feature matrix
     Return K -1 neighbors, the first one is the point itself and thus omitted.
-    TODO: Documentation
+    If nsample <= 1000, method is set to "sklearn" for exact KNN search
     """
-
     nsample = X.shape[0]
 
     if nsample <= 1000:
@@ -85,6 +84,7 @@ def calculate_nearest_neighbors(
         distances = np.sqrt(distances, out=distances)
     else:
         assert method == "sklearn"
+        print("haha, exact!")
         knn = NearestNeighbors(
             n_neighbors=K - 1, n_jobs=n_jobs
         )  # eliminate the first neighbor, which is the node itself
@@ -116,6 +116,7 @@ def get_neighbors(
     full_speed: bool = False,
     use_cache: bool = True,
     dist: str = "l2",
+    method: str = "hnsw",
 ) -> Tuple[List[int], List[float]]:
     """Find K nearest neighbors for each data point and return the indices and distances arrays.
 
@@ -140,6 +141,8 @@ def get_neighbors(
         If use_cache and found cached knn results, will not recompute.
     dist: `str`, optional (default: 'l2')
         Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'.
+    method: `str`, optional (default: 'hnsw')
+        Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search.    
 
     Returns
     -------
@@ -164,6 +167,7 @@ def get_neighbors(
             X_from_rep(data, rep, n_comps),
             K=K,
             n_jobs=eff_n_jobs(n_jobs),
+            method=method,
             random_state=random_state,
             full_speed=full_speed,
             dist=dist,
@@ -237,6 +241,7 @@ def neighbors(
     full_speed: bool = False,
     use_cache: bool = True,
     dist: str = "l2",
+    method: str = "hnsw",
 ) -> None:
     """Compute k nearest neighbors and affinity matrix, which will be used for diffmap and graph-based community detection algorithms.
 
@@ -274,6 +279,9 @@ def neighbors(
     dist: ``str``, optional (default: ``"l2"``)
         Distance metric to use. By default, use squared L2 distance. Available options, inner product ``"ip"`` or cosine similarity ``"cosine"``.
 
+    method: ``str``, optional (default: ``"hnsw"``)
+        Choose from "hnsw" or "sklearn". "hnsw" uses HNSW algorithm for approximate nearest neighbor search and "sklearn" uses sklearn package for exact nearest neighbor search.
+
     Returns
     -------
     ``None``
@@ -302,6 +310,7 @@ def neighbors(
         full_speed=full_speed,
         use_cache=use_cache,
         dist=dist,
+        method=method,
     )
 
     # calculate affinity matrix

From b84a87ce850bebb09421d8773616952689c97610 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sat, 17 Jun 2023 10:03:48 -0700
Subject: [PATCH 24/57] Updated nearest neighbor search function to a) set
 use_cache to False by default and b) adjust K to min(K, int(sqrt(nsample)))

---
 pegasus/tools/doublet_detection.py |   2 +-
 pegasus/tools/nearest_neighbors.py | 103 +++++++++++++++++++++--------
 pegasus/tools/nmf.py               |   2 +-
 pegasus/tools/visualization.py     |  28 +++++---
 4 files changed, 97 insertions(+), 38 deletions(-)

diff --git a/pegasus/tools/doublet_detection.py b/pegasus/tools/doublet_detection.py
index a1607cbb..5737af2d 100644
--- a/pegasus/tools/doublet_detection.py
+++ b/pegasus/tools/doublet_detection.py
@@ -349,7 +349,7 @@ def _run_scrublet(
     if k is None:
         k = int(round(0.5 * np.sqrt(obsX.shape[0])))
     k_adj = int(round(k * (1.0 + r)))
-    indices, _ = calculate_nearest_neighbors(pc_coords, K = k_adj + 1, n_jobs = n_jobs)
+    indices, _, _ = calculate_nearest_neighbors(pc_coords, K=k_adj + 1, n_jobs=n_jobs, exact_k=True)
 
     # Calculate scrublet-like doublet score
     k_d = is_doublet[indices].sum(axis = 1)
diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py
index 37965183..4ecc3169 100644
--- a/pegasus/tools/nearest_neighbors.py
+++ b/pegasus/tools/nearest_neighbors.py
@@ -34,26 +34,61 @@ def calculate_nearest_neighbors(
     K: int = 100,
     n_jobs: int = -1,
     method: str = "hnsw",
+    exact_k: bool = False,
     M: int = 20,
     efC: int = 200,
     efS: int = 200,
     random_state: int = 0,
     full_speed: int = False,
     dist: str = 'l2',
-):
-    """Calculate nearest neighbors
-    X is the sample by feature matrix
-    Return K -1 neighbors, the first one is the point itself and thus omitted.
-    If nsample <= 1000, method is set to "sklearn" for exact KNN search
+) -> Tuple[List[int], List[float], int]:
+    """Find K nearest neighbors for each data point in the matrix and return the indices and distances arrays.
+
+    K is determined by min(K, int(sqrt(X.shape[0]))) if exact_k == False.
+
+    Parameters
+    ----------
+
+    X : `np.array`
+        An array of n_samples by n_features.
+    K : `int`, optional (default: 100)
+        Number of neighbors, including the data point itself. If K is None, determine K by sqrt(X.shape[0]).
+    n_jobs : `int`, optional (default: -1)
+        Number of threads to use. -1 refers to using all physical CPU cores.
+    method: `str`, optional (default: 'hnsw')
+        Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search. If X.shape[0] <= 1000, method will be automatically set to "sklearn" for exact KNN search
+    exact_k: `bool`, optional (default: 'False')
+        If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
+    M, efC, efS: `int`, optional (20, 200, 200)
+        HNSW algorithm parameters.
+    random_state: `int`, optional (default: 0)
+        Random seed for random number generator.
+    full_speed: `bool`, optional (default: False)
+        If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.
+    dist: `str`, optional (default: 'l2')
+        Distance metric to use. By default, use squared L2 distance. Available options, 'l2', inner product 'ip' or cosine similarity 'cosine'.
+
+    Returns
+    -------
+
+    kNN indices array, distances array and adjusted K.
+
+    Examples
+    --------
+    >>> indices, distances = calculate_nearest_neighbors(X)
     """
     nsample = X.shape[0]
 
     if nsample <= 1000:
         method = "sklearn"
 
-    if nsample < K:
-        logger.warning(f"Warning: in calculate_nearest_neighbors, number of samples = {nsample} < K = {K}!\n Set K to {nsample}.")
-        K = nsample
+    k_rot = int(nsample ** 0.5) # rot, rule of thumb
+    if (K is None) or (K > k_rot and (not exact_k)):
+        K = k_rot
+        logger.info(f"in calculate_nearest_neighbors, K is adjusted to {K}.")
+
+    if K == 1:
+        return np.zeros(0, dtype=int), np.zeros(0, dtype=np.float32), K
 
     n_jobs = eff_n_jobs(n_jobs)
 
@@ -84,18 +119,17 @@ def calculate_nearest_neighbors(
         distances = np.sqrt(distances, out=distances)
     else:
         assert method == "sklearn"
-        print("haha, exact!")
         knn = NearestNeighbors(
             n_neighbors=K - 1, n_jobs=n_jobs
         )  # eliminate the first neighbor, which is the node itself
         knn.fit(X)
         distances, indices = knn.kneighbors()
 
-    return indices, distances
+    return indices, distances, K
 
 
 def knn_is_cached(
-    data: MultimodalData, indices_key: str, distances_key: str, K: int
+    data: MultimodalData, indices_key: str, distances_key: str, K: int, exact_k: bool
 ) -> bool:
     return (
         (indices_key in data.obsm)
@@ -114,12 +148,15 @@ def get_neighbors(
     n_jobs: int = -1,
     random_state: int = 0,
     full_speed: bool = False,
-    use_cache: bool = True,
+    use_cache: bool = False,
     dist: str = "l2",
     method: str = "hnsw",
-) -> Tuple[List[int], List[float]]:
+    exact_k: bool = False,
+) -> Tuple[List[int], List[float], int]:
     """Find K nearest neighbors for each data point and return the indices and distances arrays.
 
+    K is determined by min(K, int(sqrt(data.shape[0]))) if exact_k == False.
+
     Parameters
     ----------
 
@@ -137,37 +174,44 @@ def get_neighbors(
         Random seed for random number generator.
     full_speed: `bool`, optional (default: False)
         If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.
-    use_cache: `bool`, optional (default: True)
+    use_cache: `bool`, optional (default: False)
         If use_cache and found cached knn results, will not recompute.
     dist: `str`, optional (default: 'l2')
-        Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'.
+        Distance metric to use. By default, use squared L2 distance. Available options, 'l2' or inner product 'ip' or cosine similarity 'cosine'.
     method: `str`, optional (default: 'hnsw')
-        Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search.    
+        Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search.
+    exact_k: `bool`, optional (default: 'False')
+        If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
 
     Returns
     -------
 
-    kNN indices and distances arrays.
+    kNN indices array, distances array, and adjusted K.
 
     Examples
     --------
-    >>> indices, distances = tools.get_neighbors(data)
+    >>> indices, distances, K = tools.get_neighbors(data)
     """
-
     rep = update_rep(rep)
     indices_key = rep + "_knn_indices"
     distances_key = rep + "_knn_distances"
 
+    k_rot = int(data.shape[0] ** 0.5) # rot, rule of thumb
+    if (K is None) or (K > k_rot and (not exact_k)):
+        K = k_rot
+        logger.info(f"in get_neighbors, K is adjusted to {K}.")
+
     if use_cache and knn_is_cached(data, indices_key, distances_key, K):
         indices = data.obsm[indices_key]
         distances = data.obsm[distances_key]
         logger.info("Found cached kNN results, no calculation is required.")
     else:
-        indices, distances = calculate_nearest_neighbors(
+        indices, distances, _ = calculate_nearest_neighbors(
             X_from_rep(data, rep, n_comps),
             K=K,
             n_jobs=eff_n_jobs(n_jobs),
             method=method,
+            exact_k=exact_k,
             random_state=random_state,
             full_speed=full_speed,
             dist=dist,
@@ -177,7 +221,7 @@ def get_neighbors(
         data.obsm[distances_key] = distances
         data.register_attr(distances_key, "knn")
 
-    return indices, distances
+    return indices, distances, K
 
 
 def get_symmetric_matrix(csr_mat: "csr_matrix") -> "csr_matrix":
@@ -239,14 +283,17 @@ def neighbors(
     n_jobs: int = -1,
     random_state: int = 0,
     full_speed: bool = False,
-    use_cache: bool = True,
+    use_cache: bool = False,
     dist: str = "l2",
     method: str = "hnsw",
+    exact_k: bool = False,
 ) -> None:
     """Compute k nearest neighbors and affinity matrix, which will be used for diffmap and graph-based community detection algorithms.
 
     The kNN calculation uses `hnswlib <https://github.com/nmslib/hnswlib>`_ introduced by [Malkov16]_.
 
+    K is determined by min(K, sqrt(data.shape[0])).
+
     Parameters
     ----------
 
@@ -272,16 +319,19 @@ def neighbors(
         * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
         * Otherwise, use only one thread to make sure results are reproducible.
 
-    use_cache: ``bool``, optional, default: ``True``
+    use_cache: ``bool``, optional, default: ``False``
         * If ``True`` and found cached knn results, Pegasus will use cached results and do not recompute.
         * Otherwise, compute kNN irrespective of caching status.
 
     dist: ``str``, optional (default: ``"l2"``)
-        Distance metric to use. By default, use squared L2 distance. Available options, inner product ``"ip"`` or cosine similarity ``"cosine"``.
+        Distance metric to use. By default, use squared L2 distance. Available options, ``"l2"`` or inner product ``"ip"`` or cosine similarity ``"cosine"``.
 
     method: ``str``, optional (default: ``"hnsw"``)
         Choose from "hnsw" or "sklearn". "hnsw" uses HNSW algorithm for approximate nearest neighbor search and "sklearn" uses sklearn package for exact nearest neighbor search.
 
+    exact_k: ``bool``, optional (default: ``False``)
+        If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
+
     Returns
     -------
     ``None``
@@ -311,6 +361,7 @@ def neighbors(
         use_cache=use_cache,
         dist=dist,
         method=method,
+        exact_k=exact_k,
     )
 
     # calculate affinity matrix
@@ -417,7 +468,7 @@ def calc_kBET(
     attr_values = data.obs[attr].values.copy()
     attr_values.categories = range(nbatch)
 
-    indices, distances = get_neighbors(
+    indices, distances, K = get_neighbors(
         data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, use_cache=use_cache,
     )
     knn_indices = np.concatenate(
@@ -508,7 +559,7 @@ def calc_kSIM(
     assert attr in data.obs
     nsample = data.shape[0]
 
-    indices, distances = get_neighbors(
+    indices, distances, K = get_neighbors(
         data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, use_cache=use_cache,
     )
     knn_indices = np.concatenate(
diff --git a/pegasus/tools/nmf.py b/pegasus/tools/nmf.py
index 4cc6f270..7ce54e65 100644
--- a/pegasus/tools/nmf.py
+++ b/pegasus/tools/nmf.py
@@ -418,7 +418,7 @@ def integrative_nmf(
             continue
 
         clusters = np.argmax(H_new, axis=1) # Assign cluster
-        indices, _ = calculate_nearest_neighbors(H_new, K=20, n_jobs=n_jobs, random_state=seeds[i]) # KNN with K=20
+        indices, _, _ = calculate_nearest_neighbors(H_new, K=20, n_jobs=n_jobs, random_state=seeds[i]) # KNN with K=20
         clusters, csum = _refine_cluster(clusters, indices, n_components) # Refine cluster
         csums.append(csum)
         ids_by_clusts.append(np.argsort(clusters, kind='stable'))
diff --git a/pegasus/tools/visualization.py b/pegasus/tools/visualization.py
index c1dab252..1d660ada 100644
--- a/pegasus/tools/visualization.py
+++ b/pegasus/tools/visualization.py
@@ -276,6 +276,7 @@ def umap(
     dens_var_shift: float = 0.1,
     n_jobs: int = -1,
     full_speed: bool = False,
+    use_cache: bool = True,
     random_state: int = 0,
     out_basis: str = "umap",
 ) -> None:
@@ -334,6 +335,9 @@ def umap(
         * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
         * Otherwise, use only one thread to make sure results are reproducible.
 
+    use_cache: ``bool``, optional, default: ``True``
+        If use_cache and found cached knn results, will not recompute.
+
     random_state: ``int``, optional, default: ``0``
         Random seed set for reproducing results.
 
@@ -354,11 +358,7 @@ def umap(
     rep = update_rep(rep)
     X = X_from_rep(data, rep, rep_ncomps)
 
-    if data.shape[0] < n_neighbors:
-        logger.warning(f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}.")
-        n_neighbors = data.shape[0]
-
-    knn_indices, knn_dists = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+    knn_indices, knn_dists, n_neighbors = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
     knn_indices = np.insert(knn_indices[:, 0 : n_neighbors - 1], 0, range(data.shape[0]), axis=1)
     knn_dists = np.insert(knn_dists[:, 0 : n_neighbors - 1], 0, 0.0, axis=1)
 
@@ -539,6 +539,7 @@ def net_umap(
     select_K: int = 25,
     select_alpha: float = 1.0,
     full_speed: bool = False,
+    use_cache: bool = True,
     net_alpha: float = 0.1,
     polish_learning_rate: float = 10.0,
     polish_n_epochs: int = 30,
@@ -612,6 +613,9 @@ def net_umap(
         * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
         * Otherwise, use only one thread to make sure results are reproducible.
 
+    use_cache: ``bool``, optional, default: ``True``
+        If use_cache and found cached knn results, will not recompute.
+
     net_alpha: ``float``, optional, default: ``0.1``
         L2 penalty (regularization term) parameter of the deep regressor.
 
@@ -641,7 +645,7 @@ def net_umap(
 
     rep = update_rep(rep)
     n_jobs = eff_n_jobs(n_jobs)
-    knn_indices, knn_dists = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+    knn_indices, knn_dists, select_K = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
 
     selected = select_cells(
         knn_dists,
@@ -659,7 +663,7 @@ def net_umap(
 
     ds_indices_key = "ds_" + rep + "_knn_indices"  # ds refers to down-sampling
     ds_distances_key = "ds_" + rep + "_knn_distances"
-    indices, distances = calculate_nearest_neighbors(
+    indices, distances, n_neighbors = calculate_nearest_neighbors(
         X,
         K=n_neighbors,
         n_jobs=n_jobs,
@@ -702,7 +706,7 @@ def net_umap(
 
     data.obsm["X_" + out_basis + "_pred"] = Y_init
 
-    knn_indices, knn_dists = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+    knn_indices, knn_dists, n_neighbors = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
     knn_indices = np.insert(knn_indices[:, 0 : n_neighbors - 1], 0, range(data.shape[0]), axis=1)
     knn_dists = np.insert(knn_dists[:, 0 : n_neighbors - 1], 0, 0.0, axis=1)
 
@@ -735,6 +739,7 @@ def net_fle(
     rep: str = "diffmap",
     K: int = 50,
     full_speed: bool = False,
+    use_cache: bool = True,
     target_change_per_node: float = 2.0,
     target_steps: int = 5000,
     is3d: bool = False,
@@ -778,6 +783,9 @@ def net_fle(
         * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
         * Otherwise, use only one thread to make sure results are reproducible.
 
+    use_cache: ``bool``, optional, default: ``True``
+        If use_cache and found cached knn results, will not recompute.
+
     target_change_per_node: ``float``, optional, default: ``2.0``
         Target change per node to stop ForceAtlas2.
 
@@ -845,7 +853,7 @@ def net_fle(
             full_speed=full_speed,
         )
 
-    knn_indices, knn_dists = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+    knn_indices, knn_dists, select_K = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
 
     selected = select_cells(
         knn_dists,
@@ -860,7 +868,7 @@ def net_fle(
 
     ds_indices_key = "ds_" + rep + "_knn_indices"
     ds_distances_key = "ds_" + rep + "_knn_distances"
-    indices, distances = calculate_nearest_neighbors(
+    indices, distances, K = calculate_nearest_neighbors(
         X, K=K, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed
     )
     data.uns[ds_indices_key] = indices

From af26b006ce8b5e38c649c525afb315b806ed84ff Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sat, 17 Jun 2023 22:20:11 -0700
Subject: [PATCH 25/57] Fixed a bug in nearest_neighbors

---
 pegasus/tools/nearest_neighbors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py
index 4ecc3169..2f2170b3 100644
--- a/pegasus/tools/nearest_neighbors.py
+++ b/pegasus/tools/nearest_neighbors.py
@@ -88,7 +88,7 @@ def calculate_nearest_neighbors(
         logger.info(f"in calculate_nearest_neighbors, K is adjusted to {K}.")
 
     if K == 1:
-        return np.zeros(0, dtype=int), np.zeros(0, dtype=np.float32), K
+        return np.zeros((nsample, 0), dtype=int), np.zeros((nsample, 0), dtype=np.float32), K
 
     n_jobs = eff_n_jobs(n_jobs)
 

From ea5f47b25768c5c4ccf861c5a6b11fff5328bd4d Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sun, 18 Jun 2023 23:29:43 -0700
Subject: [PATCH 26/57] Fixed several bugs

---
 pegasus/tools/nearest_neighbors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py
index 2f2170b3..fd95e4ca 100644
--- a/pegasus/tools/nearest_neighbors.py
+++ b/pegasus/tools/nearest_neighbors.py
@@ -129,7 +129,7 @@ def calculate_nearest_neighbors(
 
 
 def knn_is_cached(
-    data: MultimodalData, indices_key: str, distances_key: str, K: int, exact_k: bool
+    data: MultimodalData, indices_key: str, distances_key: str, K: int
 ) -> bool:
     return (
         (indices_key in data.obsm)
@@ -350,7 +350,7 @@ def neighbors(
 
     # calculate kNN
     rep = update_rep(rep)
-    indices, distances = get_neighbors(
+    indices, distances, K = get_neighbors(
         data,
         K=K,
         rep=rep,

From 3bca8bbaa7ce398f3fd21627f016e316ab177d62 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Wed, 21 Jun 2023 01:46:28 -0700
Subject: [PATCH 27/57] don't test louvain

---
 tests/run_pipeline.sh  | 6 +++---
 tests/test_pipeline.py | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/run_pipeline.sh b/tests/run_pipeline.sh
index c6e0d2eb..f3f2243f 100644
--- a/tests/run_pipeline.sh
+++ b/tests/run_pipeline.sh
@@ -1,14 +1,14 @@
 pegasus aggregate_matrix tests/data/count_matrix.csv tests/aggr
 
 if [ -f "tests/aggr.zarr.zip" ]; then
-    pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --correct-batch-effect --nmf --louvain --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr louvain_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result
+    pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --correct-batch-effect --nmf --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr louvain_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result
 fi
 
 if [ -f "tests/result.zarr.zip" ]; then
     pegasus de_analysis -p 2 --labels louvain_labels --t --fisher tests/result.zarr.zip tests/result.de.xlsx
     pegasus annotate_cluster --markers mouse_immune,mouse_brain tests/result.zarr.zip tests/result.anno.txt
     pegasus plot compo --groupby leiden_labels --condition Channel tests/result.zarr.zip tests/result.compo.pdf
-    pegasus plot scatter --basis umap --attributes louvain_labels,Channel tests/result.zarr.zip tests/result.louvain_labels.umap.pdf
+    pegasus plot scatter --basis umap --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.umap.pdf
     pegasus plot scatter --basis tsne --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.tsne.pdf
-    pegasus plot scatter --basis fle --attributes louvain_labels,Channel tests/result.zarr.zip tests/result.louvain_labels.fle.pdf
+    pegasus plot scatter --basis fle --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.fle.pdf
 fi
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 4b1da926..580bbaf1 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -27,7 +27,6 @@ def test_qc(self):
     def test_clustering(self):
         self.assertEqual(self.data.obsm['pca_harmony_knn_indices'].shape, (1043, 99), "KNN graph shape differs!")
         self.assertEqual(self.data.obsm['pca_harmony_knn_distances'].shape, (1043, 99), "KNN distance matrix shape differs!")
-        self.assertIn('louvain_labels', self.data.obs.columns, "Louvain result is lost!")
         self.assertIn('leiden_labels', self.data.obs.columns, "Leiden result is lost!")
 
     def test_doublet_detection(self):

From fa1d0a11040e0d1d165659ab4ec0b6b4be64e03b Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Fri, 23 Jun 2023 00:17:31 -0700
Subject: [PATCH 28/57] Add --exact-K option to pegasus cluster command

---
 pegasus/commands/Clustering.py | 2 ++
 pegasus/pipeline/pipeline.py   | 1 +
 tests/run_pipeline.sh          | 4 ++--
 tests/test_pipeline.py         | 4 ++--
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pegasus/commands/Clustering.py b/pegasus/commands/Clustering.py
index 9d8611b2..7b6d7e90 100644
--- a/pegasus/commands/Clustering.py
+++ b/pegasus/commands/Clustering.py
@@ -68,6 +68,7 @@ class Clustering(Base):
   --nmf-n <number>                                 Number of NMF components. IF iNMF is used for batch correction, this parameter also sets iNMF number of components. [default: 20]
 
   --knn-K <number>                                 Number of nearest neighbors for building kNN graph. [default: 100]
+  --exact-K                                        If use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
   --knn-full-speed                                 For the sake of reproducibility, we only run one thread for building kNN indices. Turn on this option will allow multiple threads to be used for index building. However, it will also reduce reproducibility due to the racing between multiple threads.
 
   --kBET                                           Calculate kBET.
@@ -210,6 +211,7 @@ def execute(self):
             "nmf": self.args["--nmf"],
             "nmf_n": int(self.args["--nmf-n"]),
             "K": int(self.args["--knn-K"]),
+            "exact_K": self.args["--exact-K"],
             "full_speed": self.args["--knn-full-speed"],
             "kBET": self.args["--kBET"],
             "kBET_batch": self.args["--kBET-batch"],
diff --git a/pegasus/pipeline/pipeline.py b/pegasus/pipeline/pipeline.py
index 34626967..5ede69b3 100644
--- a/pegasus/pipeline/pipeline.py
+++ b/pegasus/pipeline/pipeline.py
@@ -92,6 +92,7 @@ def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool,
         tools.neighbors(
             unidata,
             K=kwargs["K"],
+            exact_k=kwargs["exact_K"],
             rep=dim_key,
             n_jobs=kwargs["n_jobs"],
             random_state=kwargs["random_state"],
diff --git a/tests/run_pipeline.sh b/tests/run_pipeline.sh
index f3f2243f..8c516f2f 100644
--- a/tests/run_pipeline.sh
+++ b/tests/run_pipeline.sh
@@ -1,11 +1,11 @@
 pegasus aggregate_matrix tests/data/count_matrix.csv tests/aggr
 
 if [ -f "tests/aggr.zarr.zip" ]; then
-    pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --correct-batch-effect --nmf --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr louvain_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result
+    pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --exact-K --correct-batch-effect --nmf --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr leiden_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result
 fi
 
 if [ -f "tests/result.zarr.zip" ]; then
-    pegasus de_analysis -p 2 --labels louvain_labels --t --fisher tests/result.zarr.zip tests/result.de.xlsx
+    pegasus de_analysis -p 2 --labels leiden_labels --t --fisher tests/result.zarr.zip tests/result.de.xlsx
     pegasus annotate_cluster --markers mouse_immune,mouse_brain tests/result.zarr.zip tests/result.anno.txt
     pegasus plot compo --groupby leiden_labels --condition Channel tests/result.zarr.zip tests/result.compo.pdf
     pegasus plot scatter --basis umap --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.umap.pdf
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 580bbaf1..3e8b7f3b 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -59,9 +59,9 @@ def test_annotation(self):
 
     def test_plot(self):
         self.assertIn('result.compo.pdf', os.listdir('tests'), "Composition plot is lost!")
-        self.assertIn('result.louvain_labels.umap.pdf', os.listdir('tests'), "UMAP plot is lost!")
+        self.assertIn('result.leiden_labels.umap.pdf', os.listdir('tests'), "UMAP plot is lost!")
         self.assertIn('result.leiden_labels.tsne.pdf', os.listdir('tests'), "tSNE plot is lost!")
-        self.assertIn('result.louvain_labels.fle.pdf', os.listdir('tests'), 'FLE plot is lost!')
+        self.assertIn('result.leiden_labels.fle.pdf', os.listdir('tests'), 'FLE plot is lost!')
 
     def test_output(self):
         data_h5ad = pg.read_input("tests/result.mm10-rna.h5ad")

From 7c6aa5b910494962003998e114a982a363b15cb1 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sat, 8 Jul 2023 23:20:56 -0700
Subject: [PATCH 29/57] fixed a typo

---
 pegasus/annotate_cluster/human_immune_cell_markers.json | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index fd2afc23..feb55f0a 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -132,8 +132,7 @@
 								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
 							}
  						]
-					},
-
+					}
 				]
 			}
 		},

From 7d689b25bf0487e5871749e0fcb1e230a600090b Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Wed, 12 Jul 2023 00:47:23 -0400
Subject: [PATCH 30/57] Fixed a typo

---
 pegasus/annotate_cluster/human_lung_cell_markers.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index 9166aa1a..8149071b 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -90,7 +90,7 @@
 		},
 
 		{
-			"name" : "Submucosal gland serous cel",
+			"name" : "Submucosal gland serous cell",
 			"markers" : [
 				{
 					"genes" : ["PRR4+", "TCN1+", "C6orf58+", "PRB3+", "LPO+", "PRB1+", "PRH2+", "PRH1+", "ODAM+"],

From 606c1653ef7298ad3d4e30b14a1c059abbaa34af Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Tue, 18 Jul 2023 00:31:42 -0700
Subject: [PATCH 31/57] Raise warning instead of exception for attributes not
 in data

---
 pegasus/plotting/plot_library.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index f5f236e8..fb1083a6 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -152,6 +152,25 @@ def scatter(
     elif not is_list_like(attrs):
         attrs = [attrs]
 
+    # Select only valid attributes
+    attrs_filt = []
+    attrs_drop = []
+    for attr in attrs:
+        if (attr == '_all') or (attr in data.obs) or (attr in data.var_names) or ('@' in attr):
+            if not '@' in attr:
+                attrs_filt.append(attr)
+            else:
+                obsm_key, sep, component = attr.partition("@")
+                if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()):
+                    attrs_drop.append(attr)
+                else:
+                    attrs_filt.append(attr)
+        else:
+            attrs_drop.append(attr)
+    attrs = attrs_filt
+    if len(attrs_drop) > 0:
+        print(f"Warning: Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
+
     if isinstance(basis, str):
         basis = [basis]
     if isinstance(components, tuple):
@@ -236,8 +255,6 @@ def scatter(
                     values = slicing(data.X, col = loc)
                 else:
                     obsm_key, sep, component = attr.partition("@")
-                    if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()):
-                        raise KeyError(f"{attr} is not in data.obs, data.var_names or data.obsm!")
                     values = data.obsm[obsm_key][:, int(component)]
 
                 selected = restr_obj.get_satisfied(data, attr)

From 6d7b831addfca69660b852badfbf974af1af6449 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Thu, 20 Jul 2023 01:19:04 -0700
Subject: [PATCH 32/57] Added a third manual correction option (threshold) for
 doublet detection; Separate LEC from VEC for human_lung_cell_markers.json;
 Updated human immune and mouse immune markers for B and plasma cells

---
 .../human_immune_cell_markers.json            | 45 +++++-------
 .../human_lung_cell_markers.json              | 37 +++++-----
 .../mouse_immune_cell_markers.json            | 73 ++++++++++++++++++-
 pegasus/data_files/human_lung.gmt             |  5 +-
 pegasus/tools/doublet_detection.py            | 27 +++++--
 5 files changed, 132 insertions(+), 55 deletions(-)

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index feb55f0a..f7eff6fa 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -141,9 +141,9 @@
 			"name" : "Natural killer cell",
 			"markers" : [
 				{
-					"genes" : ["GNLY+", "KLRF1+", "KLRD1+", "TRDC+", "IL2RB+", "KLRC1+"],
+					"genes" : ["GNLY+", "KLRF1+", "KLRD1+", "TRDC+", "IL2RB+", "KLRC1+", "NCR1+"],
 					"weight" : 0.6,
-					"comment" : "General NK cell markers also cover some T cells; derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
+					"comment" : "General NK cell markers also cover some T cells; derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; Added NCR1, a pan NK cell marker"
 				},
 				{
 					"genes" : ["NCAM1+", "FCGR3A+"],
@@ -188,20 +188,15 @@
 			"name" : "B cell",
 			"markers" : [
 				{
-					"genes" : ["CD19+", "MS4A1+", "CD79A+", "CD79B+"],
-					"weight" : 0.7,
-					"comment" : "CD19, CD20 and CD79"
-				},
-				{
-					"genes" : ["BANK1+", "BLK+"],
-					"weight" : 0.2,
-					"comment" : "Extra B cell markers"
+					"genes" : ["MS4A1+", "CD79A+", "CD79B+", "CD19+", "BANK1+", "TNFRSF13C+", "CD22+", "BLK+", "FCRLA+", "HLA-DOB+"],
+					"weight" : 0.9,
+					"comment" : "Human and mouse shared B cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; TNFRSF13C (BAFF receptor); CD79A, CD79B, CD19, BLK, FCRLA and HLA-DOB are also expressed in Plasma cells; CD79B in addition is expressed in CD16+ monocytes & HSCs; BANK1 & BLK are expressed higher in memory B"
 				},
 				{
-					"genes" : ["CD74+", "HLA-DRA+", "HLA-DRB1+", "HLA-DPA1+", "HLA-DPB1+", "HLA-DQA1+", "HLA-DQB1+"],
+					"genes" : ["LINC00926+", "VPREB3+"],
 					"weight" : 0.1,
-					"comment" : "MHC II"
-				}
+					"comment" : "B cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
+				}			
 			],
 			"subtypes" : {
 				"title" : "B cell subtype markers",
@@ -235,9 +230,9 @@
 						"name" : "Naive B cell",
 						"markers" : [
 							{
-								"genes" : ["IGHD+", "TCL1A+", "FCER2+"],
+								"genes" : ["IGHD+", "TCL1A+", "FCER2+", "IL4R+", "PLPP5+"],
 								"weight" : 1.0,
-								"comments" : "markers for naive B cell, collected from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper. Validated using ICA pbmc data"
+								"comments" : "Markers for naive B cell derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; IGHD & FCER2 are shared with mouse"
 							}
 						]
 					},
@@ -245,9 +240,9 @@
 						"name" : "Memory B cell",
 						"markers" : [
 							{
-								"genes" : ["CD27+", "TNFRSF13B+"],
+								"genes" : ["IGHA1+", "IGHG1+", "CD27+", "TNFRSF13B+", "CLECL1P+", "AIM2+", "LGALS1+", "CRIP1+"],
 								"weight" : 1.0,
-								"comments" : "markers for memory B cell, collected from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper. Validated using ICA pbmc data"
+								"comments" : "Markers for memory B cell derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
 							}
 						]
 					}
@@ -295,23 +290,23 @@
 			"name" : "Plasma cell",
 			"markers" : [
 				{
-					"genes" : ["CD38+", "XBP1+", "CD27+", "SLAMF7+"],
-					"weight" : 0.4,
-					"comment" : "important markers"
+					"genes" : ["TNFRSF17+", "PRDM1+", "SLAMF7+", "IRF4+", "SDC1+"],
+					"weight" : 0.5,
+					"comment" : "Human and mouse shared markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; TNFRSF17 (BCMA), PRDM1 (BLIMP1); SDC1 is highly expressed in BMMC but not PBMC"
 				},
 				{
-					"genes" : ["TNFRSF17+", "TNFRSF13B+"],
+					"genes" : ["IGHA1+", "IGHG1+", "TNFRSF13B+"],
 					"weight" : 0.2,
-					"comment" : "TNF-receptor superfamily"
+					"comment" : "Markers expressed by both plasma and memory B cells, derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; IGHA1 & IGHG1 indicate class switch"
 				},
 				{
-					"genes" : ["IGHA1+", "IGHG1+"],
+					"genes" : ["CD38+", "ABCB9+", "CHPF+", "PLAAT2+"],
 					"weight" : 0.2,
-					"comment" : "class switching happened"
+					"comment" : "Human-specific plasma markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; PLAAT2 is highly expressed in PBMC but not BMMC"
 				},
 				{
 					"genes" : ["MS4A1-"],
-					"weight" : 0.2,
+					"weight" : 0.1,
 					"comment" : "not B cell, doi: https://doi.org/10.1182/bloodadvances.2017004481, long-live plasma can still express CD19"
 				}				
 			]
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index 8149071b..d138a8d9 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -117,14 +117,9 @@
 			"name" : "Vascular endothelial cell",
 			"markers" : [
 				{
-					"genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+"],
-					"weight" : 0.2,
-					"comment" : "Markers for endothelial cells, from Schupp et al. Circulation 2021"
-				},
-				{
-					"genes" : [ "ENG+", "PCDH17+", "CLEC14A+", "ESAM+", "ITM2A+", "BMPR2+", "FLT1+", "ADGRL4+", "SLCO2A1+", "AQP1+", "EPAS1+", "ADGRL2+", "IFI27+"],
-					"weight" : 0.8,
-					"comment" : "Common vascular EC markers from Schupp et al. Circulation 2021 and ADGRL2"
+					"genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+", "ICAM2+", "CLEC14A+", "ITM2A+", "ADGRL4+", "SLCO2A1+", "IFI27+"],
+					"weight" : 1.0,
+					"comment" : "Markers for vascular endothelial cells, validated using Travaglini et al. Nature 2020 and Schupp et al. Circulation 2021 data"
 				}
 			],
 			"subtypes" : {
@@ -179,21 +174,27 @@
 								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 							}
 						]
-					},
-					{
-						"name" : "EC lymphatic",
-						"markers" : [
-							{
-								"genes" : ["CCL21+", "TFF3+", "PDPN+", "PROX1+", "GPM6A+", "SEMA3D+", "TBX1+", "RELN+"],
-								"weight" : 1.0,
-								"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
-							}
-						]
 					}
 				]
 			}
 		},
 
+		{
+			"name" : "Lymphatic endothelial cell",
+			"markers" : [
+				{
+					"genes" : ["PECAM1+", "CLDN5+", "ERG+", "CDH5+"],
+					"weight" : 0.2,
+					"comment" : "Pan endothelial cell markers, validated using Travaglini et al. Nature 2020 and Schupp et al. Circulation 2021 data"
+				},
+				{
+					"genes" : ["CCL21+", "TFF3+", "PDPN+", "PROX1+", "LYVE1+", "FLT4+", "GPM6A+", "SEMA3D+", "TBX1+", "RELN+"],
+					"weight" : 0.8,
+					"comment" : "LEC markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
+				}
+			]
+		}
+
 
 
 		{
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index bb3ac649..4fe9acae 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -82,9 +82,65 @@
 			"name" : "B cell",
 			"markers" : [
 				{
-					"genes" : ["Cd19+", "Ms4a1+", "Cd79a+", "Cd79b+", "Ebf1+", "Pax5+", "Fcmr+", "Bank1+"],
-					"weight" : 1.0,
-					"comment" : "B cell markers from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
+					"genes" : ["Cd79a+", "Cd79b+", "Ms4a1+", "Cd19+", "H2-Ob+", "Tnfrsf13c+", "Bank1+", "Blk+", "Fcrla+", "Cd22+"],
+					"weight" : 0.91,
+					"comment" : "Human and mouse shared B cell markers; validated using Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020), Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data; Ebf1, Pax5 and Fcmr are good markers for mouse lung and liver but not marrow, tissue-specific marker?"
+				},
+				{
+					"genes" : ["Cxcr5+"],
+					"weight" : 0.09,
+					"comment" : "CXCR5 is constantly expressed by mature B cells and helps to guide B cells to follicle; fDC expresses CXCL13, the ligand for CXCR5; this marker expresses lowly in human but higher in mouse "
+				}
+			],
+			"subtypes" : {
+				"title" : "B cell subtype markers",
+				"cell_types" : [
+					{
+						"name" : "Naive B cell",
+						"markers" : [
+							{
+								"genes" : ["Ighd+", "Fcer2a+", "Vpreb3+", "Fcrl1+", "Chchd10+"],
+								"weight" : 1.0,
+								"comments" : "Markers for naive B cell derived from Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) & Kaptein et al. Cell 2022; Ighd & Fcer2a are shared with human"
+							}
+						]
+					},
+					{
+						"name" : "Memory B cell",
+						"markers" : [
+							{
+								"genes" : ["Zbtb32+", "C130026I21Rik+", "Pdlim1+", "Hepacam2+", "Igha+"],
+								"weight" : 0.8,
+								"comments" : "Markers for memory B cell derived from Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) data; need to check and add Ighg related genes"
+							},
+							{
+								"genes" : ["Nt5e+", "Cd80+", "Fas+", "Pdcd1lg2+"],
+								"weight" : 0.2,
+								"comments" : "Traditional mouse memory B cell validated by Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) data; all lowly expressed; Nt5e (5' Nucleotidase/CD73), Fas (CD95), Pdcd1lg2 (PD-L2/CD273)"
+							}
+						]
+					}
+				]
+			}
+		},
+
+		{
+			"name" : "Plasma cell",
+			"markers" : [
+				{
+					"genes" : ["Sdc1+", "Slamf7+", "Tnfrsf17+", "Irf4+", "Prdm1+"],
+					"weight" : 0.5,
+					"comment" : "Plasma cell markers shared with human and validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020)"
+				},
+				{
+					"genes" : ["Derl3+", "Chst1+", "Eaf2+", "Oosp1+", "Cacna1s+"],
+					"weight" : 0.4,
+					"comment" : "Mouse-specific plasma cell markers validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020)"
+				},
+				{
+					"genes" : ["Xbp1+", "Slc3a2+", "Ly6k+"],
+					"weight" : 0.1,
+					"comment" : "Traditional mouse plasma markers (not ideal) validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020); Xbp1 & Slc3a2 (CD98) expressed highest in plasma but also expressed in other cell types"
 				}
 			]
 		},
@@ -127,6 +183,17 @@
 			]
 		},
 
+		{
+			"name" : "Macrophage",
+			"markers" : [
+				{
+					"genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"],
+					"weight" : 1.0,
+					"comment" : "Machrophage markers from Kaptein et al. Cell 2022"
+				}
+			]
+		},
+		
 		{
 			"name" : "Conventional type 1 dendritic cell",
 			"markers" : [
diff --git a/pegasus/data_files/human_lung.gmt b/pegasus/data_files/human_lung.gmt
index 3559ccb9..6f488d48 100644
--- a/pegasus/data_files/human_lung.gmt
+++ b/pegasus/data_files/human_lung.gmt
@@ -1,12 +1,13 @@
 Epithelial	Epithelial markers from HTAPP paper	KRT8	KRT18	EPCAM	CD24
-Endothelial	Endothelial shared markers from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021	PECAM1	CLDN5	CDH5	ERG
+VEC	Vascular endothelial cell markers from Travaglini et al. Nature 2020 and and Schupp et al. Circulation 2021	PECAM1	CLDN5	CDH5	ERG	ICAM2	CLEC14A	ITM2A	ADGRL4	SLCO2A1	IFI27
+LEC	Lymphatic endothelial cell markers from Travaglini et al. Nature 2020 and and Schupp et al. Circulation 2021	PECAM1	CLDN5	ERG	CDH5	CCL21	TFF3	PDPN	PROX1	LYVE1	FLT4	GPM6A	SEMA3D	TBX1	RELN	
 Fibroblast	Fibroblast/Myofibroblast shared markers from Travaglini et al.	COL1A1	COL1A2	PDGFRA	ELN	BGN
 Macrophage	Macro	CD68	CD163	C1QA	MRC1	MS4A6A	MSR1	MERTK
 SMC	SMC from Muus et al., Braga et al. and Schupp et al.	MYH11	TAGLN	ACTG2	CNN1	PLN
 Pericyte	Pericyte from Schupp et al. and Travaglini et al.	TRPC6	CSPG4	FAM162B	GJA4	GJC1	HIGD1B	CDH6	LAMC3	FHL5
 T cell	T cell markers	CD3D	CD3E	CD3G	TRAC
 B cell	B cell markers	CD19	MS4A1	CD79A	CD79B
-Plasma cell	Plasma cell markers	CD38	XBP1	CD27	SLAMF7	TNFRSF17	TNFRSF13B
+Plasma cell	Plasma cell markers from ICA	TNFRSF17	PRDM1	SLAMF7	IRF4	SDC1	IGHA1	IGHG1	TNFRSF13B	CD38	ABCB9	CHPF	PLAAT2
 Mast cell	Mast cell markers	KIT	CPA3	TPSB2	TPSAB1	AREG	RGS1	RGS2
 ProNeu	Pro-Neutrophil markers validated using 10x public whole blood dataset	DEFA3	DEFA4	AZU1	MS4A3	ELANE	SLPI	CEACAM6	RNASE3	PRTN3	MPO	AC104232.1	CTSG
 PreNeu	Pre-Neutrophil markers validated using 10x public whole blood dataset	LTF	LCN2	MMP8	CRISP3	CAMP	PGLYRP1	CD177	HP
diff --git a/pegasus/tools/doublet_detection.py b/pegasus/tools/doublet_detection.py
index 5737af2d..f9f69393 100644
--- a/pegasus/tools/doublet_detection.py
+++ b/pegasus/tools/doublet_detection.py
@@ -267,7 +267,7 @@ def _run_scrublet(
         If True, plot diagnostic histograms. Each sample would have a figure consisting of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets.
 
     manual_correction: ``str``, optional, default: ``None``
-        If present, use human guide provided in manual_correction to select threshold. Currently support 'peak' and 'expected'. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate.
+        If present, use human guide provided in manual_correction to select threshold. Currently support 'peak', 'expected' and threshold. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate. If not both, convert guide to float and use as user-specified threshold.
 
     Returns
     --------
@@ -420,6 +420,8 @@ def _run_scrublet(
             threshold = np.exp(x[maxima_by_x[-1]])
         elif manual_correction == "expected":
             threshold = threshold_theory
+        else:
+            threshold = float(manual_correction)
 
     data.obs["doublet_score"] = obs_scores.astype(np.float32)
     data.obs["pred_dbl"] = obs_scores > threshold
@@ -474,7 +476,7 @@ def infer_doublets(
     data: MultimodalData,
     channel_attr: Optional[str] = None,
     clust_attr: Optional[str] = None,
-    raw_mat_key: Optional[str] = 'counts',
+    raw_mat_key: Optional[str] = None,
     min_cell: Optional[int] = 100,
     expected_doublet_rate: Optional[float] = None,
     sim_doublet_ratio: Optional[float] = 2.0,
@@ -501,6 +503,9 @@ def infer_doublets(
     clust_attr: ``str``, optional, default: None
         Attribute indicating cluster labels. If set, estimate proportion of doublets in each cluster and statistical significance.
 
+    raw_mat_key: ``str``, optional, default: None
+        The key for raw count matrix. By default, Pegasus will first try "counts" and then try "raw.X"
+
     min_cell: ``int``, optional, default: 100
         Minimum number of cells per sample to calculate doublet scores. For samples having less than 'min_cell' cells, doublet score calculation will be skipped.
 
@@ -529,7 +534,7 @@ def infer_doublets(
         If not None, plot diagnostic histograms using ``plot_hist`` as the prefix. If `channel_attr` is None, ``plot_hist.dbl.png`` is generated; Otherwise, ``plot_hist.channel_name.dbl.png`` files are generated. Each figure consists of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets. Each plot contains two dashed lines. The red dashed line represents the theoretical cutoff (calucalted based on number of cells and 10x doublet table) and the black dashed line represents the cutof inferred from the data.
     
     manual_correction: ``str``, optional, default: ``None``
-        Use human guide to correct doublet threshold for certain channels. This is string representing a comma-separately list. Each item in the list represent one sample and the sample name and correction guide are separated using ':'. The orrection guides supported are 'peak' and 'expected'. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate. If only one sample available, use '' as the sample name.
+        Use human guide to correct doublet threshold for certain channels. This is string representing a comma-separately list. Each item in the list represent one sample and the sample name and correction guide are separated using ':'. The correction guides supported are 'peak', 'expected' and threshold. 'peak' means cutting at the center of the peak; 'expected' means cutting at the expected doublet rate; threshold is the user-specified doublet threshold; if the guide is neither 'peak' nor 'expected', pegasus will try to convert the string into float and use it as doublet threshold. If only one sample available, no need to specify sample name.
 
     Returns
     -------
@@ -545,6 +550,11 @@ def infer_doublets(
     >>> pg.infer_doublets(data, channel_attr = 'Channel', clust_attr = 'Annotation')
     """
     assert data.get_modality() == "rna"
+
+    if raw_mat_key is None:
+        raw_mat_key = 'counts'
+        if raw_mat_key not in data.list_keys():
+            raw_mat_key = 'raw.X'
     try:
         rawX = data.get_matrix(raw_mat_key)
     except ValueError:
@@ -554,10 +564,13 @@ def infer_doublets(
 
     mancor = {}
     if manual_correction is not None:
-        for item in manual_correction.split(','):
-            name, action = item.split(':')
-            mancor[name] = action
-
+        if channel_attr is None:
+            mancor[''] = manual_correction
+        else:
+            for item in manual_correction.split(','):
+                name, action = item.split(':')
+                mancor[name] = action
+            
     if channel_attr is None:
         if data.shape[0] >= min_cell:
             fig = _run_scrublet(data, raw_mat_key, expected_doublet_rate = expected_doublet_rate, sim_doublet_ratio = sim_doublet_ratio, \

From 6990c47f099262439f0788dfcc078e2517055e1d Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Fri, 21 Jul 2023 11:46:04 -0700
Subject: [PATCH 33/57] release notes for v1.8.0

---
 docs/api/index.rst                 |  1 +
 docs/conf.py                       |  8 ++++----
 docs/index.rst                     |  2 +-
 docs/release_notes.rst             |  5 +++++
 docs/release_notes/version_1_8.rst | 16 ++++++++++++++++
 5 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 docs/release_notes/version_1_8.rst

diff --git a/docs/api/index.rst b/docs/api/index.rst
index 4311a096..540af78e 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -85,6 +85,7 @@ Cluster Algorithms
     cluster
     louvain
     leiden
+    split_one_cluster
     spectral_louvain
     spectral_leiden
 
diff --git a/docs/conf.py b/docs/conf.py
index 5ca8a416..8ec108a1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -23,22 +23,22 @@
 # -- Project information -----------------------------------------------------
 
 project = "Pegasus"
-copyright = "2022 Genentech, Inc. All rights reserved."
+copyright = "2023 Genentech, Inc. All rights reserved."
 author = (
     "Yiming Yang, Joshua Gould and Bo Li"
 )
 
 # The short X.Y version
-version = "1.7"
+version = "1.8"
 # The full version, including alpha/beta/rc tags
-release = "1.7.1"
+release = "1.8.0"
 
 
 # -- General configuration ---------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
 #
-#needs_sphinx = '1.7'
+#needs_sphinx = '1.8'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
diff --git a/docs/index.rst b/docs/index.rst
index 9289f3c9..b1893bf2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,7 +10,7 @@
 Release Highlights in Current Stable
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. include:: release_notes/version_1_7.rst
+.. include:: release_notes/version_1_8.rst
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
index 407a6f5c..4adc7f23 100644
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -6,6 +6,11 @@ Release Notes
 .. note::
     Also see the release notes of `PegasusIO <https://pegasusio.readthedocs.io/en/stable/release_notes.html>`__.
 
+Version 1.8
+~~~~~~~~~~~~~
+
+.. include:: release_notes/version_1_8.rst
+
 Version 1.7
 ~~~~~~~~~~~~~
 
diff --git a/docs/release_notes/version_1_8.rst b/docs/release_notes/version_1_8.rst
new file mode 100644
index 00000000..e81da4a4
--- /dev/null
+++ b/docs/release_notes/version_1_8.rst
@@ -0,0 +1,16 @@
+1.8.0 :small:`July 21, 2023`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**New Feature and Improvement**
+
+* Updata ``human_immune`` and ``human_lung`` marker sets.
+* Add ``mouse_liver`` marker set.
+* Add `split_one_cluster <./api/pegasus.split_one_cluster.html>`_ function to subcluster one cluster into a specified number of subclusters.
+* Update **neighbors** function to set ``use_cache=False`` by default, and adjust K to ``min(K, int(sqrt(n_samples)))``. [PR `272 <https://github.com/lilab-bcb/pegasus/pull/272>`_]
+* In **infer_doublets** function, argument ``manual_correction`` now accepts a float number threshold specified by users for cut-off. [PR `275 <https://github.com/lilab-bcb/pegasus/pull/275>`_]
+
+**Bug Fix**
+
+* Fix divide by zero issue in ``integrative_nmf`` function. [PR `258 <https://github.com/lilab-bcb/pegasus/pull/258>`_]
+* Compatibility with Pandas v2.0. [PR `261 <https://github.com/lilab-bcb/pegasus/pull/261>`_]
+* Allow ``infer_doublets`` to use any count matrix with key name specified by users. [PR `268 <https://github.com/lilab-bcb/pegasus/pull/268>`_ Thanks to `Donghoon Lee <https://github.com/hoondy>`_]

From 8714e025aa80d4f7d83ecee7fe1dcabf649f3a37 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Fri, 21 Jul 2023 11:53:19 -0700
Subject: [PATCH 34/57] build wheel for py3.10

---
 setup.py                             | 1 +
 wheel_build/build_wheel_for_linux.sh | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b353e796..6b679097 100644
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,7 @@
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
     ],
     keywords="single cell/nucleus genomics analysis",
     packages=find_packages(),
diff --git a/wheel_build/build_wheel_for_linux.sh b/wheel_build/build_wheel_for_linux.sh
index 792a1d48..a5cceadb 100755
--- a/wheel_build/build_wheel_for_linux.sh
+++ b/wheel_build/build_wheel_for_linux.sh
@@ -11,7 +11,7 @@ function repair_wheel {
     fi
 }
 
-declare -a PythonVersions=("cp37-cp37m" "cp38-cp38" "cp39-cp39")
+declare -a PythonVersions=("cp37-cp37m" "cp38-cp38" "cp39-cp39" "cp310-cp310")
 
 for val in ${PythonVersions[@]}; do
     /opt/python/$val/bin/pip install -r /src/requirements.txt

From 0010ca4b20fd2c9e37afa48bc42861e2f2ee8173 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sat, 29 Jul 2023 16:20:24 -0700
Subject: [PATCH 35/57] Updated human lung and mosue immune markers

---
 pegasus/annotate_cluster/human_lung_cell_markers.json |  2 +-
 .../annotate_cluster/mouse_immune_cell_markers.json   | 11 +++++++++++
 pegasus/tools/preprocessing.py                        |  2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index d138a8d9..42cc1eb8 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -193,7 +193,7 @@
 					"comment" : "LEC markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
-		}
+		},
 
 
 
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index 4fe9acae..fdcced1c 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -296,6 +296,17 @@
 					"comment" : "Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022"
 				}
 			]
+		}, 
+
+		{
+			"name" : "Red blood cell",
+			"markers" : [
+				{
+					"genes" : ["Hba-a1+", "Hba-a2+", "Hbb-bs+", "Hbb-bt+"],
+					"weight" : 1.0,
+					"comment" : "Hemoglobin genes"
+				}
+			]
 		}
 	]
 }
diff --git a/pegasus/tools/preprocessing.py b/pegasus/tools/preprocessing.py
index d3c1d6c8..dd105f74 100644
--- a/pegasus/tools/preprocessing.py
+++ b/pegasus/tools/preprocessing.py
@@ -346,7 +346,7 @@ def _set_target_mat(data, X, target_matrix, select, base_matrix, suffix):
     if target_matrix in data.matrices:
        logger.warning(f"{target_matrix} is in data's matrices. It will be rewritten.")
 
-    data.add_matrix(target_matrix, X)
+    data.update_matrix(target_matrix, X)
 
     if select:
         data.select_matrix(target_matrix)

From 3a758300e47089fd281b107eaa4deff362a64c56 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sun, 6 Aug 2023 13:59:15 -0700
Subject: [PATCH 36/57] Renamed Megakaryocyte to Platelet

---
 pegasus/annotate_cluster/human_immune_cell_markers.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index f7eff6fa..16459b95 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -403,7 +403,7 @@
 		},
 
 		{
-			"name" : "Erythroid cells",
+			"name" : "Erythroid cell",
 			"markers" : [
 				{
 					"genes" : ["GYPA+"], 
@@ -429,7 +429,7 @@
 		},
 
 		{
-			"name" : "Megakaryocyte",
+			"name" : "Platelet",
 			"markers" : [
 				{
 					"genes" : ["PF4+", "PPBP+", "GP5+"],

From 3cf7e67db5f3d4a5fab16cf033f1b7cf8d7e533a Mon Sep 17 00:00:00 2001
From: Jayaram Kancherla <jayaram.kancherla@gmail.com>
Date: Tue, 15 Aug 2023 16:32:48 -0700
Subject: [PATCH 37/57] only convert sparse matrices to numpy arrays

---
 pegasus/tools/signature_score.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pegasus/tools/signature_score.py b/pegasus/tools/signature_score.py
index 5030162f..1f055d39 100644
--- a/pegasus/tools/signature_score.py
+++ b/pegasus/tools/signature_score.py
@@ -1,4 +1,5 @@
 import numpy as np
+import scipy.sparse as sp
 import pandas as pd
 
 from typing import Dict, List, Union
@@ -89,7 +90,11 @@ def calculate_z_score(
     if not _check_and_calc_sig_background(data, n_bins):
         return None
 
-    z_score_mat = (data.X.toarray().astype(np.float32) - data.var["mean"].values.astype(np.float32) - data.obsm["sig_bkg_mean"][:, data.var["bins"].cat.codes].astype(np.float32)) / data.obsm["sig_bkg_std"][:, data.var["bins"].cat.codes].astype(np.float32)
+    mat = data.X
+    if sp.issparse(mat):
+        mat = mat.toarray()
+
+    z_score_mat = (mat.astype(np.float32) - data.var["mean"].values.astype(np.float32) - data.obsm["sig_bkg_mean"][:, data.var["bins"].cat.codes].astype(np.float32)) / data.obsm["sig_bkg_std"][:, data.var["bins"].cat.codes].astype(np.float32)
 
     return z_score_mat
 

From 56b07b5936673bf0c2164fde8eb0cfffd6d2a8c5 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Mon, 21 Aug 2023 14:53:57 +0800
Subject: [PATCH 38/57] Make this operation a function

---
 pegasus/plotting/plot_library.py | 19 ++-----------------
 pegasus/plotting/plot_utils.py   | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index fb1083a6..0b07ada1 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -31,6 +31,7 @@
     _generate_categories,
     _plot_corners,
     _plot_spots,
+    _get_valid_attrs,
 )
 
 
@@ -153,23 +154,7 @@ def scatter(
         attrs = [attrs]
 
     # Select only valid attributes
-    attrs_filt = []
-    attrs_drop = []
-    for attr in attrs:
-        if (attr == '_all') or (attr in data.obs) or (attr in data.var_names) or ('@' in attr):
-            if not '@' in attr:
-                attrs_filt.append(attr)
-            else:
-                obsm_key, sep, component = attr.partition("@")
-                if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()):
-                    attrs_drop.append(attr)
-                else:
-                    attrs_filt.append(attr)
-        else:
-            attrs_drop.append(attr)
-    attrs = attrs_filt
-    if len(attrs_drop) > 0:
-        print(f"Warning: Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
+    attrs = _get_valid_attrs(data, attrs)
 
     if isinstance(basis, str):
         basis = [basis]
diff --git a/pegasus/plotting/plot_utils.py b/pegasus/plotting/plot_utils.py
index 0c61cb58..48766ef9 100644
--- a/pegasus/plotting/plot_utils.py
+++ b/pegasus/plotting/plot_utils.py
@@ -435,3 +435,24 @@ def _plot_spots(x: np.ndarray, y: np.ndarray, c: Union[str, np.ndarray], s: floa
         spots.set_clim(vmin, vmax)
     ax.add_collection(spots)
     return spots
+
+
+def _get_valid_attrs(data:Union[MultimodalData, UnimodalData], attrs: List[str]) -> List[str]:
+    attrs_filt = []
+    attrs_drop = []
+    for attr in attrs:
+        if (attr == '_all') or (attr in data.obs) or (attr in data.var_names) or ('@' in attr):
+            if not '@' in attr:
+                attrs_filt.append(attr)
+            else:
+                obsm_key, sep, component = attr.partition("@")
+                if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()):
+                    attrs_drop.append(attr)
+                else:
+                    attrs_filt.append(attr)
+        else:
+            attrs_drop.append(attr)
+    if len(attrs_drop) > 0:
+        print(f"Warning: Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
+
+    return attrs_filt

From bc630ec545a288b655112840cf201773f10281bd Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Thu, 24 Aug 2023 08:07:04 +0800
Subject: [PATCH 39/57] fix a typo in human_lung cell marker JSON file

---
 pegasus/annotate_cluster/human_lung_cell_markers.json | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index d138a8d9..5b54986a 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -193,9 +193,7 @@
 					"comment" : "LEC markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
 				}
 			]
-		}
-
-
+		},
 
 		{
 			"name" : "Smooth muscle cell",

From eb7b766439474b8d9525cc3970b589239ec0bf7b Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Thu, 24 Aug 2023 08:23:25 +0800
Subject: [PATCH 40/57] update docs

---
 docs/conf.py                       | 2 +-
 docs/release_notes/version_1_8.rst | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index 8ec108a1..25a5726a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -31,7 +31,7 @@
 # The short X.Y version
 version = "1.8"
 # The full version, including alpha/beta/rc tags
-release = "1.8.0"
+release = "1.8.1"
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/release_notes/version_1_8.rst b/docs/release_notes/version_1_8.rst
index e81da4a4..4b0947ae 100644
--- a/docs/release_notes/version_1_8.rst
+++ b/docs/release_notes/version_1_8.rst
@@ -1,3 +1,8 @@
+1.8.1 :small:`August 23, 2023`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Bug fix in cell marker JSON files for ``infer_cell_types`` function.
+
 1.8.0 :small:`July 21, 2023`
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

From 5c186ef32676fcbdc371e33e2c7d552af3173ffb Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Thu, 5 Oct 2023 23:07:42 -0700
Subject: [PATCH 41/57] Expose online_batch_size in nmf and integrative_nmf
 functions

---
 pegasus/tools/nmf.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pegasus/tools/nmf.py b/pegasus/tools/nmf.py
index 7ce54e65..8cf2cb08 100644
--- a/pegasus/tools/nmf.py
+++ b/pegasus/tools/nmf.py
@@ -81,6 +81,7 @@ def nmf(
     alpha_H: float = 0.0,
     l1_ratio_H: float = 0.0,
     fp_precision: str = "float",
+    online_chunk_size: int = 5000,
     n_jobs: int = -1,
     random_state: int = 0,
 ) -> None:
@@ -137,6 +138,9 @@ def nmf(
     fp_precision: ``str``, optional, default: ``float``
         The numeric precision on the results. Choose from ``float`` and ``double``.
 
+    online_chunk_size: ``int``, optional, default: ``int``
+        The chunk / mini-batch size for online learning. Only works when ``mode='online'``.
+
     n_jobs : `int`, optional (default: -1)
         Number of threads to use. -1 refers to using all physical CPU cores.
 
@@ -189,6 +193,7 @@ def nmf(
         alpha_H=alpha_H,
         l1_ratio_H=l1_ratio_H,
         fp_precision=fp_precision,
+        online_chunk_size=online_chunk_size,
     )
 
     data.uns["nmf_features"] = features # record which feature to use
@@ -285,6 +290,7 @@ def integrative_nmf(
     use_gpu: bool = False,
     lam: float = 5.0,
     fp_precision: str = "float",
+    online_chunk_size: int = 5000,
     n_jobs: int = -1,
     random_state: int = 0,
     quantile_norm: bool = True,
@@ -334,6 +340,9 @@ def integrative_nmf(
     fp_precision: ``str``, optional, default: ``float``
         The numeric precision on the results. Choose from ``float`` and ``double``.
 
+    online_chunk_size: ``int``, optional, default: ``5000``
+        The chunk / mini-batch size for online learning. Only works when ``mode='online'``.
+
     n_jobs : `int`, optional (default: -1)
         Number of threads to use. -1 refers to using all physical CPU cores.
 
@@ -394,6 +403,7 @@ def integrative_nmf(
         use_gpu=use_gpu,
         lam=lam,
         fp_precision=fp_precision,
+        online_chunk_size=online_chunk_size,
     )
 
     # Implementation of algo 3, quantile normalization

From c55d2e4a6b88ccfd231c0aa25e04d935f2256bde Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Thu, 5 Oct 2023 23:27:39 -0700
Subject: [PATCH 42/57] no longer support Python 3.7

---
 .github/workflows/ci-test.yml | 2 +-
 requirements.txt              | 1 -
 setup.py                      | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
index 41d69118..cbcfaabe 100644
--- a/.github/workflows/ci-test.yml
+++ b/.github/workflows/ci-test.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: ['3.7', '3.8', '3.9']
+        python-version: ['3.8', '3.9']
 
     steps:
     - uses: actions/checkout@v2
diff --git a/requirements.txt b/requirements.txt
index f9a154c7..6341d8da 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,6 @@ Cython
 docopt
 demuxEM
 hnswlib
-importlib_metadata>=0.7; python_version < '3.8'
 psutil
 threadpoolctl
 joblib>=0.14
diff --git a/setup.py b/setup.py
index 6b679097..a434934e 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,6 @@
         "Topic :: Software Development :: Build Tools",
         "Topic :: Scientific/Engineering :: Bio-Informatics",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",

From 4628fa3764cd9b27ec8ec5297ebc5f633b651926 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Thu, 5 Oct 2023 23:29:13 -0700
Subject: [PATCH 43/57] Add Python 3.10 to CI test

---
 .github/workflows/ci-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
index cbcfaabe..e66dd54d 100644
--- a/.github/workflows/ci-test.yml
+++ b/.github/workflows/ci-test.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: ['3.8', '3.9']
+        python-version: ['3.8', '3.9', '3.10']
 
     steps:
     - uses: actions/checkout@v2

From 49a45a39dd820ce54832c6d235a0386ba2f31c78 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Sat, 7 Oct 2023 13:31:53 -0700
Subject: [PATCH 44/57] Updated marker list

---
 .../human_immune_cell_markers.json            |  8 +++----
 .../mouse_immune_cell_markers.json            | 24 ++++---------------
 .../mouse_lung_cell_markers.json              |  4 ++--
 3 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 16459b95..a4592a6f 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -97,9 +97,9 @@
 						"name" : "CD8 TCM",
 						"markers" : [
 							{
-								"genes" : ["CD8A+", "CD8B+", "GZMK+", "DUSP2+", "RGS1+", "CXCR3+", "CMC1+", "TIGIT+", "CST7+", "NKG7+"],
+								"genes" : ["CD8A+", "CD8B+", "GZMK+", "DUSP2+", "LTB+", "CD27+", "IL7R+", "GPR183+", "RGS1+", "CXCR3+"],
 								"weight" : 1.0,
-								"comment" : "Markers derived from Immune Cell Atlas PBMC data; CD8A & CD8B are CD8 markers; GZMK, DUSP2, RGS1 & CXCR3 are specific to TCM; CMC1 & TIGIT are biased towards TCM; CST7 & NKG7 are shared by TCM & TEM"
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data; CD8A & CD8B are CD8 markers; All others are CD8 TCM specific markers"
 							}
 						]
 					},
@@ -107,9 +107,9 @@
 						"name" : "CD8 TEM",
 						"markers" : [
 							{
-								"genes" : ["CD8A+", "CD8B+", "FGFBP2+", "GZMB+", "FCGR3A+", "SPON2+", "ADGRG1+", "CX3CR1+", "ASCL2+", "PRSS23+"],
+								"genes" : ["CD8A+", "CD8B+", "FGFBP2+", "GZMB+", "GZMH+", "GNLY+", "PRF1+", "KLRD1+", "FCGR3A+", "TBX21+", "CX3CR1+", "ASCL2+", "SPON2+", "ADGRG1+", "PRSS23+"],
 								"weight" : 1.0,
-								"comment" : "Markers derived from Immune Cell Atlas PBMC data"
+								"comment" : "Markers derived from Immune Cell Atlas PBMC data; FGFBP2, GZMB, GZMH, GNLY, PRF1, KLRD1, FCGR3A are pan TEM markers; TBX21, CX3CR1 and ASCL2 are Temra markers; the last three are purely data driven markers"
 							}
 						]
 					},
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index fdcced1c..9b9095eb 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -53,20 +53,6 @@
 			}
 		},
 
-		{
-			"name" : "Monocyte",
-			"markers" : [
-				{
-					"genes" : ["Lyz2+", "Lyz1+", "S100a4+", "Itgam+"],
-					"weight" : 0.8
-				},
-				{
-					"genes" : ["C1qb+", "C1qc+", "Mrc1+", "Cd52+"],
-					"weight" : 0.2
-				}
-			]
-		},
-
 		{
 			"name" : "Immature B cell",
 			"markers" : [
@@ -162,12 +148,12 @@
 		},
 
 		{
-			"name" : "Inflammatory monocyte",
+			"name" : "Classical monocyte",
 			"markers" : [
 				{
-					"genes" : ["Ly6c2+", "F13a1+", "Ms4a4c+", "Ccr2+", "Gm9733+", "Mcub+"],
+					"genes" : ["Ly6c2+", "F13a1+", "Ccr2+", "Ms4a4c+", "Gm9733+", "Mcub+", "S100a4+"],
 					"weight" : 1.0,
-					"comment" : "Inflammatory monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
+					"comment" : "Classical monocyte markers (except S100a4) inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; Ly6c2, F13a1, Ccr2 and Ms4a4c (in Fig. 1b) are Group III markers from Casanova-Acebes et al. Nature 2021. S100a4 is less specific to classical monocyte."
 				}
 			]
 		},
@@ -176,9 +162,9 @@
 			"name" : "Patrolling monocyte",
 			"markers" : [
 				{
-					"genes" : ["Ace+", "Eno3+", "Ear2+", "Treml4+", "Spn+", "Fcgr4+", "Lair1+", "Cd300e+", "Cd300ld+", "Adgre4+"],
+					"genes" : ["Eno3+", "Cd300e+", "Ace+", "Treml4+", "Spn+", "Adgre4+", "Lair1+", "Fcgr4+", "Ear2+", "Cd300ld+"],
 					"weight" : 1.0,
-					"comment" : "Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; Related papers: Domingo-Gonzalez et al. Elife 2020, Thomas et al. Arterioscler Thromb Vasc Biol. 2015, and Schyns et al. Nat. Commun. 2019."
+					"comment" : "Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; First 6 markers are Group IV markers in Fig. 1b of Casanova-Acebes et al. Nature paper; Eno3, Cd300e, Ace and Lair1 are very specific; Related papers: Domingo-Gonzalez et al. Elife 2020, Thomas et al. Arterioscler Thromb Vasc Biol. 2015, and Schyns et al. Nat. Commun. 2019."
 				}
 			]
 		},
diff --git a/pegasus/annotate_cluster/mouse_lung_cell_markers.json b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
index cb091482..543c3cff 100644
--- a/pegasus/annotate_cluster/mouse_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
@@ -292,9 +292,9 @@
 			"name" : "Alveolar macrophage",
 			"markers" : [
 				{
-					"genes" : ["Atp6v0d2+", "Olr1+", "F7+", "Ear1+", "Tfec+", "Gpnmb+", "Lrp12+", "Marco+"],
+					"genes" : ["Ear1+", "Marco+", "Atp6v0d2+", "Olr1+", "F7+", "Tfec+", "Gpnmb+", "Lrp12+", "Pparg+", "Car4+", "Krt19+", "Plet1+"],
 					"weight" : 1.0,
-					"comment" : "Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+					"comment" : "First 8 markers are Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data; Ear1 and Marco also show in Casanova-Acebes et al. Nature 2021; Last 4 are markers from Casanova-Acebes et al. Nature 2021 that are validated using Hurskainen et al. Nat. Commun. 2021 data"
 				}
 			]
 		},

From d09113a6f1047e2617d2164efeb42624b27fd64b Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Mon, 16 Oct 2023 15:30:06 -0700
Subject: [PATCH 45/57] update readthedocs conf

---
 .readthedocs.yml                     | 33 ++++++++++++++++++++++++++--
 wheel_build/build_wheel_for_linux.sh |  2 +-
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 7e053cac..ae4157db 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,6 +1,35 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
 build:
-  image: latest
+  os: ubuntu-22.04
+  tools:
+    python: "3.9"
+    # You can also specify other tool versions:
+    # nodejs: "20"
+    # rust: "1.70"
+    # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
 sphinx:
   configuration: docs/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 python:
-  version: 3.8
+  install:
+    - requirements: docs/requirements.txt
\ No newline at end of file
diff --git a/wheel_build/build_wheel_for_linux.sh b/wheel_build/build_wheel_for_linux.sh
index a5cceadb..98d6dc96 100755
--- a/wheel_build/build_wheel_for_linux.sh
+++ b/wheel_build/build_wheel_for_linux.sh
@@ -11,7 +11,7 @@ function repair_wheel {
     fi
 }
 
-declare -a PythonVersions=("cp37-cp37m" "cp38-cp38" "cp39-cp39" "cp310-cp310")
+declare -a PythonVersions=("cp38-cp38" "cp39-cp39" "cp310-cp310" "cp311-cp311")
 
 for val in ${PythonVersions[@]}; do
     /opt/python/$val/bin/pip install -r /src/requirements.txt

From 7d56c4937fe3e8c69485ca83d5f4c0ea150d1f22 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Mon, 16 Oct 2023 15:34:51 -0700
Subject: [PATCH 46/57] add support for py3.11

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index a434934e..10dac019 100644
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,7 @@
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
     ],
     keywords="single cell/nucleus genomics analysis",
     packages=find_packages(),

From d279339d2fc6071fa0d636c19fcf9b773c7831d7 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Mon, 30 Oct 2023 00:09:44 -0700
Subject: [PATCH 47/57] Updated heatmap function

---
 pegasus/plotting/plot_library.py | 149 ++++++++++++++++++++-----------
 1 file changed, 99 insertions(+), 50 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index f5f236e8..2048aa6c 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1006,16 +1006,24 @@ def violin(
 def heatmap(
     data: Union[MultimodalData, UnimodalData, anndata.AnnData],
     attrs: Union[str, List[str]],
-    groupby: str,
+    groupby: Optional[str] = None,
     matkey: Optional[str] = None,
-    on_average: bool = True,
-    switch_axes: bool = False,
+    gene_zscore: Optional[bool] = True,
+    on_average: Optional[bool] = True,
+    switch_axes: Optional[bool] = False,
     attrs_cluster: Optional[bool] = False,
     attrs_dendrogram: Optional[bool] = True,
+    attrs_method: Optional[bool] = 'ward',
+    attrs_optimal_ordering: Optional[bool] = True,
+    attrs_labelsize: Optional[float] = 10.0,
+    attrs_labelrotation: Optional[float] = 0.0,
     groupby_cluster: Optional[bool] = True,
     groupby_dendrogram: Optional[bool] = True,
-    attrs_labelsize: Optional[float] = 10.0,
+    groupby_method: Optional[bool] = 'ward',
+    groupby_optimal_ordering: Optional[bool] = True,
+    groupby_precomputed_linkage: Optional[np.array] = None,
     groupby_labelsize: Optional[float] = 10.0,
+    groupby_labelrotation: Optional[float] = 0.0,
     cbar_labelsize: Optional[float] = 10.0,
     panel_size: Tuple[float, float] = (10, 10),
     return_fig: Optional[bool] = False,
@@ -1027,7 +1035,6 @@ def heatmap(
 
     Parameters
     -----------
-
     data: ``AnnData`` or ``MultimodalData`` or ``UnimodalData`` object
         Single-cell expression data.
     attrs: ``str`` or ``List[str]``
@@ -1035,13 +1042,16 @@ def heatmap(
         Cell attributes must exist in ``data.obs`` and must be numeric.
         Features must exist in ``data.var``.
         By default, attrs are plotted as columns.
-    groupby: ``str``
+    groupby: ``str``, optional, default: ``None``
         A categorical variable in data.obs that is used to categorize the cells, e.g. Clusters.
         By default, data.obs['groupby'] is plotted as rows.
+        If ``None``, use data.obs_names instead.
     matkey: ``str``, optional, default: ``None``
         If matkey is set, select matrix with matkey as keyword in the current modality. Only works for MultimodalData or UnimodalData objects.
+    gene_zscore: ``bool``, optional, default: ``True``
+        If ``True``, compute and then plot z scores for gene expression.
     on_average: ``bool``, optional, default: ``True``
-        If ``True``, plot cluster average gene expression (i.e. show a Matrixplot); otherwise, plot a general heatmap.
+        If ``True``, plot cluster average gene expression or z score (i.e. show a Matrixplot); otherwise, plot a general heatmap.
     switch_axes: ``bool``, optional, default: ``False``
         By default, X axis is for attributes, and Y axis for clusters. If this parameter is ``True``, switch the axes.
         Moreover, with ``on_average`` being ``False``, if ``switch_axes`` is ``False``, ``row_cluster`` is enforced to be ``False``; if ``switch_axes`` is ``True``, ``col_cluster`` is enforced to be ``False``.
@@ -1049,14 +1059,28 @@ def heatmap(
         Cluster attributes and generate a attribute-wise dendrogram.
     attrs_dendrogram: ``bool``, optional, default: ``True``
         Only matters if attrs_cluster is True. Show the dendrogram if this option is True.
+    attrs_method: ``str``, optional, default: ``ward``
+        Linkage method for attrs, choosing from ``single``, ``complete``, ``average``, ``weighted``, ``centroid``, ``median`` and ``ward``.
+    attrs_optimal_ordering: ``bool``, optional, default: ``True``
+        Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the attrs linkage matrix will be reordered so that the distance between successive leaves is minima.
+    attrs_labelsize: ``float``, optional, default: 10.0
+        Fontsize for labels of attrs.
+    attrs_labelrotation: ``float``, optional, default: 0.0
+        Rotation of labels for attrs.
     groupby_cluster: ``bool``, optional, default: ``True``
         Cluster data.obs['groupby'] and generate a cluster-wise dendrogram.
     groupby_dendrogram: ``bool``, optional, default: ``True``
         Only matters if groupby_cluster is True. Show the dendrogram if this option is True.
-    attrs_labelsize: ``float``, optional, default: 10.0
-        Fontsize for labels of attrs.
+    groupby_method: ``str``, optional, default: ``ward``
+        Linkage method for groupby, choosing from ``single``, ``complete``, ``average``, ``weighted``, ``centroid``, ``median`` and ``ward``.
+    groupby_optimal_ordering: ``bool``, optional, default: ``True``
+        Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the groupby linkage matrix will be reordered so that the distance between successive leaves is minima.
+    groupby_precomputed_linkage: ``np.array``, optional, default: ``None``
+        Pass a precomputed linkage.
     groupby_labelsize: ``float``, optional, default: 10.0
         Fontsize for labels of data.obs['groupby'].
+    groupby_labelrotation: ``float``, optional, default: 0.0
+        Rotation of labels for groupby.
     cbar_labelsize: ``float``, optional, default: 10.0
         Fontsize of the color bar.
     panel_size: ``Tuple[float, float]``, optional, default: ``(10, 10)``
@@ -1073,7 +1097,7 @@ def heatmap(
     -------
 
     ``Figure`` object
-        A ``matplotlib.figure.Figure`` object containing the dot plot if ``return_fig == True``
+        A ``matplotlib.figure.Figure`` object containing the heatmap if ``return_fig == True``; Otherwise, A ``seaborn.matrix.ClusterGrid`` object is returned.
 
     Examples
     --------
@@ -1101,71 +1125,100 @@ def heatmap(
                 return None
             genes.append(key)
 
-    clusters = data.obs[groupby].values
-    if not is_categorical_dtype(clusters):
-        clusters = pd.Categorical(clusters)
-    else:
-        clusters = clusters.remove_unused_categories()
-    df_list = [pd.DataFrame({'cluster_name': clusters})]
-
+    df_list = []
     if len(obs_keys) > 0:
         df_list.append(data.obs[obs_keys].reset_index(drop=True))
     if len(genes) > 0:
         expr_mat = slicing(data[:, genes].X)
+        if gene_zscore:
+            from scipy.stats import zscore
+            expr_mat = zscore(expr_mat, ddof=1)
         df_list.append(pd.DataFrame(data=expr_mat, columns=genes))
     df = pd.concat(df_list, axis = 1)
-    attr_names = df.columns[1:].values
+    df.index = data.obs_names
+    attr_names = df.columns.values
+
+    cluster_ids = df.index
+    cell_colors = None
+    if groupby is not None:
+        cluster_ids = data.obs[groupby].values
+        if not is_categorical_dtype(cluster_ids):
+            cluster_ids = pd.Categorical(cluster_ids)
+        else:
+            cluster_ids = cluster_ids.remove_unused_categories()
+
+        if on_average:
+            if not 'cmap' in kwargs.keys():
+                kwargs['cmap'] = 'Reds'
+            df['cluster_name'] = cluster_ids
+            df = df.groupby('cluster_name').mean()
+            cluster_ids = df.index
+        else:
+            if not groupby_cluster:
+                idx = cluster_ids.argsort(kind = 'mergesort')
+                df = df.iloc[idx, :]  # organize df by category order
+                cluster_ids = cluster_ids[idx]
 
-    if on_average:
-        if not 'cmap' in kwargs.keys():
-            kwargs['cmap'] = 'Reds'
-        df = df.groupby('cluster_name').mean()
-        cluster_ids = df.index
-    else:
-        cluster_ids = df.pop('cluster_name').values
-        if not groupby_cluster:
-            idx = cluster_ids.argsort(kind = 'mergesort')
-            df = df.iloc[idx, :]  # organize df by category order
-            cluster_ids = cluster_ids[idx]
+            cell_colors = np.zeros(df.shape[0], dtype=object)
+            palette = _get_palette(cluster_ids.categories.size)
+
+            for k, cat in enumerate(cluster_ids.categories):
+                cell_colors[cluster_ids == cat] = palette[k]
 
-        cell_colors = np.zeros(df.shape[0], dtype=object)
-        palette = _get_palette(cluster_ids.categories.size)
+            cluster_ids = []
+
+
+    from scipy.cluster.hierarchy import linkage
+
+    groupby_linkage = None
+    if groupby_cluster:
+        if groupby_precomputed_linkage is not None:
+            groupby_linkage = groupby_precomputed_linkage
+        else:
+            groupby_linkage = linkage(df, groupby_method, optimal_ordering = groupby_optimal_ordering)
+    attrs_linkage = None
+    if attrs_cluster:
+        attrs_linage = linkage(df.T, attrs_method, optimal_ordering = attrs_optimal_ordering)
 
-        for k, cat in enumerate(cluster_ids.categories):
-            cell_colors[cluster_ids == cat] = palette[k]
 
     if not switch_axes:
         cg = sns.clustermap(
             data=df,
-            row_colors=cell_colors if not on_average else None,
+            row_colors=cell_colors,
             col_colors=None,
             row_cluster=groupby_cluster,
             col_cluster=attrs_cluster,
+            row_linkage=groupby_linkage,
+            col_linkage=attrs_linkage,
             linewidths=0,
-            yticklabels=cluster_ids if on_average else [],
+            yticklabels=cluster_ids,
             xticklabels=attr_names,
             figsize=panel_size,
             **kwargs,
         )
         cg.ax_heatmap.set_ylabel("")
-        if attrs_labelsize is not None:
-            cg.ax_heatmap.tick_params(axis='x', labelsize=attrs_labelsize, labelrotation=75)
+        cg.ax_heatmap.tick_params(axis='x', labelsize=attrs_labelsize, labelrotation=attrs_labelrotation)
+        if groupby is None:
+            cg.ax_heatmap.tick_params(axis='y', labelsize=groupby_labelsize, labelrotation=groupby_labelrotation)
     else:
         cg = sns.clustermap(
             data=df.T,
             row_colors=None,
-            col_colors=cell_colors if not on_average else None,
+            col_colors=cell_colors,
             row_cluster=attrs_cluster,
             col_cluster=groupby_cluster,
+            row_linkage=attrs_linkage,
+            col_linkage=groupby_linkage,
             linewidths=0,
             yticklabels=attr_names,
-            xticklabels=cluster_ids if on_average else [],
+            xticklabels=cluster_ids,
             figsize=panel_size,
             **kwargs,
         )
         cg.ax_heatmap.set_xlabel("")
-        if attrs_labelsize is not None:
-            cg.ax_heatmap.tick_params(axis='y', labelsize=attrs_labelsize)
+        cg.ax_heatmap.tick_params(axis='y', labelsize=attrs_labelsize, labelrotation=attrs_labelrotation)
+        if groupby is None:
+            cg.ax_heatmap.tick_params(axis='x', labelsize=groupby_labelsize, labelrotation=groupby_labelrotation)
 
     show_row_dendrogram = (attrs_cluster and attrs_dendrogram) if switch_axes else (groupby_cluster and groupby_dendrogram)
     show_col_dendrogram = (groupby_cluster and groupby_dendrogram) if switch_axes else (attrs_cluster and attrs_dendrogram)
@@ -1194,17 +1247,13 @@ def heatmap(
         cg.ax_cbar.yaxis.set_ticks_position("right")
 
 
-    if show_col_dendrogram:
-        cg.ax_heatmap.xaxis.tick_bottom()
-        cg.ax_col_dendrogram.set_visible(True)
-    else:
-        cg.ax_heatmap.xaxis.tick_top()
-        cg.ax_col_dendrogram.set_visible(False)
+    cg.ax_heatmap.xaxis.tick_bottom()
+    cg.ax_col_dendrogram.set_visible(show_col_dendrogram)
 
     cg.ax_cbar.tick_params(labelsize=cbar_labelsize)
     cg.fig.dpi = dpi
 
-    if not on_average:
+    if (groupby is not None) and (not on_average):
         if groupby_cluster:
             from matplotlib.patches import Patch
             legend_elements = [Patch(color = color, label = label) for color, label in zip(palette, cluster_ids.categories)]
@@ -1228,7 +1277,7 @@ def heatmap(
         if cur_matkey != data.current_matrix():
             data.select_matrix(cur_matkey)
 
-    return cg.fig if return_fig else None
+    return cg.fig if return_fig else cg
 
 
 def dotplot(
@@ -1498,7 +1547,7 @@ def dendrogram(
     linkage: ``str``, optional, default: ``complete``
         Which linkage criterion to use, used by hierarchical clustering. Below are available options:
             - ``ward`` minimizes the variance of the clusters being merged.
-            - ``avarage`` uses the average of the distances of each observation of the two sets.
+            - ``average`` uses the average of the distances of each observation of the two sets.
             - ``complete`` uses the maximum distances between all observations of the two sets. (Default)
             - ``single`` uses the minimum of the distances between all observations of the two sets.
 

From a3ea1f85b7c206728d6dd05daa3073533b08d6e6 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yangy197@10-178-21-254.aws.cloud.roche.com>
Date: Mon, 18 Dec 2023 22:29:54 +0000
Subject: [PATCH 48/57] Fix issue in dotplot

---
 pegasus/plotting/plot_library.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 0b07ada1..deb2aa05 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
+import matplotlib
 import matplotlib.pyplot as plt
 
 from scipy.sparse import issparse
@@ -1443,7 +1444,7 @@ def non_zero(g):
     size_legend.grid(False)
 
     # Reset global settings.
-    sns.reset_orig()
+    matplotlib.rc_file_defaults()
 
     return fig if return_fig else None
 

From 5483ba0f2d08f5668f08d580b4af2022374dbce4 Mon Sep 17 00:00:00 2001
From: Bo Li <li.bo@gene.com>
Date: Fri, 22 Dec 2023 20:47:39 -0800
Subject: [PATCH 49/57] Updated heatmap function

---
 pegasus/plotting/plot_library.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 2048aa6c..749b6947 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1024,6 +1024,7 @@ def heatmap(
     groupby_precomputed_linkage: Optional[np.array] = None,
     groupby_labelsize: Optional[float] = 10.0,
     groupby_labelrotation: Optional[float] = 0.0,
+    show_sample_name: Optional[bool] = None,
     cbar_labelsize: Optional[float] = 10.0,
     panel_size: Tuple[float, float] = (10, 10),
     return_fig: Optional[bool] = False,
@@ -1081,6 +1082,8 @@ def heatmap(
         Fontsize for labels of data.obs['groupby'].
     groupby_labelrotation: ``float``, optional, default: 0.0
         Rotation of labels for groupby.
+    show_sample_name: ``bool``, optional, default: ``None``
+        If show sample names as tick labels. If ``None``, show_sample_name == ``True`` if groupby == ``None`` and otherwise show_sample_name == ``False``.
     cbar_labelsize: ``float``, optional, default: 10.0
         Fontsize of the color bar.
     panel_size: ``Tuple[float, float]``, optional, default: ``(10, 10)``
@@ -1138,7 +1141,11 @@ def heatmap(
     df.index = data.obs_names
     attr_names = df.columns.values
 
-    cluster_ids = df.index
+    if show_sample_name is None:
+        show_sample_name = True if groupby is None else False
+    sample_tick_labels = df.index if show_sample_name else []
+
+    cluster_ids = None
     cell_colors = None
     if groupby is not None:
         cluster_ids = data.obs[groupby].values
@@ -1165,9 +1172,6 @@ def heatmap(
             for k, cat in enumerate(cluster_ids.categories):
                 cell_colors[cluster_ids == cat] = palette[k]
 
-            cluster_ids = []
-
-
     from scipy.cluster.hierarchy import linkage
 
     groupby_linkage = None
@@ -1178,7 +1182,7 @@ def heatmap(
             groupby_linkage = linkage(df, groupby_method, optimal_ordering = groupby_optimal_ordering)
     attrs_linkage = None
     if attrs_cluster:
-        attrs_linage = linkage(df.T, attrs_method, optimal_ordering = attrs_optimal_ordering)
+        attrs_linkage = linkage(df.T, attrs_method, optimal_ordering = attrs_optimal_ordering)
 
 
     if not switch_axes:
@@ -1191,7 +1195,7 @@ def heatmap(
             row_linkage=groupby_linkage,
             col_linkage=attrs_linkage,
             linewidths=0,
-            yticklabels=cluster_ids,
+            yticklabels=sample_tick_labels,
             xticklabels=attr_names,
             figsize=panel_size,
             **kwargs,
@@ -1211,7 +1215,7 @@ def heatmap(
             col_linkage=groupby_linkage,
             linewidths=0,
             yticklabels=attr_names,
-            xticklabels=cluster_ids,
+            xticklabels=sample_tick_labels,
             figsize=panel_size,
             **kwargs,
         )

From 4ede98533a7948887dd2029a0ae0b1b5b84581db Mon Sep 17 00:00:00 2001
From: Yiming Yang <yiming@Yimings-MacBook-Pro.local>
Date: Wed, 27 Dec 2023 12:14:38 -0800
Subject: [PATCH 50/57] remove restriction on igraph

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6341d8da..1a8193d6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,4 +26,4 @@ statsmodels
 umap-learn>=0.5.2
 wordcloud
 xlsxwriter
-igraph<=0.9.10
+igraph

From 9daf25e6c752355cc946866c336b3c0be31fcd99 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Thu, 4 Jan 2024 14:22:45 -0800
Subject: [PATCH 51/57] Fix violin plot for Seaborn v0.13+

---
 pegasus/plotting/plot_library.py | 15 ++++++++++++++-
 requirements.txt                 |  2 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index deb2aa05..934f479c 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -964,9 +964,17 @@ def violin(
             genes.append(key)
 
     df_list = [pd.DataFrame({"label": data.obs[groupby].values})]
+
     if hue is not None:
         df_list.append(pd.DataFrame({hue: data.obs[hue].values}))
         stripplot = False
+        kwargs['hue'] = hue
+        kwargs['split'] = True
+    else:
+        kwargs['hue'] = "label"
+        kwargs['legend'] = False
+        kwargs['split'] = False
+
     if len(obs_keys) > 0:
         df_list.append(data.obs[obs_keys].reset_index(drop=True))
     if len(genes) > 0:
@@ -978,7 +986,11 @@ def violin(
         ax = axes[i, 0]
         if stripplot:
             sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=stripsize, color="k", jitter=True)
-        sns.violinplot(x="label", y=attrs[i], hue = hue, data=df, inner=inner, linewidth=1, ax=ax, cut=0, scale=scale, split=True, palette=palette, **kwargs)
+        sns.violinplot(x="label", y=attrs[i], data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, palette=palette, **kwargs)
+        #if hue is None:
+        #    sns.violinplot(x="label", y=attrs[i], hue = 'label', legend=False, data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, split=False, palette=palette, **kwargs)
+        #else:
+        #    sns.violinplot(x="label", y=attrs[i], hue = hue, data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, split=True, palette=palette, **kwargs)
         ax.grid(False)
 
         if hue is not None:
@@ -991,6 +1003,7 @@ def violin(
             ax.set_xlabel("")
         else:
             ax.set_xlabel(groupby)
+            ax.set_xticks(ax.get_xticks())  # Get rid of the UserWarning: set_ticklabels() should only be used with a fixed number of ticks
             ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
         ax.set_ylabel(attrs[i], labelpad=8, rotation=0, horizontalalignment='right', fontsize='medium')
         ax.tick_params(axis='y', right=True, left=False, labelright=True, labelleft=False, labelsize='small')
diff --git a/requirements.txt b/requirements.txt
index 1a8193d6..6e948083 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,7 +20,7 @@ pybind11
 scikit-learn>=0.23.2
 scikit-misc
 scipy
-seaborn
+seaborn>=0.13.0
 setuptools
 statsmodels
 umap-learn>=0.5.2

From 10e1269884e45ad11d6139dd1472f20eff609646 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Thu, 4 Jan 2024 14:26:13 -0800
Subject: [PATCH 52/57] remove comments

---
 pegasus/plotting/plot_library.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 934f479c..6d3eaa14 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -987,10 +987,6 @@ def violin(
         if stripplot:
             sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=stripsize, color="k", jitter=True)
         sns.violinplot(x="label", y=attrs[i], data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, palette=palette, **kwargs)
-        #if hue is None:
-        #    sns.violinplot(x="label", y=attrs[i], hue = 'label', legend=False, data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, split=False, palette=palette, **kwargs)
-        #else:
-        #    sns.violinplot(x="label", y=attrs[i], hue = hue, data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, split=True, palette=palette, **kwargs)
         ax.grid(False)
 
         if hue is not None:

From 4b160e9c3909b2f3995eca57038f718139f42fd8 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Sat, 6 Jan 2024 23:13:21 -0800
Subject: [PATCH 53/57] dotplot and violin skip genes not in the data

---
 pegasus/plotting/plot_library.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 19d27fbe..b5123381 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -947,6 +947,9 @@ def violin(
         assert not isinstance(data, anndata.AnnData)
         data.select_matrix(matkey)
 
+    # Filter out attributes not existing in the data
+    attrs = _get_valid_attrs(data, attrs)
+
     nrows = len(attrs)
     fig, axes = _get_subplot_layouts(nrows=nrows, ncols=1, panel_size=panel_size, dpi=dpi, left=left, bottom=bottom, wspace=wspace, hspace=0, squeeze=False, sharey=False)
 
@@ -958,9 +961,6 @@ def violin(
             assert is_numeric_dtype(data.obs[key])
             obs_keys.append(key)
         else:
-            if key not in data.var_names:
-                logger.warning(f"Cannot find gene {key}. Please make sure all genes are included in data.var_names before running this function!")
-                return None
             genes.append(key)
 
     df_list = [pd.DataFrame({"label": data.obs[groupby].values})]
@@ -1170,7 +1170,7 @@ def heatmap(
             if not 'cmap' in kwargs.keys():
                 kwargs['cmap'] = 'Reds'
             df['cluster_name'] = cluster_ids
-            df = df.groupby('cluster_name').mean()
+            df = df.groupby(by='cluster_name', observed=True).mean()
             cluster_ids = df.index
         else:
             if not groupby_cluster:
@@ -1300,7 +1300,7 @@ def dotplot(
     data: Union[MultimodalData, UnimodalData, anndata.AnnData],
     genes: Union[str, List[str]],
     groupby: str,
-    reduce_function: Callable[[np.ndarray], float] = np.mean,
+    reduce_function: Union[str, Callable[[np.ndarray], float]] = "mean",
     fraction_min: float = 0,
     fraction_max: float = None,
     dot_min: int = 0,
@@ -1325,7 +1325,7 @@ def dotplot(
         Features to plot.
     groupby: ``str``
         A categorical variable in data.obs that is used to categorize the cells, e.g. Clusters.
-    reduce_function: ``Callable[[np.ndarray], float]``, optional, default: ``np.mean``
+    reduce_function: ``Union[str, Callable[[np.ndarray], float]]``, optional, default: ``"mean"``
         Function to calculate statistic on expression data. Default is mean.
     fraction_min: ``float``, optional, default: ``0``.
         Minimum fraction of expressing cells to consider.
@@ -1364,12 +1364,14 @@ def dotplot(
     sns.set(font_scale=0.7, style='whitegrid')
 
     if not is_list_like(genes):
-        geness = [genes]
+        genes = [genes]
+
+    # Select only genes existing in the data
+    genes = _get_valid_attrs(data, genes)
 
     keywords = dict(cmap=cmap)
     keywords.update(kwds)
 
-    from scipy.sparse import issparse
     X = slicing(data[:, genes].X)
     df = pd.DataFrame(data=X, columns=genes)
     df[groupby] = data.obs[groupby].values
@@ -1387,7 +1389,8 @@ def dotplot(
     def non_zero(g):
         return np.count_nonzero(g) / g.shape[0]
 
-    summarized_df = df.groupby(groupby).aggregate([reduce_function, non_zero])
+    # Set observed=True to suppress warnings.
+    summarized_df = df.groupby(by=groupby, observed=True).aggregate([reduce_function, non_zero])
 
     row_indices = summarized_df.index.tolist()
     if sort_function == "natsorted":

From b61f9ce306db82eeef7e6fcf2efddcbe9283e4ab Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Sat, 6 Jan 2024 23:17:35 -0800
Subject: [PATCH 54/57] heatmap skip attributes not in the data

---
 pegasus/plotting/plot_library.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index b5123381..7ffd1581 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1128,6 +1128,9 @@ def heatmap(
     if isinstance(attrs, str):
         attrs = [attrs]
 
+    # Filter out attributes not existing in the data
+    attrs = _get_valid_attrs(data, attrs)
+
     obs_keys = []
     genes = []
     for key in attrs:

From c8600cc91ab8577ea100564f867fd3ffd74bc6ce Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Sun, 7 Jan 2024 13:08:02 -0800
Subject: [PATCH 55/57] update CI test

---
 .github/workflows/ci-test.yml |  6 +----
 tests/run_hashing_citeseq.sh  | 13 ---------
 tests/test_hashing_citeseq.py | 50 -----------------------------------
 3 files changed, 1 insertion(+), 68 deletions(-)
 delete mode 100644 tests/run_hashing_citeseq.sh
 delete mode 100644 tests/test_hashing_citeseq.py

diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
index e66dd54d..a501f27c 100644
--- a/.github/workflows/ci-test.yml
+++ b/.github/workflows/ci-test.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.8', '3.9', '3.10', '3.11']
 
     steps:
     - uses: actions/checkout@v2
@@ -48,10 +48,6 @@ jobs:
     - name: One sample input test
       run: |
         bash tests/run_one_sample.sh
-    - name: Hashing CITE-Seq pipeline test
-      run: |
-        bash tests/run_hashing_citeseq.sh
-        pytest tests/test_hashing_citeseq.py
     - name: iNMF test
       run: |
         bash tests/run_inmf.sh
diff --git a/tests/run_hashing_citeseq.sh b/tests/run_hashing_citeseq.sh
deleted file mode 100644
index e546244e..00000000
--- a/tests/run_hashing_citeseq.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-pegasus demuxEM -p 2 --generate-diagnostic-plots tests/data/hashing_citeseq/cb_cc_raw_gene_bc_matrices_h5.h5 tests/data/hashing_citeseq/cb_cell_hashing.csv tests/cb_cc
-if [ -f "tests/cb_cc_demux.zarr.zip" ]; then
-    pegasus aggregate_matrix --select-only-singlets --min-genes 100 tests/data/sample_hashing_citeseq.csv tests/cb_cc_citeseq
-
-    if [ -f "tests/cb_cc_citeseq.zarr.zip" ]; then
-        pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --mito-prefix MT- --percent-mito 20 --louvain --umap --citeseq --citeseq-umap --citeseq-umap-exclude Mouse_IgG1,Mouse_IgG2a,Mouse_IgG2b,Rat_IgG2b tests/cb_cc_citeseq.zarr.zip tests/citeseq_result
-
-        if [ -f "tests/citeseq_result.zarr.zip" ]; then
-            pegasus plot scatter --basis umap --attributes louvain_labels,assignment tests/citeseq_result.zarr.zip tests/citeseq_result.umap.pdf
-            pegasus plot scatter --basis citeseq_umap --attributes louvain_labels,assignment tests/citeseq_result.zarr.zip tests/citeseq_result.citeseq_umap.pdf
-        fi
-    fi
-fi
diff --git a/tests/test_hashing_citeseq.py b/tests/test_hashing_citeseq.py
deleted file mode 100644
index e30cdff7..00000000
--- a/tests/test_hashing_citeseq.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-Unittest module for hashing_citeseq
-"""
-
-import os
-import glob
-import unittest
-
-import numpy as np
-import pandas as pd
-import pegasus as pg
-
-
-class TestPipeline(unittest.TestCase):
-    def test_demux(self):
-        data = pg.read_input("tests/cb_cc_demux.zarr.zip")
-        self.assertEqual(data.shape, (737280, 33694), "Demux data shape differs!")
-        self.assertIn('demux_type', data.obs.columns, "Demux type is lost!")
-        self.assertIn('assignment', data.obs.columns, "Cell assignment is lost!")
-        f_list = glob.glob("tests/cb_cc.*.pdf")
-        self.assertEqual(len(f_list), 4, "Demux diagnosis plots are missing!")
-        self.assertIn('cb_cc.out.demuxEM.zarr.zip', os.listdir('tests'), "Demultiplexed RNA matrix is lost!")
-
-    def test_citeseq(self):
-        data = pg.read_input("tests/cb_cc_citeseq.zarr.zip")
-        self.assertSetEqual(set(data.list_data()), set(['GRCh38-citeseq', 'GRCh38-rna']), "Some modality is missing!")
-        self.assertNotIn('demux_type', data.obs.columns, "Demux type is not removed!")
-        self.assertIn('assignment', data.obs.columns, "Cell assignment is lost!")
-        self.assertEqual(data.shape, (14363, 33694), "RNA data shape differs!")
-        data.select_data('GRCh38-citeseq')
-        self.assertEqual(data.shape, (14363, 31), "CITE-Seq data shape differs!")
-
-    def test_clustering(self):
-        data = pg.read_input("tests/citeseq_result.zarr.zip")
-        self.assertSetEqual(set(data.list_data()), set(['GRCh38-citeseq', 'GRCh38-rna']), "Some modality is missing!")
-        n_rna_cells = data.shape[0]
-        self.assertNotIn('demux_type', data.obs.columns, "Demux type is not removed!")
-        self.assertEqual(data.obs['assignment'].cat.categories.size, 7, "Not all cells are demultiplexed singlets!")
-        self.assertIn('X_citeseq', data.obsm.keys(), "CITE-Seq coordinates are lost!")
-        self.assertEqual(data.obsm['X_citeseq_umap'].shape[1], data.obsm['X_umap'].shape[1], "Some of UMAP embeddings is lost!")
-        data.select_data('GRCh38-citeseq')
-        n_citeseq_cells = data.shape[0]
-        self.assertEqual(n_rna_cells, n_citeseq_cells, "Two modalities have inconsistent number of cells!")
-
-    def test_plot(self):
-        self.assertIn('citeseq_result.citeseq_umap.pdf', os.listdir('tests'), "CITE-Seq UMAP plot is lost!")
-        self.assertIn('citeseq_result.umap.pdf', os.listdir('tests'), "RNA UMAP plot is lost!")
-
-if __name__ == "__main__":
-    unittest.main()

From 89ca768e25cc316de3f582c9ad503a17d4d94f0c Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Fri, 19 Jan 2024 17:00:52 -0800
Subject: [PATCH 56/57] Fix heatmap (#286)

Fix issues in heatmap
---
 pegasus/plotting/plot_library.py | 52 ++++++++++++++++----------------
 pegasus/plotting/plot_utils.py   |  5 ++-
 2 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 7ffd1581..edce18d9 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1027,15 +1027,16 @@ def heatmap(
     attrs_dendrogram: Optional[bool] = True,
     attrs_method: Optional[bool] = 'ward',
     attrs_optimal_ordering: Optional[bool] = True,
-    attrs_labelsize: Optional[float] = 10.0,
-    attrs_labelrotation: Optional[float] = 0.0,
+    xlabel_size: Optional[float] = 10.0,
+    ylabel_size: Optional[float] = 10.0,
+    legend_fontsize: Optional[float] = 10.0,
+    xlabel_rotation: Optional[float] = 90.0,
+    ylabel_rotation: Optional[float] = 0.0,
     groupby_cluster: Optional[bool] = True,
     groupby_dendrogram: Optional[bool] = True,
     groupby_method: Optional[bool] = 'ward',
     groupby_optimal_ordering: Optional[bool] = True,
     groupby_precomputed_linkage: Optional[np.array] = None,
-    groupby_labelsize: Optional[float] = 10.0,
-    groupby_labelrotation: Optional[float] = 0.0,
     show_sample_name: Optional[bool] = None,
     cbar_labelsize: Optional[float] = 10.0,
     panel_size: Tuple[float, float] = (10, 10),
@@ -1076,10 +1077,16 @@ def heatmap(
         Linkage method for attrs, choosing from ``single``, ``complete``, ``average``, ``weighted``, ``centroid``, ``median`` and ``ward``.
     attrs_optimal_ordering: ``bool``, optional, default: ``True``
         Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the attrs linkage matrix will be reordered so that the distance between successive leaves is minima.
-    attrs_labelsize: ``float``, optional, default: 10.0
-        Fontsize for labels of attrs.
-    attrs_labelrotation: ``float``, optional, default: 0.0
-        Rotation of labels for attrs.
+    xlabel_size: ``float``, optional, default: 10.0
+        Fontsize for x-axis labels.
+    ylabel_size: ``float``, optional, default: 10.0
+        Fontsize for y-axis labels.
+    legend_fontsize: ``float``, optional, default: 10.0
+        Fontsize for legend labels.
+    xlabel_rotation: ``float``, optional, default: 90.0
+        Rotation of x-axis labels.
+    ylabel_rotation: ``float``, optional, default: 0.0
+        Rotation of y-axis labels.
     groupby_cluster: ``bool``, optional, default: ``True``
         Cluster data.obs['groupby'] and generate a cluster-wise dendrogram.
     groupby_dendrogram: ``bool``, optional, default: ``True``
@@ -1090,10 +1097,6 @@ def heatmap(
         Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the groupby linkage matrix will be reordered so that the distance between successive leaves is minima.
     groupby_precomputed_linkage: ``np.array``, optional, default: ``None``
         Pass a precomputed linkage.
-    groupby_labelsize: ``float``, optional, default: 10.0
-        Fontsize for labels of data.obs['groupby'].
-    groupby_labelrotation: ``float``, optional, default: 0.0
-        Rotation of labels for groupby.
     show_sample_name: ``bool``, optional, default: ``None``
         If show sample names as tick labels. If ``None``, show_sample_name == ``True`` if groupby == ``None`` and otherwise show_sample_name == ``False``.
     cbar_labelsize: ``float``, optional, default: 10.0
@@ -1116,7 +1119,7 @@ def heatmap(
 
     Examples
     --------
-    >>> pg.heatmap(data, attrs=['CD14', 'TRAC', 'CD34'], groupby='louvain_labels')
+    >>> pg.heatmap(data, attrs=['CD14', 'TRAC', 'CD34'], groupby='leiden_labels')
 
     """
     if not isinstance(data, anndata.AnnData):
@@ -1158,7 +1161,7 @@ def heatmap(
 
     if show_sample_name is None:
         show_sample_name = True if groupby is None else False
-    sample_tick_labels = df.index if show_sample_name else []
+    groupby_tick_labels = df.index if show_sample_name else []
 
     cluster_ids = None
     cell_colors = None
@@ -1175,6 +1178,7 @@ def heatmap(
             df['cluster_name'] = cluster_ids
             df = df.groupby(by='cluster_name', observed=True).mean()
             cluster_ids = df.index
+            groupby_tick_labels = cluster_ids
         else:
             if not groupby_cluster:
                 idx = cluster_ids.argsort(kind = 'mergesort')
@@ -1199,7 +1203,6 @@ def heatmap(
     if attrs_cluster:
         attrs_linkage = linkage(df.T, attrs_method, optimal_ordering = attrs_optimal_ordering)
 
-
     if not switch_axes:
         cg = sns.clustermap(
             data=df,
@@ -1210,15 +1213,14 @@ def heatmap(
             row_linkage=groupby_linkage,
             col_linkage=attrs_linkage,
             linewidths=0,
-            yticklabels=sample_tick_labels,
+            yticklabels=groupby_tick_labels,
             xticklabels=attr_names,
             figsize=panel_size,
             **kwargs,
         )
         cg.ax_heatmap.set_ylabel("")
-        cg.ax_heatmap.tick_params(axis='x', labelsize=attrs_labelsize, labelrotation=attrs_labelrotation)
-        if groupby is None:
-            cg.ax_heatmap.tick_params(axis='y', labelsize=groupby_labelsize, labelrotation=groupby_labelrotation)
+        cg.ax_heatmap.tick_params(axis='x', labelsize=xlabel_size, labelrotation=xlabel_rotation)
+        cg.ax_heatmap.tick_params(axis='y', labelsize=ylabel_size, labelrotation=ylabel_rotation)
     else:
         cg = sns.clustermap(
             data=df.T,
@@ -1230,14 +1232,13 @@ def heatmap(
             col_linkage=groupby_linkage,
             linewidths=0,
             yticklabels=attr_names,
-            xticklabels=sample_tick_labels,
+            xticklabels=groupby_tick_labels,
             figsize=panel_size,
             **kwargs,
         )
         cg.ax_heatmap.set_xlabel("")
-        cg.ax_heatmap.tick_params(axis='y', labelsize=attrs_labelsize, labelrotation=attrs_labelrotation)
-        if groupby is None:
-            cg.ax_heatmap.tick_params(axis='x', labelsize=groupby_labelsize, labelrotation=groupby_labelrotation)
+        cg.ax_heatmap.tick_params(axis='y', labelsize=ylabel_size, labelrotation=ylabel_rotation)
+        cg.ax_heatmap.tick_params(axis='x', labelsize=xlabel_size, labelrotation=xlabel_rotation)
 
     show_row_dendrogram = (attrs_cluster and attrs_dendrogram) if switch_axes else (groupby_cluster and groupby_dendrogram)
     show_col_dendrogram = (groupby_cluster and groupby_dendrogram) if switch_axes else (attrs_cluster and attrs_dendrogram)
@@ -1276,7 +1277,7 @@ def heatmap(
         if groupby_cluster:
             from matplotlib.patches import Patch
             legend_elements = [Patch(color = color, label = label) for color, label in zip(palette, cluster_ids.categories)]
-            cg.ax_heatmap.legend(handles=legend_elements, loc='lower left', bbox_to_anchor = (1.02, 1.02), fontsize = groupby_labelsize)
+            cg.ax_heatmap.legend(handles=legend_elements, loc='lower left', bbox_to_anchor = (1.02, 1.02), fontsize = legend_fontsize)
         else:
             values = cluster_ids.value_counts().values
             ticks = np.cumsum(values) - values / 2
@@ -1290,7 +1291,7 @@ def heatmap(
                 cg.ax_col_colors.xaxis.tick_top()
                 cg.ax_col_colors.set_xticks(ticks)
                 cg.ax_col_colors.set_xticklabels(labels, rotation=45)
-                cg.ax_col_colors.tick_params(axis='x', top = False, labelsize = groupby_labelsize, length=10)
+                cg.ax_col_colors.tick_params(axis='x', top = False, labelsize = xlabel_size, length=10)
 
     if not isinstance(data, anndata.AnnData):
         if cur_matkey != data.current_matrix():
@@ -1387,7 +1388,6 @@ def dotplot(
     idx = series == 0
     if idx.sum() > 0:
         logger.warning(f"The following categories contain no cells and are removed: {','.join(list(series.index[idx]))}.")
-        df[groupby] = df[groupby].cat.remove_unused_categories()
 
     def non_zero(g):
         return np.count_nonzero(g) / g.shape[0]
diff --git a/pegasus/plotting/plot_utils.py b/pegasus/plotting/plot_utils.py
index 48766ef9..e0b6fde8 100644
--- a/pegasus/plotting/plot_utils.py
+++ b/pegasus/plotting/plot_utils.py
@@ -9,6 +9,9 @@
 from matplotlib.patches import Circle
 from matplotlib.collections import PatchCollection
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 def _transform_basis(basis: str) -> str:
     if basis == "tsne":
@@ -453,6 +456,6 @@ def _get_valid_attrs(data:Union[MultimodalData, UnimodalData], attrs: List[str])
         else:
             attrs_drop.append(attr)
     if len(attrs_drop) > 0:
-        print(f"Warning: Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
+        logger.warning(f"Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
 
     return attrs_filt

From 40e6ef5b7d7009600a00b5238f1d01a1cdaddc87 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Fri, 19 Jan 2024 20:49:32 -0800
Subject: [PATCH 57/57] update docs

---
 docs/conf.py                       |  6 +++---
 docs/index.rst                     |  2 +-
 docs/release_notes.rst             |  5 +++++
 docs/release_notes/version_1_9.rst | 14 ++++++++++++++
 docs/requirements.txt              |  1 -
 requirements.txt                   |  2 +-
 setup.py                           |  2 +-
 7 files changed, 25 insertions(+), 7 deletions(-)
 create mode 100644 docs/release_notes/version_1_9.rst

diff --git a/docs/conf.py b/docs/conf.py
index 25a5726a..fc3d4cdf 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -23,15 +23,15 @@
 # -- Project information -----------------------------------------------------
 
 project = "Pegasus"
-copyright = "2023 Genentech, Inc. All rights reserved."
+copyright = "2024 Genentech, Inc. All rights reserved."
 author = (
     "Yiming Yang, Joshua Gould and Bo Li"
 )
 
 # The short X.Y version
-version = "1.8"
+version = "1.9"
 # The full version, including alpha/beta/rc tags
-release = "1.8.1"
+release = "1.9.0"
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/index.rst b/docs/index.rst
index b1893bf2..c8d37d11 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,7 +10,7 @@
 Release Highlights in Current Stable
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. include:: release_notes/version_1_8.rst
+.. include:: release_notes/version_1_9.rst
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
index 4adc7f23..7a2690a8 100644
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -6,6 +6,11 @@ Release Notes
 .. note::
     Also see the release notes of `PegasusIO <https://pegasusio.readthedocs.io/en/stable/release_notes.html>`__.
 
+Version 1.9
+~~~~~~~~~~~~~
+
+.. include:: release_notes/version_1_9.rst
+
 Version 1.8
 ~~~~~~~~~~~~~
 
diff --git a/docs/release_notes/version_1_9.rst b/docs/release_notes/version_1_9.rst
new file mode 100644
index 00000000..fa61e2f3
--- /dev/null
+++ b/docs/release_notes/version_1_9.rst
@@ -0,0 +1,14 @@
+1.9.0 :small:`January 19, 2024`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**New Feature and Improvement**
+
+* ``calculate_z_score`` works with sparse count matrix. [PR `276 <https://github.com/lilab-bcb/pegasus/pull/276>`_ Thanks to `Jayaram Kancherla <https://github.com/jkanche>`_]
+* Plotting functions (``scatter``, ``dotplot``, ``violin``, ``heatmap``) now give warnings on genes/attributes not existing in the data, and skip them in the plots.
+* Improve ``heatmap``:
+
+  * Add ``show_sample_name`` parameter for cases of pseudo-bulk data, nanoString DSP data, etc.
+  * Use Scipy's linkage (``scipy.cluster.hierarchy.linkage``) for dendrograms to use its optimal ordering feature for better results (see ``groupby_optimal_ordering`` parameter).
+
+* Update human lung and mouse immune markers used by ``infer_cell_types`` function.
+* Expose ``online_batch_size`` parameter in ``nmf`` and ``integrative_nmf`` functions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
index f6857eea..e714db1c 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -14,7 +14,6 @@ natsort
 joblib
 psutil
 numba
-importlib_metadata; python_version < '3.8'
 umap-learn
 forceatlas2-python
 pyarrow
diff --git a/requirements.txt b/requirements.txt
index 6e948083..5458ce4d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ natsort
 numba
 numpy
 pandas>=1.2.0
-pegasusio>=0.5.1
+pegasusio>=0.9.0
 pybind11
 scikit-learn>=0.23.2
 scikit-misc
diff --git a/setup.py b/setup.py
index 10dac019..087a27bc 100644
--- a/setup.py
+++ b/setup.py
@@ -59,7 +59,7 @@
         scvi=["scvi-tools"],
         all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools"]
     ),
-    python_requires="~=3.7",
+    python_requires="~=3.8",
     package_data={
         "pegasus.annotate_cluster": [
             "human_immune_cell_markers.json",