From 1234a12ad943f6f1944902c78fb5828ded42db51 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sun, 4 Dec 2022 13:15:49 -0800
Subject: [PATCH 01/57] Added n_comps to split_one_cluster function
---
pegasus/tools/clustering.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py
index 29c50d5b..89d41895 100644
--- a/pegasus/tools/clustering.py
+++ b/pegasus/tools/clustering.py
@@ -643,6 +643,7 @@ def split_one_cluster(
n_clust: int,
res_label: str,
rep: str = "pca",
+ n_comps: int = None,
random_state: int = 0,
) -> None:
"""
@@ -668,6 +669,9 @@ def split_one_cluster(
rep: ``str``, optional, default: ``"pca"``
The embedding representation used for Kmeans clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.
+ n_comps: `int`, optional (default: None)
+ Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions.
+
n_jobs : `int`, optional (default: -1)
Number of threads to use for the KMeans step in 'spectral_louvain' and 'spectral_leiden'. -1 refers to using all physical CPU cores.
@@ -688,7 +692,7 @@ def split_one_cluster(
idx = np.where(data.obs[clust_label] == clust_id)[0]
tmpdat = data[idx].copy()
from pegasus.tools import neighbors
- neighbors(tmpdat, rep=rep, use_cache=False)
+ neighbors(tmpdat, rep=rep, n_comps=n_comps, use_cache=False)
leiden(tmpdat, rep=rep, resolution=None, n_clust=n_clust, random_state=random_state)
new_clust = data.obs[clust_label].values.astype(int)
new_label = new_clust.max() + 1
From 92c9ae51972d5082734217a3d041e93ba122c14c Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sun, 4 Dec 2022 15:04:47 -0800
Subject: [PATCH 02/57] Added Migratory DC markers
---
.../human_immune_cell_markers.json | 105 ++++++++++--------
1 file changed, 58 insertions(+), 47 deletions(-)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 60eaa54c..9e307620 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -72,6 +72,42 @@
}
},
+ {
+ "name" : "Natural killer cell",
+ "markers" : [
+ {
+ "genes" : ["NCAM1+"],
+ "weight" : 0.2,
+ "comment" : "CD56"
+ },
+ {
+ "genes" : ["NKG7+"],
+ "weight" : 0.2,
+ "comment" : "natural killer cell granule protein 7"
+ },
+ {
+ "genes" : ["KLRB1+", "KLRD1+", "KLRF1+", "KLRC1+", "KLRC2+", "KLRC3+", "KLRC4+"],
+ "weight" : 0.25,
+ "comment" : "killer cell lectin like receptors"
+ },
+ {
+ "genes" : ["CD3D-", "CD3E-", "CD3G-"],
+ "weight" : 0.15,
+ "comment" : "not T cell"
+ },
+ {
+ "genes" : ["FCGR3A+"],
+ "weight" : 0.1,
+ "comment" : "CD16a"
+ },
+ {
+ "genes" : ["ITGAL+", "ITGAM+"],
+ "weight" : 0.1,
+ "comment" : "CD11a,CD11b"
+ }
+ ]
+ },
+
{
"name" : "B cell",
"markers" : [
@@ -154,7 +190,7 @@
{
"genes" : ["CD38+", "BCL6+", "BCL2-", "RGS13+"],
"weight" : 0.6,
- "comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last marker is from XXX"
+ "comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last marker is from Xing et al. Science Advances 2021 Table S2 (Germinal center B)"
},
{
"genes" : ["PCNA+", "MKI67+"],
@@ -190,38 +226,28 @@
},
{
- "name" : "Natural killer cell",
+ "name" : "Plasma cell",
"markers" : [
{
- "genes" : ["NCAM1+"],
- "weight" : 0.2,
- "comment" : "CD56"
+ "genes" : ["CD38+", "XBP1+", "CD27+", "SLAMF7+"],
+ "weight" : 0.4,
+ "comment" : "important markers"
},
{
- "genes" : ["NKG7+"],
+ "genes" : ["TNFRSF17+", "TNFRSF13B+"],
"weight" : 0.2,
- "comment" : "natural killer cell granule protein 7"
- },
- {
- "genes" : ["KLRB1+", "KLRD1+", "KLRF1+", "KLRC1+", "KLRC2+", "KLRC3+", "KLRC4+"],
- "weight" : 0.25,
- "comment" : "killer cell lectin like receptors"
- },
- {
- "genes" : ["CD3D-", "CD3E-", "CD3G-"],
- "weight" : 0.15,
- "comment" : "not T cell"
+ "comment" : "TNF-receptor superfamily"
},
{
- "genes" : ["FCGR3A+"],
- "weight" : 0.1,
- "comment" : "CD16a"
+ "genes" : ["IGHA1+", "IGHG1+"],
+ "weight" : 0.2,
+ "comment" : "class switching happened"
},
{
- "genes" : ["ITGAL+", "ITGAM+"],
- "weight" : 0.1,
- "comment" : "CD11a,CD11b"
- }
+ "genes" : ["MS4A1-"],
+ "weight" : 0.2,
+ "comment" : "not B cell, doi: https://doi.org/10.1182/bloodadvances.2017004481, long-live plasma can still express CD19"
+ }
]
},
@@ -270,39 +296,24 @@
},
{
- "name" : "Plasmacytoid dendritic cell",
+ "name" : "Migratory dendritic cell",
"markers" : [
{
- "genes" : ["JCHAIN+", "LILRA4+", "GZMB+", "MZB1+", "IL3RA+", "SERPINF1+", "ITM2C+", "IRF7+"],
+ "genes" : ["FSCN1+", "CCR7+", "LAMP3+", "CCL19+", "CCL22+", "CD40+", "BIRC3+"],
"weight" : 1.0,
- "comment" : "important pDC markers"
+ "comment" : "Xing et al. Science Advances 2021 Table S2 (DCs-C3)"
}
]
},
{
- "name" : "Plasma cell",
+ "name" : "Plasmacytoid dendritic cell",
"markers" : [
{
- "genes" : ["CD38+", "XBP1+", "CD27+", "SLAMF7+"],
- "weight" : 0.4,
- "comment" : "important markers"
- },
- {
- "genes" : ["TNFRSF17+", "TNFRSF13B+"],
- "weight" : 0.2,
- "comment" : "TNF-receptor superfamily"
- },
- {
- "genes" : ["IGHA1+", "IGHG1+"],
- "weight" : 0.2,
- "comment" : "class switching happened"
- },
- {
- "genes" : ["MS4A1-"],
- "weight" : 0.2,
- "comment" : "not B cell, doi: https://doi.org/10.1182/bloodadvances.2017004481, long-live plasma can still express CD19"
- }
+ "genes" : ["JCHAIN+", "LILRA4+", "GZMB+", "MZB1+", "IL3RA+", "SERPINF1+", "ITM2C+", "IRF7+"],
+ "weight" : 1.0,
+ "comment" : "important pDC markers"
+ }
]
},
From 5d72a0ee5ad9547535d3a42c008d17ccce4898e6 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Fri, 23 Dec 2022 14:49:20 -0500
Subject: [PATCH 03/57] Updated markers
---
.../human_immune_cell_markers.json | 11 +-
.../mouse_brain_cell_markers.json | 142 ++++++++++++------
pegasus/data_files/human_lung.gmt | 19 +++
pegasus/data_files/mouse_brain.gmt | 11 ++
pegasus/tools/signature_score.py | 4 +-
pegasus/tools/utils.py | 2 +
6 files changed, 137 insertions(+), 52 deletions(-)
create mode 100644 pegasus/data_files/human_lung.gmt
create mode 100644 pegasus/data_files/mouse_brain.gmt
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 9e307620..cc153e55 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -188,14 +188,9 @@
"comment" : "CD19, CD20 and CD79"
},
{
- "genes" : ["CD38+", "BCL6+", "BCL2-", "RGS13+"],
- "weight" : 0.6,
- "comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last marker is from Xing et al. Science Advances 2021 Table S2 (Germinal center B)"
- },
- {
- "genes" : ["PCNA+", "MKI67+"],
- "weight" : 0.1,
- "comment" : "From Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 2)"
+ "genes" : ["CD38+", "BCL6+", "BCL2-", "RGS13+", "MEF2B"],
+ "weight" : 0.7,
+ "comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last 2 markers are from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper"
}
],
"subtypes" : {
diff --git a/pegasus/annotate_cluster/mouse_brain_cell_markers.json b/pegasus/annotate_cluster/mouse_brain_cell_markers.json
index ebfc0eac..f7895afe 100644
--- a/pegasus/annotate_cluster/mouse_brain_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_brain_cell_markers.json
@@ -1,5 +1,6 @@
{
"title" : "Mouse brain cell markers",
+ "comments": "Yao et al. Nature 2021 Allen Mouse Brain Map is a great resource for markers; Map: https://celltypes.brain-map.org/rnaseq/mouse_ctx-hpf_10x?selectedVisualization=Heatmap&colorByFeature=Cell+Type&colorByFeatureValue=Gad1; Cell type metadata: https://brainpalmseq.med.ubc.ca/brain-regions/neocortex-allen-brain-atlas-rnaseq/search-allen-brain-map-by-all-cell-types/; Extended Data Fig 2 & Supp Table 1 of Zhang et al. Nature 2021 is also used in marker selection",
"cell_types" : [
{
"name" : "Glutamatergic neuron",
@@ -168,28 +169,60 @@
"name" : "Oligodendrocyte",
"markers" : [
{
- "genes" : ["Mbp+", "Plp1+"],
- "weight" : 0.6,
- "comment" : "Oligo specific markers (Allen Brain Map)"
- },
- {
- "genes" : ["Mog+"],
- "weight" : 0.15,
- "comment" : "Oligo specific markers, but not expressed in all Oligo cells (Allen Brain Map)"
+ "genes" : ["Plp1+", "Cnp+", "Fa2h+", "St18+", "Mbp+"],
+ "weight" : 0.8,
+ "comment" : "Oligo specific markers from Yao et al. Nature 2021 (Allen Brain Map)"
},
{
"genes" : ["Olig1+", "Olig2+", "Sox10+"],
- "weight" : 0.25,
+ "weight" : 0.2,
"comment" : "Expressed in both Oligo and OPC (Allen Brain Map)"
}
- ]
+ ],
+ "subtypes" : {
+ "title" : "Oligodendrocyte subtype markers",
+ "cell_types" : [
+ {
+ "name" : "Opalin+ Oligodendrocyte",
+ "markers" : [
+ {
+ "genes" : ["Opalin+", "Mog+", "Plekhh1+", "Ermn+"],
+ "weight" : 1.0,
+ "comment": "Opalin+ markers from Yao et al. Nature 2021"
+ }
+ ]
+ },
+ {
+ "name" : "Enpp6+ Oligodendrocyte",
+ "markers" : [
+ {
+ "genes" : ["Enpp6+", "Pik3r3+", "Cnksr3+", "Parvb+", "Dusp15+"],
+ "weight" : 1.0,
+ "comment": "Enpp6+ markers from Yao et al. Nature 2021"
+ }
+ ]
+ },
+ {
+ "name" : "Neu4+ Oligodendrocyte",
+ "markers" : [
+ {
+ "genes" : ["Neu4+"],
+ "weight" : 1.0,
+ "comment": "Neu4+ markers from Yao et al. Nature 2021"
+ }
+ ]
+ }
+
+ ]
+ }
},
{
"name" : "OPC",
"markers" : [
{
- "genes" : ["Pdgfra+", "Cspg4+"],
- "weight" : 1.0
+ "genes" : ["Pdgfra+", "Cspg4+", "Emid1+", "Fabp7+"],
+ "weight" : 1.0,
+ "comment": "Oligodendrocyte progenitor cell markers from Yao et al. Nature 2021"
}
]
},
@@ -197,71 +230,94 @@
"name" : "Astrocyte",
"markers" : [
{
- "genes" : ["Aqp4+", "Gja1+", "F3+", "Prex2+"],
- "weight" : 1.0
+ "genes" : ["Mt2+", "Gja1+", "Prdx6+", "Htra1+", "Ntsr2+", "Aldoc+", "Apoe+", "Prex2+", "Aqp4+", "Gpr37l1+"],
+ "weight" : 1.0,
+ "comment": "Astrocyte markers from Yao et al. Nature 2021"
}
- ]
+ ],
+ "subtypes" : {
+ "title" : "Astrocyte subtype markers",
+ "cell_types" : [
+ {
+ "name" : "Gfap+ Astrocyte",
+ "markers" : [
+ {
+ "genes" : ["Gfap+", "Aqp4+", "Tmem47+", "Id4+", "Mlc1+", "Sdc4+", "Gstm1+"],
+ "weight" : 1.0,
+ "comment": "Gfap+ markers from Yao et al. Nature 2021"
+ }
+ ]
+ },
+ {
+ "name" : "Slc7a10+ Astrocyte",
+ "markers" : [
+ {
+ "genes" : ["Slc7a10+", "Grm3+", "Trpm3+", "Phkg1+", "Cdh10+", "Luzp2+", "Gria2+", "Slc6a1+"],
+ "weight" : 1.0,
+ "comment": "Slc7a10+ markers from Yao et al. Nature 2021"
+ }
+ ]
+ }
+ ]
+ }
},
{
"name" : "Microglia",
"markers" : [
{
- "genes" : ["C1qb+", "P2ry12+", "Ctss+", "Csf1r+", "Hmha1+"],
- "weight" : 1.0
+ "genes" : ["Hexb+", "Siglech+", "Selplg+", "Tmem119+", "Ctss+", "P2ry12+", "Cx3cr1+", "Trem2+", "Fcrls+", "Csf1r+"],
+ "weight" : 1.0,
+ "comment": "Microglia specific markers from Yao et al. Nature 2021"
}
]
},
{
- "name" : "Endothelial",
- "markers" : [
- {
- "genes" : ["Flt1+", "Dcn+", "Xdh+", "Id1+"],
- "weight" : 1.0
- }
- ]
- },
- {
- "name" : "Fibroblast",
+ "name" : "Perivascular macrophage",
"markers" : [
{
- "genes" : ["Igfbp1+", "Dcn+"],
- "weight" : 1.0
+ "genes" : ["Mrc1+", "Stab1+", "Lyz2+", "Ms4a6c+", "F13a1+", "Pf4+"],
+ "weight" : 1.0,
+ "comment": "PVM specific markers from Yao et al. Nature 2021"
}
]
},
{
- "name" : "Mural",
+ "name" : "Endothelial cell",
"markers" : [
{
- "genes" : ["Rgs5+", "Acta2+"],
- "weight" : 1.0
+ "genes" : ["Flt1+", "Pecam1+", "Ly6a+", "Slco1a4+", "Mecom+", "Ptprb+", "Id1+"],
+ "weight" : 1.0,
+ "comment" : "Endo specific markers from Yao et al. Nature 2021"
}
- ]
+ ]
},
{
- "name" : "Choroid Coch",
+ "name" : "Vascular leptomeningeal cell",
"markers" : [
{
- "genes" : ["Tgfbi+"],
- "weight" : 1.0
+ "genes" : ["Slc7a11+", "Slc6a13+", "Bmp6+", "Igfbp2+", "Fmod+", "Ranbp3l+"],
+ "weight" : 1.0,
+ "comment" : "VLMC specific markers from Yao et al. Nature 2021"
}
- ]
+ ]
},
{
- "name" : "Ependyma",
+ "name" : "Smooth muscle cell",
"markers" : [
{
- "genes" : ["Ccdc153+"],
- "weight" : 1.0
+ "genes" : ["Atca2+", "Myh11+", "Tagln+", "Pln+", "Mylk+"],
+ "weight" : 1.0,
+ "comment" : "SMC specific markers from Yao et al. Nature 2021"
}
]
},
{
- "name" : "Smooth muscle cell",
+ "name" : "Pericyte",
"markers" : [
{
- "genes" : ["Vtn+", "Colec12+"],
- "weight" : 1.0
+ "genes" : ["Vtn+", "Atp13a5+", "Abcc9+", "Kcnj8+", "Art3+"],
+ "weight" : 1.0,
+ "comment" : "Pericyte specific markers from Yao et al. Nature 2021"
}
]
}
diff --git a/pegasus/data_files/human_lung.gmt b/pegasus/data_files/human_lung.gmt
new file mode 100644
index 00000000..726c485b
--- /dev/null
+++ b/pegasus/data_files/human_lung.gmt
@@ -0,0 +1,19 @@
+Epithelial Epithelial markers from HTAPP paper KRT8 KRT18 EPCAM CD24
+Endothelial Endothelial shared markers from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021 PECAM1 CLDN5 CDH5 ERG
+Fibroblast Fibroblast/Myofibroblast shared markers from Travaglini et al. COL1A1 COL1A2 PDGFRA ELN BGN
+Macrophage Macro CD68 CD163 C1QA MRC1 MS4A6A MSR1 MERTK
+SMC SMC from Muus et al., Braga et al. and Schupp et al. MYH11 TAGLN ACTG2 CNN1 PLN
+Pericyte Pericyte from Schupp et al. and Travaglini et al. TRPC6 CSPG4 FAM162B GJA4 GJC1 HIGD1B CDH6 LAMC3 FHL5
+T cell T cell markers CD3D CD3E CD3G TRAC
+B cell B cell markers CD19 MS4A1 CD79A CD79B
+Plasma cell Plasma cell markers CD38 XBP1 CD27 SLAMF7 TNFRSF17 TNFRSF13B
+Mast cell Mast cell markers KIT CPA3 TPSB2 TPSAB1 AREG RGS1 RGS2
+Neutrophil Neutrophil markers FUT4 MPO CEACAM8 ELANE CXCR1 CXCR2 LY6G6D
+AT1 AT1 markers from Schupp et al., Travaglini et al. and Tony et al. AGER CAV1 RTKN2 MYL9 SPOCK2 ANXA3 TIMP3 CAV2 ST6GALNAC5 MYRF
+AT2 AT2 markers from Schupp et al., Travaglini et al. and Tony et al. SFTPA1 SFTPA2 SFTPC ETV5 TTN PLA2G4F CCDC141 LAMP3 ABCA3 HHIP
+Basal Basal cell markers from Schupp et al., Travaglini et al. and Tony et al. KRT5 KRT15 KRT17 TP63 S100A2 TNS4
+Ciliated Ciliated cell markers from Schupp et al., Travaglini et al. and Tony et al. ERICH3 SNTN CCDC78 SNTN ZBBX DNAI1 ARMC3 CFAP157 TTC29 CFAP73
+Club Club cell markers from Schupp et al., Travaglini et al. and Tony et al. SCGB3A2 MGP VIM CST3
+Goblet Goblet cell markers from Schupp et al., Travaglini et al. and Tony et al. MUC5AC MUC5B BPIFB1 MSMB FAM3D SERPINB11 CXCL6 SCGB1A1 FAM3D SERPINB3
+Ionocyte Ionocyte markers from Travaglini et al. FOXI1 ASCL3 CLDN25 ATP6V1G3 LINC01187
+PNEC Plumonary neuroendocrien cell markers from Travaglini et al. CALCA CHGA ASCL1 SLC35D3 KIF1A
diff --git a/pegasus/data_files/mouse_brain.gmt b/pegasus/data_files/mouse_brain.gmt
new file mode 100644
index 00000000..fa32ef25
--- /dev/null
+++ b/pegasus/data_files/mouse_brain.gmt
@@ -0,0 +1,11 @@
+GlutamatergicNeuron Glutamatergic neuron Slc17a7 Slc17a6 Neurod6 Neurod2
+GABAergicNeuron GABAergic neuron Gad1 Gad2 Slc32a1
+Oligodendrocyte Oligodendrocyte Plp1 Cnp Fa2h St18 Mbp
+OPC Oligodendrocyte progenitor cell Pdgfra Cspg4 Emid1 Fabp7
+SMC Smooth muscle cell Atca2 Myh11 Tagln Pln Mylk
+Pericyte Pericyte Vtn Atp13a5 Abcc9 Kcnj8 Art3
+Endo Endothelial cell Flt1 Pecam1 Ly6a Slco1a4 Mecom Ptprb Id1
+Microglia Microglia cell Hexb Siglech Selplg Tmem119 Ctss P2ry12 Cx3cr1 Trem2 Fcrls Csf1r
+Astrocyte Astrocyte Mt2 Gja1 Prdx6 Htra1 Ntsr2 Aldoc Apoe Prex2 Aqp4 Gpr37l1
+PVM Perivascular macrophages Mrc1 Stab1 Lyz2 Ms4a6c F13a1 Pf4
+VLMC Vascular leptomeningeal cells Slc7a11 Slc6a13 Bmp6 Igfbp2 Fmod Ranbp3l
diff --git a/pegasus/tools/signature_score.py b/pegasus/tools/signature_score.py
index ee14c446..965e4129 100644
--- a/pegasus/tools/signature_score.py
+++ b/pegasus/tools/signature_score.py
@@ -210,7 +210,9 @@ def calc_signature_score(
elif sig_string.startswith("apoptosis"):
_calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
else:
- assert False
+ assert sig_string in predefined_signatures
+ signatures = load_signatures_from_file(predefined_signatures[sig_string])
+ _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
else:
signatures = load_signatures_from_file(sig_string)
_calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py
index ebaf11bc..2b80d819 100644
--- a/pegasus/tools/utils.py
+++ b/pegasus/tools/utils.py
@@ -187,6 +187,8 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: str, warni
ribosomal_genes_mouse=pkg_resources.resource_filename("pegasus", "data_files/ribosomal_genes_mouse.gmt"),
apoptosis_human=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_human.gmt"),
apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"),
+ human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"),
+ mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"),
)
predefined_pathways = dict(
From 6f35adf7178a4a8429ae16a7066fd2fc02af485c Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Thu, 29 Dec 2022 01:33:05 -0500
Subject: [PATCH 04/57] Added mouse liver markers
---
.../mouse_brain_cell_markers.json | 2 +-
.../mouse_immune_cell_markers.json | 145 ++++++++++--
.../mouse_liver_cell_markers.json | 209 ++++++++++++++++++
pegasus/data_files/mouse_liver.gmt | 23 ++
setup.py | 5 +-
5 files changed, 367 insertions(+), 17 deletions(-)
create mode 100644 pegasus/annotate_cluster/mouse_liver_cell_markers.json
create mode 100644 pegasus/data_files/mouse_liver.gmt
diff --git a/pegasus/annotate_cluster/mouse_brain_cell_markers.json b/pegasus/annotate_cluster/mouse_brain_cell_markers.json
index f7895afe..1bdfc86c 100644
--- a/pegasus/annotate_cluster/mouse_brain_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_brain_cell_markers.json
@@ -287,7 +287,7 @@
{
"genes" : ["Flt1+", "Pecam1+", "Ly6a+", "Slco1a4+", "Mecom+", "Ptprb+", "Id1+"],
"weight" : 1.0,
- "comment" : "Endo specific markers from Yao et al. Nature 2021"
+ "comment" : "Endo specific markers from Yao et al. Nature 2021; Slco1a4 is specific to mouse brain: see https://journals.plos.org/plosone/article/figures?id=10.1371/journal.pone.0013741"
}
]
},
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index 64e0fd8a..4f462f13 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -5,8 +5,9 @@
"name" : "T cell",
"markers" : [
{
- "genes" : ["Cd28+", "Cd3d+", "Cd3e+", "Cd4+", "Cd8a+"],
- "weight" : 1.0
+ "genes" : ["Cd3d+", "Cd3e+", "Cd3g+", "Trac+", "Cd28+"],
+ "weight" : 1.0,
+ "comment" : "T cell markers from Kaptein et al. Cell 2022"
}
],
"subtypes" : {
@@ -51,6 +52,7 @@
]
}
},
+
{
"name" : "Monocyte",
"markers" : [
@@ -64,36 +66,151 @@
}
]
},
+
{
"name" : "B cell",
"markers" : [
{
- "genes" : ["Cd19+", "Cd79b+", "Cd74+", "Igkc+", "Ighm+", "Iglc2+", "Ms4a1+"],
- "weight" : 1.0
+ "genes" : ["Cd19+", "Ms4a1+", "Cd79a+", "Cd79b+", "Ebf1+"],
+ "weight" : 1.0,
+ "comment" : "B cell markers from Kaptein et al. Cell 2022"
}
]
},
+
{
- "name" : "Neutrophil",
+ "name" : "Natural killer cell",
"markers" : [
{
- "genes" : ["Mmp9+", "S100a8+", "S100a9+", "Il1b+", "Retnlg+", "Lcn2+"],
- "weight" : 1.0
+ "genes" : ["Eomes+", "Cma1+", "Klra4+", "Klra7+", "Klra8+"],
+ "weight" : 1.0,
+ "comment" : "NK cell markers from Kaptein et al. Cell 2022"
}
]
},
+
{
- "name" : "NK cell",
+ "name" : "Inflammatory monocyte",
"markers" : [
{
- "genes" : ["Nkg7+"],
- "weight" : 0.55
- },
+ "genes" : ["Ly6c2+", "F13a1+", "Chil3+", "Ms4a4c+", "Ccr2+"],
+ "weight" : 1.0,
+ "comment" : "Inflammatory monocyte markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Patrolling monocyte",
+ "markers" : [
{
- "genes" : ["Cd3d-", "Cd3e-"],
- "weight" : 0.45
+ "genes" : ["Ace+", "Eno3+", "Ear2+", "Treml4+", "Fabp4+"],
+ "weight" : 1.0,
+ "comment" : "Patrolling monocyte markers from Kaptein et al. Cell 2022"
}
]
- }
+ },
+
+ {
+ "name" : "Macrophage",
+ "markers" : [
+ {
+ "genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"],
+ "weight" : 1.0,
+ "comment" : "Machrophage markers from Kaptein et al. Cell 2022"
+ }
+ ],
+ "subtypes" : {
+ "title" : "Macrophage subtype markers",
+ "cell_types" : [
+ {
+ "name" : "Cd207+ macrophage",
+ "markers" : [
+ {
+ "genes" : ["Cd207+", "Tmem119+", "Olfml3+", "Mmp13+"],
+ "weight" : 1.0,
+ "comments" : "Cd207+ macrophage markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+ {
+ "name" : "Bile-duct lipid-associated macrophage",
+ "markers" : [
+ {
+ "genes" : ["Gpnmb+", "Spp1+", "Syngr1+", "Cd93+"],
+ "weight" : 1.0,
+ "comments" : "Bile-duct LAM markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ }
+ ]
+ }
+ },
+
+ {
+ "name" : "Conventional type 1 dendritic cell",
+ "markers" : [
+ {
+ "genes" : ["Xcr1+", "Gcsam+", "Snx22+", "Rab7b+", "Ifi205+"],
+ "weight" : 1.0,
+ "comment" : "cDC1 markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Conventional type 2 dendritic cell",
+ "markers" : [
+ {
+ "genes" : ["Cd209a+","Ltb4r1+", "Mgl2+", "Tnip3+", "Bex6+"],
+ "weight" : 1.0,
+ "comment" : "cDC2 markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Migratory dendritic cell",
+ "markers" : [
+ {
+ "genes" : ["Cacnb3+", "Nudt17+", "Ccl22+", "Apol7c+", "Slco5a1+"],
+ "weight" : 1.0,
+ "comment" : "Migratory DC markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Plasmacytoid dendritic cell",
+ "markers" : [
+ {
+ "genes" : ["Siglech+", "Ccr9+", "Cox6a2+", "Cd300c+", "Klk1+"],
+ "weight" : 1.0,
+ "comment" : "pDC markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Neutrophil",
+ "markers" : [
+ {
+ "genes" : ["S100a8+", "S100a9+", "Retnlg+", "Mmp9+", "Clec4d+"],
+ "weight" : 1.0,
+ "comment" : "Neutrophil markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Basophil",
+ "markers" : [
+ {
+ "genes" : ["Fcer1a+", "Cyp11a+", "Cd200r3+", "Il6+", "Ms4a2+"],
+ "weight" : 1.0,
+ "comment" : "Basophil markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
]
}
diff --git a/pegasus/annotate_cluster/mouse_liver_cell_markers.json b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
new file mode 100644
index 00000000..b8d40786
--- /dev/null
+++ b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
@@ -0,0 +1,209 @@
+{
+ "title" : "Mouse liver cell type markers",
+ "comment": "Markers are collected from Kaptein et al. Cell 2022",
+ "cell_types" : [
+ {
+ "name" : "Hepatocye",
+ "markers" : [
+ {
+ "genes" : ["Acaa1b+", "Arg1+", "Sult2a8+", "Hgd+", "Otc+"],
+ "weight" : 1.0,
+ "comment" : "Hepatocye markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Cholangiocyte",
+ "markers" : [
+ {
+ "genes" : ["Spp1+", "Ddit4l+", "Sox9+", "Fgfr3+", "Plet1+"],
+ "weight" : 1.0,
+ "comment" : "Cholangiocyte markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "HsPC",
+ "markers" : [
+ {
+ "genes" : ["Chrm3+", "Dmbt1+", "Slc4a4+", "Parm1+", "Pcdh11x+"],
+ "weight" : 1.0,
+ "comment" : "Hepatic stem and progenitor cell markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+
+ {
+ "name" : "ILC1",
+ "markers" : [
+ {
+ "genes" : ["Xcl1+", "Cd160+", "Klrc1+", "Cd200r2+", "Gzmc+"],
+ "weight" : 1.0,
+ "comment" : "Innate lymphoid cell type 1 markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Kupffer cell",
+ "markers" : [
+ {
+ "genes" : ["Cd5l+", "Clec4f+", "Vig4+", "Folr2+", "Timd4+"],
+ "weight" : 1.0,
+ "comment" : "Kupffer cell markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Peritoneal macrophage",
+ "markers" : [
+ {
+ "genes" : ["Lyz1+", "Saa3+", "Prg4+", "Retnla+", "Cbr2+"],
+ "weight" : 1.0,
+ "comment" : "Peritoneal macrophage markers from Kaptein et al. Cell 2022; Note that Lyve1 is also a good marker but it is also expressed in endothelial cells"
+ }
+ ]
+ },
+
+
+ {
+ "name" : "Endothelial cell",
+ "markers" : [
+ {
+ "genes" : ["Mmrn2+", "Cldn5+", "Adgrl4+", "Tek+", "Myct1+"],
+ "weight" : 1.0,
+ "comment" : "Endothelial cell markers from Kaptein et al. Cell 2022"
+ }
+ ],
+ "subtypes" : {
+ "title" : "Endothelial cell subtype markers",
+ "cell_types" : [
+ {
+ "name" : "Liver sinusoidal endothelial cell",
+ "markers" : [
+ {
+ "genes" : ["Lyve1+", "Clec1b+", "Chst2+", "Wisp1+"],
+ "weight" : 1.0,
+ "comment" : "LSEC markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+ {
+ "name" : "Central vein endothelial cell",
+ "markers" : [
+ {
+ "genes" : ["Rspo3+", "Lhx6+", "Wnt9b+", "Plppr5+"],
+ "weight" : 1.0,
+ "comment" : "CV EC markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+ {
+ "name" : "Portal Vein endothelial cell",
+ "markers" : [
+ {
+ "genes" : ["Adgrg6+", "Nrg1+", "Gja5+","Cmklr1+"],
+ "weight" : 1.0,
+ "comment" : "PV EC markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+ {
+ "name" : "Lymphatic Endothelial cell",
+ "markers" : [
+ {
+ "genes" : ["Mmrn1+", "Pard6g+", "Nts+", "Ccl21a+"],
+ "weight" : 1.0,
+ "comments" : "LEC markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ }
+ ]
+ }
+ },
+
+
+ {
+ "name" : "Stellate cell",
+ "markers" : [
+ {
+ "genes" : ["Colec10+", "Rspo3+", "Mapt+", "Lama1+", "Bmp10+"],
+ "weight" : 1.0,
+ "comment" : "Stellate cell markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Vascular smooth muscle cell",
+ "markers" : [
+ {
+ "genes" : ["Cacna1c+", "Myh11+", "Notch3+", "Lmod1+", "Tagln+"],
+ "weight" : 1.0,
+ "comment" : "VSMC markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Mesothelial cell",
+ "markers" : [
+ {
+ "genes" : ["Ephb1+", "Cadm2+", "Prss12+", "Myl7+", "Prph+"],
+ "weight" : 1.0,
+ "comment" : "Mesothelial cell markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Fibroblast",
+ "markers" : [
+ {
+ "genes" : ["Col1a1+", "Mrc2+", "Plcxd3+", "Fndc1+", "Cpxm1+"],
+ "weight" : 1.0,
+ "comment" : "Fibroblast markers from Kaptein et al. Cell 2022"
+ }
+ ],
+ "subtypes" : {
+ "title" : "Fibro subtype markers",
+ "cell_types" : [
+ {
+ "name" : "Capsule fibroblast",
+ "markers" : [
+ {
+ "genes" : ["Osr1+", "Cldn10+", "Lgals7+", "Spock3+"],
+ "weight" : 1.0,
+ "comment" : "Capsule fibroblast markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+ {
+ "name" : "Central vein fibroblast",
+ "markers" : [
+ {
+ "genes" : ["Dpt+", "Pcolce2+", "Ntrk2+", "Pi16+"],
+ "weight" : 1.0,
+ "comment" : "Central vein fibroblast markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+ {
+ "name" : "Bile-duct fibroblast",
+ "markers" : [
+ {
+ "genes" : ["Itgbl1+", "Plcxd3+", "Nkain3+", "Clic5+"],
+ "weight" : 1.0,
+ "comment" : "Bile-duct fibroblast markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ }
+ ]
+ }
+ }
+ ]
+}
diff --git a/pegasus/data_files/mouse_liver.gmt b/pegasus/data_files/mouse_liver.gmt
new file mode 100644
index 00000000..5229ce86
--- /dev/null
+++ b/pegasus/data_files/mouse_liver.gmt
@@ -0,0 +1,23 @@
+Endo Endothelial cell Mmrn2 Cldn5 Adgrl4 Tek Myct1
+Stellate Stellate cell Colec10 Rspo3 Mapt Lama1 Bmp10
+VSMC Vascular smooth muscle cell Cacna1c Myh11 Notch3 Lmod1 Tagln
+Meso Mesothelial cell Ephb1 Cadm2 Prss12 Myl7 Prph
+Fibro Fibroblast Col1a1 Mrc2 Plcxd3 Fndc1 Cpxm1
+Hepatocyte Hepatocye Acaa1b Arg1 Sult2a8 Hgd Otc
+Cholangiocyte Cholangiocyte Spp1 Ddit4l Sox9 Fgfr3 Plet1
+HSPC Hepatic stem and progenitor cell Chrm3 Dmbt1 Slc4a4 Parm1 Pcdh11x
+T T cell Cd3d Cd3e Cd3g Trac Cd28
+B B cell Cd19 Ms4a1 Cd79a Cd79b Ebf1
+NK NK cell Eomes Cma1 Klra4 Klra7 Klra8
+ILC1 Innate lymphoid cell type 1 Xcl1 Cd160 Klrc1 Cd200r2 Gzmc
+cDC1 cDC1 Xcr1 Gcsam Snx22 Rab7b Ifi205
+cDC2 cDC2 Cd209a Ltb4r1 Mgl2 Tnip3 Bex6
+Mig_cDC Migoritory cDC Cacnb3 Nudt17 Ccl22 Apol7c Slco5a1
+pDC Plasmacytoid dendritic cell Siglech Ccr9 Cox6a2 Cd300c Klk1
+MonoI Inflammatory monocyte Ly6c2 F13a1 Chil3 Ms4a4c Ccr2
+MonoP Patrolling monocyte Ace Eno3 Ear2 Treml4 Fabp4
+PeriMac Peritoneal macrophage Lyz1 Saa3 Prg4 Retnla Cbr2
+Mac Macrophage Cd14 Ms4a7 Cx3cr1 Trem2 Hpgds
+Kupffer Kupffer cell Cd5l Clec4f Vig4 Folr2 Timd4
+Neutrophil Neutrophil S100a8 S100a9 Retnlg Mmp9 Clec4d
+Basophil Basophil Fcer1a Cyp11a Cd200r3 Il6 Ms4a2
diff --git a/setup.py b/setup.py
index 9cd66105..b353e796 100644
--- a/setup.py
+++ b/setup.py
@@ -62,10 +62,11 @@
package_data={
"pegasus.annotate_cluster": [
"human_immune_cell_markers.json",
- "mouse_immune_cell_markers.json",
- "mouse_brain_cell_markers.json",
"human_brain_cell_markers.json",
"human_lung_cell_markers.json",
+ "mouse_immune_cell_markers.json",
+ "mouse_brain_cell_markers.json",
+ "mouse_liver_cell_markers.json",
],
"pegasus.check_sample_indexes": ["chromium-shared-sample-indexes-plate.json", "Chromium-i7-Multiplex-Kit-N-Set-A-sample-indexes-plate.json"],
"pegasus": ["data_files/*.gmt"],
From 94867ebb783d7cb0acdd54b35107c171bb6dbda8 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Thu, 29 Dec 2022 08:01:08 -0500
Subject: [PATCH 05/57] Fixed typos in mouse liver and immune cell types
---
pegasus/annotate_cluster/annotate_cluster.py | 4 +++-
pegasus/annotate_cluster/mouse_immune_cell_markers.json | 4 ++--
pegasus/annotate_cluster/mouse_liver_cell_markers.json | 2 +-
pegasus/data_files/mouse_liver.gmt | 4 ++--
pegasus/tools/utils.py | 1 +
5 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/pegasus/annotate_cluster/annotate_cluster.py b/pegasus/annotate_cluster/annotate_cluster.py
index 5ce05357..4b0b6fce 100644
--- a/pegasus/annotate_cluster/annotate_cluster.py
+++ b/pegasus/annotate_cluster/annotate_cluster.py
@@ -278,7 +278,8 @@ def infer_cell_types(
* ``'mouse_immune'`` for mouse immune cells;
* ``'human_brain'`` for human brain cells;
* ``'mouse_brain'`` for mouse brain cells;
- * ``'human_lung'`` for human lung cells.
+ * ``'human_lung'`` for human lung cells;
+ * ``'mouse_liver'`` for mouse liver cells.
* If ``Dict``, it refers to a Python dictionary describing the markers.
de_test: ``str``, optional, default: ``"mwu"``
@@ -320,6 +321,7 @@ def infer_cell_types(
human_brain="human_brain_cell_markers.json",
mouse_brain="mouse_brain_cell_markers.json",
human_lung="human_lung_cell_markers.json",
+ mouse_liver="mouse_liver_cell_markers.json",
)
if isinstance(markers, str):
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index 4f462f13..f330b19c 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -206,11 +206,11 @@
"name" : "Basophil",
"markers" : [
{
- "genes" : ["Fcer1a+", "Cyp11a+", "Cd200r3+", "Il6+", "Ms4a2+"],
+ "genes" : ["Fcer1a+", "Cyp11a1+", "Cd200r3+", "Il6+", "Ms4a2+"],
"weight" : 1.0,
"comment" : "Basophil markers from Kaptein et al. Cell 2022"
}
]
- },
+ }
]
}
diff --git a/pegasus/annotate_cluster/mouse_liver_cell_markers.json b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
index b8d40786..e1350ff5 100644
--- a/pegasus/annotate_cluster/mouse_liver_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
@@ -51,7 +51,7 @@
"name" : "Kupffer cell",
"markers" : [
{
- "genes" : ["Cd5l+", "Clec4f+", "Vig4+", "Folr2+", "Timd4+"],
+ "genes" : ["Cd5l+", "Clec4f+", "Vsig4+", "Folr2+", "Timd4+"],
"weight" : 1.0,
"comment" : "Kupffer cell markers from Kaptein et al. Cell 2022"
}
diff --git a/pegasus/data_files/mouse_liver.gmt b/pegasus/data_files/mouse_liver.gmt
index 5229ce86..a4a5f070 100644
--- a/pegasus/data_files/mouse_liver.gmt
+++ b/pegasus/data_files/mouse_liver.gmt
@@ -18,6 +18,6 @@ MonoI Inflammatory monocyte Ly6c2 F13a1 Chil3 Ms4a4c Ccr2
MonoP Patrolling monocyte Ace Eno3 Ear2 Treml4 Fabp4
PeriMac Peritoneal macrophage Lyz1 Saa3 Prg4 Retnla Cbr2
Mac Macrophage Cd14 Ms4a7 Cx3cr1 Trem2 Hpgds
-Kupffer Kupffer cell Cd5l Clec4f Vig4 Folr2 Timd4
+Kupffer Kupffer cell Cd5l Clec4f Vsig4 Folr2 Timd4
Neutrophil Neutrophil S100a8 S100a9 Retnlg Mmp9 Clec4d
-Basophil Basophil Fcer1a Cyp11a Cd200r3 Il6 Ms4a2
+Basophil Basophil Fcer1a Cyp11a1 Cd200r3 Il6 Ms4a2
diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py
index 2b80d819..2410d547 100644
--- a/pegasus/tools/utils.py
+++ b/pegasus/tools/utils.py
@@ -189,6 +189,7 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: str, warni
apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"),
human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"),
mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"),
+ mouse_liver=pkg_resources.resource_filename("pegasus", "data_files/mouse_liver.gmt"),
)
predefined_pathways = dict(
From 9a15fcbbdb455b16069243c9e5a200dec6275e67 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sun, 15 Jan 2023 13:06:19 -0800
Subject: [PATCH 06/57] Fixed switched_axes for dotplot
---
pegasus/plotting/plot_library.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index f4a95549..f9874be7 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1359,9 +1359,9 @@ def non_zero(g):
yticks = summarized_df.index.map(str).values
if switch_axes:
- x, y = y, x
- xlabel, ylabel = ylabel, xlabel
- xticks, yticks = yticks, xticks
+ x, y = y[::-1], x[::-1]
+ xlabel, ylabel = ylabel[::-1], xlabel[::-1]
+ xticks, yticks = yticks[::-1], xticks[::-1]
dotplot_df = pd.DataFrame(data=dict(x=x, y=y, value=summary_values, pixels=pixels, fraction=fraction,
xlabel=np.array(xlabel)[x], ylabel=np.array(ylabel)[y]))
From 766d4f93d50ba69c7dcfdc18b39fb313b3e36aa6 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Tue, 17 Jan 2023 22:07:48 -0800
Subject: [PATCH 07/57] Fixed a typo in the docstring of plot_heatmap
---
pegasus/plotting/plot_library.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index f9874be7..824cefaa 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1076,7 +1076,7 @@ def heatmap(
Examples
--------
- >>> pg.heatmap(data, genes=['CD14', 'TRAC', 'CD34'], groupby='louvain_labels')
+ >>> pg.heatmap(data, attrs=['CD14', 'TRAC', 'CD34'], groupby='louvain_labels')
"""
if not isinstance(data, anndata.AnnData):
From ce70e86d6dbe0ba897b94230bd72de7cc27b1ebe Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Thu, 19 Jan 2023 23:44:49 -0800
Subject: [PATCH 08/57] Simplified sig score calculation and added emt score
---
pegasus/data_files/emt_human.gmt | 2 ++
pegasus/tools/signature_score.py | 40 ++++++++++++++++++--------------
pegasus/tools/utils.py | 1 +
3 files changed, 26 insertions(+), 17 deletions(-)
create mode 100644 pegasus/data_files/emt_human.gmt
diff --git a/pegasus/data_files/emt_human.gmt b/pegasus/data_files/emt_human.gmt
new file mode 100644
index 00000000..dfec37a7
--- /dev/null
+++ b/pegasus/data_files/emt_human.gmt
@@ -0,0 +1,2 @@
+Epithelial-like Signatures from Gibbons and Creighton Dev. Dyn. 2018 CDH1 DSP OCLN
+Mesenchymal-like Signatures from Gibbons and Creighton Dev. Dyn. 2018 VIM CDH2 FOXC2 SNAI1 SNAI2 TWIST1 FN1 ITGB6 MMP2 MMP3 MMP9 SOX10 GCS
diff --git a/pegasus/tools/signature_score.py b/pegasus/tools/signature_score.py
index 965e4129..b41071c0 100644
--- a/pegasus/tools/signature_score.py
+++ b/pegasus/tools/signature_score.py
@@ -100,6 +100,7 @@ def calc_signature_score(
signatures: Union[Dict[str, List[str]], str],
n_bins: int = 50,
show_omitted_genes: bool = False,
+ skip_threshold: int = 1,
random_state: int = 0
) -> None:
"""Calculate signature / gene module score. [Li20-1]_
@@ -124,12 +125,21 @@ def calc_signature_score(
* ``apoptosis_human`` contains one signature, ``apoptosis``, which includes apoptosis-related genes from the KEGG pathway.
* ``cell_cycle_mouse``, ``gender_mouse``, ``mitochondrial_genes_mouse``, ``ribosomal_genes_mouse`` and ``apoptosis_mouse`` are the corresponding signatures for mouse. Gene symbols are directly translated from human genes.
+ In addition, Pegasus provides the following 4 curated signature panels:
+ * ``emt_human``, the Epithelial-Mesenchymal Transition signature from Gibbons and Creighton Dev. Dyn. 2018.
+ * ``human_lung``, human lung cell type markers.
+ * ``mouse_brain``, mouse brain cell type markers.
+ * ``mouse_liver``, mouse liver cell type markers.
+
n_bins: ``int``, optional, default: 50
Number of bins on expression levels for grouping genes.
show_omitted_genes: ``bool``, optional, default False
Signature genes that are not expressed in the data will be omitted. By default, pegasus does not report which genes are omitted. If this option is turned on, report omitted genes.
+ skip_threshold: ``int``, optional, default 1
+ Skip signature calculation of number of kept genes is less than skip_threshold.
+
random_state: ``int``, optional, default: 0
Random state used by KMeans if signature == ``gender_human`` or ``gender_mouse``.
@@ -170,16 +180,22 @@ def calc_signature_score(
sig_string = signatures
if sig_string in predefined_signatures:
signatures = load_signatures_from_file(predefined_signatures[sig_string])
- from threadpoolctl import threadpool_limits
+
+ if sig_string.startswith("mitochondrial_genes"):
+ del signatures["mito_noncoding"]
+ elif sig_string.startswith("ribosomal_genes"):
+ del signatures["ribo_like"]
+
+ _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold)
if sig_string.startswith("cell_cycle"):
- _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
data.obs["cycle_diff"] = data.obs["G2/M"] - data.obs["G1/S"]
values = data.obs[["G1/S", "G2/M"]].values
maxvalues = values.max(axis = 1)
data.obs["cycling"] = maxvalues
+ from threadpoolctl import threadpool_limits
kmeans = KMeans(n_clusters=2, random_state=random_state)
with threadpool_limits(limits = 1):
kmeans.fit(maxvalues.reshape(-1, 1))
@@ -191,9 +207,9 @@ def calc_signature_score(
data.obs["predicted_phase"] = pd.Categorical.from_codes(codes, categories = ["G0", "G1/S", "G2/M"])
elif sig_string.startswith("gender"):
- _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
data.obs["gender_score"] = data.obs["male_score"] - data.obs["female_score"]
+ from threadpoolctl import threadpool_limits
kmeans = KMeans(n_clusters=3, random_state=random_state)
with threadpool_limits(limits = 1):
kmeans.fit(data.obs["gender_score"].values.reshape(-1, 1))
@@ -201,20 +217,10 @@ def calc_signature_score(
codes = list(map(lambda x: reorg_dict[x], kmeans.labels_))
data.obs["predicted_gender"] = pd.Categorical.from_codes(codes, categories = ["female", "uncertain", "male"])
- elif sig_string.startswith("mitochondrial_genes"):
- del signatures["mito_noncoding"]
- _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
- elif sig_string.startswith("ribosomal_genes"):
- del signatures["ribo_like"]
- _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
- elif sig_string.startswith("apoptosis"):
- _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1)
- else:
- assert sig_string in predefined_signatures
- signatures = load_signatures_from_file(predefined_signatures[sig_string])
- _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
+ elif sig_string == "emt_human":
+ data.obs["EMT_score"] = data.obs["Mesenchymal-like"] - data.obs["Epithelial-like"]
else:
signatures = load_signatures_from_file(sig_string)
- _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
+ _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold)
else:
- _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes)
+ _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold)
diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py
index 2410d547..09aa0f69 100644
--- a/pegasus/tools/utils.py
+++ b/pegasus/tools/utils.py
@@ -190,6 +190,7 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: str, warni
human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"),
mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"),
mouse_liver=pkg_resources.resource_filename("pegasus", "data_files/mouse_liver.gmt"),
+ emt_human=pkg_resources.resource_filename("pegasus", "data_files/emt_human.gmt"),
)
predefined_pathways = dict(
From 4793f788f22c1680d571c475e43abba3ea89ef7f Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Fri, 10 Feb 2023 13:43:04 -0800
Subject: [PATCH 09/57] Add stripsize to violin function
---
pegasus/plotting/plot_library.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 824cefaa..f5f236e8 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -45,7 +45,7 @@ def scatter(
fix_corners: Optional[bool] = True,
alpha: Optional[Union[float, List[float]]] = 1.0,
legend_loc: Optional[Union[str, List[str]]] = "right margin",
- legend_fontsize: Optional[Union[int, List[int]]] = 10,
+ legend_fontsize: Optional[Union[int, List[int]]] = 10,
legend_ncol: Optional[str] = None,
palettes: Optional[Union[str, List[str]]] = None,
cmaps: Optional[Union[str, List[str]]] = "YlOrRd",
@@ -214,7 +214,7 @@ def scatter(
if global_marker_size == None:
global_marker_size = _get_marker_size(x.size) if marker_size is None else marker_size
-
+
x_label = f"{basis_}{comp_key[0]}"
y_label = f"{basis_}{comp_key[1]}"
@@ -864,6 +864,7 @@ def violin(
hue: Optional[str] = None,
matkey: Optional[str] = None,
stripplot: Optional[bool] = False,
+ stripsize: int = 1,
inner: Optional[str] = None,
scale: Optional[str] = 'width',
panel_size: Optional[Tuple[float, float]] = (8, 0.5),
@@ -973,7 +974,7 @@ def violin(
for i in range(nrows):
ax = axes[i, 0]
if stripplot:
- sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=1, color="k", jitter=True)
+ sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=stripsize, color="k", jitter=True)
sns.violinplot(x="label", y=attrs[i], hue = hue, data=df, inner=inner, linewidth=1, ax=ax, cut=0, scale=scale, split=True, palette=palette, **kwargs)
ax.grid(False)
From 89c10a8066b4d75a08d53b0ad2f4f51d6859e348 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Mon, 6 Mar 2023 10:33:31 -0800
Subject: [PATCH 10/57] Added more marker files and cluster_specific_marker
function'
---
pegasus/__init__.py | 1 +
.../human_lung_cell_markers.json | 2 +-
.../mouse_immune_cell_markers.json | 113 +++----
.../mouse_liver_cell_markers.json | 41 ++-
.../mouse_lung_cell_markers.json | 291 ++++++++++++++++++
pegasus/data_files/human_lung.gmt | 2 +-
pegasus/data_files/mouse_liver.gmt | 19 +-
pegasus/data_files/mouse_lung.gmt | 38 +++
pegasus/tools/__init__.py | 2 +-
pegasus/tools/clustering.py | 38 ++-
pegasus/tools/diff_expr.py | 59 +++-
11 files changed, 531 insertions(+), 75 deletions(-)
create mode 100644 pegasus/annotate_cluster/mouse_lung_cell_markers.json
create mode 100644 pegasus/data_files/mouse_lung.gmt
diff --git a/pegasus/__init__.py b/pegasus/__init__.py
index ae574a32..3e0d62bc 100644
--- a/pegasus/__init__.py
+++ b/pegasus/__init__.py
@@ -65,6 +65,7 @@
de_analysis,
markers,
write_results_to_excel,
+ cluster_specific_markers,
find_markers,
infer_path,
calc_signature_score,
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index 4e18b1ad..c787456e 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -84,7 +84,7 @@
{
"genes" : ["CALCA+", "CHGA+", "ASCL1+", "SLC35D3+", "KIF1A+"],
"weight" : 1.0,
- "comment" : "Plumonary neuroendocrien cell markers from Travaglini et al."
+ "comment" : "Plumonary neuroendocrine cell markers from Travaglini et al."
}
]
},
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index f330b19c..240615d3 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -71,9 +71,9 @@
"name" : "B cell",
"markers" : [
{
- "genes" : ["Cd19+", "Ms4a1+", "Cd79a+", "Cd79b+", "Ebf1+"],
+ "genes" : ["Cd19+", "Ms4a1+", "Cd79a+", "Cd79b+", "Ebf1+", "Pax5+", "Fcmr+", "Bank1+"],
"weight" : 1.0,
- "comment" : "B cell markers from Kaptein et al. Cell 2022"
+ "comment" : "B cell markers from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
}
]
},
@@ -81,10 +81,15 @@
{
"name" : "Natural killer cell",
"markers" : [
+ {
+ "genes" : ["Gzma+", "Klrb1c+", "Ncr1+", "Klre1+", "Klrc2+"],
+ "weight" : 0.6,
+ "comment" : "NK & ILC1 shared markers from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Kaptein et al. Cell 2022 data"
+ },
{
"genes" : ["Eomes+", "Cma1+", "Klra4+", "Klra7+", "Klra8+"],
- "weight" : 1.0,
- "comment" : "NK cell markers from Kaptein et al. Cell 2022"
+ "weight" : 0.4,
+ "comment" : "NK cell specific markers (compared to ILC1) from Kaptein et al. Cell 2022; these markers do not have high expressions in Hurskainen et al. Nat. Commun. 2021 data"
}
]
},
@@ -93,9 +98,9 @@
"name" : "Inflammatory monocyte",
"markers" : [
{
- "genes" : ["Ly6c2+", "F13a1+", "Chil3+", "Ms4a4c+", "Ccr2+"],
+ "genes" : ["Ly6c2+", "F13a1+", "Ms4a4c+", "Ccr2+", "Gm9733+", "Mcub+"],
"weight" : 1.0,
- "comment" : "Inflammatory monocyte markers from Kaptein et al. Cell 2022"
+ "comment" : "Inflammatory monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
}
]
},
@@ -104,56 +109,25 @@
"name" : "Patrolling monocyte",
"markers" : [
{
- "genes" : ["Ace+", "Eno3+", "Ear2+", "Treml4+", "Fabp4+"],
+ "genes" : ["Ace+", "Eno3+", "Ear2+", "Treml4+", "Spn+", "Fcgr4+", "Lair1+", "Cd300e+", "Cd300ld+", "Adgre4+"],
"weight" : 1.0,
- "comment" : "Patrolling monocyte markers from Kaptein et al. Cell 2022"
+ "comment" : "Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; Related papers: Domingo-Gonzalez et al. Elife 2020, Thomas et al. Arterioscler Thromb Vasc Biol. 2015, and Schyns et al. Nat. Commun. 2019."
}
]
},
- {
- "name" : "Macrophage",
- "markers" : [
- {
- "genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"],
- "weight" : 1.0,
- "comment" : "Machrophage markers from Kaptein et al. Cell 2022"
- }
- ],
- "subtypes" : {
- "title" : "Macrophage subtype markers",
- "cell_types" : [
- {
- "name" : "Cd207+ macrophage",
- "markers" : [
- {
- "genes" : ["Cd207+", "Tmem119+", "Olfml3+", "Mmp13+"],
- "weight" : 1.0,
- "comments" : "Cd207+ macrophage markers from Kaptein et al. Cell 2022"
- }
- ]
- },
- {
- "name" : "Bile-duct lipid-associated macrophage",
- "markers" : [
- {
- "genes" : ["Gpnmb+", "Spp1+", "Syngr1+", "Cd93+"],
- "weight" : 1.0,
- "comments" : "Bile-duct LAM markers from Kaptein et al. Cell 2022"
- }
- ]
- }
- ]
- }
- },
-
{
"name" : "Conventional type 1 dendritic cell",
"markers" : [
{
- "genes" : ["Xcr1+", "Gcsam+", "Snx22+", "Rab7b+", "Ifi205+"],
- "weight" : 1.0,
- "comment" : "cDC1 markers from Kaptein et al. Cell 2022"
+ "genes" : ["Xcr1+", "Ifi205+", "Rab7b+", "Tlr3+", "Sept3+", "Hepacam2+"],
+ "weight" : 0.7,
+ "comment" : "cDC1 markers shared between Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
+ },
+ {
+ "genes" : ["Gcsam+", "Snx22+", "Itgae+", "Xlr+"],
+ "weight" : 0.3,
+ "comment" : "cDC1 markers expressed highly in one of Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021, but not both"
}
]
},
@@ -173,9 +147,9 @@
"name" : "Migratory dendritic cell",
"markers" : [
{
- "genes" : ["Cacnb3+", "Nudt17+", "Ccl22+", "Apol7c+", "Slco5a1+"],
+ "genes" : ["Cacnb3+", "Nudt17+", "Ccl22+", "Apol7c+", "Slco5a1+", "Ccr7+", "Fscn1+", "Il4i1+", "Mreg+", "Bcl2l14+"],
"weight" : 1.0,
- "comment" : "Migratory DC markers from Kaptein et al. Cell 2022"
+ "comment" : "Migratory DC markers shared between Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
}
]
},
@@ -195,9 +169,20 @@
"name" : "Neutrophil",
"markers" : [
{
- "genes" : ["S100a8+", "S100a9+", "Retnlg+", "Mmp9+", "Clec4d+"],
+ "genes" : ["S100a8+", "S100a9+", "Retnlg+", "Mmp9+", "Csf3r+", "Wfdc21+", "Il1r2+", "Cxcr2+"],
+ "weight" : 1.0,
+ "comment" : "Neutrophil markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; related paper: Grieshaber-Bouyer et al. Nat. Commun. 2021"
+ }
+ ]
+ },
+
+ {
+ "name" : "Immature neutrophil",
+ "markers" : [
+ {
+ "genes" : ["Ngp+", "Camp+", "Ltf+", "Ly6g+", "Cebpe+"],
"weight" : 1.0,
- "comment" : "Neutrophil markers from Kaptein et al. Cell 2022"
+ "comment" : "Immature Neutrophil markers inferred from Hurskainen et al. Nat. Commun. 2021 and checked using Evrard et al. Immunity 2018 Fig. 5"
}
]
},
@@ -206,9 +191,31 @@
"name" : "Basophil",
"markers" : [
{
- "genes" : ["Fcer1a+", "Cyp11a1+", "Cd200r3+", "Il6+", "Ms4a2+"],
+ "genes" : ["Cd200r3+", "Aqp9+", "Il6+", "Hgf+", "Adora2b+", "Il4+", "L1cam+", "Grm6+"],
+ "weight" : 1.0,
+ "comment" : "Basophil markers inferred from Matsumara et al. Nat. Commun. 2022 and confirmed using data from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
+ }
+ ]
+ },
+
+ {
+ "name" : "Eosinophil",
+ "markers" : [
+ {
+ "genes" : ["Epx+", "Prg3+", "Eml5+", "Il5ra+", "Qsox2+", "L2hgdh+"],
+ "weight" : 1.0,
+ "comment" : "Eosinophil markers inferred from Matsumara et al. Nat. Commun. 2022"
+ }
+ ]
+ },
+
+ {
+ "name" : "Mast cell",
+ "markers" : [
+ {
+ "genes" : ["Tph1+", "Clnk+", "Hs6st2+", "Plcg1+"],
"weight" : 1.0,
- "comment" : "Basophil markers from Kaptein et al. Cell 2022"
+ "comment" : "Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022"
}
]
}
diff --git a/pegasus/annotate_cluster/mouse_liver_cell_markers.json b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
index e1350ff5..f40427b3 100644
--- a/pegasus/annotate_cluster/mouse_liver_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_liver_cell_markers.json
@@ -3,12 +3,12 @@
"comment": "Markers are collected from Kaptein et al. Cell 2022",
"cell_types" : [
{
- "name" : "Hepatocye",
+ "name" : "Hepatocyte",
"markers" : [
{
"genes" : ["Acaa1b+", "Arg1+", "Sult2a8+", "Hgd+", "Otc+"],
"weight" : 1.0,
- "comment" : "Hepatocye markers from Kaptein et al. Cell 2022"
+ "comment" : "Hepatocyte markers from Kaptein et al. Cell 2022"
}
]
},
@@ -47,6 +47,7 @@
]
},
+
{
"name" : "Kupffer cell",
"markers" : [
@@ -69,6 +70,42 @@
]
},
+ {
+ "name" : "Macrophage",
+ "markers" : [
+ {
+ "genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"],
+ "weight" : 1.0,
+ "comment" : "Machrophage markers from Kaptein et al. Cell 2022"
+ }
+ ],
+ "subtypes" : {
+ "title" : "Macrophage subtype markers",
+ "cell_types" : [
+ {
+ "name" : "Cd207+ macrophage",
+ "markers" : [
+ {
+ "genes" : ["Cd207+", "Tmem119+", "Olfml3+", "Mmp13+"],
+ "weight" : 1.0,
+ "comments" : "Cd207+ macrophage markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+ {
+ "name" : "Bile-duct lipid-associated macrophage",
+ "markers" : [
+ {
+ "genes" : ["Gpnmb+", "Spp1+", "Syngr1+", "Cd93+"],
+ "weight" : 1.0,
+ "comments" : "Bile-duct LAM markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ }
+ ]
+ }
+ },
+
{
"name" : "Endothelial cell",
diff --git a/pegasus/annotate_cluster/mouse_lung_cell_markers.json b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
new file mode 100644
index 00000000..2c81e13f
--- /dev/null
+++ b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
@@ -0,0 +1,291 @@
+{
+ "title" : "Mouse lung cell type markers",
+ "cell_types" : [
+ {
+ "name" : "Alveolar type I cell",
+ "markers" : [
+ {
+ "genes" : ["Akap5+", "Rtkn2+", "Ndnf+", "Col4a3+", "Spock2+"],
+ "weight" : 1.0,
+ "comment" : "AT1 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+ }
+ ]
+ },
+
+ {
+ "name" : "Alveolar type II cell",
+ "markers" : [
+ {
+ "genes" : ["Sftpc+", "Sftpa1+", "Lamp3+", "Hc+", "Slc34a2+"],
+ "weight" : 1.0,
+ "comment" : "AT2 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+ }
+ ]
+ },
+
+ {
+ "name" : "Ciliated cell",
+ "markers" : [
+ {
+ "genes" : ["Dynlrb2+", "Tmem212+", "Foxj1+", "Ccdc153+", "Nme5+"],
+ "weight" : 1.0,
+ "comment" : "Ciliated cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+ }
+ ]
+ },
+
+ {
+ "name" : "Club cell",
+ "markers" : [
+ {
+ "genes" : ["Scgb1a1+", "Scgb3a2+", "Cckar+", "Gabrp+", "Slc16a11+"],
+ "weight" : 1.0,
+ "comment" : "Club cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021"
+ }
+ ]
+ },
+
+ {
+ "name" : "Basal cell",
+ "markers" : [
+ {
+ "genes" : ["Aqp3+", "Krt5+", "Dapl1+", "Hspa1a+", "Trp63+"],
+ "weight" : 1.0,
+ "comment" : "Basal cell markers from Montoro et al. Nature 2018 Extended Data Fig. 1d"
+ }
+ ]
+ },
+
+ {
+ "name" : "Goblet cell",
+ "markers" : [
+ {
+ "genes" : ["Scgb3a1+", "Muc5b+", "Serpinb11+", "Gp2+", "Dmbt1+"],
+ "weight" : 1.0,
+ "comment" : "Goblet cell markers from Montoro et al. Nature 2018 Supp Table 1"
+ }
+ ]
+ },
+
+ {
+ "name" : "Tuft cell",
+ "markers" : [
+ {
+ "genes" : ["Pou2f3+", "Ascl2+", "Dclk1+", "Lrmp+", "Ltc4s+", "Trpm5+", "Gnb3+", "Rgs13+"],
+ "weight" : 1.0,
+ "comment" : "Tuft cell markers from Sun et al. Dev. Cell 2022 and Montoro et al. Nature 2018 Extended Data Fig. 3b; first 3 markers are mainly suggested by Sun et al. the CellCards."
+ }
+ ]
+ },
+
+ {
+ "name" : "Plumonary neuroendocrine cell",
+ "markers" : [
+ {
+ "genes" : ["Ascl1+", "Chga+", "Calca+", "Scg2+", "Scg5+"],
+ "weight" : 1.0,
+ "comment" : "Plumonary neuroendocrine cell markers from Montoro et al. Nature 2018 Extended Data Fig. 3b & 3c"
+ }
+ ]
+ },
+
+ {
+ "name" : "Ionocyte",
+ "markers" : [
+ {
+ "genes" : ["Foxi1+", "Ascl3+", "Smbd1+", "Moxd1+", "Atp6v0d2+"],
+ "weight" : 1.0,
+ "comment" : "Ionocyte markers from Montoro et al. Nature 2018 Fig. 5a"
+ }
+ ]
+ },
+
+
+
+ {
+ "name" : "Endothelial cell",
+ "markers" : [
+ {
+ "genes" : ["Egfl7+", "Cldn5+", "Cdh5+", "Pecam1+", "Calcrl+", "Ecscr+", "Icam2+"],
+ "weight" : 1.0,
+ "comment" : "Endothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+ }
+ ],
+ "subtypes" : {
+ "title" : "Endothelial cell subtype markers (Main and Capillary, see https://lungmap.net/cell-cards/)",
+ "cell_types" : [
+ {
+ "name" : "Aerocyte",
+ "markers" : [
+ {
+ "genes" : ["Emp2+", "Car4+", "Tbx2+", "Apln+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ]
+ },
+ {
+ "name" : "EC general capillary",
+ "markers" : [
+ {
+ "genes" : ["Gpihbp1+", "Kit+", "Nckap5+", "Aplnr+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ]
+ },
+ {
+ "name" : "EC lymphatic",
+ "markers" : [
+ {
+ "genes" : ["Mmrn1+", "Ccl21a+", "Prox1+", "Nts+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ]
+ },
+ {
+ "name" : "EC venous",
+ "markers" : [
+ {
+ "genes" : ["Slc6a2+", "Vegfc+", "Ackr3+", "Fabp4+"],
+ "weight" : 1.0,
+ "comments" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ]
+ },
+ {
+ "name" : "EC arterial",
+ "markers" : [
+ {
+ "genes" : ["Gja5+", "Cxcl12+", "Pcsk5+", "Thsd7a+"],
+ "weight" : 1.0,
+ "comments" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ]
+ }
+ ]
+ }
+ },
+
+
+
+ {
+ "name" : "Mesothelial cell",
+ "markers" : [
+ {
+ "genes" : ["Wt1+", "Upk3b+", "Rspo1+", "C2+", "Sbsn+", "Aldh1a2+", "Lrrn4+", "Cldn15+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ]
+ },
+
+ {
+ "name" : "Pericyte",
+ "markers" : [
+ {
+ "genes" : ["Notch3+", "Heyl+", "Parm1+", "Ndufa4l2+", "Cox4i2+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ],
+ "subtypes" : {
+ "title" : "Pericyte subtype markers",
+ "cell_types" : [
+ {
+ "name" : "Pericyte 1",
+ "markers" : [
+ {
+ "genes" : ["Gpc6+", "Cxcl12+", "Wisp2+", "Map3k7cl+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ]
+ },
+ {
+ "name" : "Pericyte 2",
+ "markers" : [
+ {
+ "genes" : ["Higd1b+", "Pcdh18+", "Trpc6+", "Fam162b+", "Clstn2+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ]
+ }
+ ]
+ }
+ },
+
+ {
+ "name" : "Fibroblast",
+ "markers" : [
+ {
+ "genes" : ["Dpt+", "Clec3b+", "Pcolce2+", "Vegfd+", "Vcam1+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ],
+ "subtypes" : {
+ "title" : "Fibro subtype markers",
+ "cell_types" : [
+ {
+ "name" : "Adventitial fibroblast",
+ "markers" : [
+ {
+ "genes" : ["Mfap5+", "Serpinf1+", "Abca8a+", "Twist2+"],
+ "weight" : 1.0,
+ "comment" : "Markers from Schupp et al. and Travaglini et al."
+ }
+ ]
+ },
+ {
+ "name" : "Alveolar fibroblast",
+ "markers" : [
+ {
+ "genes" : ["Slit2+", "Col13a1+", "Wnt2+", "Slc38a5+", "Slc27a6+", "Frem1+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data"
+ }
+ ]
+ }
+ ]
+ }
+ },
+
+ {
+ "name" : "Myofibroblast",
+ "markers" : [
+ {
+ "genes" : ["Egfem1+", "Agt+", "Prag1+", "Etv1+", "Trim67+"],
+ "weight" : 1.0,
+ "comment" : "Markers from Schupp et al. and Travaglini et al."
+ }
+ ]
+ },
+
+ {
+ "name" : "Smooth muscle cell",
+ "markers" : [
+ {
+ "genes" : ["Tnnt2+", "Sgcg+", "Sntg2+", "Nrtn+", "Mrvi1+", "Sbspon+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+ }
+ ]
+ },
+
+
+
+ {
+ "name" : "ILC2",
+ "markers" : [
+ {
+ "genes" : ["Gata3+", "Il1rl1+", "Arg1+", "Areg+", "Il2ra+", "Csf2+", "Ccl1+", "Ccdc184+", "Calca+", "Il5+"],
+ "weight" : 1.0,
+ "comment" : "Innate lymphoid cell type 2 markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+ }
+ ]
+ },
+ ]
+}
diff --git a/pegasus/data_files/human_lung.gmt b/pegasus/data_files/human_lung.gmt
index 726c485b..871c9dec 100644
--- a/pegasus/data_files/human_lung.gmt
+++ b/pegasus/data_files/human_lung.gmt
@@ -16,4 +16,4 @@ Ciliated Ciliated cell markers from Schupp et al., Travaglini et al. and Tony et
Club Club cell markers from Schupp et al., Travaglini et al. and Tony et al. SCGB3A2 MGP VIM CST3
Goblet Goblet cell markers from Schupp et al., Travaglini et al. and Tony et al. MUC5AC MUC5B BPIFB1 MSMB FAM3D SERPINB11 CXCL6 SCGB1A1 FAM3D SERPINB3
Ionocyte Ionocyte markers from Travaglini et al. FOXI1 ASCL3 CLDN25 ATP6V1G3 LINC01187
-PNEC Plumonary neuroendocrien cell markers from Travaglini et al. CALCA CHGA ASCL1 SLC35D3 KIF1A
+PNEC Plumonary neuroendocrine cell markers from Travaglini et al. CALCA CHGA ASCL1 SLC35D3 KIF1A
diff --git a/pegasus/data_files/mouse_liver.gmt b/pegasus/data_files/mouse_liver.gmt
index a4a5f070..ad62209e 100644
--- a/pegasus/data_files/mouse_liver.gmt
+++ b/pegasus/data_files/mouse_liver.gmt
@@ -7,17 +7,22 @@ Hepatocyte Hepatocye Acaa1b Arg1 Sult2a8 Hgd Otc
Cholangiocyte Cholangiocyte Spp1 Ddit4l Sox9 Fgfr3 Plet1
HSPC Hepatic stem and progenitor cell Chrm3 Dmbt1 Slc4a4 Parm1 Pcdh11x
T T cell Cd3d Cd3e Cd3g Trac Cd28
-B B cell Cd19 Ms4a1 Cd79a Cd79b Ebf1
+B B cell Cd19 Ms4a1 Cd79a Cd79b Ebf1 Pax5 Fcmr Bank1
NK NK cell Eomes Cma1 Klra4 Klra7 Klra8
ILC1 Innate lymphoid cell type 1 Xcl1 Cd160 Klrc1 Cd200r2 Gzmc
-cDC1 cDC1 Xcr1 Gcsam Snx22 Rab7b Ifi205
+cDC1 cDC1 Xcr1 Ifi205 Rab7b Tlr3 Sept3 Hepacam2 Gcsam Snx22 Itgae Xlr
cDC2 cDC2 Cd209a Ltb4r1 Mgl2 Tnip3 Bex6
-Mig_cDC Migoritory cDC Cacnb3 Nudt17 Ccl22 Apol7c Slco5a1
+migDC Migoritory DC Cacnb3 Nudt17 Ccl22 Apol7c Slco5a1 Ccr7 Fscn1 Il4i1 Mreg Bcl2l14
pDC Plasmacytoid dendritic cell Siglech Ccr9 Cox6a2 Cd300c Klk1
-MonoI Inflammatory monocyte Ly6c2 F13a1 Chil3 Ms4a4c Ccr2
-MonoP Patrolling monocyte Ace Eno3 Ear2 Treml4 Fabp4
+MonoI Inflammatory monocyte Ly6c2 F13a1 Ms4a4c Ccr2 Gm9733 Mcub
+MonoP Patrolling monocyte Ace Eno3 Ear2 Treml4 Spn Fcgr4 Lair1 Cd300e Cd300ld Adgre4
PeriMac Peritoneal macrophage Lyz1 Saa3 Prg4 Retnla Cbr2
Mac Macrophage Cd14 Ms4a7 Cx3cr1 Trem2 Hpgds
Kupffer Kupffer cell Cd5l Clec4f Vsig4 Folr2 Timd4
-Neutrophil Neutrophil S100a8 S100a9 Retnlg Mmp9 Clec4d
-Basophil Basophil Fcer1a Cyp11a1 Cd200r3 Il6 Ms4a2
+Neutrophil Neutrophil S100a8 S100a9 Retnlg Mmp9 Csf3r Wfdc21 Il1r2 Cxcr2
+Basophil Basophil Cd200r3 Aqp9 Il6 Hgf Adora2b Il4 L1cam Grm6
+Eosinophil Eosinophil Epx Prg3 Eml5 Il5ra Qsox2 L2hgdh
+Mast Mast cell Tph1 Clnk Hs6st2 Plcg1
+Pericentral Pericentral liver zonation markers from Halpern et al. Nature 2017 and Guilliams et al. Cell 2022 Mup11 Oat Rgn Glul Cyp2e1 Axin2 Cyp1a2 Gstm3 Psmd4
+Periportal Periportal liver zonation markers from Halpern et al. Nature 2017 and Guilliams et al. Cell 2022 Cyp2f2 Hal Sds Ass1 Asl Alb Arg1 Pck1 C2 Sdhd
+Midlobular Mid-lobular liver zonation markers picked from Fig. 3 and Extended Data Fig 10a of Halpern et al. Nature 2017 Hamp Igfbp2 Cyp8b1 Mup3 Hamp2 Hsbp8 Ces1d Cebpa Fkbp8 Clpp
diff --git a/pegasus/data_files/mouse_lung.gmt b/pegasus/data_files/mouse_lung.gmt
new file mode 100644
index 00000000..f3ab9d0f
--- /dev/null
+++ b/pegasus/data_files/mouse_lung.gmt
@@ -0,0 +1,38 @@
+AT1 AT1 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021 Akap5 Rtkn2 Ndnf Col4a3 Spock2
+AT2 AT2 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021 Sftpc Sftpa1 Lamp3 Hc Slc34a2
+Ciliated Ciliated cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021 Dynlrb2 Tmem212 Foxj1 Ccdc153 Nme5
+Club Club cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021 Scgb1a1 Scgb3a2 Cckar Gabrp Slc16a11
+Basal Basal cell markers from Montoro et al. Nature 2018 Extended Data Fig. 1d Aqp3 Krt5 Dapl1 Hspa1a Trp63
+Goblet Goblet cell markers from Montoro et al. Nature 2018 Supp Table 1 Scgb3a1 Muc5b Serpinb11 Gp2 Dmbt1
+Tuft Tuft cell markers from Sun et al. Dev. Cell 2022 and Montoro et al. Nature 2018 Extended Data Fig. 3b Pou2f3 Ascl2 Dclk1 Lrmp Ltc4s Trpm5 Gnb3 Rgs13
+PNEC Plumonary neuroendocrine cell markers from Montoro et al. Nature 2018 Extended Data Fig. 3b & 3c Ascl1 Chga Calca Scg2 Scg5
+Ionocyte Ionocyte markers from Montoro et al. Nature 2018 Fig. 5a Foxi1 Ascl3 Smbd1 Moxd1 Atp6v0d2
+Endothelial Endothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data Egfl7 Cldn5 Cdh5 Pecam1 Calcrl Ecscr Icam2
+Mesothelial Mesothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021 Wt1 Upk3b Rspo1 C2 Sbsn Aldh1a2 Lrrn4 Cldn15
+Pericyte Pericyte markers inferred from Hurskainen et al. Nat. Commun. 2021 data Notch3 Heyl Parm1 Ndufa4l2 Cox4i2
+Fibroblast Fibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data Dpt Clec3b Pcolce2 Vegfd Vcam1
+Myofibroblast Myofibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data Egfem1 Agt Prag1 Etv1 Trim67
+SMC Smooth muscle cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data Tnnt2 Sgcg Sntg2 Nrtn Mrvi1 Sbspon
+ILC2 Innate lymphoid cell type 2 markers inferred from Hurskainen et al. Nat. Commun. 2021 data Gata3 Il1rl1 Arg1 Areg Il2ra Csf2 Ccl1 Ccdc184 Calca Il5
+B B cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Cd19 Ms4a1 Cd79a Cd79b Ebf1 Pax5 Fcmr Bank1
+NK NK cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Gzma Klrb1c Ncr1 Klre1 Klrc2 Eomes Cma1 Klra4 Klra7 Klra8
+cDC1 cDC1 markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Xcr1 Ifi205 Rab7b Tlr3 Sept3 Hepacam2 Gcsam Snx22 Itgae Xlr
+cDC2 cDC2 markers from Kaptein et al. Cell 2022 Cd209a Ltb4r1 Mgl2 Tnip3 Bex6
+migDC Migoritory DC markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Cacnb3 Nudt17 Ccl22 Apol7c Slco5a1 Ccr7 Fscn1 Il4i1 Mreg Bcl2l14
+pDC Plasmacytoid dendritic cell markers from Kaptein et al. Cell 2022 Siglech Ccr9 Cox6a2 Cd300c Klk1
+MonoI Inflammatory monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Ly6c2 F13a1 Ms4a4c Ccr2 Gm9733 Mcub
+MonoP Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Ace Eno3 Ear2 Treml4 Spn Fcgr4 Lair1 Cd300e Cd300ld Adgre4
+Neutrophil Neutrophil markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data S100a8 S100a9 Retnlg Mmp9 Csf3r Wfdc21 Il1r2 Cxcr2
+Basophil Basophil markers inferred from Matsumara et al. Nat. Commun. 2022 data Cd200r3 Aqp9 Il6 Hgf Adora2b Il4 L1cam Grm6
+Eosinophil Eosinophil markers inferred from Matsumara et al. Nat. Commun. 2022 data Epx Prg3 Eml5 Il5ra Qsox2 L2hgdh
+Mast Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022 data Tph1 Clnk Hs6st2 Plcg1
+
+
+
+Macrophage Macro CD68 CD163 C1QA MRC1 MS4A6A MSR1 MERTK
+T cell T cell markers CD3D CD3E CD3G TRAC
+
+Plasma cell Plasma cell markers CD38 XBP1 CD27 SLAMF7 TNFRSF17 TNFRSF13B
+
+
+Neutrophil Neutrophil markers FUT4 MPO CEACAM8 ELANE CXCR1 CXCR2 LY6G6D
diff --git a/pegasus/tools/__init__.py b/pegasus/tools/__init__.py
index ac0f149e..3b5eebd3 100644
--- a/pegasus/tools/__init__.py
+++ b/pegasus/tools/__init__.py
@@ -56,7 +56,7 @@
net_umap,
net_fle,
)
-from .diff_expr import de_analysis, markers, write_results_to_excel, run_de_analysis
+from .diff_expr import de_analysis, markers, write_results_to_excel, cluster_specific_markers, run_de_analysis
from .gradient_boosting import find_markers, run_find_markers
from .subcluster_utils import clone_subset
from .signature_score import calc_signature_score, calculate_z_score
diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py
index 89d41895..aa16291b 100644
--- a/pegasus/tools/clustering.py
+++ b/pegasus/tools/clustering.py
@@ -1,6 +1,7 @@
import time
import numpy as np
import pandas as pd
+from pandas.api.types import is_categorical_dtype
from pegasusio import MultimodalData
from natsort import natsorted
@@ -647,7 +648,7 @@ def split_one_cluster(
random_state: int = 0,
) -> None:
"""
- Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' clusters and write the new clusting results to 'res_label'. Assume 'clust_label' named clusters as numbers (in str format).
+ Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' sub-clusters and write the new clusting results to 'res_label'. The sub-cluster names are the concatenation of original cluster name and the subcluster id (e.g. 'T' -> 'T-1', 'T-2').
Parameters
----------
@@ -664,7 +665,7 @@ def split_one_cluster(
Split 'clust_id' into `n_clust' subclusters.
res_label: `str`,
- Write new clustering in data.obs['res_label']. The largest subcluster will use 'clust_id' as its cluster ID, while other subclusters will be numbered after existing clusters.
+ Write new clustering in data.obs['res_label']. The sub-cluster names are the concatenation of original cluster name and the subcluster id (e.g. 'T' -> 'T-1', 'T-2').
rep: ``str``, optional, default: ``"pca"``
The embedding representation used for Kmeans clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.
@@ -689,16 +690,35 @@ def split_one_cluster(
--------
>>> pg.split_one_cluster(data, 'leiden_labels', '15', 2, 'leiden_labels_split')
"""
- idx = np.where(data.obs[clust_label] == clust_id)[0]
+ cats = None
+ if is_categorical_dtype(data.obs[clust_label]):
+ cats = data.obs[clust_label].cat.categories.values
+ else:
+ cats = pd.Categorical(data.obs[clust_label]).categories.values
+ if cats.dtype.kind not in {'S', 'U'}:
+ cats = cats.astype(str)
+ idx_cat = np.nonzero(cats==clust_id)[0]
+
+ if idx_cat.size == 0:
+ raise ValueError(f"{clust_id} is not in {clust_label}!")
+ elif idx_cat.size > 1:
+ raise ValueError(f"Detected more than one categories in {clust_label} with name {clust_id}!")
+ else:
+ idx_cat = idx_cat[0]
+
+ idx = np.nonzero((data.obs[clust_label] == clust_id).values)[0]
tmpdat = data[idx].copy()
from pegasus.tools import neighbors
neighbors(tmpdat, rep=rep, n_comps=n_comps, use_cache=False)
leiden(tmpdat, rep=rep, resolution=None, n_clust=n_clust, random_state=random_state)
- new_clust = data.obs[clust_label].values.astype(int)
- new_label = new_clust.max() + 1
- for label in tmpdat.obs['leiden_labels'].value_counts().index[1:]:
- new_clust[idx[(tmpdat.obs['leiden_labels'] == label).values]] = new_label
- new_label += 1
- data.obs[res_label] = pd.Categorical(values = new_clust.astype(str), categories = np.array(range(1, new_label)).astype(str))
+
+ new_clust = data.obs[clust_label].values.astype(str)
+ cats_sub = []
+ for i, label in enumerate(tmpdat.obs['leiden_labels'].value_counts().index):
+ sub_id = f"{clust_id}-{i+1}"
+ new_clust[idx[(tmpdat.obs['leiden_labels'] == label).values]] = sub_id
+ cats_sub.append(sub_id)
+
+ data.obs[res_label] = pd.Categorical(values = new_clust, categories = np.concatenate((cats[0:idx_cat], np.array(cats_sub), cats[idx_cat+1:])))
data.register_attr(res_label, "cluster")
del tmpdat
diff --git a/pegasus/tools/diff_expr.py b/pegasus/tools/diff_expr.py
index 45461628..53366c93 100644
--- a/pegasus/tools/diff_expr.py
+++ b/pegasus/tools/diff_expr.py
@@ -419,7 +419,7 @@ def de_analysis(
n_jobs: ``int``, optional, default: ``-1``
Number of threads to use. If ``-1``, use all available threads.
- t: ``bool``, optional, default: ``True``
+ t: ``bool``, optional, default: ``False``
If ``True``, calculate Welch's t test.
fisher: ``bool``, optional, default: ``False``
@@ -756,6 +756,63 @@ def add_worksheet(
logger.info("Excel spreadsheet is written.")
+def cluster_specific_markers(
+ markers: Dict[str, Dict[str, pd.DataFrame]],
+ clust_id: str,
+ min_auroc: float = 0.7,
+ expected_pfc: float = 10.0,
+ n_lo: int = 25,
+ n_up: int = 50,
+) -> pd.DataFrame:
+ """ Extract cluster-specific markers from DE results ``markers``.
+
+ This function extracts cluster-specific markers (e.g. with auroc >= min_auroc and high in percentage fold change). The extracted markers can be screened for signatures representing the cluster.
+
+ The selection procedure is as follows: First, pick genes with AUROC >= min_auroc and pfc (percentage fold change) >= expected_pfc. If the number is between [n_lo, n_up], return the subset of markers containing only these genes. Otherwise, if the number < n_lo, extend the gene set to include up to n_lo genes in descending order of their pfc. If the number > n_up, truncate the set by keeping only n_up genes with highest pfc.
+
+ Parameters
+ ----------
+ markers: ``Dict[str, Dict[str, pd.DataFrame]]``
+ Markers from `de_analysis`.
+
+ clust_id: ``str``
+ Cluster ID to tell which cluster to focus on.
+
+ min_auroc: ``float``, default, ``0.7``
+ Minimum AUROC for a gene.
+
+ expected_pfc: ``float``, optional, default: ``10.0``
+ Expected percentage fold change for a gene.
+
+ n_lo: ``int``, optional, default: ``25``
+ Lower bound (inclusive) on the number of genes to return.
+
+ n_up: ``int``, optional, default: ``50``
+ Upper bound (inclusive) on the number of genes to return.
+
+ Returns
+ -------
+ results: ``pd.DataFrame``
+ A Python dataframe containing selected markers, ranking in descending order with respect to AUROC.
+
+ Examples
+ --------
+ >>> candidates = pg.cluster_specific_markers(markers, 'Mono')
+ """
+ df = markers[clust_id]['up']
+ idx_auc = df['auroc'] >= min_auroc
+ idx_epf = df['percentage_fold_change'] >= expected_pfc
+ idx = idx_auc & idx_epf
+ n = idx.sum()
+ if n >= n_lo and n <= n_up:
+ return df[idx]
+ elif n < n_lo:
+ res = df[idx_auc].sort_values('percentage_fold_change', ascending=False)
+ return res.iloc[0:n_lo]
+ else:
+ return df[idx].iloc[0:n_up]
+
+
@timer(logger=logger)
def run_de_analysis(
input_file: str,
From 8d2b3fb14b54b8e79b3a90f8b90efccc8f780edd Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Tue, 7 Mar 2023 18:27:24 -0800
Subject: [PATCH 11/57] Updated markers
---
.../mouse_immune_cell_markers.json | 15 +++++++++++--
.../mouse_lung_cell_markers.json | 22 +++++++++++++++++++
pegasus/data_files/mouse_liver.gmt | 2 +-
pegasus/data_files/mouse_lung.gmt | 13 +++--------
pegasus/tools/diff_expr.py | 7 +++---
5 files changed, 42 insertions(+), 17 deletions(-)
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index 240615d3..bb3ac649 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -5,9 +5,9 @@
"name" : "T cell",
"markers" : [
{
- "genes" : ["Cd3d+", "Cd3e+", "Cd3g+", "Trac+", "Cd28+"],
+ "genes" : ["Cd3d+", "Cd3e+", "Lat+", "Thy1+", "Lef1+", "Trac+", "Cd28+"],
"weight" : 1.0,
- "comment" : "T cell markers from Kaptein et al. Cell 2022"
+ "comment" : "T cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
}
],
"subtypes" : {
@@ -67,6 +67,17 @@
]
},
+ {
+ "name" : "Immature B cell",
+ "markers" : [
+ {
+ "genes" : ["Tifa+", "Cecr2+", "Rag1+", "Atp1b1+", "Myb+", "Irf4+", "Fam129c+"],
+ "weight" : 1.0,
+ "comment" : "Immature B cell markers from Hurskainen et al. Nat. Commun. 2021"
+ }
+ ]
+ },
+
{
"name" : "B cell",
"markers" : [
diff --git a/pegasus/annotate_cluster/mouse_lung_cell_markers.json b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
index 2c81e13f..cb091482 100644
--- a/pegasus/annotate_cluster/mouse_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
@@ -287,5 +287,27 @@
}
]
},
+
+ {
+ "name" : "Alveolar macrophage",
+ "markers" : [
+ {
+ "genes" : ["Atp6v0d2+", "Olr1+", "F7+", "Ear1+", "Tfec+", "Gpnmb+", "Lrp12+", "Marco+"],
+ "weight" : 1.0,
+ "comment" : "Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+ }
+ ]
+ },
+
+ {
+ "name" : "Interstitial macrophage",
+ "markers" : [
+ {
+ "genes" : ["C1qa+", "C1qb+", "C1qc+", "Pf4+", "Ms4a7+", "Fcrls+"],
+ "weight" : 1.0,
+ "comment" : "Interstitial macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+ }
+ ]
+ }
]
}
diff --git a/pegasus/data_files/mouse_liver.gmt b/pegasus/data_files/mouse_liver.gmt
index ad62209e..d9c8bb4b 100644
--- a/pegasus/data_files/mouse_liver.gmt
+++ b/pegasus/data_files/mouse_liver.gmt
@@ -6,7 +6,7 @@ Fibro Fibroblast Col1a1 Mrc2 Plcxd3 Fndc1 Cpxm1
Hepatocyte Hepatocye Acaa1b Arg1 Sult2a8 Hgd Otc
Cholangiocyte Cholangiocyte Spp1 Ddit4l Sox9 Fgfr3 Plet1
HSPC Hepatic stem and progenitor cell Chrm3 Dmbt1 Slc4a4 Parm1 Pcdh11x
-T T cell Cd3d Cd3e Cd3g Trac Cd28
+T T cell Cd3d Cd3e Lat Thy1 Lef1 Trac Cd28
B B cell Cd19 Ms4a1 Cd79a Cd79b Ebf1 Pax5 Fcmr Bank1
NK NK cell Eomes Cma1 Klra4 Klra7 Klra8
ILC1 Innate lymphoid cell type 1 Xcl1 Cd160 Klrc1 Cd200r2 Gzmc
diff --git a/pegasus/data_files/mouse_lung.gmt b/pegasus/data_files/mouse_lung.gmt
index f3ab9d0f..0ed0bc5b 100644
--- a/pegasus/data_files/mouse_lung.gmt
+++ b/pegasus/data_files/mouse_lung.gmt
@@ -13,7 +13,10 @@ Pericyte Pericyte markers inferred from Hurskainen et al. Nat. Commun. 2021 data
Fibroblast Fibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data Dpt Clec3b Pcolce2 Vegfd Vcam1
Myofibroblast Myofibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data Egfem1 Agt Prag1 Etv1 Trim67
SMC Smooth muscle cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data Tnnt2 Sgcg Sntg2 Nrtn Mrvi1 Sbspon
+AlvMf Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 Atp6v0d2 Olr1 F7 Ear1 Tfec Gpnmb Lrp12 Marco
+IntMf Interstitial macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 C1qa C1qb C1qc Pf4 Ms4a7 Fcrls
ILC2 Innate lymphoid cell type 2 markers inferred from Hurskainen et al. Nat. Commun. 2021 data Gata3 Il1rl1 Arg1 Areg Il2ra Csf2 Ccl1 Ccdc184 Calca Il5
+T T cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Cd3d Cd3e Lat Thy1 Lef1 Trac Cd28
B B cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Cd19 Ms4a1 Cd79a Cd79b Ebf1 Pax5 Fcmr Bank1
NK NK cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Gzma Klrb1c Ncr1 Klre1 Klrc2 Eomes Cma1 Klra4 Klra7 Klra8
cDC1 cDC1 markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Xcr1 Ifi205 Rab7b Tlr3 Sept3 Hepacam2 Gcsam Snx22 Itgae Xlr
@@ -26,13 +29,3 @@ Neutrophil Neutrophil markers inferred from Kaptein et al. Cell 2022 and Hurskai
Basophil Basophil markers inferred from Matsumara et al. Nat. Commun. 2022 data Cd200r3 Aqp9 Il6 Hgf Adora2b Il4 L1cam Grm6
Eosinophil Eosinophil markers inferred from Matsumara et al. Nat. Commun. 2022 data Epx Prg3 Eml5 Il5ra Qsox2 L2hgdh
Mast Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022 data Tph1 Clnk Hs6st2 Plcg1
-
-
-
-Macrophage Macro CD68 CD163 C1QA MRC1 MS4A6A MSR1 MERTK
-T cell T cell markers CD3D CD3E CD3G TRAC
-
-Plasma cell Plasma cell markers CD38 XBP1 CD27 SLAMF7 TNFRSF17 TNFRSF13B
-
-
-Neutrophil Neutrophil markers FUT4 MPO CEACAM8 ELANE CXCR1 CXCR2 LY6G6D
diff --git a/pegasus/tools/diff_expr.py b/pegasus/tools/diff_expr.py
index 53366c93..f83c01e1 100644
--- a/pegasus/tools/diff_expr.py
+++ b/pegasus/tools/diff_expr.py
@@ -806,11 +806,10 @@ def cluster_specific_markers(
n = idx.sum()
if n >= n_lo and n <= n_up:
return df[idx]
- elif n < n_lo:
- res = df[idx_auc].sort_values('percentage_fold_change', ascending=False)
- return res.iloc[0:n_lo]
else:
- return df[idx].iloc[0:n_up]
+ res = df[idx_auc].sort_values('percentage_fold_change', ascending=False)
+ res = res.iloc[0:(n_lo if n < n_lo else n_up)].sort_values('auroc', ascending=False)
+ return res
@timer(logger=logger)
From e312855c42c0489ffa4d679cb6c5f6aa6ac0f138 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Fri, 7 Apr 2023 17:02:56 -0700
Subject: [PATCH 12/57] fix divide by zero issue in quantile normalization step
of integrative_nmf
---
pegasus/tools/nmf.py | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/pegasus/tools/nmf.py b/pegasus/tools/nmf.py
index a37399d2..4cc6f270 100644
--- a/pegasus/tools/nmf.py
+++ b/pegasus/tools/nmf.py
@@ -406,7 +406,12 @@ def integrative_nmf(
seeds = rg.integers(4294967295, size=nbatch)
ref_batch = max_size = -1
for i in range(nbatch):
- H_new = np.ascontiguousarray(Hs[i] / np.linalg.norm(Hs[i], axis=0), dtype=np.float32) # Scale H
+ h_norm = np.linalg.norm(Hs[i], axis=0)
+ idx_h_zeros = np.where(h_norm==0)[0]
+ if idx_h_zeros.size > 0:
+ # Set norm 0 to 1 to avoid divide by zero issue
+ h_norm[idx_h_zeros] = 1.0
+ H_new = np.ascontiguousarray(Hs[i] / h_norm, dtype=np.float32) # Scale H
Hs_new.append(H_new) # Append scaled H
if not quantile_norm:
From 8f29e1a9c154b0c08990a1fa17cc626a4f8373c2 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Mon, 10 Apr 2023 00:02:08 -0700
Subject: [PATCH 13/57] Updated human lung and immune markers
---
pegasus/annotate_cluster/annotate_cluster.py | 2 +
.../human_immune_cell_markers.json | 4 +-
.../human_lung_cell_markers.json | 135 +++++++++---------
pegasus/tools/utils.py | 1 +
4 files changed, 71 insertions(+), 71 deletions(-)
diff --git a/pegasus/annotate_cluster/annotate_cluster.py b/pegasus/annotate_cluster/annotate_cluster.py
index 4b0b6fce..22caea43 100644
--- a/pegasus/annotate_cluster/annotate_cluster.py
+++ b/pegasus/annotate_cluster/annotate_cluster.py
@@ -279,6 +279,7 @@ def infer_cell_types(
* ``'human_brain'`` for human brain cells;
* ``'mouse_brain'`` for mouse brain cells;
* ``'human_lung'`` for human lung cells;
+ * ``'mouse_lung'`` for mouse lung cells;
* ``'mouse_liver'`` for mouse liver cells.
* If ``Dict``, it refers to a Python dictionary describing the markers.
@@ -321,6 +322,7 @@ def infer_cell_types(
human_brain="human_brain_cell_markers.json",
mouse_brain="mouse_brain_cell_markers.json",
human_lung="human_lung_cell_markers.json",
+ mouse_lung="mouse_lung_cell_markers.json",
mouse_liver="mouse_liver_cell_markers.json",
)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index cc153e55..94e21464 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -305,9 +305,9 @@
"name" : "Plasmacytoid dendritic cell",
"markers" : [
{
- "genes" : ["JCHAIN+", "LILRA4+", "GZMB+", "MZB1+", "IL3RA+", "SERPINF1+", "ITM2C+", "IRF7+"],
+ "genes" : ["LILRA4+", "SERPINF1+", "IL3RA+", "TPM2+", "SCT+", "UGCG+", "CLEC4C+", "LRRC26+", "SMPD3+", "AC119428.2+"],
"weight" : 1.0,
- "comment" : "important pDC markers"
+ "comment" : "Markers derived from Immune Cell Atlas PBMC, BM and CB data"
}
]
},
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index c787456e..c2e9f6c4 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -102,8 +102,6 @@
-
-
{
"name" : "Vascular endothelial cell",
"markers" : [
@@ -122,22 +120,22 @@
"title" : "Vascular endothelial cell subtype markers",
"cell_types" : [
{
- "name" : "Aerocyte",
+ "name" : "EC artery",
"markers" : [
{
- "genes" : ["EDNRB+", "TBX2+", "EDA+", "HPGD+", "PRKG1+", "RCSD1+", "CYP3A5+", "VWF-"],
+ "genes" : ["CXCL12+", "GJA5+", "DKK2+", "HEY1+", "IGFBP3+", "SERPINE2+", "EFNB2+", "BMX+"],
"weight" : 1.0,
- "comment" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
{
- "name" : "EC general capillary",
+ "name" : "EC vein",
"markers" : [
{
- "genes" : ["VWF+", "EDN1+", "FCN3+", "CD36+", "GPIHBP1+", "NRXN3+", "BTNL8+"],
+ "genes" : ["CPE+", "C7+", "IL1R1+", "PLA1A+", "PTGIS+", "ABI3BP+", "CYP1B1+", "ADGRG6+"],
"weight" : 1.0,
- "comment" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+ "comments" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
@@ -145,52 +143,45 @@
"name" : "EC bronchial vessel",
"markers" : [
{
- "genes" : ["SPRY1+", "PLVAP+", "VWA1+", "MPZL2+", "ESM1+"],
+ "genes" : ["SPRY1+", "PLVAP+", "VWA1+", "ABCB1+", "COL15A1+", "RUNDC3B+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
+ }
+ ]
+ },
+ {
+ "name" : "Aerocyte",
+ "markers" : [
+ {
+ "genes" : ["HPGD+", "EDNRB+", "SOSTDC1+", "B3GALNT1+", "CYP3A5+", "TBX2+", "S100A3+", "IL1RL1+", "PRKG1+", "EXPH5+"],
"weight" : 1.0,
- "comment" : "Markers from Travaglini et al. Nature 2020"
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
{
- "name" : "EC pulmonary-venous",
+ "name" : "EC general capillary",
"markers" : [
{
- "genes" : ["COL15A1+", "ZNF385D+", "EBF1+", "CPXM2+", "PLVAP+", "VWA1+", "SPRY1+"],
+ "genes" : ["FCN3+", "IL7R+", "EDN1+", "GPIHBP1+", "SLC6A4+", "NTRK2+", "IL18R1+", "NRXN3+"],
"weight" : 1.0,
- "comments" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
{
- "name" : "EC systemic-venous",
+ "name" : "EC lymphatic",
"markers" : [
{
- "genes" : ["COL15A1-", "CPE+", "DKK3+", "EFEMP1+", "CDH11+", "PLAT+"],
+ "genes" : ["CCL21+", "TFF3+", "PDPN+", "PROX1+", "GPM6A+", "SEMA3D+", "TBX1+", "RELN+"],
"weight" : 1.0,
- "comments" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021"
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
}
]
}
},
- {
- "name" : "Lymphatic endothelial cell",
- "markers" : [
- {
- "genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+"],
- "weight" : 0.2,
- "comment" : "Markers for endothelial cells, from Schupp et al. Circulation 2021"
- },
- {
- "genes" : ["CCL21+", "SEMA3D+", "PROX1+", "PDPN+", "MMRN1+", "RELN+", "PKHD1L1+", "TFF3+", "LYVE1+", "FLT4+", "TBX1+"],
- "weight" : 0.8,
- "comment" : "Lymphatic-specific markers, from Schupp et al. Circulation 2021"
- }
- ]
- },
-
-
@@ -198,26 +189,21 @@
"name" : "Smooth muscle cell",
"markers" : [
{
- "genes" : ["MYH11+", "TAGLN+", "ACTG2+", "CNN1+", "PLN+"],
- "weight" : 0.8,
- "comment" : "Markers from Muus et al., Braga et al. and Schupp et al."
- },
- {
- "genes" : ["MYL9+", "TPM2+", "ACTA2+"],
- "weight" : 0.2,
- "comment" : "Markers that might also expressed in other stromal cell types"
+ "genes" : ["MYH11+", "ACTG2+", "CNN1+", "PLN+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
],
"subtypes" : {
"title" : "SMC subtype markers",
"cell_types" : [
{
- "name" : "DES+",
+ "name" : "Airway smooth muscle cell",
"markers" : [
{
- "genes" : ["DES+"],
+ "genes" : ["DES+", "TNNT2+", "RERGL+"],
"weight" : 1.0,
- "comment" : "DES+ SMC"
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020"
}
]
}
@@ -229,21 +215,10 @@
"name" : "Pericyte",
"markers" : [
{
- "genes" : ["TRPC6+", "CSPG4+", "FAM162B+", "GJA4+", "GJC1+", "HIGD1B+", "CDH6+", "LAMC3+", "FHL5+"],
- "weight" : 0.8,
- "comment" : "Markers from Schupp et al. Circulation 2021 and Travaglini et al. Nature 2020"
- },
- {
- "genes" : ["PDGFRB+", "TBX2+", "EBF1+"],
- "weight" : 0.1,
- "comment" : "Markers that are highly expressed in Pericytes but also expressed in fibroblast"
- },
- {
- "genes" : [ "LGI4+", "KCNK17+", "CACNA1H+", "PTN+", "TESC+"],
- "weight" : 0.1,
- "comment" : "Markers that are lowly expressed"
+ "genes" : ["COX4I2+", "HIGD1B+", "NDUFA4L2+", "FAM162B+", "LAMC3+", "KCNK3+", "GJA4+", "GJC1+", "CSPG4+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
-
]
},
@@ -251,15 +226,15 @@
"name" : "Mesothelial cell",
"markers" : [
{
- "genes" : ["WT1+", "VIPR2+", "ITLN1+", "LINC02360+", "BNC1+", "AP000561.1+", "CALB2+", "HAS1+", "LINC01133+", "GALNT9+"],
+ "genes" : ["CPA4+", "ITLN1+", "GALNT9+", "BNC1+", "CALB2+", "WT1+", "UPK3B+"],
"weight" : 1.0,
- "comment" : "Markers from Schupp et al. and Travaglini et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
{
- "name" : "Fibroblast/Myofibroblast",
+ "name" : "Fibroblast",
"markers" : [
{
"genes" : ["COL1A1+", "COL1A2+", "PDGFRA+", "ELN+", "BGN+"],
@@ -268,15 +243,15 @@
}
],
"subtypes" : {
- "title" : "Fibro/Myofib subtype markers",
+ "title" : "Fibroblast subtype markers",
"cell_types" : [
{
"name" : "Adventitial fibroblast",
"markers" : [
{
- "genes" : ["PTGIS+", "SFRP2+", "PDGFRL+", "SCARA5+", "MFAP5+", "PI16+", "AOX1+", "GAS1+", "IGFBP6+", "CXCL14+"],
+ "genes" : ["SFRP2+", "SFRP4+", "PDGFRL+", "PI16+", "MFAP5+", "SCARA5+"],
"weight" : 1.0,
- "comment" : "Markers from Schupp et al. and Travaglini et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
@@ -284,24 +259,46 @@
"name" : "Alveolar fibroblast",
"markers" : [
{
- "genes" : ["NKD1+", "FGFR4+", "GPM6B+", "SPINT2+", "SCN7A+", "TCF21+", "CAMK2N1+", "ADAMTS8+"],
+ "genes" : ["GPC3+", "FMO2+", "SCN7A+", "FGFR4+", "NKD2+", "ADAMTS8+"],
"weight" : 1.0,
- "comment" : "Markers from Schupp et al. and Travaglini et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
{
- "name" : "Myofibroblast",
+ "name" : "Lipofibroblast",
"markers" : [
{
- "genes" : ["ACTA2+", "MYL9+", "MT2A+", "EEF1A1+", "TMSB10+", "FAU+", "UBA52+", "SERF2+", "PTMA+", "S100A6+"],
+ "genes" : ["MLLT11+", "HAS2+", "SEMA6A+", "LONRF2+", "HOMER1+", "PWWP3B+"],
"weight" : 1.0,
- "comment" : "Markers from Schupp et al. and Travaglini et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020"
}
]
}
]
}
+ },
+
+ {
+ "name" : "Myofibroblast",
+ "markers" : [
+ {
+ "genes" : ["ASPN+", "SCARA3+", "WIF1+", "ANGPTL2+", "ITGBL1+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020"
+ }
+ ]
+ },
+
+ {
+ "name" : "Fibromyocyte",
+ "markers" : [
+ {
+ "genes" : ["SBSPON+", "SCX+", "GREM2+", "KCNMB1+", "LGR6+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020"
+ }
+ ]
}
]
}
diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py
index 09aa0f69..cd3bbf69 100644
--- a/pegasus/tools/utils.py
+++ b/pegasus/tools/utils.py
@@ -188,6 +188,7 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: str, warni
apoptosis_human=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_human.gmt"),
apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"),
human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"),
+ mouse_lung=pkg_resources.resource_filename("pegasus", "data_files/mouse_lung.gmt"),
mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"),
mouse_liver=pkg_resources.resource_filename("pegasus", "data_files/mouse_liver.gmt"),
emt_human=pkg_resources.resource_filename("pegasus", "data_files/emt_human.gmt"),
From ea404155c461170f43bb4af21554564321c54c96 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Tue, 11 Apr 2023 14:37:51 -0700
Subject: [PATCH 14/57] Make compatible with Pandas 2.0
---
pegasus/tools/preprocessing.py | 7 +++----
pegasus/tools/signature_score.py | 2 +-
2 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/pegasus/tools/preprocessing.py b/pegasus/tools/preprocessing.py
index 71c78b3b..d3c1d6c8 100644
--- a/pegasus/tools/preprocessing.py
+++ b/pegasus/tools/preprocessing.py
@@ -276,10 +276,9 @@ def _run_filter_data(
if output_filt is not None:
group_key = unidata.get_uid()
- writer = pd.ExcelWriter(f"{output_filt}.{group_key}.filt.xlsx", engine="xlsxwriter")
- df_cells = get_filter_stats(unidata, min_genes_before_filt = min_genes_before_filt)
- df_cells.to_excel(writer, sheet_name="Cell filtration stats")
- writer.save()
+ with pd.ExcelWriter(f"{output_filt}.{group_key}.filt.xlsx", engine="xlsxwriter") as writer:
+ df_cells = get_filter_stats(unidata, min_genes_before_filt = min_genes_before_filt)
+ df_cells.to_excel(writer, sheet_name="Cell filtration stats")
logger.info(f"Filtration results for {group_key} are written.")
if plot_filt is not None:
diff --git a/pegasus/tools/signature_score.py b/pegasus/tools/signature_score.py
index b41071c0..5030162f 100644
--- a/pegasus/tools/signature_score.py
+++ b/pegasus/tools/signature_score.py
@@ -30,7 +30,7 @@ def _check_and_calc_sig_background(data: UnimodalData, n_bins: int) -> bool:
bins = pd.qcut(mean_vec, n_bins, duplicates = "drop")
if bins.value_counts().min() == 1:
logger.warning("Detected bins with only 1 gene!")
- bins.categories = bins.categories.astype(str)
+ bins = bins.rename_categories(dict(zip(bins.categories, bins.categories.astype(str))))
data.var["bins"] = bins
# calculate background expectations
From d875435004bba29e68aeb77f2bb955fa09434321 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Tue, 18 Apr 2023 18:08:08 -0700
Subject: [PATCH 15/57] Change string type from fixed-length to var-length
---
pegasus/tools/clustering.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py
index aa16291b..432129e5 100644
--- a/pegasus/tools/clustering.py
+++ b/pegasus/tools/clustering.py
@@ -712,7 +712,7 @@ def split_one_cluster(
neighbors(tmpdat, rep=rep, n_comps=n_comps, use_cache=False)
leiden(tmpdat, rep=rep, resolution=None, n_clust=n_clust, random_state=random_state)
- new_clust = data.obs[clust_label].values.astype(str)
+ new_clust = data.obs[clust_label].values.astype(object)
cats_sub = []
for i, label in enumerate(tmpdat.obs['leiden_labels'].value_counts().index):
sub_id = f"{clust_id}-{i+1}"
From 3b252a34f5ded402231c471a415de0df04e66728 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Tue, 18 Apr 2023 23:50:24 -0700
Subject: [PATCH 16/57] Updated neutrophil markers
---
pegasus/annotate_cluster/human_immune_cell_markers.json | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 94e21464..eb601134 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -369,9 +369,9 @@
"name" : "Neutrophil",
"markers" : [
{
- "genes" : ["FUT4+", "MPO+", "CEACAM8+", "ELANE+", "CXCR1+", "CXCR2+", "LY6G6D+"],
+ "genes" : ["KCNJ15+", "IL1R2+", "LUCAT1+", "G0S2+", "TREM1+", "CSF3R+", "FCGR3B+", "CXCR1+", "CXCR2+"],
"weight" : 1.0,
- "comment" : "key markers"
+ "comment" : "Neutrophil markers validated using 10x public whole blood dataset."
}
]
},
From e2167d0c1c249a2ebb5c787f86aa83b01c3829e7 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sun, 30 Apr 2023 23:16:45 -0700
Subject: [PATCH 17/57] Updated neutrophil markers
---
.../human_immune_cell_markers.json | 37 +++++++++++++++-
.../human_lung_cell_markers.json | 43 ++++++++++++-------
pegasus/data_files/human_lung.gmt | 21 +++++----
pegasus/tools/scvitools.py | 5 +++
4 files changed, 79 insertions(+), 27 deletions(-)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index eb601134..b849947f 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -365,13 +365,46 @@
]
},
+ {
+ "name" : "Pro-Neutrophil",
+ "markers" : [
+ {
+ "genes" : ["DEFA3+", "DEFA4+", "AZU1+", "MS4A3+", "ELANE+", "SLPI+", "CEACAM6+", "RNASE3+", "PRTN3+", "MPO+", "AC104232.1+", "CTSG+"],
+ "weight" : 1.0,
+ "comment" : "Pro-Neutrophil markers validated using 10x public whole blood dataset"
+ }
+ ]
+ },
+
+ {
+ "name" : "Pre-Neutrophil",
+ "markers" : [
+ {
+ "genes" : ["LTF+", "LCN2+", "MMP8+", "CRISP3+", "CAMP+", "PGLYRP1+", "CD177+", "HP+"],
+ "weight" : 1.0,
+ "comment" : "Pre-Neutrophil markers validated using 10x public whole blood dataset"
+ }
+ ]
+ },
+
{
"name" : "Neutrophil",
"markers" : [
{
- "genes" : ["KCNJ15+", "IL1R2+", "LUCAT1+", "G0S2+", "TREM1+", "CSF3R+", "FCGR3B+", "CXCR1+", "CXCR2+"],
+ "genes" : ["CSF3R+", "G0S2+", "LUCAT1+", "EPHB1+", "TNFRSF10C+", "IL1R2+", "KCNJ15+", "FCGR3B+", "AC007032.1+", "HSD11B1-AS1+"],
+ "weight" : 1.0,
+ "comment" : "Neutrophil markers validated using 10x public whole blood dataset"
+ }
+ ]
+ },
+
+ {
+ "name" : "Basophil",
+ "markers" : [
+ {
+ "genes" : ["AKAP12+", "HDC+", "GATA2+", "ENPP3+", "CA8+", "ITGB8+", "GCSAML+", "CRPPA+", "AC111000.4+", "LINC02223+"],
"weight" : 1.0,
- "comment" : "Neutrophil markers validated using 10x public whole blood dataset."
+ "comment" : "Basophil markers validated using 10x public whole blood dataset"
}
]
},
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index c2e9f6c4..9166aa1a 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -5,9 +5,9 @@
"name" : "Alveolar type I cell",
"markers" : [
{
- "genes" : ["AGER+", "CAV1+", "RTKN2+", "MYL9+", "SPOCK2+", "ANXA3+", "TIMP3+", "CAV2+", "ST6GALNAC5+", "MYRF+"],
+ "genes" : ["AGER+", "SPOCK2+", "RTKN2+", "TNNC1+", "SCEL+", "CLIC5+", "NCKAP5+", "ARHGEF26+", "GGTLC1+", "ITLN2+", "MS4A15+"],
"weight" : 1.0,
- "comment" : "AT1 markers from Schupp et al., Travaglini et al. and Tony et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
@@ -16,9 +16,9 @@
"name" : "Alveolar type II cell",
"markers" : [
{
- "genes" : ["SFTPA1+", "SFTPA2+", "SFTPC+", "ETV5+", "TTN+", "PLA2G4F+", "CCDC141+", "LAMP3+", "ABCA3+", "HHIP+"],
+ "genes" : ["SFTPA1+", "SFTPA2+", "SFTPC+", "PGC+", "LAMP3+", "FASN+", "HHIP+", "ETV5+", "RASGRF1+", "ABCA3+"],
"weight" : 1.0,
- "comment" : "AT2 markers from Schupp et al., Travaglini et al. and Tony et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
@@ -27,9 +27,9 @@
"name" : "Basal cell",
"markers" : [
{
- "genes" : ["KRT5+", "KRT15+", "KRT17+", "TP63+", "S100A2+", "TNS4+"],
+ "genes" : ["KRT17+", "S100A2+", "MIR205HG+", "KRT15+", "KRT5+", "DLK2+", "CDH3+", "TP63+", "TNS4+"],
"weight" : 1.0,
- "comment" : "Basal cell markers from Schupp et al., Travaglini et al. and Tony et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
@@ -38,9 +38,9 @@
"name" : "Club cell",
"markers" : [
{
- "genes" : ["SCGB3A2+", "MGP+", "VIM+", "CST3+"],
+ "genes" : ["SCGB3A2+", "MGP+", "CTSE+"],
"weight" : 1.0,
- "comment" : "Club cell markers from Schupp et al., Travaglini et al. and Tony et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
@@ -49,9 +49,9 @@
"name" : "Ciliated cell",
"markers" : [
{
- "genes" : ["ERICH3+", "SNTN+", "CCDC78+", "SNTN+", "ZBBX+", "DNAI1+", "ARMC3+", "CFAP157+", "TTC29+", "CFAP73+"],
+ "genes" : ["ERICH3+", "ARMC3+", "DNAI2+", "ZBBX+", "VWA3B+", "RGS22+", "TTC29+", "CDHR4+", "PPP1R42+", "CFAP46+", "CFAP52+", "CFAP73+", "CFAP77+", "CFAP157+", "DNAH3+", "DNAH9+", "ADGB+", "SNTN+", "CCDC170+", "C6orf118+"],
"weight" : 1.0,
- "comment" : "Ciliated cell markers from Schupp et al., Travaglini et al. and Tony et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
@@ -60,9 +60,9 @@
"name" : "Goblet cell",
"markers" : [
{
- "genes" : ["MUC5AC+", "MUC5B+", "BPIFB1+", "MSMB+", "FAM3D+", "SERPINB11+", "CXCL6+", "SCGB1A1+", "FAM3D+", "SERPINB3+"],
+ "genes" : ["MUC5AC+", "MUC5B+", "BPIFB1+", "MSMB+", "SERPINB11+", "CYP2F1+"],
"weight" : 1.0,
- "comment" : "Goblet cell markers from Schupp et al., Travaglini et al. and Tony et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
@@ -71,9 +71,9 @@
"name" : "Ionocyte",
"markers" : [
{
- "genes" : ["FOXI1+", "ASCL3+", "CLDN25+", "ATP6V1G3+", "LINC01187+"],
+ "genes" : ["ASCL3+", "CLCNKB+", "FOXI1+", "ATP6V1G3+", "TMPRSS11E+", "BSND+", "LINC01187+", "CLDN25+"],
"weight" : 1.0,
- "comment" : "Ionocyte markers from Travaglini et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
},
@@ -82,9 +82,20 @@
"name" : "Plumonary neuroendocrine cell",
"markers" : [
{
- "genes" : ["CALCA+", "CHGA+", "ASCL1+", "SLC35D3+", "KIF1A+"],
+ "genes" : ["CHGA+", "CHGB+", "SCGN+", "SCG5+", "CPLX2+", "GRP+", "ASCL1+", "INSM1+"],
"weight" : 1.0,
- "comment" : "Plumonary neuroendocrine cell markers from Travaglini et al."
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
+ }
+ ]
+ },
+
+ {
+ "name" : "Submucosal gland serous cel",
+ "markers" : [
+ {
+ "genes" : ["PRR4+", "TCN1+", "C6orf58+", "PRB3+", "LPO+", "PRB1+", "PRH2+", "PRH1+", "ODAM+"],
+ "weight" : 1.0,
+ "comment" : "Markers inferred from Travaglini et al. Nature 2020"
}
]
},
diff --git a/pegasus/data_files/human_lung.gmt b/pegasus/data_files/human_lung.gmt
index 871c9dec..3559ccb9 100644
--- a/pegasus/data_files/human_lung.gmt
+++ b/pegasus/data_files/human_lung.gmt
@@ -8,12 +8,15 @@ T cell T cell markers CD3D CD3E CD3G TRAC
B cell B cell markers CD19 MS4A1 CD79A CD79B
Plasma cell Plasma cell markers CD38 XBP1 CD27 SLAMF7 TNFRSF17 TNFRSF13B
Mast cell Mast cell markers KIT CPA3 TPSB2 TPSAB1 AREG RGS1 RGS2
-Neutrophil Neutrophil markers FUT4 MPO CEACAM8 ELANE CXCR1 CXCR2 LY6G6D
-AT1 AT1 markers from Schupp et al., Travaglini et al. and Tony et al. AGER CAV1 RTKN2 MYL9 SPOCK2 ANXA3 TIMP3 CAV2 ST6GALNAC5 MYRF
-AT2 AT2 markers from Schupp et al., Travaglini et al. and Tony et al. SFTPA1 SFTPA2 SFTPC ETV5 TTN PLA2G4F CCDC141 LAMP3 ABCA3 HHIP
-Basal Basal cell markers from Schupp et al., Travaglini et al. and Tony et al. KRT5 KRT15 KRT17 TP63 S100A2 TNS4
-Ciliated Ciliated cell markers from Schupp et al., Travaglini et al. and Tony et al. ERICH3 SNTN CCDC78 SNTN ZBBX DNAI1 ARMC3 CFAP157 TTC29 CFAP73
-Club Club cell markers from Schupp et al., Travaglini et al. and Tony et al. SCGB3A2 MGP VIM CST3
-Goblet Goblet cell markers from Schupp et al., Travaglini et al. and Tony et al. MUC5AC MUC5B BPIFB1 MSMB FAM3D SERPINB11 CXCL6 SCGB1A1 FAM3D SERPINB3
-Ionocyte Ionocyte markers from Travaglini et al. FOXI1 ASCL3 CLDN25 ATP6V1G3 LINC01187
-PNEC Plumonary neuroendocrine cell markers from Travaglini et al. CALCA CHGA ASCL1 SLC35D3 KIF1A
+ProNeu Pro-Neutrophil markers validated using 10x public whole blood dataset DEFA3 DEFA4 AZU1 MS4A3 ELANE SLPI CEACAM6 RNASE3 PRTN3 MPO AC104232.1 CTSG
+PreNeu Pre-Neutrophil markers validated using 10x public whole blood dataset LTF LCN2 MMP8 CRISP3 CAMP PGLYRP1 CD177 HP
+Neutrophil Neutrophil markers CSF3R G0S2 LUCAT1 EPHB1 TNFRSF10C IL1R2 KCNJ15 FCGR3B AC007032.1 HSD11B1-AS1
+AT1 AT1 markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 AGER SPOCK2 RTKN2 TNNC1 SCEL CLIC5 NCKAP5 ARHGEF26 GGTLC1 ITLN2 MS4A15
+AT2 AT2 markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 SFTPA1 SFTPA2 SFTPC PGC LAMP3 FASN HHIP ETV5 RASGRF1 ABCA3
+Basal Basal cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 KRT17 S100A2 MIR205HG KRT15 KRT5 DLK2 CDH3 TP63 TNS4
+Club Club cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 SCGB3A2 MGP CTSE
+Ciliated Ciliated cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 ERICH3 ARMC3 DNAI2 ZBBX VWA3B RGS22 TTC29 CDHR4 PPP1R42 CFAP46 CFAP52 CFAP73 CFAP77 CFAP157 DNAH3 DNAH9 ADGB SNTN CCDC170 C6orf118
+Goblet Goblet cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 MUC5AC MUC5B BPIFB1 MSMB SERPINB11 CYP2F1
+Ionocyte Ionocyte markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 ASCL3 CLCNKB FOXI1 ATP6V1G3 TMPRSS11E BSND LINC01187 CLDN25
+PNEC Plumonary neuroendocrine cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 CHGA CHGB SCGN SCG5 CPLX2 GRP ASCL1 INSM1
+SMG SMG serous cell markers inferred from Travaglini et al. Nature 2020 PRR4 TCN1 C6orf58 PRB3 LPO PRB1 PRH2 PRH1 ODAM
diff --git a/pegasus/tools/scvitools.py b/pegasus/tools/scvitools.py
index a01c0a8e..20dfd1c3 100644
--- a/pegasus/tools/scvitools.py
+++ b/pegasus/tools/scvitools.py
@@ -190,9 +190,14 @@ def run_scvi(
scvi.settings.num_threads = eff_n_jobs(n_jobs) # set n_jobs
scvi.settings.seed = random_state # set random_state, see [here](https://docs.scvi-tools.org/en/stable/_modules/scvi/_settings.html) for more details.
+ print(max_epochs)
+
if max_epochs is None:
max_epochs = np.min([round((20000 / len(adata.obs)) * 400), 400])
+ print(type(max_epochs))
+ print(max_epochs)
+
scvi.model.SCVI.setup_anndata(adata,
batch_key=batch,
categorical_covariate_keys=categorical_covariate_keys,
From 47fde0836693386940f230d8219750a3c187c18b Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Tue, 23 May 2023 09:08:53 +0800
Subject: [PATCH 18/57] Updated NK cell and NK subtype markers
---
.../human_immune_cell_markers.json | 57 +++++++++++--------
1 file changed, 34 insertions(+), 23 deletions(-)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index b849947f..f4ed25b4 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -76,36 +76,47 @@
"name" : "Natural killer cell",
"markers" : [
{
- "genes" : ["NCAM1+"],
- "weight" : 0.2,
- "comment" : "CD56"
+ "genes" : ["GNLY+", "KLRF1+", "KLRD1+", "TRDC+", "IL2RB+", "KLRC1+"],
+ "weight" : 0.6,
+ "comment" : "General NK cell markers also cover some T cells; derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
},
{
- "genes" : ["NKG7+"],
+ "genes" : ["NCAM1+", "FCGR3A+"],
"weight" : 0.2,
- "comment" : "natural killer cell granule protein 7"
- },
- {
- "genes" : ["KLRB1+", "KLRD1+", "KLRF1+", "KLRC1+", "KLRC2+", "KLRC3+", "KLRC4+"],
- "weight" : 0.25,
- "comment" : "killer cell lectin like receptors"
+ "comment" : "NK subtype markers"
},
{
"genes" : ["CD3D-", "CD3E-", "CD3G-"],
- "weight" : 0.15,
- "comment" : "not T cell"
- },
- {
- "genes" : ["FCGR3A+"],
- "weight" : 0.1,
- "comment" : "CD16a"
- },
- {
- "genes" : ["ITGAL+", "ITGAM+"],
- "weight" : 0.1,
- "comment" : "CD11a,CD11b"
+ "weight" : 0.2,
+ "comment" : "No T cell markers"
}
- ]
+ ],
+ "subtypes" : {
+ "title" : "NK cell subtype markers",
+ "cell_types" : [
+ {
+ "name" : "CD56-dim NK cell",
+ "markers" : [
+ {
+ "genes" : ["FCGR3A+", "FGFBP2+", "SPON2+", "MYOM2+", "S1PR5+", "CX3CR1+", "AKR1C3+", "FCRL6+", "LAIR2+", "PRSS23+"],
+ "weight" : 1.0,
+ "comment" : "Cytotoxic NK cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
+ }
+ ]
+ },
+ {
+ "name" : "CD56-bright NK cell",
+ "markers" : [
+ {
+ "genes" : ["NCAM1+", "GZMK+", "XCL1+", "SPTSSB+", "CAPG+", "IL7R+", "GPR183+", "IGFBP4+", "SPINK2+", "FUT7+"],
+ "weight" : 1.0,
+ "comment" : "Regulatory NK cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; CD56bright develops into CD56dim"
+ }
+ ]
+ }
+ ],
+ "comment": "There is also a CD56_dim CD16_dim population in between of the CD56-dim and CD56-bright subtypes."
+ }
},
{
From a587dcbd6198855a74e491dad079f6b3d5b21217 Mon Sep 17 00:00:00 2001
From: Donghoon Lee
Date: Fri, 26 May 2023 10:31:29 -0400
Subject: [PATCH 19/57] Update doublet_detection.py
Fixes a bug where you have `raw_mat_key` other than default value, which is `counts`.
---
pegasus/tools/doublet_detection.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pegasus/tools/doublet_detection.py b/pegasus/tools/doublet_detection.py
index 26aadd4a..a1607cbb 100644
--- a/pegasus/tools/doublet_detection.py
+++ b/pegasus/tools/doublet_detection.py
@@ -586,9 +586,9 @@ def infer_doublets(
if idx.size >= min_cell:
unidata = UnimodalData({"barcodekey": data.obs_names[idx]},
{"featurekey": data.var_names},
- {"counts": rawX[idx]},
+ {raw_mat_key: rawX[idx]},
{"genome": genome, "modality": modality},
- cur_matrix = "counts")
+ cur_matrix = raw_mat_key)
# Identify robust genes, count and log normalized and select top 2,000 highly variable features
identify_robust_genes(unidata)
log_norm(unidata)
From 5d29f9fa73dcfb1de47d8a265950b28d7954efb2 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sat, 27 May 2023 14:14:02 +0800
Subject: [PATCH 20/57] Updated T cell subtype markers
---
.../human_immune_cell_markers.json | 87 ++++++++++++++-----
pegasus/data_files/human_t_cell_markers.gmt | 9 ++
2 files changed, 73 insertions(+), 23 deletions(-)
create mode 100644 pegasus/data_files/human_t_cell_markers.gmt
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index f4ed25b4..73a424cd 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -19,55 +19,96 @@
"title" : "T cell subtype markers",
"cell_types" : [
{
- "name" : "T helper cell",
+ "name" : "CD4 Naive T cell",
"markers" : [
{
- "genes" : ["CD4+"],
+ "genes" : ["CD4+", "CCR7+", "SELL+", "LEF1+", "FHIT+", "ACTN1+", "LDLRAP1+", "TMIGD2+", "TRABD2A+", "LRRN3+"],
"weight" : 1.0,
- "comment" : "CD4+ T cell"
+ "comment" : "Markers derived from Immune Cell Atlas PBMC data"
+ }
+ ]
+ },
+ {
+ "name" : "CD4 TCM",
+ "markers" : [
+ {
+ "genes" : ["CD4+", "GPR183+", "CD69+", "PASK+", "LIMS1+", "LPAR6+", "SLC2A3+", "SOCS3+"],
+ "weight" : 1.0,
+ "comment" : "Markers derived from Immune Cell Atlas PBMC data"
+ }
+ ]
+ },
+ {
+ "name" : "CD4 TEM",
+ "markers" : [
+ {
+ "genes" : ["CD4+", "KLRB1+", "ANXA2+", "LGALS1+", "TIMP1+", "PTGER2+", "AHNAK+", "TNFRSF4+", "YWHAH+", "CD63+"],
+ "weight" : 1.0,
+ "comment" : "Markers derived from Immune Cell Atlas PBMC data"
+ }
+ ]
+ },
+ {
+ "name" : "T regulatory cell",
+ "markers" : [
+ {
+ "genes" : ["RTKN2+", "FOXP3+", "IL2RA+", "HACD1+", "AC133644.2+", "FANK1+", "DUSP4+", "STAM+", "CCR10+", "CTLA4+"],
+ "weight" : 1.0,
+ "comments" : "Markers derived from Immune Cell Atlas PBMC data"
}
]
},
{
- "name" : "Cytotoxic T cell",
+ "name" : "CD8 Naive T cell",
"markers" : [
{
- "genes" : ["CD8A+", "CD8B+"],
+ "genes" : ["CD8A+", "CD8B+", "CCR7+", "SELL+", "LEF1+", "ACTN1+", "TRABD2A+", "LRRN3+", "LINC02446+", "S100B+", "CLEC11A+", "NELL2+", "PASK+", "APBA2+"],
"weight" : 1.0,
- "comment" : "CD8+ T cell"
+ "comment" : "Markers derived from Immune Cell Atlas PBMC data"
}
]
},
{
- "name" : "T regulatory cell",
+ "name" : "CD8 TCM",
"markers" : [
{
- "genes" : ["FOXP3+", "IL2RA+"],
- "weight" : 0.7,
- "comments" : "key T reg markers"
- },
+ "genes" : ["CD8A+", "CD8B+", "GZMK+", "DUSP2+", "RGS1+", "CXCR3+", "CMC1+", "TIGIT+", "CST7+", "NKG7+"],
+ "weight" : 1.0,
+ "comment" : "Markers derived from Immune Cell Atlas PBMC data; CD8A & CD8B are CD8 markers; GZMK, DUSP2, RGS1 & CXCR3 are specific to TCM; CMC1 & TIGIT are biased towards TCM; CST7 & NKG7 are shared by TCM & TEM"
+ }
+ ]
+ },
+ {
+ "name" : "CD8 TEM",
+ "markers" : [
{
- "genes" : ["CD4+"],
- "weight" : 0.3,
- "comment" : "key markers that do not express heavily in droplet-based RNA-Seq"
+ "genes" : ["CD8A+", "CD8B+", "FGFBP2+", "GZMB+", "FCGR3A+", "SPON2+", "ADGRG1+", "CX3CR1+", "ASCL2+", "PRSS23+"],
+ "weight" : 1.0,
+ "comment" : "Markers derived from Immune Cell Atlas PBMC data"
}
]
},
{
- "name" : "Naive T cell",
+ "name" : "MAIT",
"markers" : [
{
- "genes" : ["CCR7+", "SELL+", "IL7R+", "TCF7+", "CD27+"],
- "weight" : 0.7,
- "comment" : "positive markers"
- },
+ "genes" : ["SLC4A10+", "KLRB1+", "NCR3+", "CEBPD+", "GPR65+", "LST1+", "CXCR6+", "TRAV1-2+"],
+ "weight" : 1.0,
+ "comment" : "Markers derived from Immune Cell Atlas PBMC data"
+ }
+ ]
+ },
+ {
+ "name" : "Gamma-delta T cell",
+ "markers" : [
{
- "genes" : ["IL2RA-", "CD44-", "CD69-"],
- "weight" : 0.3,
- "comment" : "negative markers"
+ "genes" : ["TRDC+", "TRGC1+", "TRGC2+", "KLRC1+", "KLRD1+", "GNLY+"],
+ "weight" : 1.0,
+ "comment" : "Markers derived from Immune Cell Atlas PBMC data"
}
]
- }
+ },
+
]
}
},
diff --git a/pegasus/data_files/human_t_cell_markers.gmt b/pegasus/data_files/human_t_cell_markers.gmt
new file mode 100644
index 00000000..33bd98af
--- /dev/null
+++ b/pegasus/data_files/human_t_cell_markers.gmt
@@ -0,0 +1,9 @@
+CD4_Naive CD4 Naive T CD4 CCR7 SELL LEF1 FHIT ACTN1 LDLRAP1 TMIGD2 TRABD2A LRRN3
+CD4_TCM CD4 TCM CD4 GPR183 CD69 PASK LIMS1 LPAR6 SLC2A3 SOCS3
+CD4_TEM CD4 TEM CD4 KLRB1 ANXA2 LGALS1 TIMP1 PTGER2 AHNAK TNFRSF4 YWHAH CD63
+Treg Treg RTKN2 FOXP3 IL2RA HACD1 AC133644.2 FANK1 DUSP4 STAM CCR10 CTLA4
+CD8_Naive CD8 Naive T CD8A CD8B CCR7 SELL LEF1 ACTN1 TRABD2A LRRN3 LINC02446 S100B CLEC11A NELL2 PASK APBA2
+CD8_TCM CD8 TCM CD8A CD8B GZMK DUSP2 RGS1 CXCR3 CMC1 TIGIT CST7 NKG7
+CD8_TEM CD8 TEM CD8A CD8B FGFBP2 GZMB FCGR3A SPON2 ADGRG1 CX3CR1 ASCL2 PRSS23
+MAIT MAIT SLC4A10 KLRB1 NCR3 CEBPD GPR65 LST1 CXCR6 TRAV1-2
+gdT gdT TRDC TRGC1 TRGC2 KLRC1 KLRD1 GNLY
From 469693d185a11dc5c28f10eaa6b5beac78b743db Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sat, 27 May 2023 16:12:24 +0800
Subject: [PATCH 21/57] Added CD4 CTL markers
---
.../human_immune_cell_markers.json | 15 +++++++++++++++
pegasus/data_files/human_t_cell_markers.gmt | 1 +
2 files changed, 16 insertions(+)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 73a424cd..021a07ac 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -58,6 +58,21 @@
}
]
},
+ {
+ "name" : "CD4 CTL",
+ "markers" : [
+ {
+ "genes" : ["CD4+", "CD8A-", "CD8B-"],
+ "weight" : 0.3,
+ "comments" : "Must be CD4 T"
+ },
+ {
+ "genes" : ["GNLY+", "AGAP1+", "ZNF683+", "RGS9+", "IL5RA+", "LAIR2+", "MTERF2+", "SH3RF2+", "RGS17+"],
+ "weight" : 0.7,
+ "comments" : "CD4 CTL markers that might also be expressed by CD8 TEM"
+ }
+ ]
+ },
{
"name" : "CD8 Naive T cell",
"markers" : [
diff --git a/pegasus/data_files/human_t_cell_markers.gmt b/pegasus/data_files/human_t_cell_markers.gmt
index 33bd98af..aa062fe5 100644
--- a/pegasus/data_files/human_t_cell_markers.gmt
+++ b/pegasus/data_files/human_t_cell_markers.gmt
@@ -2,6 +2,7 @@ CD4_Naive CD4 Naive T CD4 CCR7 SELL LEF1 FHIT ACTN1 LDLRAP1 TMIGD2 TRABD2A LRRN3
CD4_TCM CD4 TCM CD4 GPR183 CD69 PASK LIMS1 LPAR6 SLC2A3 SOCS3
CD4_TEM CD4 TEM CD4 KLRB1 ANXA2 LGALS1 TIMP1 PTGER2 AHNAK TNFRSF4 YWHAH CD63
Treg Treg RTKN2 FOXP3 IL2RA HACD1 AC133644.2 FANK1 DUSP4 STAM CCR10 CTLA4
+CD4_CTL CD4 Cytotoxic Lymphocyte CD4 GNLY AGAP1 ZNF683 RGS9 IL5RA LAIR2 MTERF2 SH3RF2 RGS17
CD8_Naive CD8 Naive T CD8A CD8B CCR7 SELL LEF1 ACTN1 TRABD2A LRRN3 LINC02446 S100B CLEC11A NELL2 PASK APBA2
CD8_TCM CD8 TCM CD8A CD8B GZMK DUSP2 RGS1 CXCR3 CMC1 TIGIT CST7 NKG7
CD8_TEM CD8 TEM CD8A CD8B FGFBP2 GZMB FCGR3A SPON2 ADGRG1 CX3CR1 ASCL2 PRSS23
From f965c2638d7cc88d5385909aeaf2b933ef2625fc Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Thu, 8 Jun 2023 07:58:00 +0800
Subject: [PATCH 22/57] Updated markers
---
.../human_immune_cell_markers.json | 40 ++++++++++++++-----
pegasus/data_files/human_t_cell_markers.gmt | 1 +
pegasus/data_files/tonsil_markers.gmt | 18 +++++++++
3 files changed, 48 insertions(+), 11 deletions(-)
create mode 100644 pegasus/data_files/tonsil_markers.gmt
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 021a07ac..fd2afc23 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -72,7 +72,17 @@
"comments" : "CD4 CTL markers that might also be expressed by CD8 TEM"
}
]
- },
+ },
+ {
+ "name" : "T follicular helper cell",
+ "markers" : [
+ {
+ "genes" : ["CD4+", "ST8SIA1+", "PDCD1+", "TIGIT+", "TOX2+", "ICOS+", "SH2D1A+", "IL21+"],
+ "weight" : 1.0,
+ "comments" : "Tfh markers"
+ }
+ ]
+ },
{
"name" : "CD8 Naive T cell",
"markers" : [
@@ -250,14 +260,9 @@
"name" : "Germinal Center B cell",
"markers" : [
{
- "genes" : ["CD19+", "MS4A1+", "CD79A+", "CD79B+"],
- "weight" : 0.3,
- "comment" : "CD19, CD20 and CD79"
- },
- {
- "genes" : ["CD38+", "BCL6+", "BCL2-", "RGS13+", "MEF2B"],
- "weight" : 0.7,
- "comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last 2 markers are from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper"
+ "genes" : ["MEF2B+", "NEIL1+", "RGS13+", "ELL3+", "BCL7A+", "BCL6+", "NUGGC+", "MYBL1+", "EML6+", "FANCA+"],
+ "weight" : 1.0,
+ "comment" : "GC B cell markers"
}
],
"subtypes" : {
@@ -267,9 +272,9 @@
"name" : "Dark zone B cell",
"markers" : [
{
- "genes" : ["CXCR4+", "AICDA+", "FOXP1+", "MME+"],
+ "genes" : ["NUSAP1+", "NCAPG+", "AURKB+", "HMGB2+", "HJURP+", "TOP2A+"],
"weight" : 1.0,
- "comment" : "Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper"
+ "comment" : "DZ B cell markers"
}
]
},
@@ -379,6 +384,19 @@
]
},
+
+ {
+ "name" : "Follicular dendritic cell",
+ "markers" : [
+ {
+ "genes" : ["CXCL13+", "FCAMR+", "FDCSP+", "SERPINE2+", "PAPPA+", "NPHS1+", "PKDCC+", "SYNM+", "NRG2+", "CDC42EP4+", "MUC3A+", "PRUNE2+", "B4GALNT4+", "NPPC+", "SLC1A2+", "TMEM150C+"],
+ "weight" : 1.0,
+ "comment" : "fDC markers"
+ }
+ ]
+ },
+
+
{
"name" : "Hematopoietic stem cell",
"markers" : [
diff --git a/pegasus/data_files/human_t_cell_markers.gmt b/pegasus/data_files/human_t_cell_markers.gmt
index aa062fe5..aac1cda8 100644
--- a/pegasus/data_files/human_t_cell_markers.gmt
+++ b/pegasus/data_files/human_t_cell_markers.gmt
@@ -3,6 +3,7 @@ CD4_TCM CD4 TCM CD4 GPR183 CD69 PASK LIMS1 LPAR6 SLC2A3 SOCS3
CD4_TEM CD4 TEM CD4 KLRB1 ANXA2 LGALS1 TIMP1 PTGER2 AHNAK TNFRSF4 YWHAH CD63
Treg Treg RTKN2 FOXP3 IL2RA HACD1 AC133644.2 FANK1 DUSP4 STAM CCR10 CTLA4
CD4_CTL CD4 Cytotoxic Lymphocyte CD4 GNLY AGAP1 ZNF683 RGS9 IL5RA LAIR2 MTERF2 SH3RF2 RGS17
+Tfh T follicular helper CD4 ST8SIA1 PDCD1 TIGIT TOX2 ICOS SH2D1A IL21
CD8_Naive CD8 Naive T CD8A CD8B CCR7 SELL LEF1 ACTN1 TRABD2A LRRN3 LINC02446 S100B CLEC11A NELL2 PASK APBA2
CD8_TCM CD8 TCM CD8A CD8B GZMK DUSP2 RGS1 CXCR3 CMC1 TIGIT CST7 NKG7
CD8_TEM CD8 TEM CD8A CD8B FGFBP2 GZMB FCGR3A SPON2 ADGRG1 CX3CR1 ASCL2 PRSS23
diff --git a/pegasus/data_files/tonsil_markers.gmt b/pegasus/data_files/tonsil_markers.gmt
new file mode 100644
index 00000000..bfefe13c
--- /dev/null
+++ b/pegasus/data_files/tonsil_markers.gmt
@@ -0,0 +1,18 @@
+Skeletal muscle cells Skeletal muscle cells MYBPC1 TNNT1 TNNC1 MYL1 MYBPH TNNC2 TNNI1 MYH7 MYL2
+Tfh T Follicular helper markers (one reference point is https://www.thermofisher.com/us/en/home/life-science/cell-analysis/cell-analysis-learning-center/immunology-at-work/t-follicular-helper-cell-overview.html) CD4 ST8SIA1 PDCD1 TIGIT TOX2 ICOS SH2D1A IL21
+Tregs Tregs CTLA4 TIGIT IL2RA FOXP3 CCR8 BATF
+T_Naive Naive T cell CCR7 SELL IL7R TCF7 CD27
+DC_Migratory Migratory Conventional Dendritic cell FSCN1 CCR7 LAMP3 CCL19 CCL22 CD40 BIRC3
+MAIT MAIT SLC4A10
+EC lymphatic Schupp et al. Circulation 2021 PECAM1 CLDN5 CDH5 ERG CCL21 SEMA3D PROX1 PDPN MMRN1 RELN PKHD1L1 TFF3 LYVE1 FLT4 TBX1
+fDC Follicular dendritic cell CXCL13 FCAMR FDCSP SERPINE2 PAPPA NPHS1 PKDCC SYNM NRG2 CDC42EP4 MUC3A PRUNE2 B4GALNT4 NPPC SLC1A2 TMEM150C
+DCs_CLEC9A Conventional Dendritic cell type 1 CLEC9A BATF3 IRF8 CPVL CADM1
+DCs_CD1C Conventional Dendritic cell type 2 CD1C FCER1A FCGBP CD1A CD207 HLA-DQB2
+pDCs Plasmacytoid Dendritic cell IRF4 LILRA4 TCF4 MZB1
+B_Naive Naïve B cell MS4A1 IGHD TCL1A FCER2
+B_Memory Memory B cell MS4A1 CD27 TNFRSF13B
+B_Germinal_Center Germinal center B cell MEF2B NEIL1 RGS13 ELL3 BCL7A BCL6 NUGGC MYBL1 EML6 FANCA
+B_light_zone Light Zone CD83 LMO2
+B_dark_zone Dark Zone CXCR4 AICDA FOXP1 MME
+Mono_DCs Monocytes Derived DC CD14 FCGR2B CCL17 CLEC10A
+MyoF Myofibroblast from Travaglini et al. and Tony et al. ACTA2 MYL9 MT2A EEF1A1 TMSB10 FAU UBA52 SERF2 PTMA S100A6
From 3a9cca8a01a09e400eb81911e010329b0303c2b5 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Wed, 14 Jun 2023 20:11:38 +0800
Subject: [PATCH 23/57] Added method option for nearest neighbor search to
choose between hnsw and sklearn
---
.../annotate_cluster/human_immune_cell_markers.json | 3 +--
pegasus/tools/nearest_neighbors.py | 13 +++++++++++--
2 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index fd2afc23..feb55f0a 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -132,8 +132,7 @@
"comment" : "Markers derived from Immune Cell Atlas PBMC data"
}
]
- },
-
+ }
]
}
},
diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py
index fdc16109..37965183 100644
--- a/pegasus/tools/nearest_neighbors.py
+++ b/pegasus/tools/nearest_neighbors.py
@@ -44,9 +44,8 @@ def calculate_nearest_neighbors(
"""Calculate nearest neighbors
X is the sample by feature matrix
Return K -1 neighbors, the first one is the point itself and thus omitted.
- TODO: Documentation
+ If nsample <= 1000, method is set to "sklearn" for exact KNN search
"""
-
nsample = X.shape[0]
if nsample <= 1000:
@@ -85,6 +84,7 @@ def calculate_nearest_neighbors(
distances = np.sqrt(distances, out=distances)
else:
assert method == "sklearn"
+ print("haha, exact!")
knn = NearestNeighbors(
n_neighbors=K - 1, n_jobs=n_jobs
) # eliminate the first neighbor, which is the node itself
@@ -116,6 +116,7 @@ def get_neighbors(
full_speed: bool = False,
use_cache: bool = True,
dist: str = "l2",
+ method: str = "hnsw",
) -> Tuple[List[int], List[float]]:
"""Find K nearest neighbors for each data point and return the indices and distances arrays.
@@ -140,6 +141,8 @@ def get_neighbors(
If use_cache and found cached knn results, will not recompute.
dist: `str`, optional (default: 'l2')
Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'.
+ method: `str`, optional (default: 'hnsw')
+ Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search.
Returns
-------
@@ -164,6 +167,7 @@ def get_neighbors(
X_from_rep(data, rep, n_comps),
K=K,
n_jobs=eff_n_jobs(n_jobs),
+ method=method,
random_state=random_state,
full_speed=full_speed,
dist=dist,
@@ -237,6 +241,7 @@ def neighbors(
full_speed: bool = False,
use_cache: bool = True,
dist: str = "l2",
+ method: str = "hnsw",
) -> None:
"""Compute k nearest neighbors and affinity matrix, which will be used for diffmap and graph-based community detection algorithms.
@@ -274,6 +279,9 @@ def neighbors(
dist: ``str``, optional (default: ``"l2"``)
Distance metric to use. By default, use squared L2 distance. Available options, inner product ``"ip"`` or cosine similarity ``"cosine"``.
+ method: ``str``, optional (default: ``"hnsw"``)
+ Choose from "hnsw" or "sklearn". "hnsw" uses HNSW algorithm for approximate nearest neighbor search and "sklearn" uses sklearn package for exact nearest neighbor search.
+
Returns
-------
``None``
@@ -302,6 +310,7 @@ def neighbors(
full_speed=full_speed,
use_cache=use_cache,
dist=dist,
+ method=method,
)
# calculate affinity matrix
From b84a87ce850bebb09421d8773616952689c97610 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sat, 17 Jun 2023 10:03:48 -0700
Subject: [PATCH 24/57] Updated nearest neighbor search function to a) set
use_cache to False by default and b) adjust K to min(K, int(sqrt(nsample)))
---
pegasus/tools/doublet_detection.py | 2 +-
pegasus/tools/nearest_neighbors.py | 103 +++++++++++++++++++++--------
pegasus/tools/nmf.py | 2 +-
pegasus/tools/visualization.py | 28 +++++---
4 files changed, 97 insertions(+), 38 deletions(-)
diff --git a/pegasus/tools/doublet_detection.py b/pegasus/tools/doublet_detection.py
index a1607cbb..5737af2d 100644
--- a/pegasus/tools/doublet_detection.py
+++ b/pegasus/tools/doublet_detection.py
@@ -349,7 +349,7 @@ def _run_scrublet(
if k is None:
k = int(round(0.5 * np.sqrt(obsX.shape[0])))
k_adj = int(round(k * (1.0 + r)))
- indices, _ = calculate_nearest_neighbors(pc_coords, K = k_adj + 1, n_jobs = n_jobs)
+ indices, _, _ = calculate_nearest_neighbors(pc_coords, K=k_adj + 1, n_jobs=n_jobs, exact_k=True)
# Calculate scrublet-like doublet score
k_d = is_doublet[indices].sum(axis = 1)
diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py
index 37965183..4ecc3169 100644
--- a/pegasus/tools/nearest_neighbors.py
+++ b/pegasus/tools/nearest_neighbors.py
@@ -34,26 +34,61 @@ def calculate_nearest_neighbors(
K: int = 100,
n_jobs: int = -1,
method: str = "hnsw",
+ exact_k: bool = False,
M: int = 20,
efC: int = 200,
efS: int = 200,
random_state: int = 0,
full_speed: int = False,
dist: str = 'l2',
-):
- """Calculate nearest neighbors
- X is the sample by feature matrix
- Return K -1 neighbors, the first one is the point itself and thus omitted.
- If nsample <= 1000, method is set to "sklearn" for exact KNN search
+) -> Tuple[List[int], List[float], int]:
+ """Find K nearest neighbors for each data point in the matrix and return the indices and distances arrays.
+
+ K is determined by min(K, int(sqrt(X.shape[0]))) if exact_k == False.
+
+ Parameters
+ ----------
+
+ X : `np.array`
+ An array of n_samples by n_features.
+ K : `int`, optional (default: 100)
+ Number of neighbors, including the data point itself. If K is None, determine K by sqrt(X.shape[0]).
+ n_jobs : `int`, optional (default: -1)
+ Number of threads to use. -1 refers to using all physical CPU cores.
+ method: `str`, optional (default: 'hnsw')
+ Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search. If X.shape[0] <= 1000, method will be automatically set to "sklearn" for exact KNN search
+ exact_k: `bool`, optional (default: 'False')
+ If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
+ M, efC, efS: `int`, optional (20, 200, 200)
+ HNSW algorithm parameters.
+ random_state: `int`, optional (default: 0)
+ Random seed for random number generator.
+ full_speed: `bool`, optional (default: False)
+ If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.
+ dist: `str`, optional (default: 'l2')
+ Distance metric to use. By default, use squared L2 distance. Available options, 'l2', inner product 'ip' or cosine similarity 'cosine'.
+
+ Returns
+ -------
+
+ kNN indices array, distances array and adjusted K.
+
+ Examples
+ --------
+ >>> indices, distances = calculate_nearest_neighbors(X)
"""
nsample = X.shape[0]
if nsample <= 1000:
method = "sklearn"
- if nsample < K:
- logger.warning(f"Warning: in calculate_nearest_neighbors, number of samples = {nsample} < K = {K}!\n Set K to {nsample}.")
- K = nsample
+ k_rot = int(nsample ** 0.5) # rot, rule of thumb
+ if (K is None) or (K > k_rot and (not exact_k)):
+ K = k_rot
+ logger.info(f"in calculate_nearest_neighbors, K is adjusted to {K}.")
+
+ if K == 1:
+ return np.zeros(0, dtype=int), np.zeros(0, dtype=np.float32), K
n_jobs = eff_n_jobs(n_jobs)
@@ -84,18 +119,17 @@ def calculate_nearest_neighbors(
distances = np.sqrt(distances, out=distances)
else:
assert method == "sklearn"
- print("haha, exact!")
knn = NearestNeighbors(
n_neighbors=K - 1, n_jobs=n_jobs
) # eliminate the first neighbor, which is the node itself
knn.fit(X)
distances, indices = knn.kneighbors()
- return indices, distances
+ return indices, distances, K
def knn_is_cached(
- data: MultimodalData, indices_key: str, distances_key: str, K: int
+ data: MultimodalData, indices_key: str, distances_key: str, K: int, exact_k: bool
) -> bool:
return (
(indices_key in data.obsm)
@@ -114,12 +148,15 @@ def get_neighbors(
n_jobs: int = -1,
random_state: int = 0,
full_speed: bool = False,
- use_cache: bool = True,
+ use_cache: bool = False,
dist: str = "l2",
method: str = "hnsw",
-) -> Tuple[List[int], List[float]]:
+ exact_k: bool = False,
+) -> Tuple[List[int], List[float], int]:
"""Find K nearest neighbors for each data point and return the indices and distances arrays.
+ K is determined by min(K, int(sqrt(data.shape[0]))) if exact_k == False.
+
Parameters
----------
@@ -137,37 +174,44 @@ def get_neighbors(
Random seed for random number generator.
full_speed: `bool`, optional (default: False)
If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.
- use_cache: `bool`, optional (default: True)
+ use_cache: `bool`, optional (default: False)
If use_cache and found cached knn results, will not recompute.
dist: `str`, optional (default: 'l2')
- Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'.
+ Distance metric to use. By default, use squared L2 distance. Available options, 'l2' or inner product 'ip' or cosine similarity 'cosine'.
method: `str`, optional (default: 'hnsw')
- Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search.
+ Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search.
+ exact_k: `bool`, optional (default: 'False')
+ If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
Returns
-------
- kNN indices and distances arrays.
+ kNN indices array, distances array, and adjusted K.
Examples
--------
- >>> indices, distances = tools.get_neighbors(data)
+ >>> indices, distances, K = tools.get_neighbors(data)
"""
-
rep = update_rep(rep)
indices_key = rep + "_knn_indices"
distances_key = rep + "_knn_distances"
+ k_rot = int(data.shape[0] ** 0.5) # rot, rule of thumb
+ if (K is None) or (K > k_rot and (not exact_k)):
+ K = k_rot
+ logger.info(f"in get_neighbors, K is adjusted to {K}.")
+
if use_cache and knn_is_cached(data, indices_key, distances_key, K):
indices = data.obsm[indices_key]
distances = data.obsm[distances_key]
logger.info("Found cached kNN results, no calculation is required.")
else:
- indices, distances = calculate_nearest_neighbors(
+ indices, distances, _ = calculate_nearest_neighbors(
X_from_rep(data, rep, n_comps),
K=K,
n_jobs=eff_n_jobs(n_jobs),
method=method,
+ exact_k=exact_k,
random_state=random_state,
full_speed=full_speed,
dist=dist,
@@ -177,7 +221,7 @@ def get_neighbors(
data.obsm[distances_key] = distances
data.register_attr(distances_key, "knn")
- return indices, distances
+ return indices, distances, K
def get_symmetric_matrix(csr_mat: "csr_matrix") -> "csr_matrix":
@@ -239,14 +283,17 @@ def neighbors(
n_jobs: int = -1,
random_state: int = 0,
full_speed: bool = False,
- use_cache: bool = True,
+ use_cache: bool = False,
dist: str = "l2",
method: str = "hnsw",
+ exact_k: bool = False,
) -> None:
"""Compute k nearest neighbors and affinity matrix, which will be used for diffmap and graph-based community detection algorithms.
The kNN calculation uses `hnswlib `_ introduced by [Malkov16]_.
+ K is determined by min(K, sqrt(data.shape[0])).
+
Parameters
----------
@@ -272,16 +319,19 @@ def neighbors(
* If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
* Otherwise, use only one thread to make sure results are reproducible.
- use_cache: ``bool``, optional, default: ``True``
+ use_cache: ``bool``, optional, default: ``False``
* If ``True`` and found cached knn results, Pegasus will use cached results and do not recompute.
* Otherwise, compute kNN irrespective of caching status.
dist: ``str``, optional (default: ``"l2"``)
- Distance metric to use. By default, use squared L2 distance. Available options, inner product ``"ip"`` or cosine similarity ``"cosine"``.
+ Distance metric to use. By default, use squared L2 distance. Available options, ``"l2"`` or inner product ``"ip"`` or cosine similarity ``"cosine"``.
method: ``str``, optional (default: ``"hnsw"``)
Choose from "hnsw" or "sklearn". "hnsw" uses HNSW algorithm for approximate nearest neighbor search and "sklearn" uses sklearn package for exact nearest neighbor search.
+ exact_k: ``bool``, optional (default: ``False``)
+ If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
+
Returns
-------
``None``
@@ -311,6 +361,7 @@ def neighbors(
use_cache=use_cache,
dist=dist,
method=method,
+ exact_k=exact_k,
)
# calculate affinity matrix
@@ -417,7 +468,7 @@ def calc_kBET(
attr_values = data.obs[attr].values.copy()
attr_values.categories = range(nbatch)
- indices, distances = get_neighbors(
+ indices, distances, K = get_neighbors(
data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, use_cache=use_cache,
)
knn_indices = np.concatenate(
@@ -508,7 +559,7 @@ def calc_kSIM(
assert attr in data.obs
nsample = data.shape[0]
- indices, distances = get_neighbors(
+ indices, distances, K = get_neighbors(
data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, use_cache=use_cache,
)
knn_indices = np.concatenate(
diff --git a/pegasus/tools/nmf.py b/pegasus/tools/nmf.py
index 4cc6f270..7ce54e65 100644
--- a/pegasus/tools/nmf.py
+++ b/pegasus/tools/nmf.py
@@ -418,7 +418,7 @@ def integrative_nmf(
continue
clusters = np.argmax(H_new, axis=1) # Assign cluster
- indices, _ = calculate_nearest_neighbors(H_new, K=20, n_jobs=n_jobs, random_state=seeds[i]) # KNN with K=20
+ indices, _, _ = calculate_nearest_neighbors(H_new, K=20, n_jobs=n_jobs, random_state=seeds[i]) # KNN with K=20
clusters, csum = _refine_cluster(clusters, indices, n_components) # Refine cluster
csums.append(csum)
ids_by_clusts.append(np.argsort(clusters, kind='stable'))
diff --git a/pegasus/tools/visualization.py b/pegasus/tools/visualization.py
index c1dab252..1d660ada 100644
--- a/pegasus/tools/visualization.py
+++ b/pegasus/tools/visualization.py
@@ -276,6 +276,7 @@ def umap(
dens_var_shift: float = 0.1,
n_jobs: int = -1,
full_speed: bool = False,
+ use_cache: bool = True,
random_state: int = 0,
out_basis: str = "umap",
) -> None:
@@ -334,6 +335,9 @@ def umap(
* If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
* Otherwise, use only one thread to make sure results are reproducible.
+ use_cache: ``bool``, optional, default: ``True``
+ If use_cache and found cached knn results, will not recompute.
+
random_state: ``int``, optional, default: ``0``
Random seed set for reproducing results.
@@ -354,11 +358,7 @@ def umap(
rep = update_rep(rep)
X = X_from_rep(data, rep, rep_ncomps)
- if data.shape[0] < n_neighbors:
- logger.warning(f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}.")
- n_neighbors = data.shape[0]
-
- knn_indices, knn_dists = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+ knn_indices, knn_dists, n_neighbors = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
knn_indices = np.insert(knn_indices[:, 0 : n_neighbors - 1], 0, range(data.shape[0]), axis=1)
knn_dists = np.insert(knn_dists[:, 0 : n_neighbors - 1], 0, 0.0, axis=1)
@@ -539,6 +539,7 @@ def net_umap(
select_K: int = 25,
select_alpha: float = 1.0,
full_speed: bool = False,
+ use_cache: bool = True,
net_alpha: float = 0.1,
polish_learning_rate: float = 10.0,
polish_n_epochs: int = 30,
@@ -612,6 +613,9 @@ def net_umap(
* If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
* Otherwise, use only one thread to make sure results are reproducible.
+ use_cache: ``bool``, optional, default: ``True``
+ If use_cache and found cached knn results, will not recompute.
+
net_alpha: ``float``, optional, default: ``0.1``
L2 penalty (regularization term) parameter of the deep regressor.
@@ -641,7 +645,7 @@ def net_umap(
rep = update_rep(rep)
n_jobs = eff_n_jobs(n_jobs)
- knn_indices, knn_dists = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+ knn_indices, knn_dists, select_K = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
selected = select_cells(
knn_dists,
@@ -659,7 +663,7 @@ def net_umap(
ds_indices_key = "ds_" + rep + "_knn_indices" # ds refers to down-sampling
ds_distances_key = "ds_" + rep + "_knn_distances"
- indices, distances = calculate_nearest_neighbors(
+ indices, distances, n_neighbors = calculate_nearest_neighbors(
X,
K=n_neighbors,
n_jobs=n_jobs,
@@ -702,7 +706,7 @@ def net_umap(
data.obsm["X_" + out_basis + "_pred"] = Y_init
- knn_indices, knn_dists = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+ knn_indices, knn_dists, n_neighbors = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
knn_indices = np.insert(knn_indices[:, 0 : n_neighbors - 1], 0, range(data.shape[0]), axis=1)
knn_dists = np.insert(knn_dists[:, 0 : n_neighbors - 1], 0, 0.0, axis=1)
@@ -735,6 +739,7 @@ def net_fle(
rep: str = "diffmap",
K: int = 50,
full_speed: bool = False,
+ use_cache: bool = True,
target_change_per_node: float = 2.0,
target_steps: int = 5000,
is3d: bool = False,
@@ -778,6 +783,9 @@ def net_fle(
* If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
* Otherwise, use only one thread to make sure results are reproducible.
+ use_cache: ``bool``, optional, default: ``True``
+ If use_cache and found cached knn results, will not recompute.
+
target_change_per_node: ``float``, optional, default: ``2.0``
Target change per node to stop ForceAtlas2.
@@ -845,7 +853,7 @@ def net_fle(
full_speed=full_speed,
)
- knn_indices, knn_dists = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed)
+ knn_indices, knn_dists, select_K = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache)
selected = select_cells(
knn_dists,
@@ -860,7 +868,7 @@ def net_fle(
ds_indices_key = "ds_" + rep + "_knn_indices"
ds_distances_key = "ds_" + rep + "_knn_distances"
- indices, distances = calculate_nearest_neighbors(
+ indices, distances, K = calculate_nearest_neighbors(
X, K=K, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed
)
data.uns[ds_indices_key] = indices
From af26b006ce8b5e38c649c525afb315b806ed84ff Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sat, 17 Jun 2023 22:20:11 -0700
Subject: [PATCH 25/57] Fixed a bug in nearest_neighbors
---
pegasus/tools/nearest_neighbors.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py
index 4ecc3169..2f2170b3 100644
--- a/pegasus/tools/nearest_neighbors.py
+++ b/pegasus/tools/nearest_neighbors.py
@@ -88,7 +88,7 @@ def calculate_nearest_neighbors(
logger.info(f"in calculate_nearest_neighbors, K is adjusted to {K}.")
if K == 1:
- return np.zeros(0, dtype=int), np.zeros(0, dtype=np.float32), K
+ return np.zeros((nsample, 0), dtype=int), np.zeros((nsample, 0), dtype=np.float32), K
n_jobs = eff_n_jobs(n_jobs)
From ea5f47b25768c5c4ccf861c5a6b11fff5328bd4d Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sun, 18 Jun 2023 23:29:43 -0700
Subject: [PATCH 26/57] Fixed several bugs
---
pegasus/tools/nearest_neighbors.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py
index 2f2170b3..fd95e4ca 100644
--- a/pegasus/tools/nearest_neighbors.py
+++ b/pegasus/tools/nearest_neighbors.py
@@ -129,7 +129,7 @@ def calculate_nearest_neighbors(
def knn_is_cached(
- data: MultimodalData, indices_key: str, distances_key: str, K: int, exact_k: bool
+ data: MultimodalData, indices_key: str, distances_key: str, K: int
) -> bool:
return (
(indices_key in data.obsm)
@@ -350,7 +350,7 @@ def neighbors(
# calculate kNN
rep = update_rep(rep)
- indices, distances = get_neighbors(
+ indices, distances, K = get_neighbors(
data,
K=K,
rep=rep,
From 3bca8bbaa7ce398f3fd21627f016e316ab177d62 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Wed, 21 Jun 2023 01:46:28 -0700
Subject: [PATCH 27/57] don't test louvain
---
tests/run_pipeline.sh | 6 +++---
tests/test_pipeline.py | 1 -
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/tests/run_pipeline.sh b/tests/run_pipeline.sh
index c6e0d2eb..f3f2243f 100644
--- a/tests/run_pipeline.sh
+++ b/tests/run_pipeline.sh
@@ -1,14 +1,14 @@
pegasus aggregate_matrix tests/data/count_matrix.csv tests/aggr
if [ -f "tests/aggr.zarr.zip" ]; then
- pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --correct-batch-effect --nmf --louvain --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr louvain_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result
+ pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --correct-batch-effect --nmf --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr louvain_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result
fi
if [ -f "tests/result.zarr.zip" ]; then
pegasus de_analysis -p 2 --labels louvain_labels --t --fisher tests/result.zarr.zip tests/result.de.xlsx
pegasus annotate_cluster --markers mouse_immune,mouse_brain tests/result.zarr.zip tests/result.anno.txt
pegasus plot compo --groupby leiden_labels --condition Channel tests/result.zarr.zip tests/result.compo.pdf
- pegasus plot scatter --basis umap --attributes louvain_labels,Channel tests/result.zarr.zip tests/result.louvain_labels.umap.pdf
+ pegasus plot scatter --basis umap --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.umap.pdf
pegasus plot scatter --basis tsne --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.tsne.pdf
- pegasus plot scatter --basis fle --attributes louvain_labels,Channel tests/result.zarr.zip tests/result.louvain_labels.fle.pdf
+ pegasus plot scatter --basis fle --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.fle.pdf
fi
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 4b1da926..580bbaf1 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -27,7 +27,6 @@ def test_qc(self):
def test_clustering(self):
self.assertEqual(self.data.obsm['pca_harmony_knn_indices'].shape, (1043, 99), "KNN graph shape differs!")
self.assertEqual(self.data.obsm['pca_harmony_knn_distances'].shape, (1043, 99), "KNN distance matrix shape differs!")
- self.assertIn('louvain_labels', self.data.obs.columns, "Louvain result is lost!")
self.assertIn('leiden_labels', self.data.obs.columns, "Leiden result is lost!")
def test_doublet_detection(self):
From fa1d0a11040e0d1d165659ab4ec0b6b4be64e03b Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Fri, 23 Jun 2023 00:17:31 -0700
Subject: [PATCH 28/57] Add --exact-K option to pegasus cluster command
---
pegasus/commands/Clustering.py | 2 ++
pegasus/pipeline/pipeline.py | 1 +
tests/run_pipeline.sh | 4 ++--
tests/test_pipeline.py | 4 ++--
4 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/pegasus/commands/Clustering.py b/pegasus/commands/Clustering.py
index 9d8611b2..7b6d7e90 100644
--- a/pegasus/commands/Clustering.py
+++ b/pegasus/commands/Clustering.py
@@ -68,6 +68,7 @@ class Clustering(Base):
--nmf-n Number of NMF components. IF iNMF is used for batch correction, this parameter also sets iNMF number of components. [default: 20]
--knn-K Number of nearest neighbors for building kNN graph. [default: 100]
+ --exact-K If use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])).
--knn-full-speed For the sake of reproducibility, we only run one thread for building kNN indices. Turn on this option will allow multiple threads to be used for index building. However, it will also reduce reproducibility due to the racing between multiple threads.
--kBET Calculate kBET.
@@ -210,6 +211,7 @@ def execute(self):
"nmf": self.args["--nmf"],
"nmf_n": int(self.args["--nmf-n"]),
"K": int(self.args["--knn-K"]),
+ "exact_K": self.args["--exact-K"],
"full_speed": self.args["--knn-full-speed"],
"kBET": self.args["--kBET"],
"kBET_batch": self.args["--kBET-batch"],
diff --git a/pegasus/pipeline/pipeline.py b/pegasus/pipeline/pipeline.py
index 34626967..5ede69b3 100644
--- a/pegasus/pipeline/pipeline.py
+++ b/pegasus/pipeline/pipeline.py
@@ -92,6 +92,7 @@ def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool,
tools.neighbors(
unidata,
K=kwargs["K"],
+ exact_k=kwargs["exact_K"],
rep=dim_key,
n_jobs=kwargs["n_jobs"],
random_state=kwargs["random_state"],
diff --git a/tests/run_pipeline.sh b/tests/run_pipeline.sh
index f3f2243f..8c516f2f 100644
--- a/tests/run_pipeline.sh
+++ b/tests/run_pipeline.sh
@@ -1,11 +1,11 @@
pegasus aggregate_matrix tests/data/count_matrix.csv tests/aggr
if [ -f "tests/aggr.zarr.zip" ]; then
- pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --correct-batch-effect --nmf --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr louvain_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result
+ pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --exact-K --correct-batch-effect --nmf --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr leiden_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result
fi
if [ -f "tests/result.zarr.zip" ]; then
- pegasus de_analysis -p 2 --labels louvain_labels --t --fisher tests/result.zarr.zip tests/result.de.xlsx
+ pegasus de_analysis -p 2 --labels leiden_labels --t --fisher tests/result.zarr.zip tests/result.de.xlsx
pegasus annotate_cluster --markers mouse_immune,mouse_brain tests/result.zarr.zip tests/result.anno.txt
pegasus plot compo --groupby leiden_labels --condition Channel tests/result.zarr.zip tests/result.compo.pdf
pegasus plot scatter --basis umap --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.umap.pdf
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 580bbaf1..3e8b7f3b 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -59,9 +59,9 @@ def test_annotation(self):
def test_plot(self):
self.assertIn('result.compo.pdf', os.listdir('tests'), "Composition plot is lost!")
- self.assertIn('result.louvain_labels.umap.pdf', os.listdir('tests'), "UMAP plot is lost!")
+ self.assertIn('result.leiden_labels.umap.pdf', os.listdir('tests'), "UMAP plot is lost!")
self.assertIn('result.leiden_labels.tsne.pdf', os.listdir('tests'), "tSNE plot is lost!")
- self.assertIn('result.louvain_labels.fle.pdf', os.listdir('tests'), 'FLE plot is lost!')
+ self.assertIn('result.leiden_labels.fle.pdf', os.listdir('tests'), 'FLE plot is lost!')
def test_output(self):
data_h5ad = pg.read_input("tests/result.mm10-rna.h5ad")
From 7c6aa5b910494962003998e114a982a363b15cb1 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sat, 8 Jul 2023 23:20:56 -0700
Subject: [PATCH 29/57] fixed a typo
---
pegasus/annotate_cluster/human_immune_cell_markers.json | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index fd2afc23..feb55f0a 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -132,8 +132,7 @@
"comment" : "Markers derived from Immune Cell Atlas PBMC data"
}
]
- },
-
+ }
]
}
},
From 7d689b25bf0487e5871749e0fcb1e230a600090b Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Wed, 12 Jul 2023 00:47:23 -0400
Subject: [PATCH 30/57] Fixed a typo
---
pegasus/annotate_cluster/human_lung_cell_markers.json | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index 9166aa1a..8149071b 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -90,7 +90,7 @@
},
{
- "name" : "Submucosal gland serous cel",
+ "name" : "Submucosal gland serous cell",
"markers" : [
{
"genes" : ["PRR4+", "TCN1+", "C6orf58+", "PRB3+", "LPO+", "PRB1+", "PRH2+", "PRH1+", "ODAM+"],
From 606c1653ef7298ad3d4e30b14a1c059abbaa34af Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Tue, 18 Jul 2023 00:31:42 -0700
Subject: [PATCH 31/57] Raise warning instead of exception for attributes not
in data
---
pegasus/plotting/plot_library.py | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index f5f236e8..fb1083a6 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -152,6 +152,25 @@ def scatter(
elif not is_list_like(attrs):
attrs = [attrs]
+ # Select only valid attributes
+ attrs_filt = []
+ attrs_drop = []
+ for attr in attrs:
+ if (attr == '_all') or (attr in data.obs) or (attr in data.var_names) or ('@' in attr):
+ if not '@' in attr:
+ attrs_filt.append(attr)
+ else:
+ obsm_key, sep, component = attr.partition("@")
+ if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()):
+ attrs_drop.append(attr)
+ else:
+ attrs_filt.append(attr)
+ else:
+ attrs_drop.append(attr)
+ attrs = attrs_filt
+ if len(attrs_drop) > 0:
+ print(f"Warning: Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
+
if isinstance(basis, str):
basis = [basis]
if isinstance(components, tuple):
@@ -236,8 +255,6 @@ def scatter(
values = slicing(data.X, col = loc)
else:
obsm_key, sep, component = attr.partition("@")
- if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()):
- raise KeyError(f"{attr} is not in data.obs, data.var_names or data.obsm!")
values = data.obsm[obsm_key][:, int(component)]
selected = restr_obj.get_satisfied(data, attr)
From 6d7b831addfca69660b852badfbf974af1af6449 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Thu, 20 Jul 2023 01:19:04 -0700
Subject: [PATCH 32/57] Added a third manual correction option (threshold) for
doublet detection; Separate LEC from VEC for human_lung_cell_markers.json;
Updated human immune and mouse immune markers for B and plasma cells
---
.../human_immune_cell_markers.json | 45 +++++-------
.../human_lung_cell_markers.json | 37 +++++-----
.../mouse_immune_cell_markers.json | 73 ++++++++++++++++++-
pegasus/data_files/human_lung.gmt | 5 +-
pegasus/tools/doublet_detection.py | 27 +++++--
5 files changed, 132 insertions(+), 55 deletions(-)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index feb55f0a..f7eff6fa 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -141,9 +141,9 @@
"name" : "Natural killer cell",
"markers" : [
{
- "genes" : ["GNLY+", "KLRF1+", "KLRD1+", "TRDC+", "IL2RB+", "KLRC1+"],
+ "genes" : ["GNLY+", "KLRF1+", "KLRD1+", "TRDC+", "IL2RB+", "KLRC1+", "NCR1+"],
"weight" : 0.6,
- "comment" : "General NK cell markers also cover some T cells; derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
+ "comment" : "General NK cell markers also cover some T cells; derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; Added NCR1, a pan NK cell marker"
},
{
"genes" : ["NCAM1+", "FCGR3A+"],
@@ -188,20 +188,15 @@
"name" : "B cell",
"markers" : [
{
- "genes" : ["CD19+", "MS4A1+", "CD79A+", "CD79B+"],
- "weight" : 0.7,
- "comment" : "CD19, CD20 and CD79"
- },
- {
- "genes" : ["BANK1+", "BLK+"],
- "weight" : 0.2,
- "comment" : "Extra B cell markers"
+ "genes" : ["MS4A1+", "CD79A+", "CD79B+", "CD19+", "BANK1+", "TNFRSF13C+", "CD22+", "BLK+", "FCRLA+", "HLA-DOB+"],
+ "weight" : 0.9,
+ "comment" : "Human and mouse shared B cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; TNFRSF13C (BAFF receptor); CD79A, CD79B, CD19, BLK, FCRLA and HLA-DOB are also expressed in Plasma cells; CD79B in addition is expressed in CD16+ monocytes & HSCs; BANK1 & BLK are expressed higher in memory B"
},
{
- "genes" : ["CD74+", "HLA-DRA+", "HLA-DRB1+", "HLA-DPA1+", "HLA-DPB1+", "HLA-DQA1+", "HLA-DQB1+"],
+ "genes" : ["LINC00926+", "VPREB3+"],
"weight" : 0.1,
- "comment" : "MHC II"
- }
+ "comment" : "B cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
+ }
],
"subtypes" : {
"title" : "B cell subtype markers",
@@ -235,9 +230,9 @@
"name" : "Naive B cell",
"markers" : [
{
- "genes" : ["IGHD+", "TCL1A+", "FCER2+"],
+ "genes" : ["IGHD+", "TCL1A+", "FCER2+", "IL4R+", "PLPP5+"],
"weight" : 1.0,
- "comments" : "markers for naive B cell, collected from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper. Validated using ICA pbmc data"
+ "comments" : "Markers for naive B cell derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; IGHD & FCER2 are shared with mouse"
}
]
},
@@ -245,9 +240,9 @@
"name" : "Memory B cell",
"markers" : [
{
- "genes" : ["CD27+", "TNFRSF13B+"],
+ "genes" : ["IGHA1+", "IGHG1+", "CD27+", "TNFRSF13B+", "CLECL1P+", "AIM2+", "LGALS1+", "CRIP1+"],
"weight" : 1.0,
- "comments" : "markers for memory B cell, collected from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper. Validated using ICA pbmc data"
+ "comments" : "Markers for memory B cell derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data"
}
]
}
@@ -295,23 +290,23 @@
"name" : "Plasma cell",
"markers" : [
{
- "genes" : ["CD38+", "XBP1+", "CD27+", "SLAMF7+"],
- "weight" : 0.4,
- "comment" : "important markers"
+ "genes" : ["TNFRSF17+", "PRDM1+", "SLAMF7+", "IRF4+", "SDC1+"],
+ "weight" : 0.5,
+ "comment" : "Human and mouse shared markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; TNFRSF17 (BCMA), PRDM1 (BLIMP1); SDC1 is highly expressed in BMMC but not PBMC"
},
{
- "genes" : ["TNFRSF17+", "TNFRSF13B+"],
+ "genes" : ["IGHA1+", "IGHG1+", "TNFRSF13B+"],
"weight" : 0.2,
- "comment" : "TNF-receptor superfamily"
+ "comment" : "Markers expressed by both plasma and memory B cells, derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; IGHA1 & IGHG1 indicate class switch"
},
{
- "genes" : ["IGHA1+", "IGHG1+"],
+ "genes" : ["CD38+", "ABCB9+", "CHPF+", "PLAAT2+"],
"weight" : 0.2,
- "comment" : "class switching happened"
+ "comment" : "Human-specific plasma markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; PLAAT2 is highly expressed in PBMC but not BMMC"
},
{
"genes" : ["MS4A1-"],
- "weight" : 0.2,
+ "weight" : 0.1,
"comment" : "not B cell, doi: https://doi.org/10.1182/bloodadvances.2017004481, long-live plasma can still express CD19"
}
]
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index 8149071b..d138a8d9 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -117,14 +117,9 @@
"name" : "Vascular endothelial cell",
"markers" : [
{
- "genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+"],
- "weight" : 0.2,
- "comment" : "Markers for endothelial cells, from Schupp et al. Circulation 2021"
- },
- {
- "genes" : [ "ENG+", "PCDH17+", "CLEC14A+", "ESAM+", "ITM2A+", "BMPR2+", "FLT1+", "ADGRL4+", "SLCO2A1+", "AQP1+", "EPAS1+", "ADGRL2+", "IFI27+"],
- "weight" : 0.8,
- "comment" : "Common vascular EC markers from Schupp et al. Circulation 2021 and ADGRL2"
+ "genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+", "ICAM2+", "CLEC14A+", "ITM2A+", "ADGRL4+", "SLCO2A1+", "IFI27+"],
+ "weight" : 1.0,
+ "comment" : "Markers for vascular endothelial cells, validated using Travaglini et al. Nature 2020 and Schupp et al. Circulation 2021 data"
}
],
"subtypes" : {
@@ -179,21 +174,27 @@
"comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
- },
- {
- "name" : "EC lymphatic",
- "markers" : [
- {
- "genes" : ["CCL21+", "TFF3+", "PDPN+", "PROX1+", "GPM6A+", "SEMA3D+", "TBX1+", "RELN+"],
- "weight" : 1.0,
- "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
- }
- ]
}
]
}
},
+ {
+ "name" : "Lymphatic endothelial cell",
+ "markers" : [
+ {
+ "genes" : ["PECAM1+", "CLDN5+", "ERG+", "CDH5+"],
+ "weight" : 0.2,
+ "comment" : "Pan endothelial cell markers, validated using Travaglini et al. Nature 2020 and Schupp et al. Circulation 2021 data"
+ },
+ {
+ "genes" : ["CCL21+", "TFF3+", "PDPN+", "PROX1+", "LYVE1+", "FLT4+", "GPM6A+", "SEMA3D+", "TBX1+", "RELN+"],
+ "weight" : 0.8,
+ "comment" : "LEC markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
+ }
+ ]
+ }
+
{
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index bb3ac649..4fe9acae 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -82,9 +82,65 @@
"name" : "B cell",
"markers" : [
{
- "genes" : ["Cd19+", "Ms4a1+", "Cd79a+", "Cd79b+", "Ebf1+", "Pax5+", "Fcmr+", "Bank1+"],
- "weight" : 1.0,
- "comment" : "B cell markers from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
+ "genes" : ["Cd79a+", "Cd79b+", "Ms4a1+", "Cd19+", "H2-Ob+", "Tnfrsf13c+", "Bank1+", "Blk+", "Fcrla+", "Cd22+"],
+ "weight" : 0.91,
+ "comment" : "Human and mouse shared B cell markers; validated using Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020), Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data; Ebf1, Pax5 and Fcmr are good markers for mouse lung and liver but not marrow, tissue-specific marker?"
+ },
+ {
+ "genes" : ["Cxcr5+"],
+ "weight" : 0.09,
+ "comment" : "CXCR5 is constantly expressed by mature B cells and helps to guide B cells to follicle; fDC expresses CXCL13, the ligand for CXCR5; this marker expresses lowly in human but higher in mouse "
+ }
+ ],
+ "subtypes" : {
+ "title" : "B cell subtype markers",
+ "cell_types" : [
+ {
+ "name" : "Naive B cell",
+ "markers" : [
+ {
+ "genes" : ["Ighd+", "Fcer2a+", "Vpreb3+", "Fcrl1+", "Chchd10+"],
+ "weight" : 1.0,
+ "comments" : "Markers for naive B cell derived from Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) & Kaptein et al. Cell 2022; Ighd & Fcer2a are shared with human"
+ }
+ ]
+ },
+ {
+ "name" : "Memory B cell",
+ "markers" : [
+ {
+ "genes" : ["Zbtb32+", "C130026I21Rik+", "Pdlim1+", "Hepacam2+", "Igha+"],
+ "weight" : 0.8,
+ "comments" : "Markers for memory B cell derived from Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) data; need to check and add Ighg related genes"
+ },
+ {
+ "genes" : ["Nt5e+", "Cd80+", "Fas+", "Pdcd1lg2+"],
+ "weight" : 0.2,
+ "comments" : "Traditional mouse memory B cell validated by Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) data; all lowly expressed; Nt5e (5' Nucleotidase/CD73), Fas (CD95), Pdcd1lg2 (PD-L2/CD273)"
+ }
+ ]
+ }
+ ]
+ }
+ },
+
+ {
+ "name" : "Plasma cell",
+ "markers" : [
+ {
+ "genes" : ["Sdc1+", "Slamf7+", "Tnfrsf17+", "Irf4+", "Prdm1+"],
+ "weight" : 0.5,
+ "comment" : "Plasma cell markers shared with human and validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020)"
+ },
+ {
+ "genes" : ["Derl3+", "Chst1+", "Eaf2+", "Oosp1+", "Cacna1s+"],
+ "weight" : 0.4,
+ "comment" : "Mouse-specific plasma cell markers validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020)"
+ },
+ {
+ "genes" : ["Xbp1+", "Slc3a2+", "Ly6k+"],
+ "weight" : 0.1,
+ "comment" : "Traditional mouse plasma markers (not ideal) validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020); Xbp1 & Slc3a2 (CD98) expressed highest in plasma but also expressed in other cell types"
}
]
},
@@ -127,6 +183,17 @@
]
},
+ {
+ "name" : "Macrophage",
+ "markers" : [
+ {
+ "genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"],
+ "weight" : 1.0,
+ "comment" : "Machrophage markers from Kaptein et al. Cell 2022"
+ }
+ ]
+ },
+
{
"name" : "Conventional type 1 dendritic cell",
"markers" : [
diff --git a/pegasus/data_files/human_lung.gmt b/pegasus/data_files/human_lung.gmt
index 3559ccb9..6f488d48 100644
--- a/pegasus/data_files/human_lung.gmt
+++ b/pegasus/data_files/human_lung.gmt
@@ -1,12 +1,13 @@
Epithelial Epithelial markers from HTAPP paper KRT8 KRT18 EPCAM CD24
-Endothelial Endothelial shared markers from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021 PECAM1 CLDN5 CDH5 ERG
+VEC Vascular endothelial cell markers from Travaglini et al. Nature 2020 and and Schupp et al. Circulation 2021 PECAM1 CLDN5 CDH5 ERG ICAM2 CLEC14A ITM2A ADGRL4 SLCO2A1 IFI27
+LEC Lymphatic endothelial cell markers from Travaglini et al. Nature 2020 and and Schupp et al. Circulation 2021 PECAM1 CLDN5 ERG CDH5 CCL21 TFF3 PDPN PROX1 LYVE1 FLT4 GPM6A SEMA3D TBX1 RELN
Fibroblast Fibroblast/Myofibroblast shared markers from Travaglini et al. COL1A1 COL1A2 PDGFRA ELN BGN
Macrophage Macro CD68 CD163 C1QA MRC1 MS4A6A MSR1 MERTK
SMC SMC from Muus et al., Braga et al. and Schupp et al. MYH11 TAGLN ACTG2 CNN1 PLN
Pericyte Pericyte from Schupp et al. and Travaglini et al. TRPC6 CSPG4 FAM162B GJA4 GJC1 HIGD1B CDH6 LAMC3 FHL5
T cell T cell markers CD3D CD3E CD3G TRAC
B cell B cell markers CD19 MS4A1 CD79A CD79B
-Plasma cell Plasma cell markers CD38 XBP1 CD27 SLAMF7 TNFRSF17 TNFRSF13B
+Plasma cell Plasma cell markers from ICA TNFRSF17 PRDM1 SLAMF7 IRF4 SDC1 IGHA1 IGHG1 TNFRSF13B CD38 ABCB9 CHPF PLAAT2
Mast cell Mast cell markers KIT CPA3 TPSB2 TPSAB1 AREG RGS1 RGS2
ProNeu Pro-Neutrophil markers validated using 10x public whole blood dataset DEFA3 DEFA4 AZU1 MS4A3 ELANE SLPI CEACAM6 RNASE3 PRTN3 MPO AC104232.1 CTSG
PreNeu Pre-Neutrophil markers validated using 10x public whole blood dataset LTF LCN2 MMP8 CRISP3 CAMP PGLYRP1 CD177 HP
diff --git a/pegasus/tools/doublet_detection.py b/pegasus/tools/doublet_detection.py
index 5737af2d..f9f69393 100644
--- a/pegasus/tools/doublet_detection.py
+++ b/pegasus/tools/doublet_detection.py
@@ -267,7 +267,7 @@ def _run_scrublet(
If True, plot diagnostic histograms. Each sample would have a figure consisting of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets.
manual_correction: ``str``, optional, default: ``None``
- If present, use human guide provided in manual_correction to select threshold. Currently support 'peak' and 'expected'. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate.
+ If present, use human guide provided in manual_correction to select threshold. Currently support 'peak', 'expected' and threshold. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate. If not both, convert guide to float and use as user-specified threshold.
Returns
--------
@@ -420,6 +420,8 @@ def _run_scrublet(
threshold = np.exp(x[maxima_by_x[-1]])
elif manual_correction == "expected":
threshold = threshold_theory
+ else:
+ threshold = float(manual_correction)
data.obs["doublet_score"] = obs_scores.astype(np.float32)
data.obs["pred_dbl"] = obs_scores > threshold
@@ -474,7 +476,7 @@ def infer_doublets(
data: MultimodalData,
channel_attr: Optional[str] = None,
clust_attr: Optional[str] = None,
- raw_mat_key: Optional[str] = 'counts',
+ raw_mat_key: Optional[str] = None,
min_cell: Optional[int] = 100,
expected_doublet_rate: Optional[float] = None,
sim_doublet_ratio: Optional[float] = 2.0,
@@ -501,6 +503,9 @@ def infer_doublets(
clust_attr: ``str``, optional, default: None
Attribute indicating cluster labels. If set, estimate proportion of doublets in each cluster and statistical significance.
+ raw_mat_key: ``str``, optional, default: None
+ The key for raw count matrix. By default, Pegasus will first try "counts" and then try "raw.X"
+
min_cell: ``int``, optional, default: 100
Minimum number of cells per sample to calculate doublet scores. For samples having less than 'min_cell' cells, doublet score calculation will be skipped.
@@ -529,7 +534,7 @@ def infer_doublets(
If not None, plot diagnostic histograms using ``plot_hist`` as the prefix. If `channel_attr` is None, ``plot_hist.dbl.png`` is generated; Otherwise, ``plot_hist.channel_name.dbl.png`` files are generated. Each figure consists of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets. Each plot contains two dashed lines. The red dashed line represents the theoretical cutoff (calucalted based on number of cells and 10x doublet table) and the black dashed line represents the cutof inferred from the data.
manual_correction: ``str``, optional, default: ``None``
- Use human guide to correct doublet threshold for certain channels. This is string representing a comma-separately list. Each item in the list represent one sample and the sample name and correction guide are separated using ':'. The orrection guides supported are 'peak' and 'expected'. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate. If only one sample available, use '' as the sample name.
+ Use human guide to correct doublet threshold for certain channels. This is string representing a comma-separately list. Each item in the list represent one sample and the sample name and correction guide are separated using ':'. The correction guides supported are 'peak', 'expected' and threshold. 'peak' means cutting at the center of the peak; 'expected' means cutting at the expected doublet rate; threshold is the user-specified doublet threshold; if the guide is neither 'peak' nor 'expected', pegasus will try to convert the string into float and use it as doublet threshold. If only one sample available, no need to specify sample name.
Returns
-------
@@ -545,6 +550,11 @@ def infer_doublets(
>>> pg.infer_doublets(data, channel_attr = 'Channel', clust_attr = 'Annotation')
"""
assert data.get_modality() == "rna"
+
+ if raw_mat_key is None:
+ raw_mat_key = 'counts'
+ if raw_mat_key not in data.list_keys():
+ raw_mat_key = 'raw.X'
try:
rawX = data.get_matrix(raw_mat_key)
except ValueError:
@@ -554,10 +564,13 @@ def infer_doublets(
mancor = {}
if manual_correction is not None:
- for item in manual_correction.split(','):
- name, action = item.split(':')
- mancor[name] = action
-
+ if channel_attr is None:
+ mancor[''] = manual_correction
+ else:
+ for item in manual_correction.split(','):
+ name, action = item.split(':')
+ mancor[name] = action
+
if channel_attr is None:
if data.shape[0] >= min_cell:
fig = _run_scrublet(data, raw_mat_key, expected_doublet_rate = expected_doublet_rate, sim_doublet_ratio = sim_doublet_ratio, \
From 6990c47f099262439f0788dfcc078e2517055e1d Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Fri, 21 Jul 2023 11:46:04 -0700
Subject: [PATCH 33/57] release notes for v1.8.0
---
docs/api/index.rst | 1 +
docs/conf.py | 8 ++++----
docs/index.rst | 2 +-
docs/release_notes.rst | 5 +++++
docs/release_notes/version_1_8.rst | 16 ++++++++++++++++
5 files changed, 27 insertions(+), 5 deletions(-)
create mode 100644 docs/release_notes/version_1_8.rst
diff --git a/docs/api/index.rst b/docs/api/index.rst
index 4311a096..540af78e 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -85,6 +85,7 @@ Cluster Algorithms
cluster
louvain
leiden
+ split_one_cluster
spectral_louvain
spectral_leiden
diff --git a/docs/conf.py b/docs/conf.py
index 5ca8a416..8ec108a1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -23,22 +23,22 @@
# -- Project information -----------------------------------------------------
project = "Pegasus"
-copyright = "2022 Genentech, Inc. All rights reserved."
+copyright = "2023 Genentech, Inc. All rights reserved."
author = (
"Yiming Yang, Joshua Gould and Bo Li"
)
# The short X.Y version
-version = "1.7"
+version = "1.8"
# The full version, including alpha/beta/rc tags
-release = "1.7.1"
+release = "1.8.0"
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
-#needs_sphinx = '1.7'
+#needs_sphinx = '1.8'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
diff --git a/docs/index.rst b/docs/index.rst
index 9289f3c9..b1893bf2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,7 +10,7 @@
Release Highlights in Current Stable
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. include:: release_notes/version_1_7.rst
+.. include:: release_notes/version_1_8.rst
.. toctree::
:maxdepth: 1
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
index 407a6f5c..4adc7f23 100644
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -6,6 +6,11 @@ Release Notes
.. note::
Also see the release notes of `PegasusIO `__.
+Version 1.8
+~~~~~~~~~~~~~
+
+.. include:: release_notes/version_1_8.rst
+
Version 1.7
~~~~~~~~~~~~~
diff --git a/docs/release_notes/version_1_8.rst b/docs/release_notes/version_1_8.rst
new file mode 100644
index 00000000..e81da4a4
--- /dev/null
+++ b/docs/release_notes/version_1_8.rst
@@ -0,0 +1,16 @@
+1.8.0 :small:`July 21, 2023`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**New Feature and Improvement**
+
+* Updata ``human_immune`` and ``human_lung`` marker sets.
+* Add ``mouse_liver`` marker set.
+* Add `split_one_cluster <./api/pegasus.split_one_cluster.html>`_ function to subcluster one cluster into a specified number of subclusters.
+* Update **neighbors** function to set ``use_cache=False`` by default, and adjust K to ``min(K, int(sqrt(n_samples)))``. [PR `272 `_]
+* In **infer_doublets** function, argument ``manual_correction`` now accepts a float number threshold specified by users for cut-off. [PR `275 `_]
+
+**Bug Fix**
+
+* Fix divide by zero issue in ``integrative_nmf`` function. [PR `258 `_]
+* Compatibility with Pandas v2.0. [PR `261 `_]
+* Allow ``infer_doublets`` to use any count matrix with key name specified by users. [PR `268 `_ Thanks to `Donghoon Lee `_]
From 8714e025aa80d4f7d83ecee7fe1dcabf649f3a37 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Fri, 21 Jul 2023 11:53:19 -0700
Subject: [PATCH 34/57] build wheel for py3.10
---
setup.py | 1 +
wheel_build/build_wheel_for_linux.sh | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index b353e796..6b679097 100644
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,7 @@
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
],
keywords="single cell/nucleus genomics analysis",
packages=find_packages(),
diff --git a/wheel_build/build_wheel_for_linux.sh b/wheel_build/build_wheel_for_linux.sh
index 792a1d48..a5cceadb 100755
--- a/wheel_build/build_wheel_for_linux.sh
+++ b/wheel_build/build_wheel_for_linux.sh
@@ -11,7 +11,7 @@ function repair_wheel {
fi
}
-declare -a PythonVersions=("cp37-cp37m" "cp38-cp38" "cp39-cp39")
+declare -a PythonVersions=("cp37-cp37m" "cp38-cp38" "cp39-cp39" "cp310-cp310")
for val in ${PythonVersions[@]}; do
/opt/python/$val/bin/pip install -r /src/requirements.txt
From 0010ca4b20fd2c9e37afa48bc42861e2f2ee8173 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sat, 29 Jul 2023 16:20:24 -0700
Subject: [PATCH 35/57] Updated human lung and mosue immune markers
---
pegasus/annotate_cluster/human_lung_cell_markers.json | 2 +-
.../annotate_cluster/mouse_immune_cell_markers.json | 11 +++++++++++
pegasus/tools/preprocessing.py | 2 +-
3 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index d138a8d9..42cc1eb8 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -193,7 +193,7 @@
"comment" : "LEC markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
- }
+ },
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index 4fe9acae..fdcced1c 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -296,6 +296,17 @@
"comment" : "Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022"
}
]
+ },
+
+ {
+ "name" : "Red blood cell",
+ "markers" : [
+ {
+ "genes" : ["Hba-a1+", "Hba-a2+", "Hbb-bs+", "Hbb-bt+"],
+ "weight" : 1.0,
+ "comment" : "Hemoglobin genes"
+ }
+ ]
}
]
}
diff --git a/pegasus/tools/preprocessing.py b/pegasus/tools/preprocessing.py
index d3c1d6c8..dd105f74 100644
--- a/pegasus/tools/preprocessing.py
+++ b/pegasus/tools/preprocessing.py
@@ -346,7 +346,7 @@ def _set_target_mat(data, X, target_matrix, select, base_matrix, suffix):
if target_matrix in data.matrices:
logger.warning(f"{target_matrix} is in data's matrices. It will be rewritten.")
- data.add_matrix(target_matrix, X)
+ data.update_matrix(target_matrix, X)
if select:
data.select_matrix(target_matrix)
From 3a758300e47089fd281b107eaa4deff362a64c56 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sun, 6 Aug 2023 13:59:15 -0700
Subject: [PATCH 36/57] Renamed Megakaryocyte to Platelet
---
pegasus/annotate_cluster/human_immune_cell_markers.json | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index f7eff6fa..16459b95 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -403,7 +403,7 @@
},
{
- "name" : "Erythroid cells",
+ "name" : "Erythroid cell",
"markers" : [
{
"genes" : ["GYPA+"],
@@ -429,7 +429,7 @@
},
{
- "name" : "Megakaryocyte",
+ "name" : "Platelet",
"markers" : [
{
"genes" : ["PF4+", "PPBP+", "GP5+"],
From 3cf7e67db5f3d4a5fab16cf033f1b7cf8d7e533a Mon Sep 17 00:00:00 2001
From: Jayaram Kancherla
Date: Tue, 15 Aug 2023 16:32:48 -0700
Subject: [PATCH 37/57] only convert sparse matrices to numpy arrays
---
pegasus/tools/signature_score.py | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/pegasus/tools/signature_score.py b/pegasus/tools/signature_score.py
index 5030162f..1f055d39 100644
--- a/pegasus/tools/signature_score.py
+++ b/pegasus/tools/signature_score.py
@@ -1,4 +1,5 @@
import numpy as np
+import scipy.sparse as sp
import pandas as pd
from typing import Dict, List, Union
@@ -89,7 +90,11 @@ def calculate_z_score(
if not _check_and_calc_sig_background(data, n_bins):
return None
- z_score_mat = (data.X.toarray().astype(np.float32) - data.var["mean"].values.astype(np.float32) - data.obsm["sig_bkg_mean"][:, data.var["bins"].cat.codes].astype(np.float32)) / data.obsm["sig_bkg_std"][:, data.var["bins"].cat.codes].astype(np.float32)
+ mat = data.X
+ if sp.issparse(mat):
+ mat = mat.toarray()
+
+ z_score_mat = (mat.astype(np.float32) - data.var["mean"].values.astype(np.float32) - data.obsm["sig_bkg_mean"][:, data.var["bins"].cat.codes].astype(np.float32)) / data.obsm["sig_bkg_std"][:, data.var["bins"].cat.codes].astype(np.float32)
return z_score_mat
From 56b07b5936673bf0c2164fde8eb0cfffd6d2a8c5 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Mon, 21 Aug 2023 14:53:57 +0800
Subject: [PATCH 38/57] Make this operation a function
---
pegasus/plotting/plot_library.py | 19 ++-----------------
pegasus/plotting/plot_utils.py | 21 +++++++++++++++++++++
2 files changed, 23 insertions(+), 17 deletions(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index fb1083a6..0b07ada1 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -31,6 +31,7 @@
_generate_categories,
_plot_corners,
_plot_spots,
+ _get_valid_attrs,
)
@@ -153,23 +154,7 @@ def scatter(
attrs = [attrs]
# Select only valid attributes
- attrs_filt = []
- attrs_drop = []
- for attr in attrs:
- if (attr == '_all') or (attr in data.obs) or (attr in data.var_names) or ('@' in attr):
- if not '@' in attr:
- attrs_filt.append(attr)
- else:
- obsm_key, sep, component = attr.partition("@")
- if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()):
- attrs_drop.append(attr)
- else:
- attrs_filt.append(attr)
- else:
- attrs_drop.append(attr)
- attrs = attrs_filt
- if len(attrs_drop) > 0:
- print(f"Warning: Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
+ attrs = _get_valid_attrs(data, attrs)
if isinstance(basis, str):
basis = [basis]
diff --git a/pegasus/plotting/plot_utils.py b/pegasus/plotting/plot_utils.py
index 0c61cb58..48766ef9 100644
--- a/pegasus/plotting/plot_utils.py
+++ b/pegasus/plotting/plot_utils.py
@@ -435,3 +435,24 @@ def _plot_spots(x: np.ndarray, y: np.ndarray, c: Union[str, np.ndarray], s: floa
spots.set_clim(vmin, vmax)
ax.add_collection(spots)
return spots
+
+
+def _get_valid_attrs(data:Union[MultimodalData, UnimodalData], attrs: List[str]) -> List[str]:
+ attrs_filt = []
+ attrs_drop = []
+ for attr in attrs:
+ if (attr == '_all') or (attr in data.obs) or (attr in data.var_names) or ('@' in attr):
+ if not '@' in attr:
+ attrs_filt.append(attr)
+ else:
+ obsm_key, sep, component = attr.partition("@")
+ if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()):
+ attrs_drop.append(attr)
+ else:
+ attrs_filt.append(attr)
+ else:
+ attrs_drop.append(attr)
+ if len(attrs_drop) > 0:
+ print(f"Warning: Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
+
+ return attrs_filt
From bc630ec545a288b655112840cf201773f10281bd Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Thu, 24 Aug 2023 08:07:04 +0800
Subject: [PATCH 39/57] fix a typo in human_lung cell marker JSON file
---
pegasus/annotate_cluster/human_lung_cell_markers.json | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json
index d138a8d9..5b54986a 100644
--- a/pegasus/annotate_cluster/human_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/human_lung_cell_markers.json
@@ -193,9 +193,7 @@
"comment" : "LEC markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021"
}
]
- }
-
-
+ },
{
"name" : "Smooth muscle cell",
From eb7b766439474b8d9525cc3970b589239ec0bf7b Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Thu, 24 Aug 2023 08:23:25 +0800
Subject: [PATCH 40/57] update docs
---
docs/conf.py | 2 +-
docs/release_notes/version_1_8.rst | 5 +++++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/docs/conf.py b/docs/conf.py
index 8ec108a1..25a5726a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -31,7 +31,7 @@
# The short X.Y version
version = "1.8"
# The full version, including alpha/beta/rc tags
-release = "1.8.0"
+release = "1.8.1"
# -- General configuration ---------------------------------------------------
diff --git a/docs/release_notes/version_1_8.rst b/docs/release_notes/version_1_8.rst
index e81da4a4..4b0947ae 100644
--- a/docs/release_notes/version_1_8.rst
+++ b/docs/release_notes/version_1_8.rst
@@ -1,3 +1,8 @@
+1.8.1 :small:`August 23, 2023`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Bug fix in cell marker JSON files for ``infer_cell_types`` function.
+
1.8.0 :small:`July 21, 2023`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
From 5c186ef32676fcbdc371e33e2c7d552af3173ffb Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Thu, 5 Oct 2023 23:07:42 -0700
Subject: [PATCH 41/57] Expose online_batch_size in nmf and integrative_nmf
functions
---
pegasus/tools/nmf.py | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/pegasus/tools/nmf.py b/pegasus/tools/nmf.py
index 7ce54e65..8cf2cb08 100644
--- a/pegasus/tools/nmf.py
+++ b/pegasus/tools/nmf.py
@@ -81,6 +81,7 @@ def nmf(
alpha_H: float = 0.0,
l1_ratio_H: float = 0.0,
fp_precision: str = "float",
+ online_chunk_size: int = 5000,
n_jobs: int = -1,
random_state: int = 0,
) -> None:
@@ -137,6 +138,9 @@ def nmf(
fp_precision: ``str``, optional, default: ``float``
The numeric precision on the results. Choose from ``float`` and ``double``.
+ online_chunk_size: ``int``, optional, default: ``int``
+ The chunk / mini-batch size for online learning. Only works when ``mode='online'``.
+
n_jobs : `int`, optional (default: -1)
Number of threads to use. -1 refers to using all physical CPU cores.
@@ -189,6 +193,7 @@ def nmf(
alpha_H=alpha_H,
l1_ratio_H=l1_ratio_H,
fp_precision=fp_precision,
+ online_chunk_size=online_chunk_size,
)
data.uns["nmf_features"] = features # record which feature to use
@@ -285,6 +290,7 @@ def integrative_nmf(
use_gpu: bool = False,
lam: float = 5.0,
fp_precision: str = "float",
+ online_chunk_size: int = 5000,
n_jobs: int = -1,
random_state: int = 0,
quantile_norm: bool = True,
@@ -334,6 +340,9 @@ def integrative_nmf(
fp_precision: ``str``, optional, default: ``float``
The numeric precision on the results. Choose from ``float`` and ``double``.
+ online_chunk_size: ``int``, optional, default: ``5000``
+ The chunk / mini-batch size for online learning. Only works when ``mode='online'``.
+
n_jobs : `int`, optional (default: -1)
Number of threads to use. -1 refers to using all physical CPU cores.
@@ -394,6 +403,7 @@ def integrative_nmf(
use_gpu=use_gpu,
lam=lam,
fp_precision=fp_precision,
+ online_chunk_size=online_chunk_size,
)
# Implementation of algo 3, quantile normalization
From c55d2e4a6b88ccfd231c0aa25e04d935f2256bde Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Thu, 5 Oct 2023 23:27:39 -0700
Subject: [PATCH 42/57] no longer support Python 3.7
---
.github/workflows/ci-test.yml | 2 +-
requirements.txt | 1 -
setup.py | 1 -
3 files changed, 1 insertion(+), 3 deletions(-)
diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
index 41d69118..cbcfaabe 100644
--- a/.github/workflows/ci-test.yml
+++ b/.github/workflows/ci-test.yml
@@ -14,7 +14,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
- python-version: ['3.7', '3.8', '3.9']
+ python-version: ['3.8', '3.9']
steps:
- uses: actions/checkout@v2
diff --git a/requirements.txt b/requirements.txt
index f9a154c7..6341d8da 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,6 @@ Cython
docopt
demuxEM
hnswlib
-importlib_metadata>=0.7; python_version < '3.8'
psutil
threadpoolctl
joblib>=0.14
diff --git a/setup.py b/setup.py
index 6b679097..a434934e 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,6 @@
"Topic :: Software Development :: Build Tools",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
From 4628fa3764cd9b27ec8ec5297ebc5f633b651926 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Thu, 5 Oct 2023 23:29:13 -0700
Subject: [PATCH 43/57] Add Python 3.10 to CI test
---
.github/workflows/ci-test.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
index cbcfaabe..e66dd54d 100644
--- a/.github/workflows/ci-test.yml
+++ b/.github/workflows/ci-test.yml
@@ -14,7 +14,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
- python-version: ['3.8', '3.9']
+ python-version: ['3.8', '3.9', '3.10']
steps:
- uses: actions/checkout@v2
From 49a45a39dd820ce54832c6d235a0386ba2f31c78 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Sat, 7 Oct 2023 13:31:53 -0700
Subject: [PATCH 44/57] Updated marker list
---
.../human_immune_cell_markers.json | 8 +++----
.../mouse_immune_cell_markers.json | 24 ++++---------------
.../mouse_lung_cell_markers.json | 4 ++--
3 files changed, 11 insertions(+), 25 deletions(-)
diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json
index 16459b95..a4592a6f 100644
--- a/pegasus/annotate_cluster/human_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/human_immune_cell_markers.json
@@ -97,9 +97,9 @@
"name" : "CD8 TCM",
"markers" : [
{
- "genes" : ["CD8A+", "CD8B+", "GZMK+", "DUSP2+", "RGS1+", "CXCR3+", "CMC1+", "TIGIT+", "CST7+", "NKG7+"],
+ "genes" : ["CD8A+", "CD8B+", "GZMK+", "DUSP2+", "LTB+", "CD27+", "IL7R+", "GPR183+", "RGS1+", "CXCR3+"],
"weight" : 1.0,
- "comment" : "Markers derived from Immune Cell Atlas PBMC data; CD8A & CD8B are CD8 markers; GZMK, DUSP2, RGS1 & CXCR3 are specific to TCM; CMC1 & TIGIT are biased towards TCM; CST7 & NKG7 are shared by TCM & TEM"
+ "comment" : "Markers derived from Immune Cell Atlas PBMC data; CD8A & CD8B are CD8 markers; All others are CD8 TCM specific markers"
}
]
},
@@ -107,9 +107,9 @@
"name" : "CD8 TEM",
"markers" : [
{
- "genes" : ["CD8A+", "CD8B+", "FGFBP2+", "GZMB+", "FCGR3A+", "SPON2+", "ADGRG1+", "CX3CR1+", "ASCL2+", "PRSS23+"],
+ "genes" : ["CD8A+", "CD8B+", "FGFBP2+", "GZMB+", "GZMH+", "GNLY+", "PRF1+", "KLRD1+", "FCGR3A+", "TBX21+", "CX3CR1+", "ASCL2+", "SPON2+", "ADGRG1+", "PRSS23+"],
"weight" : 1.0,
- "comment" : "Markers derived from Immune Cell Atlas PBMC data"
+ "comment" : "Markers derived from Immune Cell Atlas PBMC data; FGFBP2, GZMB, GZMH, GNLY, PRF1, KLRD1, FCGR3A are pan TEM markers; TBX21, CX3CR1 and ASCL2 are Temra markers; the last three are purely data driven markers"
}
]
},
diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
index fdcced1c..9b9095eb 100644
--- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json
@@ -53,20 +53,6 @@
}
},
- {
- "name" : "Monocyte",
- "markers" : [
- {
- "genes" : ["Lyz2+", "Lyz1+", "S100a4+", "Itgam+"],
- "weight" : 0.8
- },
- {
- "genes" : ["C1qb+", "C1qc+", "Mrc1+", "Cd52+"],
- "weight" : 0.2
- }
- ]
- },
-
{
"name" : "Immature B cell",
"markers" : [
@@ -162,12 +148,12 @@
},
{
- "name" : "Inflammatory monocyte",
+ "name" : "Classical monocyte",
"markers" : [
{
- "genes" : ["Ly6c2+", "F13a1+", "Ms4a4c+", "Ccr2+", "Gm9733+", "Mcub+"],
+ "genes" : ["Ly6c2+", "F13a1+", "Ccr2+", "Ms4a4c+", "Gm9733+", "Mcub+", "S100a4+"],
"weight" : 1.0,
- "comment" : "Inflammatory monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021"
+ "comment" : "Classical monocyte markers (except S100a4) inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; Ly6c2, F13a1, Ccr2 and Ms4a4c (in Fig. 1b) are Group III markers from Casanova-Acebes et al. Nature 2021. S100a4 is less specific to classical monocyte."
}
]
},
@@ -176,9 +162,9 @@
"name" : "Patrolling monocyte",
"markers" : [
{
- "genes" : ["Ace+", "Eno3+", "Ear2+", "Treml4+", "Spn+", "Fcgr4+", "Lair1+", "Cd300e+", "Cd300ld+", "Adgre4+"],
+ "genes" : ["Eno3+", "Cd300e+", "Ace+", "Treml4+", "Spn+", "Adgre4+", "Lair1+", "Fcgr4+", "Ear2+", "Cd300ld+"],
"weight" : 1.0,
- "comment" : "Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; Related papers: Domingo-Gonzalez et al. Elife 2020, Thomas et al. Arterioscler Thromb Vasc Biol. 2015, and Schyns et al. Nat. Commun. 2019."
+ "comment" : "Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; First 6 markers are Group IV markers in Fig. 1b of Casanova-Acebes et al. Nature paper; Eno3, Cd300e, Ace and Lair1 are very specific; Related papers: Domingo-Gonzalez et al. Elife 2020, Thomas et al. Arterioscler Thromb Vasc Biol. 2015, and Schyns et al. Nat. Commun. 2019."
}
]
},
diff --git a/pegasus/annotate_cluster/mouse_lung_cell_markers.json b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
index cb091482..543c3cff 100644
--- a/pegasus/annotate_cluster/mouse_lung_cell_markers.json
+++ b/pegasus/annotate_cluster/mouse_lung_cell_markers.json
@@ -292,9 +292,9 @@
"name" : "Alveolar macrophage",
"markers" : [
{
- "genes" : ["Atp6v0d2+", "Olr1+", "F7+", "Ear1+", "Tfec+", "Gpnmb+", "Lrp12+", "Marco+"],
+ "genes" : ["Ear1+", "Marco+", "Atp6v0d2+", "Olr1+", "F7+", "Tfec+", "Gpnmb+", "Lrp12+", "Pparg+", "Car4+", "Krt19+", "Plet1+"],
"weight" : 1.0,
- "comment" : "Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data"
+ "comment" : "First 8 markers are Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data; Ear1 and Marco also show in Casanova-Acebes et al. Nature 2021; Last 4 are markers from Casanova-Acebes et al. Nature 2021 that are validated using Hurskainen et al. Nat. Commun. 2021 data"
}
]
},
From d09113a6f1047e2617d2164efeb42624b27fd64b Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Mon, 16 Oct 2023 15:30:06 -0700
Subject: [PATCH 45/57] update readthedocs conf
---
.readthedocs.yml | 33 ++++++++++++++++++++++++++--
wheel_build/build_wheel_for_linux.sh | 2 +-
2 files changed, 32 insertions(+), 3 deletions(-)
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 7e053cac..ae4157db 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,6 +1,35 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
build:
- image: latest
+ os: ubuntu-22.04
+ tools:
+ python: "3.9"
+ # You can also specify other tool versions:
+ # nodejs: "20"
+ # rust: "1.70"
+ # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: docs/conf.py
+ # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+ # builder: "dirhtml"
+ # Fail on all warnings to avoid broken references
+ # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+# - pdf
+# - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
- version: 3.8
+ install:
+ - requirements: docs/requirements.txt
\ No newline at end of file
diff --git a/wheel_build/build_wheel_for_linux.sh b/wheel_build/build_wheel_for_linux.sh
index a5cceadb..98d6dc96 100755
--- a/wheel_build/build_wheel_for_linux.sh
+++ b/wheel_build/build_wheel_for_linux.sh
@@ -11,7 +11,7 @@ function repair_wheel {
fi
}
-declare -a PythonVersions=("cp37-cp37m" "cp38-cp38" "cp39-cp39" "cp310-cp310")
+declare -a PythonVersions=("cp38-cp38" "cp39-cp39" "cp310-cp310" "cp311-cp311")
for val in ${PythonVersions[@]}; do
/opt/python/$val/bin/pip install -r /src/requirements.txt
From 7d56c4937fe3e8c69485ca83d5f4c0ea150d1f22 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Mon, 16 Oct 2023 15:34:51 -0700
Subject: [PATCH 46/57] add support for py3.11
---
setup.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/setup.py b/setup.py
index a434934e..10dac019 100644
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,7 @@
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
],
keywords="single cell/nucleus genomics analysis",
packages=find_packages(),
From d279339d2fc6071fa0d636c19fcf9b773c7831d7 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Mon, 30 Oct 2023 00:09:44 -0700
Subject: [PATCH 47/57] Updated heatmap function
---
pegasus/plotting/plot_library.py | 149 ++++++++++++++++++++-----------
1 file changed, 99 insertions(+), 50 deletions(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index f5f236e8..2048aa6c 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1006,16 +1006,24 @@ def violin(
def heatmap(
data: Union[MultimodalData, UnimodalData, anndata.AnnData],
attrs: Union[str, List[str]],
- groupby: str,
+ groupby: Optional[str] = None,
matkey: Optional[str] = None,
- on_average: bool = True,
- switch_axes: bool = False,
+ gene_zscore: Optional[bool] = True,
+ on_average: Optional[bool] = True,
+ switch_axes: Optional[bool] = False,
attrs_cluster: Optional[bool] = False,
attrs_dendrogram: Optional[bool] = True,
+ attrs_method: Optional[bool] = 'ward',
+ attrs_optimal_ordering: Optional[bool] = True,
+ attrs_labelsize: Optional[float] = 10.0,
+ attrs_labelrotation: Optional[float] = 0.0,
groupby_cluster: Optional[bool] = True,
groupby_dendrogram: Optional[bool] = True,
- attrs_labelsize: Optional[float] = 10.0,
+ groupby_method: Optional[bool] = 'ward',
+ groupby_optimal_ordering: Optional[bool] = True,
+ groupby_precomputed_linkage: Optional[np.array] = None,
groupby_labelsize: Optional[float] = 10.0,
+ groupby_labelrotation: Optional[float] = 0.0,
cbar_labelsize: Optional[float] = 10.0,
panel_size: Tuple[float, float] = (10, 10),
return_fig: Optional[bool] = False,
@@ -1027,7 +1035,6 @@ def heatmap(
Parameters
-----------
-
data: ``AnnData`` or ``MultimodalData`` or ``UnimodalData`` object
Single-cell expression data.
attrs: ``str`` or ``List[str]``
@@ -1035,13 +1042,16 @@ def heatmap(
Cell attributes must exist in ``data.obs`` and must be numeric.
Features must exist in ``data.var``.
By default, attrs are plotted as columns.
- groupby: ``str``
+ groupby: ``str``, optional, default: ``None``
A categorical variable in data.obs that is used to categorize the cells, e.g. Clusters.
By default, data.obs['groupby'] is plotted as rows.
+ If ``None``, use data.obs_names instead.
matkey: ``str``, optional, default: ``None``
If matkey is set, select matrix with matkey as keyword in the current modality. Only works for MultimodalData or UnimodalData objects.
+ gene_zscore: ``bool``, optional, default: ``True``
+ If ``True``, compute and then plot z scores for gene expression.
on_average: ``bool``, optional, default: ``True``
- If ``True``, plot cluster average gene expression (i.e. show a Matrixplot); otherwise, plot a general heatmap.
+ If ``True``, plot cluster average gene expression or z score (i.e. show a Matrixplot); otherwise, plot a general heatmap.
switch_axes: ``bool``, optional, default: ``False``
By default, X axis is for attributes, and Y axis for clusters. If this parameter is ``True``, switch the axes.
Moreover, with ``on_average`` being ``False``, if ``switch_axes`` is ``False``, ``row_cluster`` is enforced to be ``False``; if ``switch_axes`` is ``True``, ``col_cluster`` is enforced to be ``False``.
@@ -1049,14 +1059,28 @@ def heatmap(
Cluster attributes and generate a attribute-wise dendrogram.
attrs_dendrogram: ``bool``, optional, default: ``True``
Only matters if attrs_cluster is True. Show the dendrogram if this option is True.
+ attrs_method: ``str``, optional, default: ``ward``
+ Linkage method for attrs, choosing from ``single``, ``complete``, ``average``, ``weighted``, ``centroid``, ``median`` and ``ward``.
+ attrs_optimal_ordering: ``bool``, optional, default: ``True``
+ Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the attrs linkage matrix will be reordered so that the distance between successive leaves is minima.
+ attrs_labelsize: ``float``, optional, default: 10.0
+ Fontsize for labels of attrs.
+ attrs_labelrotation: ``float``, optional, default: 0.0
+ Rotation of labels for attrs.
groupby_cluster: ``bool``, optional, default: ``True``
Cluster data.obs['groupby'] and generate a cluster-wise dendrogram.
groupby_dendrogram: ``bool``, optional, default: ``True``
Only matters if groupby_cluster is True. Show the dendrogram if this option is True.
- attrs_labelsize: ``float``, optional, default: 10.0
- Fontsize for labels of attrs.
+ groupby_method: ``str``, optional, default: ``ward``
+ Linkage method for groupby, choosing from ``single``, ``complete``, ``average``, ``weighted``, ``centroid``, ``median`` and ``ward``.
+ groupby_optimal_ordering: ``bool``, optional, default: ``True``
+ Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the groupby linkage matrix will be reordered so that the distance between successive leaves is minima.
+ groupby_precomputed_linkage: ``np.array``, optional, default: ``None``
+ Pass a precomputed linkage.
groupby_labelsize: ``float``, optional, default: 10.0
Fontsize for labels of data.obs['groupby'].
+ groupby_labelrotation: ``float``, optional, default: 0.0
+ Rotation of labels for groupby.
cbar_labelsize: ``float``, optional, default: 10.0
Fontsize of the color bar.
panel_size: ``Tuple[float, float]``, optional, default: ``(10, 10)``
@@ -1073,7 +1097,7 @@ def heatmap(
-------
``Figure`` object
- A ``matplotlib.figure.Figure`` object containing the dot plot if ``return_fig == True``
+ A ``matplotlib.figure.Figure`` object containing the heatmap if ``return_fig == True``; Otherwise, A ``seaborn.matrix.ClusterGrid`` object is returned.
Examples
--------
@@ -1101,71 +1125,100 @@ def heatmap(
return None
genes.append(key)
- clusters = data.obs[groupby].values
- if not is_categorical_dtype(clusters):
- clusters = pd.Categorical(clusters)
- else:
- clusters = clusters.remove_unused_categories()
- df_list = [pd.DataFrame({'cluster_name': clusters})]
-
+ df_list = []
if len(obs_keys) > 0:
df_list.append(data.obs[obs_keys].reset_index(drop=True))
if len(genes) > 0:
expr_mat = slicing(data[:, genes].X)
+ if gene_zscore:
+ from scipy.stats import zscore
+ expr_mat = zscore(expr_mat, ddof=1)
df_list.append(pd.DataFrame(data=expr_mat, columns=genes))
df = pd.concat(df_list, axis = 1)
- attr_names = df.columns[1:].values
+ df.index = data.obs_names
+ attr_names = df.columns.values
+
+ cluster_ids = df.index
+ cell_colors = None
+ if groupby is not None:
+ cluster_ids = data.obs[groupby].values
+ if not is_categorical_dtype(cluster_ids):
+ cluster_ids = pd.Categorical(cluster_ids)
+ else:
+ cluster_ids = cluster_ids.remove_unused_categories()
+
+ if on_average:
+ if not 'cmap' in kwargs.keys():
+ kwargs['cmap'] = 'Reds'
+ df['cluster_name'] = cluster_ids
+ df = df.groupby('cluster_name').mean()
+ cluster_ids = df.index
+ else:
+ if not groupby_cluster:
+ idx = cluster_ids.argsort(kind = 'mergesort')
+ df = df.iloc[idx, :] # organize df by category order
+ cluster_ids = cluster_ids[idx]
- if on_average:
- if not 'cmap' in kwargs.keys():
- kwargs['cmap'] = 'Reds'
- df = df.groupby('cluster_name').mean()
- cluster_ids = df.index
- else:
- cluster_ids = df.pop('cluster_name').values
- if not groupby_cluster:
- idx = cluster_ids.argsort(kind = 'mergesort')
- df = df.iloc[idx, :] # organize df by category order
- cluster_ids = cluster_ids[idx]
+ cell_colors = np.zeros(df.shape[0], dtype=object)
+ palette = _get_palette(cluster_ids.categories.size)
+
+ for k, cat in enumerate(cluster_ids.categories):
+ cell_colors[cluster_ids == cat] = palette[k]
- cell_colors = np.zeros(df.shape[0], dtype=object)
- palette = _get_palette(cluster_ids.categories.size)
+ cluster_ids = []
+
+
+ from scipy.cluster.hierarchy import linkage
+
+ groupby_linkage = None
+ if groupby_cluster:
+ if groupby_precomputed_linkage is not None:
+ groupby_linkage = groupby_precomputed_linkage
+ else:
+ groupby_linkage = linkage(df, groupby_method, optimal_ordering = groupby_optimal_ordering)
+ attrs_linkage = None
+ if attrs_cluster:
+ attrs_linage = linkage(df.T, attrs_method, optimal_ordering = attrs_optimal_ordering)
- for k, cat in enumerate(cluster_ids.categories):
- cell_colors[cluster_ids == cat] = palette[k]
if not switch_axes:
cg = sns.clustermap(
data=df,
- row_colors=cell_colors if not on_average else None,
+ row_colors=cell_colors,
col_colors=None,
row_cluster=groupby_cluster,
col_cluster=attrs_cluster,
+ row_linkage=groupby_linkage,
+ col_linkage=attrs_linkage,
linewidths=0,
- yticklabels=cluster_ids if on_average else [],
+ yticklabels=cluster_ids,
xticklabels=attr_names,
figsize=panel_size,
**kwargs,
)
cg.ax_heatmap.set_ylabel("")
- if attrs_labelsize is not None:
- cg.ax_heatmap.tick_params(axis='x', labelsize=attrs_labelsize, labelrotation=75)
+ cg.ax_heatmap.tick_params(axis='x', labelsize=attrs_labelsize, labelrotation=attrs_labelrotation)
+ if groupby is None:
+ cg.ax_heatmap.tick_params(axis='y', labelsize=groupby_labelsize, labelrotation=groupby_labelrotation)
else:
cg = sns.clustermap(
data=df.T,
row_colors=None,
- col_colors=cell_colors if not on_average else None,
+ col_colors=cell_colors,
row_cluster=attrs_cluster,
col_cluster=groupby_cluster,
+ row_linkage=attrs_linkage,
+ col_linkage=groupby_linkage,
linewidths=0,
yticklabels=attr_names,
- xticklabels=cluster_ids if on_average else [],
+ xticklabels=cluster_ids,
figsize=panel_size,
**kwargs,
)
cg.ax_heatmap.set_xlabel("")
- if attrs_labelsize is not None:
- cg.ax_heatmap.tick_params(axis='y', labelsize=attrs_labelsize)
+ cg.ax_heatmap.tick_params(axis='y', labelsize=attrs_labelsize, labelrotation=attrs_labelrotation)
+ if groupby is None:
+ cg.ax_heatmap.tick_params(axis='x', labelsize=groupby_labelsize, labelrotation=groupby_labelrotation)
show_row_dendrogram = (attrs_cluster and attrs_dendrogram) if switch_axes else (groupby_cluster and groupby_dendrogram)
show_col_dendrogram = (groupby_cluster and groupby_dendrogram) if switch_axes else (attrs_cluster and attrs_dendrogram)
@@ -1194,17 +1247,13 @@ def heatmap(
cg.ax_cbar.yaxis.set_ticks_position("right")
- if show_col_dendrogram:
- cg.ax_heatmap.xaxis.tick_bottom()
- cg.ax_col_dendrogram.set_visible(True)
- else:
- cg.ax_heatmap.xaxis.tick_top()
- cg.ax_col_dendrogram.set_visible(False)
+ cg.ax_heatmap.xaxis.tick_bottom()
+ cg.ax_col_dendrogram.set_visible(show_col_dendrogram)
cg.ax_cbar.tick_params(labelsize=cbar_labelsize)
cg.fig.dpi = dpi
- if not on_average:
+ if (groupby is not None) and (not on_average):
if groupby_cluster:
from matplotlib.patches import Patch
legend_elements = [Patch(color = color, label = label) for color, label in zip(palette, cluster_ids.categories)]
@@ -1228,7 +1277,7 @@ def heatmap(
if cur_matkey != data.current_matrix():
data.select_matrix(cur_matkey)
- return cg.fig if return_fig else None
+ return cg.fig if return_fig else cg
def dotplot(
@@ -1498,7 +1547,7 @@ def dendrogram(
linkage: ``str``, optional, default: ``complete``
Which linkage criterion to use, used by hierarchical clustering. Below are available options:
- ``ward`` minimizes the variance of the clusters being merged.
- - ``avarage`` uses the average of the distances of each observation of the two sets.
+ - ``average`` uses the average of the distances of each observation of the two sets.
- ``complete`` uses the maximum distances between all observations of the two sets. (Default)
- ``single`` uses the minimum of the distances between all observations of the two sets.
From a3ea1f85b7c206728d6dd05daa3073533b08d6e6 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Mon, 18 Dec 2023 22:29:54 +0000
Subject: [PATCH 48/57] Fix issue in dotplot
---
pegasus/plotting/plot_library.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 0b07ada1..deb2aa05 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import seaborn as sns
+import matplotlib
import matplotlib.pyplot as plt
from scipy.sparse import issparse
@@ -1443,7 +1444,7 @@ def non_zero(g):
size_legend.grid(False)
# Reset global settings.
- sns.reset_orig()
+ matplotlib.rc_file_defaults()
return fig if return_fig else None
From 5483ba0f2d08f5668f08d580b4af2022374dbce4 Mon Sep 17 00:00:00 2001
From: Bo Li
Date: Fri, 22 Dec 2023 20:47:39 -0800
Subject: [PATCH 49/57] Updated heatmap function
---
pegasus/plotting/plot_library.py | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 2048aa6c..749b6947 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1024,6 +1024,7 @@ def heatmap(
groupby_precomputed_linkage: Optional[np.array] = None,
groupby_labelsize: Optional[float] = 10.0,
groupby_labelrotation: Optional[float] = 0.0,
+ show_sample_name: Optional[bool] = None,
cbar_labelsize: Optional[float] = 10.0,
panel_size: Tuple[float, float] = (10, 10),
return_fig: Optional[bool] = False,
@@ -1081,6 +1082,8 @@ def heatmap(
Fontsize for labels of data.obs['groupby'].
groupby_labelrotation: ``float``, optional, default: 0.0
Rotation of labels for groupby.
+ show_sample_name: ``bool``, optional, default: ``None``
+ If show sample names as tick labels. If ``None``, show_sample_name == ``True`` if groupby == ``None`` and otherwise show_sample_name == ``False``.
cbar_labelsize: ``float``, optional, default: 10.0
Fontsize of the color bar.
panel_size: ``Tuple[float, float]``, optional, default: ``(10, 10)``
@@ -1138,7 +1141,11 @@ def heatmap(
df.index = data.obs_names
attr_names = df.columns.values
- cluster_ids = df.index
+ if show_sample_name is None:
+ show_sample_name = True if groupby is None else False
+ sample_tick_labels = df.index if show_sample_name else []
+
+ cluster_ids = None
cell_colors = None
if groupby is not None:
cluster_ids = data.obs[groupby].values
@@ -1165,9 +1172,6 @@ def heatmap(
for k, cat in enumerate(cluster_ids.categories):
cell_colors[cluster_ids == cat] = palette[k]
- cluster_ids = []
-
-
from scipy.cluster.hierarchy import linkage
groupby_linkage = None
@@ -1178,7 +1182,7 @@ def heatmap(
groupby_linkage = linkage(df, groupby_method, optimal_ordering = groupby_optimal_ordering)
attrs_linkage = None
if attrs_cluster:
- attrs_linage = linkage(df.T, attrs_method, optimal_ordering = attrs_optimal_ordering)
+ attrs_linkage = linkage(df.T, attrs_method, optimal_ordering = attrs_optimal_ordering)
if not switch_axes:
@@ -1191,7 +1195,7 @@ def heatmap(
row_linkage=groupby_linkage,
col_linkage=attrs_linkage,
linewidths=0,
- yticklabels=cluster_ids,
+ yticklabels=sample_tick_labels,
xticklabels=attr_names,
figsize=panel_size,
**kwargs,
@@ -1211,7 +1215,7 @@ def heatmap(
col_linkage=groupby_linkage,
linewidths=0,
yticklabels=attr_names,
- xticklabels=cluster_ids,
+ xticklabels=sample_tick_labels,
figsize=panel_size,
**kwargs,
)
From 4ede98533a7948887dd2029a0ae0b1b5b84581db Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Wed, 27 Dec 2023 12:14:38 -0800
Subject: [PATCH 50/57] remove restriction on igraph
---
requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements.txt b/requirements.txt
index 6341d8da..1a8193d6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,4 +26,4 @@ statsmodels
umap-learn>=0.5.2
wordcloud
xlsxwriter
-igraph<=0.9.10
+igraph
From 9daf25e6c752355cc946866c336b3c0be31fcd99 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Thu, 4 Jan 2024 14:22:45 -0800
Subject: [PATCH 51/57] Fix violin plot for Seaborn v0.13+
---
pegasus/plotting/plot_library.py | 15 ++++++++++++++-
requirements.txt | 2 +-
2 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index deb2aa05..934f479c 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -964,9 +964,17 @@ def violin(
genes.append(key)
df_list = [pd.DataFrame({"label": data.obs[groupby].values})]
+
if hue is not None:
df_list.append(pd.DataFrame({hue: data.obs[hue].values}))
stripplot = False
+ kwargs['hue'] = hue
+ kwargs['split'] = True
+ else:
+ kwargs['hue'] = "label"
+ kwargs['legend'] = False
+ kwargs['split'] = False
+
if len(obs_keys) > 0:
df_list.append(data.obs[obs_keys].reset_index(drop=True))
if len(genes) > 0:
@@ -978,7 +986,11 @@ def violin(
ax = axes[i, 0]
if stripplot:
sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=stripsize, color="k", jitter=True)
- sns.violinplot(x="label", y=attrs[i], hue = hue, data=df, inner=inner, linewidth=1, ax=ax, cut=0, scale=scale, split=True, palette=palette, **kwargs)
+ sns.violinplot(x="label", y=attrs[i], data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, palette=palette, **kwargs)
+ #if hue is None:
+ # sns.violinplot(x="label", y=attrs[i], hue = 'label', legend=False, data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, split=False, palette=palette, **kwargs)
+ #else:
+ # sns.violinplot(x="label", y=attrs[i], hue = hue, data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, split=True, palette=palette, **kwargs)
ax.grid(False)
if hue is not None:
@@ -991,6 +1003,7 @@ def violin(
ax.set_xlabel("")
else:
ax.set_xlabel(groupby)
+ ax.set_xticks(ax.get_xticks()) # Get rid of the UserWarning: set_ticklabels() should only be used with a fixed number of ticks
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_ylabel(attrs[i], labelpad=8, rotation=0, horizontalalignment='right', fontsize='medium')
ax.tick_params(axis='y', right=True, left=False, labelright=True, labelleft=False, labelsize='small')
diff --git a/requirements.txt b/requirements.txt
index 1a8193d6..6e948083 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,7 +20,7 @@ pybind11
scikit-learn>=0.23.2
scikit-misc
scipy
-seaborn
+seaborn>=0.13.0
setuptools
statsmodels
umap-learn>=0.5.2
From 10e1269884e45ad11d6139dd1472f20eff609646 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Thu, 4 Jan 2024 14:26:13 -0800
Subject: [PATCH 52/57] remove comments
---
pegasus/plotting/plot_library.py | 4 ----
1 file changed, 4 deletions(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 934f479c..6d3eaa14 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -987,10 +987,6 @@ def violin(
if stripplot:
sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=stripsize, color="k", jitter=True)
sns.violinplot(x="label", y=attrs[i], data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, palette=palette, **kwargs)
- #if hue is None:
- # sns.violinplot(x="label", y=attrs[i], hue = 'label', legend=False, data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, split=False, palette=palette, **kwargs)
- #else:
- # sns.violinplot(x="label", y=attrs[i], hue = hue, data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, split=True, palette=palette, **kwargs)
ax.grid(False)
if hue is not None:
From 4b160e9c3909b2f3995eca57038f718139f42fd8 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Sat, 6 Jan 2024 23:13:21 -0800
Subject: [PATCH 53/57] dotplot and violin skip genes not in the data
---
pegasus/plotting/plot_library.py | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 19d27fbe..b5123381 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -947,6 +947,9 @@ def violin(
assert not isinstance(data, anndata.AnnData)
data.select_matrix(matkey)
+ # Filter out attributes not existing in the data
+ attrs = _get_valid_attrs(data, attrs)
+
nrows = len(attrs)
fig, axes = _get_subplot_layouts(nrows=nrows, ncols=1, panel_size=panel_size, dpi=dpi, left=left, bottom=bottom, wspace=wspace, hspace=0, squeeze=False, sharey=False)
@@ -958,9 +961,6 @@ def violin(
assert is_numeric_dtype(data.obs[key])
obs_keys.append(key)
else:
- if key not in data.var_names:
- logger.warning(f"Cannot find gene {key}. Please make sure all genes are included in data.var_names before running this function!")
- return None
genes.append(key)
df_list = [pd.DataFrame({"label": data.obs[groupby].values})]
@@ -1170,7 +1170,7 @@ def heatmap(
if not 'cmap' in kwargs.keys():
kwargs['cmap'] = 'Reds'
df['cluster_name'] = cluster_ids
- df = df.groupby('cluster_name').mean()
+ df = df.groupby(by='cluster_name', observed=True).mean()
cluster_ids = df.index
else:
if not groupby_cluster:
@@ -1300,7 +1300,7 @@ def dotplot(
data: Union[MultimodalData, UnimodalData, anndata.AnnData],
genes: Union[str, List[str]],
groupby: str,
- reduce_function: Callable[[np.ndarray], float] = np.mean,
+ reduce_function: Union[str, Callable[[np.ndarray], float]] = "mean",
fraction_min: float = 0,
fraction_max: float = None,
dot_min: int = 0,
@@ -1325,7 +1325,7 @@ def dotplot(
Features to plot.
groupby: ``str``
A categorical variable in data.obs that is used to categorize the cells, e.g. Clusters.
- reduce_function: ``Callable[[np.ndarray], float]``, optional, default: ``np.mean``
+ reduce_function: ``Union[str, Callable[[np.ndarray], float]]``, optional, default: ``"mean"``
Function to calculate statistic on expression data. Default is mean.
fraction_min: ``float``, optional, default: ``0``.
Minimum fraction of expressing cells to consider.
@@ -1364,12 +1364,14 @@ def dotplot(
sns.set(font_scale=0.7, style='whitegrid')
if not is_list_like(genes):
- geness = [genes]
+ genes = [genes]
+
+ # Select only genes existing in the data
+ genes = _get_valid_attrs(data, genes)
keywords = dict(cmap=cmap)
keywords.update(kwds)
- from scipy.sparse import issparse
X = slicing(data[:, genes].X)
df = pd.DataFrame(data=X, columns=genes)
df[groupby] = data.obs[groupby].values
@@ -1387,7 +1389,8 @@ def dotplot(
def non_zero(g):
return np.count_nonzero(g) / g.shape[0]
- summarized_df = df.groupby(groupby).aggregate([reduce_function, non_zero])
+ # Set observed=True to suppress warnings.
+ summarized_df = df.groupby(by=groupby, observed=True).aggregate([reduce_function, non_zero])
row_indices = summarized_df.index.tolist()
if sort_function == "natsorted":
From b61f9ce306db82eeef7e6fcf2efddcbe9283e4ab Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Sat, 6 Jan 2024 23:17:35 -0800
Subject: [PATCH 54/57] heatmap skip attributes not in the data
---
pegasus/plotting/plot_library.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index b5123381..7ffd1581 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1128,6 +1128,9 @@ def heatmap(
if isinstance(attrs, str):
attrs = [attrs]
+ # Filter out attributes not existing in the data
+ attrs = _get_valid_attrs(data, attrs)
+
obs_keys = []
genes = []
for key in attrs:
From c8600cc91ab8577ea100564f867fd3ffd74bc6ce Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Sun, 7 Jan 2024 13:08:02 -0800
Subject: [PATCH 55/57] update CI test
---
.github/workflows/ci-test.yml | 6 +----
tests/run_hashing_citeseq.sh | 13 ---------
tests/test_hashing_citeseq.py | 50 -----------------------------------
3 files changed, 1 insertion(+), 68 deletions(-)
delete mode 100644 tests/run_hashing_citeseq.sh
delete mode 100644 tests/test_hashing_citeseq.py
diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
index e66dd54d..a501f27c 100644
--- a/.github/workflows/ci-test.yml
+++ b/.github/workflows/ci-test.yml
@@ -14,7 +14,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
- python-version: ['3.8', '3.9', '3.10']
+ python-version: ['3.8', '3.9', '3.10', '3.11']
steps:
- uses: actions/checkout@v2
@@ -48,10 +48,6 @@ jobs:
- name: One sample input test
run: |
bash tests/run_one_sample.sh
- - name: Hashing CITE-Seq pipeline test
- run: |
- bash tests/run_hashing_citeseq.sh
- pytest tests/test_hashing_citeseq.py
- name: iNMF test
run: |
bash tests/run_inmf.sh
diff --git a/tests/run_hashing_citeseq.sh b/tests/run_hashing_citeseq.sh
deleted file mode 100644
index e546244e..00000000
--- a/tests/run_hashing_citeseq.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-pegasus demuxEM -p 2 --generate-diagnostic-plots tests/data/hashing_citeseq/cb_cc_raw_gene_bc_matrices_h5.h5 tests/data/hashing_citeseq/cb_cell_hashing.csv tests/cb_cc
-if [ -f "tests/cb_cc_demux.zarr.zip" ]; then
- pegasus aggregate_matrix --select-only-singlets --min-genes 100 tests/data/sample_hashing_citeseq.csv tests/cb_cc_citeseq
-
- if [ -f "tests/cb_cc_citeseq.zarr.zip" ]; then
- pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --mito-prefix MT- --percent-mito 20 --louvain --umap --citeseq --citeseq-umap --citeseq-umap-exclude Mouse_IgG1,Mouse_IgG2a,Mouse_IgG2b,Rat_IgG2b tests/cb_cc_citeseq.zarr.zip tests/citeseq_result
-
- if [ -f "tests/citeseq_result.zarr.zip" ]; then
- pegasus plot scatter --basis umap --attributes louvain_labels,assignment tests/citeseq_result.zarr.zip tests/citeseq_result.umap.pdf
- pegasus plot scatter --basis citeseq_umap --attributes louvain_labels,assignment tests/citeseq_result.zarr.zip tests/citeseq_result.citeseq_umap.pdf
- fi
- fi
-fi
diff --git a/tests/test_hashing_citeseq.py b/tests/test_hashing_citeseq.py
deleted file mode 100644
index e30cdff7..00000000
--- a/tests/test_hashing_citeseq.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-Unittest module for hashing_citeseq
-"""
-
-import os
-import glob
-import unittest
-
-import numpy as np
-import pandas as pd
-import pegasus as pg
-
-
-class TestPipeline(unittest.TestCase):
- def test_demux(self):
- data = pg.read_input("tests/cb_cc_demux.zarr.zip")
- self.assertEqual(data.shape, (737280, 33694), "Demux data shape differs!")
- self.assertIn('demux_type', data.obs.columns, "Demux type is lost!")
- self.assertIn('assignment', data.obs.columns, "Cell assignment is lost!")
- f_list = glob.glob("tests/cb_cc.*.pdf")
- self.assertEqual(len(f_list), 4, "Demux diagnosis plots are missing!")
- self.assertIn('cb_cc.out.demuxEM.zarr.zip', os.listdir('tests'), "Demultiplexed RNA matrix is lost!")
-
- def test_citeseq(self):
- data = pg.read_input("tests/cb_cc_citeseq.zarr.zip")
- self.assertSetEqual(set(data.list_data()), set(['GRCh38-citeseq', 'GRCh38-rna']), "Some modality is missing!")
- self.assertNotIn('demux_type', data.obs.columns, "Demux type is not removed!")
- self.assertIn('assignment', data.obs.columns, "Cell assignment is lost!")
- self.assertEqual(data.shape, (14363, 33694), "RNA data shape differs!")
- data.select_data('GRCh38-citeseq')
- self.assertEqual(data.shape, (14363, 31), "CITE-Seq data shape differs!")
-
- def test_clustering(self):
- data = pg.read_input("tests/citeseq_result.zarr.zip")
- self.assertSetEqual(set(data.list_data()), set(['GRCh38-citeseq', 'GRCh38-rna']), "Some modality is missing!")
- n_rna_cells = data.shape[0]
- self.assertNotIn('demux_type', data.obs.columns, "Demux type is not removed!")
- self.assertEqual(data.obs['assignment'].cat.categories.size, 7, "Not all cells are demultiplexed singlets!")
- self.assertIn('X_citeseq', data.obsm.keys(), "CITE-Seq coordinates are lost!")
- self.assertEqual(data.obsm['X_citeseq_umap'].shape[1], data.obsm['X_umap'].shape[1], "Some of UMAP embeddings is lost!")
- data.select_data('GRCh38-citeseq')
- n_citeseq_cells = data.shape[0]
- self.assertEqual(n_rna_cells, n_citeseq_cells, "Two modalities have inconsistent number of cells!")
-
- def test_plot(self):
- self.assertIn('citeseq_result.citeseq_umap.pdf', os.listdir('tests'), "CITE-Seq UMAP plot is lost!")
- self.assertIn('citeseq_result.umap.pdf', os.listdir('tests'), "RNA UMAP plot is lost!")
-
-if __name__ == "__main__":
- unittest.main()
From 89ca768e25cc316de3f582c9ad503a17d4d94f0c Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Fri, 19 Jan 2024 17:00:52 -0800
Subject: [PATCH 56/57] Fix heatmap (#286)
Fix issues in heatmap
---
pegasus/plotting/plot_library.py | 52 ++++++++++++++++----------------
pegasus/plotting/plot_utils.py | 5 ++-
2 files changed, 30 insertions(+), 27 deletions(-)
diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index 7ffd1581..edce18d9 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -1027,15 +1027,16 @@ def heatmap(
attrs_dendrogram: Optional[bool] = True,
attrs_method: Optional[bool] = 'ward',
attrs_optimal_ordering: Optional[bool] = True,
- attrs_labelsize: Optional[float] = 10.0,
- attrs_labelrotation: Optional[float] = 0.0,
+ xlabel_size: Optional[float] = 10.0,
+ ylabel_size: Optional[float] = 10.0,
+ legend_fontsize: Optional[float] = 10.0,
+ xlabel_rotation: Optional[float] = 90.0,
+ ylabel_rotation: Optional[float] = 0.0,
groupby_cluster: Optional[bool] = True,
groupby_dendrogram: Optional[bool] = True,
groupby_method: Optional[bool] = 'ward',
groupby_optimal_ordering: Optional[bool] = True,
groupby_precomputed_linkage: Optional[np.array] = None,
- groupby_labelsize: Optional[float] = 10.0,
- groupby_labelrotation: Optional[float] = 0.0,
show_sample_name: Optional[bool] = None,
cbar_labelsize: Optional[float] = 10.0,
panel_size: Tuple[float, float] = (10, 10),
@@ -1076,10 +1077,16 @@ def heatmap(
Linkage method for attrs, choosing from ``single``, ``complete``, ``average``, ``weighted``, ``centroid``, ``median`` and ``ward``.
attrs_optimal_ordering: ``bool``, optional, default: ``True``
Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the attrs linkage matrix will be reordered so that the distance between successive leaves is minima.
- attrs_labelsize: ``float``, optional, default: 10.0
- Fontsize for labels of attrs.
- attrs_labelrotation: ``float``, optional, default: 0.0
- Rotation of labels for attrs.
+ xlabel_size: ``float``, optional, default: 10.0
+ Fontsize for x-axis labels.
+ ylabel_size: ``float``, optional, default: 10.0
+ Fontsize for y-axis labels.
+ legend_fontsize: ``float``, optional, default: 10.0
+ Fontsize for legend labels.
+ xlabel_rotation: ``float``, optional, default: 90.0
+ Rotation of x-axis labels.
+ ylabel_rotation: ``float``, optional, default: 0.0
+ Rotation of y-axis labels.
groupby_cluster: ``bool``, optional, default: ``True``
Cluster data.obs['groupby'] and generate a cluster-wise dendrogram.
groupby_dendrogram: ``bool``, optional, default: ``True``
@@ -1090,10 +1097,6 @@ def heatmap(
Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the groupby linkage matrix will be reordered so that the distance between successive leaves is minima.
groupby_precomputed_linkage: ``np.array``, optional, default: ``None``
Pass a precomputed linkage.
- groupby_labelsize: ``float``, optional, default: 10.0
- Fontsize for labels of data.obs['groupby'].
- groupby_labelrotation: ``float``, optional, default: 0.0
- Rotation of labels for groupby.
show_sample_name: ``bool``, optional, default: ``None``
If show sample names as tick labels. If ``None``, show_sample_name == ``True`` if groupby == ``None`` and otherwise show_sample_name == ``False``.
cbar_labelsize: ``float``, optional, default: 10.0
@@ -1116,7 +1119,7 @@ def heatmap(
Examples
--------
- >>> pg.heatmap(data, attrs=['CD14', 'TRAC', 'CD34'], groupby='louvain_labels')
+ >>> pg.heatmap(data, attrs=['CD14', 'TRAC', 'CD34'], groupby='leiden_labels')
"""
if not isinstance(data, anndata.AnnData):
@@ -1158,7 +1161,7 @@ def heatmap(
if show_sample_name is None:
show_sample_name = True if groupby is None else False
- sample_tick_labels = df.index if show_sample_name else []
+ groupby_tick_labels = df.index if show_sample_name else []
cluster_ids = None
cell_colors = None
@@ -1175,6 +1178,7 @@ def heatmap(
df['cluster_name'] = cluster_ids
df = df.groupby(by='cluster_name', observed=True).mean()
cluster_ids = df.index
+ groupby_tick_labels = cluster_ids
else:
if not groupby_cluster:
idx = cluster_ids.argsort(kind = 'mergesort')
@@ -1199,7 +1203,6 @@ def heatmap(
if attrs_cluster:
attrs_linkage = linkage(df.T, attrs_method, optimal_ordering = attrs_optimal_ordering)
-
if not switch_axes:
cg = sns.clustermap(
data=df,
@@ -1210,15 +1213,14 @@ def heatmap(
row_linkage=groupby_linkage,
col_linkage=attrs_linkage,
linewidths=0,
- yticklabels=sample_tick_labels,
+ yticklabels=groupby_tick_labels,
xticklabels=attr_names,
figsize=panel_size,
**kwargs,
)
cg.ax_heatmap.set_ylabel("")
- cg.ax_heatmap.tick_params(axis='x', labelsize=attrs_labelsize, labelrotation=attrs_labelrotation)
- if groupby is None:
- cg.ax_heatmap.tick_params(axis='y', labelsize=groupby_labelsize, labelrotation=groupby_labelrotation)
+ cg.ax_heatmap.tick_params(axis='x', labelsize=xlabel_size, labelrotation=xlabel_rotation)
+ cg.ax_heatmap.tick_params(axis='y', labelsize=ylabel_size, labelrotation=ylabel_rotation)
else:
cg = sns.clustermap(
data=df.T,
@@ -1230,14 +1232,13 @@ def heatmap(
col_linkage=groupby_linkage,
linewidths=0,
yticklabels=attr_names,
- xticklabels=sample_tick_labels,
+ xticklabels=groupby_tick_labels,
figsize=panel_size,
**kwargs,
)
cg.ax_heatmap.set_xlabel("")
- cg.ax_heatmap.tick_params(axis='y', labelsize=attrs_labelsize, labelrotation=attrs_labelrotation)
- if groupby is None:
- cg.ax_heatmap.tick_params(axis='x', labelsize=groupby_labelsize, labelrotation=groupby_labelrotation)
+ cg.ax_heatmap.tick_params(axis='y', labelsize=ylabel_size, labelrotation=ylabel_rotation)
+ cg.ax_heatmap.tick_params(axis='x', labelsize=xlabel_size, labelrotation=xlabel_rotation)
show_row_dendrogram = (attrs_cluster and attrs_dendrogram) if switch_axes else (groupby_cluster and groupby_dendrogram)
show_col_dendrogram = (groupby_cluster and groupby_dendrogram) if switch_axes else (attrs_cluster and attrs_dendrogram)
@@ -1276,7 +1277,7 @@ def heatmap(
if groupby_cluster:
from matplotlib.patches import Patch
legend_elements = [Patch(color = color, label = label) for color, label in zip(palette, cluster_ids.categories)]
- cg.ax_heatmap.legend(handles=legend_elements, loc='lower left', bbox_to_anchor = (1.02, 1.02), fontsize = groupby_labelsize)
+ cg.ax_heatmap.legend(handles=legend_elements, loc='lower left', bbox_to_anchor = (1.02, 1.02), fontsize = legend_fontsize)
else:
values = cluster_ids.value_counts().values
ticks = np.cumsum(values) - values / 2
@@ -1290,7 +1291,7 @@ def heatmap(
cg.ax_col_colors.xaxis.tick_top()
cg.ax_col_colors.set_xticks(ticks)
cg.ax_col_colors.set_xticklabels(labels, rotation=45)
- cg.ax_col_colors.tick_params(axis='x', top = False, labelsize = groupby_labelsize, length=10)
+ cg.ax_col_colors.tick_params(axis='x', top = False, labelsize = xlabel_size, length=10)
if not isinstance(data, anndata.AnnData):
if cur_matkey != data.current_matrix():
@@ -1387,7 +1388,6 @@ def dotplot(
idx = series == 0
if idx.sum() > 0:
logger.warning(f"The following categories contain no cells and are removed: {','.join(list(series.index[idx]))}.")
- df[groupby] = df[groupby].cat.remove_unused_categories()
def non_zero(g):
return np.count_nonzero(g) / g.shape[0]
diff --git a/pegasus/plotting/plot_utils.py b/pegasus/plotting/plot_utils.py
index 48766ef9..e0b6fde8 100644
--- a/pegasus/plotting/plot_utils.py
+++ b/pegasus/plotting/plot_utils.py
@@ -9,6 +9,9 @@
from matplotlib.patches import Circle
from matplotlib.collections import PatchCollection
+import logging
+logger = logging.getLogger(__name__)
+
def _transform_basis(basis: str) -> str:
if basis == "tsne":
@@ -453,6 +456,6 @@ def _get_valid_attrs(data:Union[MultimodalData, UnimodalData], attrs: List[str])
else:
attrs_drop.append(attr)
if len(attrs_drop) > 0:
- print(f"Warning: Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
+ logger.warning(f"Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!")
return attrs_filt
From 40e6ef5b7d7009600a00b5238f1d01a1cdaddc87 Mon Sep 17 00:00:00 2001
From: Yiming Yang
Date: Fri, 19 Jan 2024 20:49:32 -0800
Subject: [PATCH 57/57] update docs
---
docs/conf.py | 6 +++---
docs/index.rst | 2 +-
docs/release_notes.rst | 5 +++++
docs/release_notes/version_1_9.rst | 14 ++++++++++++++
docs/requirements.txt | 1 -
requirements.txt | 2 +-
setup.py | 2 +-
7 files changed, 25 insertions(+), 7 deletions(-)
create mode 100644 docs/release_notes/version_1_9.rst
diff --git a/docs/conf.py b/docs/conf.py
index 25a5726a..fc3d4cdf 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -23,15 +23,15 @@
# -- Project information -----------------------------------------------------
project = "Pegasus"
-copyright = "2023 Genentech, Inc. All rights reserved."
+copyright = "2024 Genentech, Inc. All rights reserved."
author = (
"Yiming Yang, Joshua Gould and Bo Li"
)
# The short X.Y version
-version = "1.8"
+version = "1.9"
# The full version, including alpha/beta/rc tags
-release = "1.8.1"
+release = "1.9.0"
# -- General configuration ---------------------------------------------------
diff --git a/docs/index.rst b/docs/index.rst
index b1893bf2..c8d37d11 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,7 +10,7 @@
Release Highlights in Current Stable
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. include:: release_notes/version_1_8.rst
+.. include:: release_notes/version_1_9.rst
.. toctree::
:maxdepth: 1
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
index 4adc7f23..7a2690a8 100644
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -6,6 +6,11 @@ Release Notes
.. note::
Also see the release notes of `PegasusIO `__.
+Version 1.9
+~~~~~~~~~~~~~
+
+.. include:: release_notes/version_1_9.rst
+
Version 1.8
~~~~~~~~~~~~~
diff --git a/docs/release_notes/version_1_9.rst b/docs/release_notes/version_1_9.rst
new file mode 100644
index 00000000..fa61e2f3
--- /dev/null
+++ b/docs/release_notes/version_1_9.rst
@@ -0,0 +1,14 @@
+1.9.0 :small:`January 19, 2024`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**New Feature and Improvement**
+
+* ``calculate_z_score`` works with sparse count matrix. [PR `276 `_ Thanks to `Jayaram Kancherla `_]
+* Plotting functions (``scatter``, ``dotplot``, ``violin``, ``heatmap``) now give warnings on genes/attributes not existing in the data, and skip them in the plots.
+* Improve ``heatmap``:
+
+ * Add ``show_sample_name`` parameter for cases of pseudo-bulk data, nanoString DSP data, etc.
+ * Use Scipy's linkage (``scipy.cluster.hierarchy.linkage``) for dendrograms to use its optimal ordering feature for better results (see ``groupby_optimal_ordering`` parameter).
+
+* Update human lung and mouse immune markers used by ``infer_cell_types`` function.
+* Expose ``online_batch_size`` parameter in ``nmf`` and ``integrative_nmf`` functions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
index f6857eea..e714db1c 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -14,7 +14,6 @@ natsort
joblib
psutil
numba
-importlib_metadata; python_version < '3.8'
umap-learn
forceatlas2-python
pyarrow
diff --git a/requirements.txt b/requirements.txt
index 6e948083..5458ce4d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ natsort
numba
numpy
pandas>=1.2.0
-pegasusio>=0.5.1
+pegasusio>=0.9.0
pybind11
scikit-learn>=0.23.2
scikit-misc
diff --git a/setup.py b/setup.py
index 10dac019..087a27bc 100644
--- a/setup.py
+++ b/setup.py
@@ -59,7 +59,7 @@
scvi=["scvi-tools"],
all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools"]
),
- python_requires="~=3.7",
+ python_requires="~=3.8",
package_data={
"pegasus.annotate_cluster": [
"human_immune_cell_markers.json",