diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index 41d69118..a501f27c 100644 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -14,7 +14,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ['3.7', '3.8', '3.9'] + python-version: ['3.8', '3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v2 @@ -48,10 +48,6 @@ jobs: - name: One sample input test run: | bash tests/run_one_sample.sh - - name: Hashing CITE-Seq pipeline test - run: | - bash tests/run_hashing_citeseq.sh - pytest tests/test_hashing_citeseq.py - name: iNMF test run: | bash tests/run_inmf.sh diff --git a/.readthedocs.yml b/.readthedocs.yml index 7e053cac..ae4157db 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,6 +1,35 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need build: - image: latest + os: ubuntu-22.04 + tools: + python: "3.9" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + +# Build documentation in the "docs/" directory with Sphinx sphinx: configuration: docs/conf.py + # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs + # builder: "dirhtml" + # Fail on all warnings to avoid broken references + # fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html python: - version: 3.8 + install: + - requirements: docs/requirements.txt \ No newline at end of file diff --git a/docs/api/index.rst b/docs/api/index.rst index 4311a096..540af78e 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -85,6 +85,7 @@ Cluster Algorithms cluster louvain leiden + split_one_cluster spectral_louvain spectral_leiden diff --git a/docs/conf.py b/docs/conf.py index 5ca8a416..fc3d4cdf 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,22 +23,22 @@ # -- Project information ----------------------------------------------------- project = "Pegasus" -copyright = "2022 Genentech, Inc. All rights reserved." +copyright = "2024 Genentech, Inc. All rights reserved." author = ( "Yiming Yang, Joshua Gould and Bo Li" ) # The short X.Y version -version = "1.7" +version = "1.9" # The full version, including alpha/beta/rc tags -release = "1.7.1" +release = "1.9.0" # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # -#needs_sphinx = '1.7' +#needs_sphinx = '1.8' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom diff --git a/docs/index.rst b/docs/index.rst index 9289f3c9..c8d37d11 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,7 @@ Release Highlights in Current Stable ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. include:: release_notes/version_1_7.rst +.. include:: release_notes/version_1_9.rst .. toctree:: :maxdepth: 1 diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 407a6f5c..7a2690a8 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -6,6 +6,16 @@ Release Notes .. note:: Also see the release notes of `PegasusIO `__. +Version 1.9 +~~~~~~~~~~~~~ + +.. include:: release_notes/version_1_9.rst + +Version 1.8 +~~~~~~~~~~~~~ + +.. include:: release_notes/version_1_8.rst + Version 1.7 ~~~~~~~~~~~~~ diff --git a/docs/release_notes/version_1_8.rst b/docs/release_notes/version_1_8.rst new file mode 100644 index 00000000..4b0947ae --- /dev/null +++ b/docs/release_notes/version_1_8.rst @@ -0,0 +1,21 @@ +1.8.1 :small:`August 23, 2023` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Bug fix in cell marker JSON files for ``infer_cell_types`` function. + +1.8.0 :small:`July 21, 2023` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**New Feature and Improvement** + +* Updata ``human_immune`` and ``human_lung`` marker sets. +* Add ``mouse_liver`` marker set. +* Add `split_one_cluster <./api/pegasus.split_one_cluster.html>`_ function to subcluster one cluster into a specified number of subclusters. +* Update **neighbors** function to set ``use_cache=False`` by default, and adjust K to ``min(K, int(sqrt(n_samples)))``. [PR `272 `_] +* In **infer_doublets** function, argument ``manual_correction`` now accepts a float number threshold specified by users for cut-off. [PR `275 `_] + +**Bug Fix** + +* Fix divide by zero issue in ``integrative_nmf`` function. [PR `258 `_] +* Compatibility with Pandas v2.0. [PR `261 `_] +* Allow ``infer_doublets`` to use any count matrix with key name specified by users. [PR `268 `_ Thanks to `Donghoon Lee `_] diff --git a/docs/release_notes/version_1_9.rst b/docs/release_notes/version_1_9.rst new file mode 100644 index 00000000..fa61e2f3 --- /dev/null +++ b/docs/release_notes/version_1_9.rst @@ -0,0 +1,14 @@ +1.9.0 :small:`January 19, 2024` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**New Feature and Improvement** + +* ``calculate_z_score`` works with sparse count matrix. [PR `276 `_ Thanks to `Jayaram Kancherla `_] +* Plotting functions (``scatter``, ``dotplot``, ``violin``, ``heatmap``) now give warnings on genes/attributes not existing in the data, and skip them in the plots. +* Improve ``heatmap``: + + * Add ``show_sample_name`` parameter for cases of pseudo-bulk data, nanoString DSP data, etc. + * Use Scipy's linkage (``scipy.cluster.hierarchy.linkage``) for dendrograms to use its optimal ordering feature for better results (see ``groupby_optimal_ordering`` parameter). + +* Update human lung and mouse immune markers used by ``infer_cell_types`` function. +* Expose ``online_batch_size`` parameter in ``nmf`` and ``integrative_nmf`` functions. diff --git a/docs/requirements.txt b/docs/requirements.txt index f6857eea..e714db1c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -14,7 +14,6 @@ natsort joblib psutil numba -importlib_metadata; python_version < '3.8' umap-learn forceatlas2-python pyarrow diff --git a/pegasus/__init__.py b/pegasus/__init__.py index ae574a32..3e0d62bc 100644 --- a/pegasus/__init__.py +++ b/pegasus/__init__.py @@ -65,6 +65,7 @@ de_analysis, markers, write_results_to_excel, + cluster_specific_markers, find_markers, infer_path, calc_signature_score, diff --git a/pegasus/annotate_cluster/annotate_cluster.py b/pegasus/annotate_cluster/annotate_cluster.py index 5ce05357..22caea43 100644 --- a/pegasus/annotate_cluster/annotate_cluster.py +++ b/pegasus/annotate_cluster/annotate_cluster.py @@ -278,7 +278,9 @@ def infer_cell_types( * ``'mouse_immune'`` for mouse immune cells; * ``'human_brain'`` for human brain cells; * ``'mouse_brain'`` for mouse brain cells; - * ``'human_lung'`` for human lung cells. + * ``'human_lung'`` for human lung cells; + * ``'mouse_lung'`` for mouse lung cells; + * ``'mouse_liver'`` for mouse liver cells. * If ``Dict``, it refers to a Python dictionary describing the markers. de_test: ``str``, optional, default: ``"mwu"`` @@ -320,6 +322,8 @@ def infer_cell_types( human_brain="human_brain_cell_markers.json", mouse_brain="mouse_brain_cell_markers.json", human_lung="human_lung_cell_markers.json", + mouse_lung="mouse_lung_cell_markers.json", + mouse_liver="mouse_liver_cell_markers.json", ) if isinstance(markers, str): diff --git a/pegasus/annotate_cluster/human_immune_cell_markers.json b/pegasus/annotate_cluster/human_immune_cell_markers.json index 60eaa54c..a4592a6f 100644 --- a/pegasus/annotate_cluster/human_immune_cell_markers.json +++ b/pegasus/annotate_cluster/human_immune_cell_markers.json @@ -19,52 +19,117 @@ "title" : "T cell subtype markers", "cell_types" : [ { - "name" : "T helper cell", + "name" : "CD4 Naive T cell", "markers" : [ { - "genes" : ["CD4+"], + "genes" : ["CD4+", "CCR7+", "SELL+", "LEF1+", "FHIT+", "ACTN1+", "LDLRAP1+", "TMIGD2+", "TRABD2A+", "LRRN3+"], "weight" : 1.0, - "comment" : "CD4+ T cell" + "comment" : "Markers derived from Immune Cell Atlas PBMC data" } - ] + ] }, { - "name" : "Cytotoxic T cell", + "name" : "CD4 TCM", "markers" : [ { - "genes" : ["CD8A+", "CD8B+"], + "genes" : ["CD4+", "GPR183+", "CD69+", "PASK+", "LIMS1+", "LPAR6+", "SLC2A3+", "SOCS3+"], "weight" : 1.0, - "comment" : "CD8+ T cell" + "comment" : "Markers derived from Immune Cell Atlas PBMC data" } - ] + ] + }, + { + "name" : "CD4 TEM", + "markers" : [ + { + "genes" : ["CD4+", "KLRB1+", "ANXA2+", "LGALS1+", "TIMP1+", "PTGER2+", "AHNAK+", "TNFRSF4+", "YWHAH+", "CD63+"], + "weight" : 1.0, + "comment" : "Markers derived from Immune Cell Atlas PBMC data" + } + ] }, { "name" : "T regulatory cell", "markers" : [ { - "genes" : ["FOXP3+", "IL2RA+"], - "weight" : 0.7, - "comments" : "key T reg markers" - }, + "genes" : ["RTKN2+", "FOXP3+", "IL2RA+", "HACD1+", "AC133644.2+", "FANK1+", "DUSP4+", "STAM+", "CCR10+", "CTLA4+"], + "weight" : 1.0, + "comments" : "Markers derived from Immune Cell Atlas PBMC data" + } + ] + }, + { + "name" : "CD4 CTL", + "markers" : [ { - "genes" : ["CD4+"], + "genes" : ["CD4+", "CD8A-", "CD8B-"], "weight" : 0.3, - "comment" : "key markers that do not express heavily in droplet-based RNA-Seq" + "comments" : "Must be CD4 T" + }, + { + "genes" : ["GNLY+", "AGAP1+", "ZNF683+", "RGS9+", "IL5RA+", "LAIR2+", "MTERF2+", "SH3RF2+", "RGS17+"], + "weight" : 0.7, + "comments" : "CD4 CTL markers that might also be expressed by CD8 TEM" } ] }, { - "name" : "Naive T cell", + "name" : "T follicular helper cell", "markers" : [ { - "genes" : ["CCR7+", "SELL+", "IL7R+", "TCF7+", "CD27+"], - "weight" : 0.7, - "comment" : "positive markers" - }, + "genes" : ["CD4+", "ST8SIA1+", "PDCD1+", "TIGIT+", "TOX2+", "ICOS+", "SH2D1A+", "IL21+"], + "weight" : 1.0, + "comments" : "Tfh markers" + } + ] + }, + { + "name" : "CD8 Naive T cell", + "markers" : [ { - "genes" : ["IL2RA-", "CD44-", "CD69-"], - "weight" : 0.3, - "comment" : "negative markers" + "genes" : ["CD8A+", "CD8B+", "CCR7+", "SELL+", "LEF1+", "ACTN1+", "TRABD2A+", "LRRN3+", "LINC02446+", "S100B+", "CLEC11A+", "NELL2+", "PASK+", "APBA2+"], + "weight" : 1.0, + "comment" : "Markers derived from Immune Cell Atlas PBMC data" + } + ] + }, + { + "name" : "CD8 TCM", + "markers" : [ + { + "genes" : ["CD8A+", "CD8B+", "GZMK+", "DUSP2+", "LTB+", "CD27+", "IL7R+", "GPR183+", "RGS1+", "CXCR3+"], + "weight" : 1.0, + "comment" : "Markers derived from Immune Cell Atlas PBMC data; CD8A & CD8B are CD8 markers; All others are CD8 TCM specific markers" + } + ] + }, + { + "name" : "CD8 TEM", + "markers" : [ + { + "genes" : ["CD8A+", "CD8B+", "FGFBP2+", "GZMB+", "GZMH+", "GNLY+", "PRF1+", "KLRD1+", "FCGR3A+", "TBX21+", "CX3CR1+", "ASCL2+", "SPON2+", "ADGRG1+", "PRSS23+"], + "weight" : 1.0, + "comment" : "Markers derived from Immune Cell Atlas PBMC data; FGFBP2, GZMB, GZMH, GNLY, PRF1, KLRD1, FCGR3A are pan TEM markers; TBX21, CX3CR1 and ASCL2 are Temra markers; the last three are purely data driven markers" + } + ] + }, + { + "name" : "MAIT", + "markers" : [ + { + "genes" : ["SLC4A10+", "KLRB1+", "NCR3+", "CEBPD+", "GPR65+", "LST1+", "CXCR6+", "TRAV1-2+"], + "weight" : 1.0, + "comment" : "Markers derived from Immune Cell Atlas PBMC data" + } + ] + }, + { + "name" : "Gamma-delta T cell", + "markers" : [ + { + "genes" : ["TRDC+", "TRGC1+", "TRGC2+", "KLRC1+", "KLRD1+", "GNLY+"], + "weight" : 1.0, + "comment" : "Markers derived from Immune Cell Atlas PBMC data" } ] } @@ -73,24 +138,66 @@ }, { - "name" : "B cell", + "name" : "Natural killer cell", "markers" : [ { - "genes" : ["CD19+", "MS4A1+", "CD79A+", "CD79B+"], - "weight" : 0.7, - "comment" : "CD19, CD20 and CD79" + "genes" : ["GNLY+", "KLRF1+", "KLRD1+", "TRDC+", "IL2RB+", "KLRC1+", "NCR1+"], + "weight" : 0.6, + "comment" : "General NK cell markers also cover some T cells; derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; Added NCR1, a pan NK cell marker" }, { - "genes" : ["BANK1+", "BLK+"], + "genes" : ["NCAM1+", "FCGR3A+"], "weight" : 0.2, - "comment" : "Extra B cell markers" + "comment" : "NK subtype markers" }, { - "genes" : ["CD74+", "HLA-DRA+", "HLA-DRB1+", "HLA-DPA1+", "HLA-DPB1+", "HLA-DQA1+", "HLA-DQB1+"], - "weight" : 0.1, - "comment" : "MHC II" + "genes" : ["CD3D-", "CD3E-", "CD3G-"], + "weight" : 0.2, + "comment" : "No T cell markers" } ], + "subtypes" : { + "title" : "NK cell subtype markers", + "cell_types" : [ + { + "name" : "CD56-dim NK cell", + "markers" : [ + { + "genes" : ["FCGR3A+", "FGFBP2+", "SPON2+", "MYOM2+", "S1PR5+", "CX3CR1+", "AKR1C3+", "FCRL6+", "LAIR2+", "PRSS23+"], + "weight" : 1.0, + "comment" : "Cytotoxic NK cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data" + } + ] + }, + { + "name" : "CD56-bright NK cell", + "markers" : [ + { + "genes" : ["NCAM1+", "GZMK+", "XCL1+", "SPTSSB+", "CAPG+", "IL7R+", "GPR183+", "IGFBP4+", "SPINK2+", "FUT7+"], + "weight" : 1.0, + "comment" : "Regulatory NK cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; CD56bright develops into CD56dim" + } + ] + } + ], + "comment": "There is also a CD56_dim CD16_dim population in between of the CD56-dim and CD56-bright subtypes." + } + }, + + { + "name" : "B cell", + "markers" : [ + { + "genes" : ["MS4A1+", "CD79A+", "CD79B+", "CD19+", "BANK1+", "TNFRSF13C+", "CD22+", "BLK+", "FCRLA+", "HLA-DOB+"], + "weight" : 0.9, + "comment" : "Human and mouse shared B cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; TNFRSF13C (BAFF receptor); CD79A, CD79B, CD19, BLK, FCRLA and HLA-DOB are also expressed in Plasma cells; CD79B in addition is expressed in CD16+ monocytes & HSCs; BANK1 & BLK are expressed higher in memory B" + }, + { + "genes" : ["LINC00926+", "VPREB3+"], + "weight" : 0.1, + "comment" : "B cell markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data" + } + ], "subtypes" : { "title" : "B cell subtype markers", "cell_types" : [ @@ -123,9 +230,9 @@ "name" : "Naive B cell", "markers" : [ { - "genes" : ["IGHD+", "TCL1A+", "FCER2+"], + "genes" : ["IGHD+", "TCL1A+", "FCER2+", "IL4R+", "PLPP5+"], "weight" : 1.0, - "comments" : "markers for naive B cell, collected from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper. Validated using ICA pbmc data" + "comments" : "Markers for naive B cell derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; IGHD & FCER2 are shared with mouse" } ] }, @@ -133,9 +240,9 @@ "name" : "Memory B cell", "markers" : [ { - "genes" : ["CD27+", "TNFRSF13B+"], + "genes" : ["IGHA1+", "IGHG1+", "CD27+", "TNFRSF13B+", "CLECL1P+", "AIM2+", "LGALS1+", "CRIP1+"], "weight" : 1.0, - "comments" : "markers for memory B cell, collected from Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper. Validated using ICA pbmc data" + "comments" : "Markers for memory B cell derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data" } ] } @@ -147,19 +254,9 @@ "name" : "Germinal Center B cell", "markers" : [ { - "genes" : ["CD19+", "MS4A1+", "CD79A+", "CD79B+"], - "weight" : 0.3, - "comment" : "CD19, CD20 and CD79" - }, - { - "genes" : ["CD38+", "BCL6+", "BCL2-", "RGS13+"], - "weight" : 0.6, - "comment" : "First 3 markers are from Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 1 & 2). The last marker is from XXX" - }, - { - "genes" : ["PCNA+", "MKI67+"], - "weight" : 0.1, - "comment" : "From Klein et al. PNAS 2003 https://doi.org/10.1073/pnas.0437996100 (Fig. 2)" + "genes" : ["MEF2B+", "NEIL1+", "RGS13+", "ELL3+", "BCL7A+", "BCL6+", "NUGGC+", "MYBL1+", "EML6+", "FANCA+"], + "weight" : 1.0, + "comment" : "GC B cell markers" } ], "subtypes" : { @@ -169,9 +266,9 @@ "name" : "Dark zone B cell", "markers" : [ { - "genes" : ["CXCR4+", "AICDA+", "FOXP1+", "MME+"], + "genes" : ["NUSAP1+", "NCAPG+", "AURKB+", "HMGB2+", "HJURP+", "TOP2A+"], "weight" : 1.0, - "comment" : "Fig. 4B of Massoni-Badosa et al. Tonsil Atlas paper" + "comment" : "DZ B cell markers" } ] }, @@ -190,38 +287,28 @@ }, { - "name" : "Natural killer cell", + "name" : "Plasma cell", "markers" : [ { - "genes" : ["NCAM1+"], - "weight" : 0.2, - "comment" : "CD56" + "genes" : ["TNFRSF17+", "PRDM1+", "SLAMF7+", "IRF4+", "SDC1+"], + "weight" : 0.5, + "comment" : "Human and mouse shared markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; TNFRSF17 (BCMA), PRDM1 (BLIMP1); SDC1 is highly expressed in BMMC but not PBMC" }, { - "genes" : ["NKG7+"], + "genes" : ["IGHA1+", "IGHG1+", "TNFRSF13B+"], "weight" : 0.2, - "comment" : "natural killer cell granule protein 7" - }, - { - "genes" : ["KLRB1+", "KLRD1+", "KLRF1+", "KLRC1+", "KLRC2+", "KLRC3+", "KLRC4+"], - "weight" : 0.25, - "comment" : "killer cell lectin like receptors" - }, - { - "genes" : ["CD3D-", "CD3E-", "CD3G-"], - "weight" : 0.15, - "comment" : "not T cell" + "comment" : "Markers expressed by both plasma and memory B cells, derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; IGHA1 & IGHG1 indicate class switch" }, { - "genes" : ["FCGR3A+"], - "weight" : 0.1, - "comment" : "CD16a" + "genes" : ["CD38+", "ABCB9+", "CHPF+", "PLAAT2+"], + "weight" : 0.2, + "comment" : "Human-specific plasma markers derived from Immune Cell Atlas PBMC, BMMC and CB-MNC data; PLAAT2 is highly expressed in PBMC but not BMMC" }, { - "genes" : ["ITGAL+", "ITGAM+"], + "genes" : ["MS4A1-"], "weight" : 0.1, - "comment" : "CD11a,CD11b" - } + "comment" : "not B cell, doi: https://doi.org/10.1182/bloodadvances.2017004481, long-live plasma can still express CD19" + } ] }, @@ -270,42 +357,40 @@ }, { - "name" : "Plasmacytoid dendritic cell", + "name" : "Migratory dendritic cell", "markers" : [ { - "genes" : ["JCHAIN+", "LILRA4+", "GZMB+", "MZB1+", "IL3RA+", "SERPINF1+", "ITM2C+", "IRF7+"], + "genes" : ["FSCN1+", "CCR7+", "LAMP3+", "CCL19+", "CCL22+", "CD40+", "BIRC3+"], "weight" : 1.0, - "comment" : "important pDC markers" + "comment" : "Xing et al. Science Advances 2021 Table S2 (DCs-C3)" } ] }, { - "name" : "Plasma cell", + "name" : "Plasmacytoid dendritic cell", "markers" : [ { - "genes" : ["CD38+", "XBP1+", "CD27+", "SLAMF7+"], - "weight" : 0.4, - "comment" : "important markers" - }, - { - "genes" : ["TNFRSF17+", "TNFRSF13B+"], - "weight" : 0.2, - "comment" : "TNF-receptor superfamily" - }, - { - "genes" : ["IGHA1+", "IGHG1+"], - "weight" : 0.2, - "comment" : "class switching happened" - }, + "genes" : ["LILRA4+", "SERPINF1+", "IL3RA+", "TPM2+", "SCT+", "UGCG+", "CLEC4C+", "LRRC26+", "SMPD3+", "AC119428.2+"], + "weight" : 1.0, + "comment" : "Markers derived from Immune Cell Atlas PBMC, BM and CB data" + } + ] + }, + + + { + "name" : "Follicular dendritic cell", + "markers" : [ { - "genes" : ["MS4A1-"], - "weight" : 0.2, - "comment" : "not B cell, doi: https://doi.org/10.1182/bloodadvances.2017004481, long-live plasma can still express CD19" - } + "genes" : ["CXCL13+", "FCAMR+", "FDCSP+", "SERPINE2+", "PAPPA+", "NPHS1+", "PKDCC+", "SYNM+", "NRG2+", "CDC42EP4+", "MUC3A+", "PRUNE2+", "B4GALNT4+", "NPPC+", "SLC1A2+", "TMEM150C+"], + "weight" : 1.0, + "comment" : "fDC markers" + } ] }, + { "name" : "Hematopoietic stem cell", "markers" : [ @@ -318,7 +403,7 @@ }, { - "name" : "Erythroid cells", + "name" : "Erythroid cell", "markers" : [ { "genes" : ["GYPA+"], @@ -344,7 +429,7 @@ }, { - "name" : "Megakaryocyte", + "name" : "Platelet", "markers" : [ { "genes" : ["PF4+", "PPBP+", "GP5+"], @@ -359,13 +444,46 @@ ] }, + { + "name" : "Pro-Neutrophil", + "markers" : [ + { + "genes" : ["DEFA3+", "DEFA4+", "AZU1+", "MS4A3+", "ELANE+", "SLPI+", "CEACAM6+", "RNASE3+", "PRTN3+", "MPO+", "AC104232.1+", "CTSG+"], + "weight" : 1.0, + "comment" : "Pro-Neutrophil markers validated using 10x public whole blood dataset" + } + ] + }, + + { + "name" : "Pre-Neutrophil", + "markers" : [ + { + "genes" : ["LTF+", "LCN2+", "MMP8+", "CRISP3+", "CAMP+", "PGLYRP1+", "CD177+", "HP+"], + "weight" : 1.0, + "comment" : "Pre-Neutrophil markers validated using 10x public whole blood dataset" + } + ] + }, + { "name" : "Neutrophil", "markers" : [ { - "genes" : ["FUT4+", "MPO+", "CEACAM8+", "ELANE+", "CXCR1+", "CXCR2+", "LY6G6D+"], + "genes" : ["CSF3R+", "G0S2+", "LUCAT1+", "EPHB1+", "TNFRSF10C+", "IL1R2+", "KCNJ15+", "FCGR3B+", "AC007032.1+", "HSD11B1-AS1+"], "weight" : 1.0, - "comment" : "key markers" + "comment" : "Neutrophil markers validated using 10x public whole blood dataset" + } + ] + }, + + { + "name" : "Basophil", + "markers" : [ + { + "genes" : ["AKAP12+", "HDC+", "GATA2+", "ENPP3+", "CA8+", "ITGB8+", "GCSAML+", "CRPPA+", "AC111000.4+", "LINC02223+"], + "weight" : 1.0, + "comment" : "Basophil markers validated using 10x public whole blood dataset" } ] }, diff --git a/pegasus/annotate_cluster/human_lung_cell_markers.json b/pegasus/annotate_cluster/human_lung_cell_markers.json index 4e18b1ad..42cc1eb8 100644 --- a/pegasus/annotate_cluster/human_lung_cell_markers.json +++ b/pegasus/annotate_cluster/human_lung_cell_markers.json @@ -5,9 +5,9 @@ "name" : "Alveolar type I cell", "markers" : [ { - "genes" : ["AGER+", "CAV1+", "RTKN2+", "MYL9+", "SPOCK2+", "ANXA3+", "TIMP3+", "CAV2+", "ST6GALNAC5+", "MYRF+"], + "genes" : ["AGER+", "SPOCK2+", "RTKN2+", "TNNC1+", "SCEL+", "CLIC5+", "NCKAP5+", "ARHGEF26+", "GGTLC1+", "ITLN2+", "MS4A15+"], "weight" : 1.0, - "comment" : "AT1 markers from Schupp et al., Travaglini et al. and Tony et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, @@ -16,9 +16,9 @@ "name" : "Alveolar type II cell", "markers" : [ { - "genes" : ["SFTPA1+", "SFTPA2+", "SFTPC+", "ETV5+", "TTN+", "PLA2G4F+", "CCDC141+", "LAMP3+", "ABCA3+", "HHIP+"], + "genes" : ["SFTPA1+", "SFTPA2+", "SFTPC+", "PGC+", "LAMP3+", "FASN+", "HHIP+", "ETV5+", "RASGRF1+", "ABCA3+"], "weight" : 1.0, - "comment" : "AT2 markers from Schupp et al., Travaglini et al. and Tony et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, @@ -27,9 +27,9 @@ "name" : "Basal cell", "markers" : [ { - "genes" : ["KRT5+", "KRT15+", "KRT17+", "TP63+", "S100A2+", "TNS4+"], + "genes" : ["KRT17+", "S100A2+", "MIR205HG+", "KRT15+", "KRT5+", "DLK2+", "CDH3+", "TP63+", "TNS4+"], "weight" : 1.0, - "comment" : "Basal cell markers from Schupp et al., Travaglini et al. and Tony et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, @@ -38,9 +38,9 @@ "name" : "Club cell", "markers" : [ { - "genes" : ["SCGB3A2+", "MGP+", "VIM+", "CST3+"], + "genes" : ["SCGB3A2+", "MGP+", "CTSE+"], "weight" : 1.0, - "comment" : "Club cell markers from Schupp et al., Travaglini et al. and Tony et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, @@ -49,9 +49,9 @@ "name" : "Ciliated cell", "markers" : [ { - "genes" : ["ERICH3+", "SNTN+", "CCDC78+", "SNTN+", "ZBBX+", "DNAI1+", "ARMC3+", "CFAP157+", "TTC29+", "CFAP73+"], + "genes" : ["ERICH3+", "ARMC3+", "DNAI2+", "ZBBX+", "VWA3B+", "RGS22+", "TTC29+", "CDHR4+", "PPP1R42+", "CFAP46+", "CFAP52+", "CFAP73+", "CFAP77+", "CFAP157+", "DNAH3+", "DNAH9+", "ADGB+", "SNTN+", "CCDC170+", "C6orf118+"], "weight" : 1.0, - "comment" : "Ciliated cell markers from Schupp et al., Travaglini et al. and Tony et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, @@ -60,9 +60,9 @@ "name" : "Goblet cell", "markers" : [ { - "genes" : ["MUC5AC+", "MUC5B+", "BPIFB1+", "MSMB+", "FAM3D+", "SERPINB11+", "CXCL6+", "SCGB1A1+", "FAM3D+", "SERPINB3+"], + "genes" : ["MUC5AC+", "MUC5B+", "BPIFB1+", "MSMB+", "SERPINB11+", "CYP2F1+"], "weight" : 1.0, - "comment" : "Goblet cell markers from Schupp et al., Travaglini et al. and Tony et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, @@ -71,9 +71,9 @@ "name" : "Ionocyte", "markers" : [ { - "genes" : ["FOXI1+", "ASCL3+", "CLDN25+", "ATP6V1G3+", "LINC01187+"], + "genes" : ["ASCL3+", "CLCNKB+", "FOXI1+", "ATP6V1G3+", "TMPRSS11E+", "BSND+", "LINC01187+", "CLDN25+"], "weight" : 1.0, - "comment" : "Ionocyte markers from Travaglini et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, @@ -82,9 +82,20 @@ "name" : "Plumonary neuroendocrine cell", "markers" : [ { - "genes" : ["CALCA+", "CHGA+", "ASCL1+", "SLC35D3+", "KIF1A+"], + "genes" : ["CHGA+", "CHGB+", "SCGN+", "SCG5+", "CPLX2+", "GRP+", "ASCL1+", "INSM1+"], "weight" : 1.0, - "comment" : "Plumonary neuroendocrien cell markers from Travaglini et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" + } + ] + }, + + { + "name" : "Submucosal gland serous cell", + "markers" : [ + { + "genes" : ["PRR4+", "TCN1+", "C6orf58+", "PRB3+", "LPO+", "PRB1+", "PRH2+", "PRH1+", "ODAM+"], + "weight" : 1.0, + "comment" : "Markers inferred from Travaglini et al. Nature 2020" } ] }, @@ -102,42 +113,35 @@ - - { "name" : "Vascular endothelial cell", "markers" : [ { - "genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+"], - "weight" : 0.2, - "comment" : "Markers for endothelial cells, from Schupp et al. Circulation 2021" - }, - { - "genes" : [ "ENG+", "PCDH17+", "CLEC14A+", "ESAM+", "ITM2A+", "BMPR2+", "FLT1+", "ADGRL4+", "SLCO2A1+", "AQP1+", "EPAS1+", "ADGRL2+", "IFI27+"], - "weight" : 0.8, - "comment" : "Common vascular EC markers from Schupp et al. Circulation 2021 and ADGRL2" + "genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+", "ICAM2+", "CLEC14A+", "ITM2A+", "ADGRL4+", "SLCO2A1+", "IFI27+"], + "weight" : 1.0, + "comment" : "Markers for vascular endothelial cells, validated using Travaglini et al. Nature 2020 and Schupp et al. Circulation 2021 data" } ], "subtypes" : { "title" : "Vascular endothelial cell subtype markers", "cell_types" : [ { - "name" : "Aerocyte", + "name" : "EC artery", "markers" : [ { - "genes" : ["EDNRB+", "TBX2+", "EDA+", "HPGD+", "PRKG1+", "RCSD1+", "CYP3A5+", "VWF-"], + "genes" : ["CXCL12+", "GJA5+", "DKK2+", "HEY1+", "IGFBP3+", "SERPINE2+", "EFNB2+", "BMX+"], "weight" : 1.0, - "comment" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021" + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, { - "name" : "EC general capillary", + "name" : "EC vein", "markers" : [ { - "genes" : ["VWF+", "EDN1+", "FCN3+", "CD36+", "GPIHBP1+", "NRXN3+", "BTNL8+"], + "genes" : ["CPE+", "C7+", "IL1R1+", "PLA1A+", "PTGIS+", "ABI3BP+", "CYP1B1+", "ADGRG6+"], "weight" : 1.0, - "comment" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021" + "comments" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, @@ -145,79 +149,73 @@ "name" : "EC bronchial vessel", "markers" : [ { - "genes" : ["SPRY1+", "PLVAP+", "VWA1+", "MPZL2+", "ESM1+"], + "genes" : ["SPRY1+", "PLVAP+", "VWA1+", "ABCB1+", "COL15A1+", "RUNDC3B+"], "weight" : 1.0, - "comment" : "Markers from Travaglini et al. Nature 2020" + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, { - "name" : "EC pulmonary-venous", + "name" : "Aerocyte", "markers" : [ { - "genes" : ["COL15A1+", "ZNF385D+", "EBF1+", "CPXM2+", "PLVAP+", "VWA1+", "SPRY1+"], + "genes" : ["HPGD+", "EDNRB+", "SOSTDC1+", "B3GALNT1+", "CYP3A5+", "TBX2+", "S100A3+", "IL1RL1+", "PRKG1+", "EXPH5+"], "weight" : 1.0, - "comments" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021" + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, { - "name" : "EC systemic-venous", + "name" : "EC general capillary", "markers" : [ { - "genes" : ["COL15A1-", "CPE+", "DKK3+", "EFEMP1+", "CDH11+", "PLAT+"], + "genes" : ["FCN3+", "IL7R+", "EDN1+", "GPIHBP1+", "SLC6A4+", "NTRK2+", "IL18R1+", "NRXN3+"], "weight" : 1.0, - "comments" : "Markers inferred from Xing et al. Science Advances 2021, Tony et al. Nature 2021 and Schupp et al. Circulation 2021" + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] } ] } }, + { "name" : "Lymphatic endothelial cell", "markers" : [ { - "genes" : ["PECAM1+", "CLDN5+", "CDH5+", "ERG+"], + "genes" : ["PECAM1+", "CLDN5+", "ERG+", "CDH5+"], "weight" : 0.2, - "comment" : "Markers for endothelial cells, from Schupp et al. Circulation 2021" + "comment" : "Pan endothelial cell markers, validated using Travaglini et al. Nature 2020 and Schupp et al. Circulation 2021 data" }, { - "genes" : ["CCL21+", "SEMA3D+", "PROX1+", "PDPN+", "MMRN1+", "RELN+", "PKHD1L1+", "TFF3+", "LYVE1+", "FLT4+", "TBX1+"], + "genes" : ["CCL21+", "TFF3+", "PDPN+", "PROX1+", "LYVE1+", "FLT4+", "GPM6A+", "SEMA3D+", "TBX1+", "RELN+"], "weight" : 0.8, - "comment" : "Lymphatic-specific markers, from Schupp et al. Circulation 2021" + "comment" : "LEC markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, - - { "name" : "Smooth muscle cell", "markers" : [ { - "genes" : ["MYH11+", "TAGLN+", "ACTG2+", "CNN1+", "PLN+"], - "weight" : 0.8, - "comment" : "Markers from Muus et al., Braga et al. and Schupp et al." - }, - { - "genes" : ["MYL9+", "TPM2+", "ACTA2+"], - "weight" : 0.2, - "comment" : "Markers that might also expressed in other stromal cell types" + "genes" : ["MYH11+", "ACTG2+", "CNN1+", "PLN+"], + "weight" : 1.0, + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ], "subtypes" : { "title" : "SMC subtype markers", "cell_types" : [ { - "name" : "DES+", + "name" : "Airway smooth muscle cell", "markers" : [ { - "genes" : ["DES+"], + "genes" : ["DES+", "TNNT2+", "RERGL+"], "weight" : 1.0, - "comment" : "DES+ SMC" + "comment" : "Markers inferred from Travaglini et al. Nature 2020" } ] } @@ -229,21 +227,10 @@ "name" : "Pericyte", "markers" : [ { - "genes" : ["TRPC6+", "CSPG4+", "FAM162B+", "GJA4+", "GJC1+", "HIGD1B+", "CDH6+", "LAMC3+", "FHL5+"], - "weight" : 0.8, - "comment" : "Markers from Schupp et al. Circulation 2021 and Travaglini et al. Nature 2020" - }, - { - "genes" : ["PDGFRB+", "TBX2+", "EBF1+"], - "weight" : 0.1, - "comment" : "Markers that are highly expressed in Pericytes but also expressed in fibroblast" - }, - { - "genes" : [ "LGI4+", "KCNK17+", "CACNA1H+", "PTN+", "TESC+"], - "weight" : 0.1, - "comment" : "Markers that are lowly expressed" + "genes" : ["COX4I2+", "HIGD1B+", "NDUFA4L2+", "FAM162B+", "LAMC3+", "KCNK3+", "GJA4+", "GJC1+", "CSPG4+"], + "weight" : 1.0, + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } - ] }, @@ -251,15 +238,15 @@ "name" : "Mesothelial cell", "markers" : [ { - "genes" : ["WT1+", "VIPR2+", "ITLN1+", "LINC02360+", "BNC1+", "AP000561.1+", "CALB2+", "HAS1+", "LINC01133+", "GALNT9+"], + "genes" : ["CPA4+", "ITLN1+", "GALNT9+", "BNC1+", "CALB2+", "WT1+", "UPK3B+"], "weight" : 1.0, - "comment" : "Markers from Schupp et al. and Travaglini et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, { - "name" : "Fibroblast/Myofibroblast", + "name" : "Fibroblast", "markers" : [ { "genes" : ["COL1A1+", "COL1A2+", "PDGFRA+", "ELN+", "BGN+"], @@ -268,15 +255,15 @@ } ], "subtypes" : { - "title" : "Fibro/Myofib subtype markers", + "title" : "Fibroblast subtype markers", "cell_types" : [ { "name" : "Adventitial fibroblast", "markers" : [ { - "genes" : ["PTGIS+", "SFRP2+", "PDGFRL+", "SCARA5+", "MFAP5+", "PI16+", "AOX1+", "GAS1+", "IGFBP6+", "CXCL14+"], + "genes" : ["SFRP2+", "SFRP4+", "PDGFRL+", "PI16+", "MFAP5+", "SCARA5+"], "weight" : 1.0, - "comment" : "Markers from Schupp et al. and Travaglini et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, @@ -284,24 +271,46 @@ "name" : "Alveolar fibroblast", "markers" : [ { - "genes" : ["NKD1+", "FGFR4+", "GPM6B+", "SPINT2+", "SCN7A+", "TCF21+", "CAMK2N1+", "ADAMTS8+"], + "genes" : ["GPC3+", "FMO2+", "SCN7A+", "FGFR4+", "NKD2+", "ADAMTS8+"], "weight" : 1.0, - "comment" : "Markers from Schupp et al. and Travaglini et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021" } ] }, { - "name" : "Myofibroblast", + "name" : "Lipofibroblast", "markers" : [ { - "genes" : ["ACTA2+", "MYL9+", "MT2A+", "EEF1A1+", "TMSB10+", "FAU+", "UBA52+", "SERF2+", "PTMA+", "S100A6+"], + "genes" : ["MLLT11+", "HAS2+", "SEMA6A+", "LONRF2+", "HOMER1+", "PWWP3B+"], "weight" : 1.0, - "comment" : "Markers from Schupp et al. and Travaglini et al." + "comment" : "Markers inferred from Travaglini et al. Nature 2020" } ] } ] } + }, + + { + "name" : "Myofibroblast", + "markers" : [ + { + "genes" : ["ASPN+", "SCARA3+", "WIF1+", "ANGPTL2+", "ITGBL1+"], + "weight" : 1.0, + "comment" : "Markers inferred from Travaglini et al. Nature 2020" + } + ] + }, + + { + "name" : "Fibromyocyte", + "markers" : [ + { + "genes" : ["SBSPON+", "SCX+", "GREM2+", "KCNMB1+", "LGR6+"], + "weight" : 1.0, + "comment" : "Markers inferred from Travaglini et al. Nature 2020" + } + ] } ] } diff --git a/pegasus/annotate_cluster/mouse_brain_cell_markers.json b/pegasus/annotate_cluster/mouse_brain_cell_markers.json index ebfc0eac..1bdfc86c 100644 --- a/pegasus/annotate_cluster/mouse_brain_cell_markers.json +++ b/pegasus/annotate_cluster/mouse_brain_cell_markers.json @@ -1,5 +1,6 @@ { "title" : "Mouse brain cell markers", + "comments": "Yao et al. Nature 2021 Allen Mouse Brain Map is a great resource for markers; Map: https://celltypes.brain-map.org/rnaseq/mouse_ctx-hpf_10x?selectedVisualization=Heatmap&colorByFeature=Cell+Type&colorByFeatureValue=Gad1; Cell type metadata: https://brainpalmseq.med.ubc.ca/brain-regions/neocortex-allen-brain-atlas-rnaseq/search-allen-brain-map-by-all-cell-types/; Extended Data Fig 2 & Supp Table 1 of Zhang et al. Nature 2021 is also used in marker selection", "cell_types" : [ { "name" : "Glutamatergic neuron", @@ -168,28 +169,60 @@ "name" : "Oligodendrocyte", "markers" : [ { - "genes" : ["Mbp+", "Plp1+"], - "weight" : 0.6, - "comment" : "Oligo specific markers (Allen Brain Map)" - }, - { - "genes" : ["Mog+"], - "weight" : 0.15, - "comment" : "Oligo specific markers, but not expressed in all Oligo cells (Allen Brain Map)" + "genes" : ["Plp1+", "Cnp+", "Fa2h+", "St18+", "Mbp+"], + "weight" : 0.8, + "comment" : "Oligo specific markers from Yao et al. Nature 2021 (Allen Brain Map)" }, { "genes" : ["Olig1+", "Olig2+", "Sox10+"], - "weight" : 0.25, + "weight" : 0.2, "comment" : "Expressed in both Oligo and OPC (Allen Brain Map)" } - ] + ], + "subtypes" : { + "title" : "Oligodendrocyte subtype markers", + "cell_types" : [ + { + "name" : "Opalin+ Oligodendrocyte", + "markers" : [ + { + "genes" : ["Opalin+", "Mog+", "Plekhh1+", "Ermn+"], + "weight" : 1.0, + "comment": "Opalin+ markers from Yao et al. Nature 2021" + } + ] + }, + { + "name" : "Enpp6+ Oligodendrocyte", + "markers" : [ + { + "genes" : ["Enpp6+", "Pik3r3+", "Cnksr3+", "Parvb+", "Dusp15+"], + "weight" : 1.0, + "comment": "Enpp6+ markers from Yao et al. Nature 2021" + } + ] + }, + { + "name" : "Neu4+ Oligodendrocyte", + "markers" : [ + { + "genes" : ["Neu4+"], + "weight" : 1.0, + "comment": "Neu4+ markers from Yao et al. Nature 2021" + } + ] + } + + ] + } }, { "name" : "OPC", "markers" : [ { - "genes" : ["Pdgfra+", "Cspg4+"], - "weight" : 1.0 + "genes" : ["Pdgfra+", "Cspg4+", "Emid1+", "Fabp7+"], + "weight" : 1.0, + "comment": "Oligodendrocyte progenitor cell markers from Yao et al. Nature 2021" } ] }, @@ -197,71 +230,94 @@ "name" : "Astrocyte", "markers" : [ { - "genes" : ["Aqp4+", "Gja1+", "F3+", "Prex2+"], - "weight" : 1.0 + "genes" : ["Mt2+", "Gja1+", "Prdx6+", "Htra1+", "Ntsr2+", "Aldoc+", "Apoe+", "Prex2+", "Aqp4+", "Gpr37l1+"], + "weight" : 1.0, + "comment": "Astrocyte markers from Yao et al. Nature 2021" } - ] + ], + "subtypes" : { + "title" : "Astrocyte subtype markers", + "cell_types" : [ + { + "name" : "Gfap+ Astrocyte", + "markers" : [ + { + "genes" : ["Gfap+", "Aqp4+", "Tmem47+", "Id4+", "Mlc1+", "Sdc4+", "Gstm1+"], + "weight" : 1.0, + "comment": "Gfap+ markers from Yao et al. Nature 2021" + } + ] + }, + { + "name" : "Slc7a10+ Astrocyte", + "markers" : [ + { + "genes" : ["Slc7a10+", "Grm3+", "Trpm3+", "Phkg1+", "Cdh10+", "Luzp2+", "Gria2+", "Slc6a1+"], + "weight" : 1.0, + "comment": "Slc7a10+ markers from Yao et al. Nature 2021" + } + ] + } + ] + } }, { "name" : "Microglia", "markers" : [ { - "genes" : ["C1qb+", "P2ry12+", "Ctss+", "Csf1r+", "Hmha1+"], - "weight" : 1.0 + "genes" : ["Hexb+", "Siglech+", "Selplg+", "Tmem119+", "Ctss+", "P2ry12+", "Cx3cr1+", "Trem2+", "Fcrls+", "Csf1r+"], + "weight" : 1.0, + "comment": "Microglia specific markers from Yao et al. Nature 2021" } ] }, { - "name" : "Endothelial", - "markers" : [ - { - "genes" : ["Flt1+", "Dcn+", "Xdh+", "Id1+"], - "weight" : 1.0 - } - ] - }, - { - "name" : "Fibroblast", + "name" : "Perivascular macrophage", "markers" : [ { - "genes" : ["Igfbp1+", "Dcn+"], - "weight" : 1.0 + "genes" : ["Mrc1+", "Stab1+", "Lyz2+", "Ms4a6c+", "F13a1+", "Pf4+"], + "weight" : 1.0, + "comment": "PVM specific markers from Yao et al. Nature 2021" } ] }, { - "name" : "Mural", + "name" : "Endothelial cell", "markers" : [ { - "genes" : ["Rgs5+", "Acta2+"], - "weight" : 1.0 + "genes" : ["Flt1+", "Pecam1+", "Ly6a+", "Slco1a4+", "Mecom+", "Ptprb+", "Id1+"], + "weight" : 1.0, + "comment" : "Endo specific markers from Yao et al. Nature 2021; Slco1a4 is specific to mouse brain: see https://journals.plos.org/plosone/article/figures?id=10.1371/journal.pone.0013741" } - ] + ] }, { - "name" : "Choroid Coch", + "name" : "Vascular leptomeningeal cell", "markers" : [ { - "genes" : ["Tgfbi+"], - "weight" : 1.0 + "genes" : ["Slc7a11+", "Slc6a13+", "Bmp6+", "Igfbp2+", "Fmod+", "Ranbp3l+"], + "weight" : 1.0, + "comment" : "VLMC specific markers from Yao et al. Nature 2021" } - ] + ] }, { - "name" : "Ependyma", + "name" : "Smooth muscle cell", "markers" : [ { - "genes" : ["Ccdc153+"], - "weight" : 1.0 + "genes" : ["Atca2+", "Myh11+", "Tagln+", "Pln+", "Mylk+"], + "weight" : 1.0, + "comment" : "SMC specific markers from Yao et al. Nature 2021" } ] }, { - "name" : "Smooth muscle cell", + "name" : "Pericyte", "markers" : [ { - "genes" : ["Vtn+", "Colec12+"], - "weight" : 1.0 + "genes" : ["Vtn+", "Atp13a5+", "Abcc9+", "Kcnj8+", "Art3+"], + "weight" : 1.0, + "comment" : "Pericyte specific markers from Yao et al. Nature 2021" } ] } diff --git a/pegasus/annotate_cluster/mouse_immune_cell_markers.json b/pegasus/annotate_cluster/mouse_immune_cell_markers.json index 64e0fd8a..9b9095eb 100644 --- a/pegasus/annotate_cluster/mouse_immune_cell_markers.json +++ b/pegasus/annotate_cluster/mouse_immune_cell_markers.json @@ -5,8 +5,9 @@ "name" : "T cell", "markers" : [ { - "genes" : ["Cd28+", "Cd3d+", "Cd3e+", "Cd4+", "Cd8a+"], - "weight" : 1.0 + "genes" : ["Cd3d+", "Cd3e+", "Lat+", "Thy1+", "Lef1+", "Trac+", "Cd28+"], + "weight" : 1.0, + "comment" : "T cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021" } ], "subtypes" : { @@ -51,47 +52,245 @@ ] } }, + { - "name" : "Monocyte", + "name" : "Immature B cell", "markers" : [ { - "genes" : ["Lyz2+", "Lyz1+", "S100a4+", "Itgam+"], - "weight" : 0.8 + "genes" : ["Tifa+", "Cecr2+", "Rag1+", "Atp1b1+", "Myb+", "Irf4+", "Fam129c+"], + "weight" : 1.0, + "comment" : "Immature B cell markers from Hurskainen et al. Nat. Commun. 2021" + } + ] + }, + + { + "name" : "B cell", + "markers" : [ + { + "genes" : ["Cd79a+", "Cd79b+", "Ms4a1+", "Cd19+", "H2-Ob+", "Tnfrsf13c+", "Bank1+", "Blk+", "Fcrla+", "Cd22+"], + "weight" : 0.91, + "comment" : "Human and mouse shared B cell markers; validated using Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020), Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data; Ebf1, Pax5 and Fcmr are good markers for mouse lung and liver but not marrow, tissue-specific marker?" + }, + { + "genes" : ["Cxcr5+"], + "weight" : 0.09, + "comment" : "CXCR5 is constantly expressed by mature B cells and helps to guide B cells to follicle; fDC expresses CXCL13, the ligand for CXCR5; this marker expresses lowly in human but higher in mouse " + } + ], + "subtypes" : { + "title" : "B cell subtype markers", + "cell_types" : [ + { + "name" : "Naive B cell", + "markers" : [ + { + "genes" : ["Ighd+", "Fcer2a+", "Vpreb3+", "Fcrl1+", "Chchd10+"], + "weight" : 1.0, + "comments" : "Markers for naive B cell derived from Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) & Kaptein et al. Cell 2022; Ighd & Fcer2a are shared with human" + } + ] + }, + { + "name" : "Memory B cell", + "markers" : [ + { + "genes" : ["Zbtb32+", "C130026I21Rik+", "Pdlim1+", "Hepacam2+", "Igha+"], + "weight" : 0.8, + "comments" : "Markers for memory B cell derived from Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) data; need to check and add Ighg related genes" + }, + { + "genes" : ["Nt5e+", "Cd80+", "Fas+", "Pdcd1lg2+"], + "weight" : 0.2, + "comments" : "Traditional mouse memory B cell validated by Tabula Muris marrow (Tabula Muris Consortium et al. Nature 2020) data; all lowly expressed; Nt5e (5' Nucleotidase/CD73), Fas (CD95), Pdcd1lg2 (PD-L2/CD273)" + } + ] + } + ] + } + }, + + { + "name" : "Plasma cell", + "markers" : [ + { + "genes" : ["Sdc1+", "Slamf7+", "Tnfrsf17+", "Irf4+", "Prdm1+"], + "weight" : 0.5, + "comment" : "Plasma cell markers shared with human and validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020)" }, { - "genes" : ["C1qb+", "C1qc+", "Mrc1+", "Cd52+"], - "weight" : 0.2 + "genes" : ["Derl3+", "Chst1+", "Eaf2+", "Oosp1+", "Cacna1s+"], + "weight" : 0.4, + "comment" : "Mouse-specific plasma cell markers validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020)" + }, + { + "genes" : ["Xbp1+", "Slc3a2+", "Ly6k+"], + "weight" : 0.1, + "comment" : "Traditional mouse plasma markers (not ideal) validated using Tabula Muris marrow data (Tabula Muris Consortium et al. Nature 2020); Xbp1 & Slc3a2 (CD98) expressed highest in plasma but also expressed in other cell types" } ] }, + { - "name" : "B cell", + "name" : "Natural killer cell", "markers" : [ { - "genes" : ["Cd19+", "Cd79b+", "Cd74+", "Igkc+", "Ighm+", "Iglc2+", "Ms4a1+"], - "weight" : 1.0 + "genes" : ["Gzma+", "Klrb1c+", "Ncr1+", "Klre1+", "Klrc2+"], + "weight" : 0.6, + "comment" : "NK & ILC1 shared markers from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Kaptein et al. Cell 2022 data" + }, + { + "genes" : ["Eomes+", "Cma1+", "Klra4+", "Klra7+", "Klra8+"], + "weight" : 0.4, + "comment" : "NK cell specific markers (compared to ILC1) from Kaptein et al. Cell 2022; these markers do not have high expressions in Hurskainen et al. Nat. Commun. 2021 data" } ] }, + { - "name" : "Neutrophil", + "name" : "Classical monocyte", + "markers" : [ + { + "genes" : ["Ly6c2+", "F13a1+", "Ccr2+", "Ms4a4c+", "Gm9733+", "Mcub+", "S100a4+"], + "weight" : 1.0, + "comment" : "Classical monocyte markers (except S100a4) inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; Ly6c2, F13a1, Ccr2 and Ms4a4c (in Fig. 1b) are Group III markers from Casanova-Acebes et al. Nature 2021. S100a4 is less specific to classical monocyte." + } + ] + }, + + { + "name" : "Patrolling monocyte", "markers" : [ { - "genes" : ["Mmp9+", "S100a8+", "S100a9+", "Il1b+", "Retnlg+", "Lcn2+"], - "weight" : 1.0 + "genes" : ["Eno3+", "Cd300e+", "Ace+", "Treml4+", "Spn+", "Adgre4+", "Lair1+", "Fcgr4+", "Ear2+", "Cd300ld+"], + "weight" : 1.0, + "comment" : "Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; First 6 markers are Group IV markers in Fig. 1b of Casanova-Acebes et al. Nature paper; Eno3, Cd300e, Ace and Lair1 are very specific; Related papers: Domingo-Gonzalez et al. Elife 2020, Thomas et al. Arterioscler Thromb Vasc Biol. 2015, and Schyns et al. Nat. Commun. 2019." } ] }, + { - "name" : "NK cell", + "name" : "Macrophage", "markers" : [ { - "genes" : ["Nkg7+"], - "weight" : 0.55 + "genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"], + "weight" : 1.0, + "comment" : "Machrophage markers from Kaptein et al. Cell 2022" + } + ] + }, + + { + "name" : "Conventional type 1 dendritic cell", + "markers" : [ + { + "genes" : ["Xcr1+", "Ifi205+", "Rab7b+", "Tlr3+", "Sept3+", "Hepacam2+"], + "weight" : 0.7, + "comment" : "cDC1 markers shared between Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021" }, { - "genes" : ["Cd3d-", "Cd3e-"], - "weight" : 0.45 + "genes" : ["Gcsam+", "Snx22+", "Itgae+", "Xlr+"], + "weight" : 0.3, + "comment" : "cDC1 markers expressed highly in one of Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021, but not both" + } + ] + }, + + { + "name" : "Conventional type 2 dendritic cell", + "markers" : [ + { + "genes" : ["Cd209a+","Ltb4r1+", "Mgl2+", "Tnip3+", "Bex6+"], + "weight" : 1.0, + "comment" : "cDC2 markers from Kaptein et al. Cell 2022" + } + ] + }, + + { + "name" : "Migratory dendritic cell", + "markers" : [ + { + "genes" : ["Cacnb3+", "Nudt17+", "Ccl22+", "Apol7c+", "Slco5a1+", "Ccr7+", "Fscn1+", "Il4i1+", "Mreg+", "Bcl2l14+"], + "weight" : 1.0, + "comment" : "Migratory DC markers shared between Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021" + } + ] + }, + + { + "name" : "Plasmacytoid dendritic cell", + "markers" : [ + { + "genes" : ["Siglech+", "Ccr9+", "Cox6a2+", "Cd300c+", "Klk1+"], + "weight" : 1.0, + "comment" : "pDC markers from Kaptein et al. Cell 2022" + } + ] + }, + + { + "name" : "Neutrophil", + "markers" : [ + { + "genes" : ["S100a8+", "S100a9+", "Retnlg+", "Mmp9+", "Csf3r+", "Wfdc21+", "Il1r2+", "Cxcr2+"], + "weight" : 1.0, + "comment" : "Neutrophil markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021; related paper: Grieshaber-Bouyer et al. Nat. Commun. 2021" + } + ] + }, + + { + "name" : "Immature neutrophil", + "markers" : [ + { + "genes" : ["Ngp+", "Camp+", "Ltf+", "Ly6g+", "Cebpe+"], + "weight" : 1.0, + "comment" : "Immature Neutrophil markers inferred from Hurskainen et al. Nat. Commun. 2021 and checked using Evrard et al. Immunity 2018 Fig. 5" + } + ] + }, + + { + "name" : "Basophil", + "markers" : [ + { + "genes" : ["Cd200r3+", "Aqp9+", "Il6+", "Hgf+", "Adora2b+", "Il4+", "L1cam+", "Grm6+"], + "weight" : 1.0, + "comment" : "Basophil markers inferred from Matsumara et al. Nat. Commun. 2022 and confirmed using data from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021" + } + ] + }, + + { + "name" : "Eosinophil", + "markers" : [ + { + "genes" : ["Epx+", "Prg3+", "Eml5+", "Il5ra+", "Qsox2+", "L2hgdh+"], + "weight" : 1.0, + "comment" : "Eosinophil markers inferred from Matsumara et al. Nat. Commun. 2022" + } + ] + }, + + { + "name" : "Mast cell", + "markers" : [ + { + "genes" : ["Tph1+", "Clnk+", "Hs6st2+", "Plcg1+"], + "weight" : 1.0, + "comment" : "Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022" + } + ] + }, + + { + "name" : "Red blood cell", + "markers" : [ + { + "genes" : ["Hba-a1+", "Hba-a2+", "Hbb-bs+", "Hbb-bt+"], + "weight" : 1.0, + "comment" : "Hemoglobin genes" } ] } diff --git a/pegasus/annotate_cluster/mouse_liver_cell_markers.json b/pegasus/annotate_cluster/mouse_liver_cell_markers.json new file mode 100644 index 00000000..f40427b3 --- /dev/null +++ b/pegasus/annotate_cluster/mouse_liver_cell_markers.json @@ -0,0 +1,246 @@ +{ + "title" : "Mouse liver cell type markers", + "comment": "Markers are collected from Kaptein et al. Cell 2022", + "cell_types" : [ + { + "name" : "Hepatocyte", + "markers" : [ + { + "genes" : ["Acaa1b+", "Arg1+", "Sult2a8+", "Hgd+", "Otc+"], + "weight" : 1.0, + "comment" : "Hepatocyte markers from Kaptein et al. Cell 2022" + } + ] + }, + + { + "name" : "Cholangiocyte", + "markers" : [ + { + "genes" : ["Spp1+", "Ddit4l+", "Sox9+", "Fgfr3+", "Plet1+"], + "weight" : 1.0, + "comment" : "Cholangiocyte markers from Kaptein et al. Cell 2022" + } + ] + }, + + { + "name" : "HsPC", + "markers" : [ + { + "genes" : ["Chrm3+", "Dmbt1+", "Slc4a4+", "Parm1+", "Pcdh11x+"], + "weight" : 1.0, + "comment" : "Hepatic stem and progenitor cell markers from Kaptein et al. Cell 2022" + } + ] + }, + + + { + "name" : "ILC1", + "markers" : [ + { + "genes" : ["Xcl1+", "Cd160+", "Klrc1+", "Cd200r2+", "Gzmc+"], + "weight" : 1.0, + "comment" : "Innate lymphoid cell type 1 markers from Kaptein et al. Cell 2022" + } + ] + }, + + + { + "name" : "Kupffer cell", + "markers" : [ + { + "genes" : ["Cd5l+", "Clec4f+", "Vsig4+", "Folr2+", "Timd4+"], + "weight" : 1.0, + "comment" : "Kupffer cell markers from Kaptein et al. Cell 2022" + } + ] + }, + + { + "name" : "Peritoneal macrophage", + "markers" : [ + { + "genes" : ["Lyz1+", "Saa3+", "Prg4+", "Retnla+", "Cbr2+"], + "weight" : 1.0, + "comment" : "Peritoneal macrophage markers from Kaptein et al. Cell 2022; Note that Lyve1 is also a good marker but it is also expressed in endothelial cells" + } + ] + }, + + { + "name" : "Macrophage", + "markers" : [ + { + "genes" : ["Cd14+", "Ms4a7+", "Cx3cr1+", "Trem2+", "Hpgds+"], + "weight" : 1.0, + "comment" : "Machrophage markers from Kaptein et al. Cell 2022" + } + ], + "subtypes" : { + "title" : "Macrophage subtype markers", + "cell_types" : [ + { + "name" : "Cd207+ macrophage", + "markers" : [ + { + "genes" : ["Cd207+", "Tmem119+", "Olfml3+", "Mmp13+"], + "weight" : 1.0, + "comments" : "Cd207+ macrophage markers from Kaptein et al. Cell 2022" + } + ] + }, + { + "name" : "Bile-duct lipid-associated macrophage", + "markers" : [ + { + "genes" : ["Gpnmb+", "Spp1+", "Syngr1+", "Cd93+"], + "weight" : 1.0, + "comments" : "Bile-duct LAM markers from Kaptein et al. Cell 2022" + } + ] + } + ] + } + }, + + + { + "name" : "Endothelial cell", + "markers" : [ + { + "genes" : ["Mmrn2+", "Cldn5+", "Adgrl4+", "Tek+", "Myct1+"], + "weight" : 1.0, + "comment" : "Endothelial cell markers from Kaptein et al. Cell 2022" + } + ], + "subtypes" : { + "title" : "Endothelial cell subtype markers", + "cell_types" : [ + { + "name" : "Liver sinusoidal endothelial cell", + "markers" : [ + { + "genes" : ["Lyve1+", "Clec1b+", "Chst2+", "Wisp1+"], + "weight" : 1.0, + "comment" : "LSEC markers from Kaptein et al. Cell 2022" + } + ] + }, + { + "name" : "Central vein endothelial cell", + "markers" : [ + { + "genes" : ["Rspo3+", "Lhx6+", "Wnt9b+", "Plppr5+"], + "weight" : 1.0, + "comment" : "CV EC markers from Kaptein et al. Cell 2022" + } + ] + }, + { + "name" : "Portal Vein endothelial cell", + "markers" : [ + { + "genes" : ["Adgrg6+", "Nrg1+", "Gja5+","Cmklr1+"], + "weight" : 1.0, + "comment" : "PV EC markers from Kaptein et al. Cell 2022" + } + ] + }, + { + "name" : "Lymphatic Endothelial cell", + "markers" : [ + { + "genes" : ["Mmrn1+", "Pard6g+", "Nts+", "Ccl21a+"], + "weight" : 1.0, + "comments" : "LEC markers from Kaptein et al. Cell 2022" + } + ] + } + ] + } + }, + + + { + "name" : "Stellate cell", + "markers" : [ + { + "genes" : ["Colec10+", "Rspo3+", "Mapt+", "Lama1+", "Bmp10+"], + "weight" : 1.0, + "comment" : "Stellate cell markers from Kaptein et al. Cell 2022" + } + ] + }, + + { + "name" : "Vascular smooth muscle cell", + "markers" : [ + { + "genes" : ["Cacna1c+", "Myh11+", "Notch3+", "Lmod1+", "Tagln+"], + "weight" : 1.0, + "comment" : "VSMC markers from Kaptein et al. Cell 2022" + } + ] + }, + + { + "name" : "Mesothelial cell", + "markers" : [ + { + "genes" : ["Ephb1+", "Cadm2+", "Prss12+", "Myl7+", "Prph+"], + "weight" : 1.0, + "comment" : "Mesothelial cell markers from Kaptein et al. Cell 2022" + } + ] + }, + + { + "name" : "Fibroblast", + "markers" : [ + { + "genes" : ["Col1a1+", "Mrc2+", "Plcxd3+", "Fndc1+", "Cpxm1+"], + "weight" : 1.0, + "comment" : "Fibroblast markers from Kaptein et al. Cell 2022" + } + ], + "subtypes" : { + "title" : "Fibro subtype markers", + "cell_types" : [ + { + "name" : "Capsule fibroblast", + "markers" : [ + { + "genes" : ["Osr1+", "Cldn10+", "Lgals7+", "Spock3+"], + "weight" : 1.0, + "comment" : "Capsule fibroblast markers from Kaptein et al. Cell 2022" + } + ] + }, + { + "name" : "Central vein fibroblast", + "markers" : [ + { + "genes" : ["Dpt+", "Pcolce2+", "Ntrk2+", "Pi16+"], + "weight" : 1.0, + "comment" : "Central vein fibroblast markers from Kaptein et al. Cell 2022" + } + ] + }, + { + "name" : "Bile-duct fibroblast", + "markers" : [ + { + "genes" : ["Itgbl1+", "Plcxd3+", "Nkain3+", "Clic5+"], + "weight" : 1.0, + "comment" : "Bile-duct fibroblast markers from Kaptein et al. Cell 2022" + } + ] + } + ] + } + } + ] +} diff --git a/pegasus/annotate_cluster/mouse_lung_cell_markers.json b/pegasus/annotate_cluster/mouse_lung_cell_markers.json new file mode 100644 index 00000000..543c3cff --- /dev/null +++ b/pegasus/annotate_cluster/mouse_lung_cell_markers.json @@ -0,0 +1,313 @@ +{ + "title" : "Mouse lung cell type markers", + "cell_types" : [ + { + "name" : "Alveolar type I cell", + "markers" : [ + { + "genes" : ["Akap5+", "Rtkn2+", "Ndnf+", "Col4a3+", "Spock2+"], + "weight" : 1.0, + "comment" : "AT1 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021" + } + ] + }, + + { + "name" : "Alveolar type II cell", + "markers" : [ + { + "genes" : ["Sftpc+", "Sftpa1+", "Lamp3+", "Hc+", "Slc34a2+"], + "weight" : 1.0, + "comment" : "AT2 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021" + } + ] + }, + + { + "name" : "Ciliated cell", + "markers" : [ + { + "genes" : ["Dynlrb2+", "Tmem212+", "Foxj1+", "Ccdc153+", "Nme5+"], + "weight" : 1.0, + "comment" : "Ciliated cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021" + } + ] + }, + + { + "name" : "Club cell", + "markers" : [ + { + "genes" : ["Scgb1a1+", "Scgb3a2+", "Cckar+", "Gabrp+", "Slc16a11+"], + "weight" : 1.0, + "comment" : "Club cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021" + } + ] + }, + + { + "name" : "Basal cell", + "markers" : [ + { + "genes" : ["Aqp3+", "Krt5+", "Dapl1+", "Hspa1a+", "Trp63+"], + "weight" : 1.0, + "comment" : "Basal cell markers from Montoro et al. Nature 2018 Extended Data Fig. 1d" + } + ] + }, + + { + "name" : "Goblet cell", + "markers" : [ + { + "genes" : ["Scgb3a1+", "Muc5b+", "Serpinb11+", "Gp2+", "Dmbt1+"], + "weight" : 1.0, + "comment" : "Goblet cell markers from Montoro et al. Nature 2018 Supp Table 1" + } + ] + }, + + { + "name" : "Tuft cell", + "markers" : [ + { + "genes" : ["Pou2f3+", "Ascl2+", "Dclk1+", "Lrmp+", "Ltc4s+", "Trpm5+", "Gnb3+", "Rgs13+"], + "weight" : 1.0, + "comment" : "Tuft cell markers from Sun et al. Dev. Cell 2022 and Montoro et al. Nature 2018 Extended Data Fig. 3b; first 3 markers are mainly suggested by Sun et al. the CellCards." + } + ] + }, + + { + "name" : "Plumonary neuroendocrine cell", + "markers" : [ + { + "genes" : ["Ascl1+", "Chga+", "Calca+", "Scg2+", "Scg5+"], + "weight" : 1.0, + "comment" : "Plumonary neuroendocrine cell markers from Montoro et al. Nature 2018 Extended Data Fig. 3b & 3c" + } + ] + }, + + { + "name" : "Ionocyte", + "markers" : [ + { + "genes" : ["Foxi1+", "Ascl3+", "Smbd1+", "Moxd1+", "Atp6v0d2+"], + "weight" : 1.0, + "comment" : "Ionocyte markers from Montoro et al. Nature 2018 Fig. 5a" + } + ] + }, + + + + { + "name" : "Endothelial cell", + "markers" : [ + { + "genes" : ["Egfl7+", "Cldn5+", "Cdh5+", "Pecam1+", "Calcrl+", "Ecscr+", "Icam2+"], + "weight" : 1.0, + "comment" : "Endothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data" + } + ], + "subtypes" : { + "title" : "Endothelial cell subtype markers (Main and Capillary, see https://lungmap.net/cell-cards/)", + "cell_types" : [ + { + "name" : "Aerocyte", + "markers" : [ + { + "genes" : ["Emp2+", "Car4+", "Tbx2+", "Apln+"], + "weight" : 1.0, + "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ] + }, + { + "name" : "EC general capillary", + "markers" : [ + { + "genes" : ["Gpihbp1+", "Kit+", "Nckap5+", "Aplnr+"], + "weight" : 1.0, + "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ] + }, + { + "name" : "EC lymphatic", + "markers" : [ + { + "genes" : ["Mmrn1+", "Ccl21a+", "Prox1+", "Nts+"], + "weight" : 1.0, + "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ] + }, + { + "name" : "EC venous", + "markers" : [ + { + "genes" : ["Slc6a2+", "Vegfc+", "Ackr3+", "Fabp4+"], + "weight" : 1.0, + "comments" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ] + }, + { + "name" : "EC arterial", + "markers" : [ + { + "genes" : ["Gja5+", "Cxcl12+", "Pcsk5+", "Thsd7a+"], + "weight" : 1.0, + "comments" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ] + } + ] + } + }, + + + + { + "name" : "Mesothelial cell", + "markers" : [ + { + "genes" : ["Wt1+", "Upk3b+", "Rspo1+", "C2+", "Sbsn+", "Aldh1a2+", "Lrrn4+", "Cldn15+"], + "weight" : 1.0, + "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ] + }, + + { + "name" : "Pericyte", + "markers" : [ + { + "genes" : ["Notch3+", "Heyl+", "Parm1+", "Ndufa4l2+", "Cox4i2+"], + "weight" : 1.0, + "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ], + "subtypes" : { + "title" : "Pericyte subtype markers", + "cell_types" : [ + { + "name" : "Pericyte 1", + "markers" : [ + { + "genes" : ["Gpc6+", "Cxcl12+", "Wisp2+", "Map3k7cl+"], + "weight" : 1.0, + "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ] + }, + { + "name" : "Pericyte 2", + "markers" : [ + { + "genes" : ["Higd1b+", "Pcdh18+", "Trpc6+", "Fam162b+", "Clstn2+"], + "weight" : 1.0, + "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ] + } + ] + } + }, + + { + "name" : "Fibroblast", + "markers" : [ + { + "genes" : ["Dpt+", "Clec3b+", "Pcolce2+", "Vegfd+", "Vcam1+"], + "weight" : 1.0, + "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ], + "subtypes" : { + "title" : "Fibro subtype markers", + "cell_types" : [ + { + "name" : "Adventitial fibroblast", + "markers" : [ + { + "genes" : ["Mfap5+", "Serpinf1+", "Abca8a+", "Twist2+"], + "weight" : 1.0, + "comment" : "Markers from Schupp et al. and Travaglini et al." + } + ] + }, + { + "name" : "Alveolar fibroblast", + "markers" : [ + { + "genes" : ["Slit2+", "Col13a1+", "Wnt2+", "Slc38a5+", "Slc27a6+", "Frem1+"], + "weight" : 1.0, + "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data and confirmed on Negretti et al. Development 2021 data" + } + ] + } + ] + } + }, + + { + "name" : "Myofibroblast", + "markers" : [ + { + "genes" : ["Egfem1+", "Agt+", "Prag1+", "Etv1+", "Trim67+"], + "weight" : 1.0, + "comment" : "Markers from Schupp et al. and Travaglini et al." + } + ] + }, + + { + "name" : "Smooth muscle cell", + "markers" : [ + { + "genes" : ["Tnnt2+", "Sgcg+", "Sntg2+", "Nrtn+", "Mrvi1+", "Sbspon+"], + "weight" : 1.0, + "comment" : "Markers inferred from Hurskainen et al. Nat. Commun. 2021 data" + } + ] + }, + + + + { + "name" : "ILC2", + "markers" : [ + { + "genes" : ["Gata3+", "Il1rl1+", "Arg1+", "Areg+", "Il2ra+", "Csf2+", "Ccl1+", "Ccdc184+", "Calca+", "Il5+"], + "weight" : 1.0, + "comment" : "Innate lymphoid cell type 2 markers inferred from Hurskainen et al. Nat. Commun. 2021 data" + } + ] + }, + + { + "name" : "Alveolar macrophage", + "markers" : [ + { + "genes" : ["Ear1+", "Marco+", "Atp6v0d2+", "Olr1+", "F7+", "Tfec+", "Gpnmb+", "Lrp12+", "Pparg+", "Car4+", "Krt19+", "Plet1+"], + "weight" : 1.0, + "comment" : "First 8 markers are Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data; Ear1 and Marco also show in Casanova-Acebes et al. Nature 2021; Last 4 are markers from Casanova-Acebes et al. Nature 2021 that are validated using Hurskainen et al. Nat. Commun. 2021 data" + } + ] + }, + + { + "name" : "Interstitial macrophage", + "markers" : [ + { + "genes" : ["C1qa+", "C1qb+", "C1qc+", "Pf4+", "Ms4a7+", "Fcrls+"], + "weight" : 1.0, + "comment" : "Interstitial macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 data" + } + ] + } + ] +} diff --git a/pegasus/commands/Clustering.py b/pegasus/commands/Clustering.py index 9d8611b2..7b6d7e90 100644 --- a/pegasus/commands/Clustering.py +++ b/pegasus/commands/Clustering.py @@ -68,6 +68,7 @@ class Clustering(Base): --nmf-n Number of NMF components. IF iNMF is used for batch correction, this parameter also sets iNMF number of components. [default: 20] --knn-K Number of nearest neighbors for building kNN graph. [default: 100] + --exact-K If use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])). --knn-full-speed For the sake of reproducibility, we only run one thread for building kNN indices. Turn on this option will allow multiple threads to be used for index building. However, it will also reduce reproducibility due to the racing between multiple threads. --kBET Calculate kBET. @@ -210,6 +211,7 @@ def execute(self): "nmf": self.args["--nmf"], "nmf_n": int(self.args["--nmf-n"]), "K": int(self.args["--knn-K"]), + "exact_K": self.args["--exact-K"], "full_speed": self.args["--knn-full-speed"], "kBET": self.args["--kBET"], "kBET_batch": self.args["--kBET-batch"], diff --git a/pegasus/data_files/emt_human.gmt b/pegasus/data_files/emt_human.gmt new file mode 100644 index 00000000..dfec37a7 --- /dev/null +++ b/pegasus/data_files/emt_human.gmt @@ -0,0 +1,2 @@ +Epithelial-like Signatures from Gibbons and Creighton Dev. Dyn. 2018 CDH1 DSP OCLN +Mesenchymal-like Signatures from Gibbons and Creighton Dev. Dyn. 2018 VIM CDH2 FOXC2 SNAI1 SNAI2 TWIST1 FN1 ITGB6 MMP2 MMP3 MMP9 SOX10 GCS diff --git a/pegasus/data_files/human_lung.gmt b/pegasus/data_files/human_lung.gmt new file mode 100644 index 00000000..6f488d48 --- /dev/null +++ b/pegasus/data_files/human_lung.gmt @@ -0,0 +1,23 @@ +Epithelial Epithelial markers from HTAPP paper KRT8 KRT18 EPCAM CD24 +VEC Vascular endothelial cell markers from Travaglini et al. Nature 2020 and and Schupp et al. Circulation 2021 PECAM1 CLDN5 CDH5 ERG ICAM2 CLEC14A ITM2A ADGRL4 SLCO2A1 IFI27 +LEC Lymphatic endothelial cell markers from Travaglini et al. Nature 2020 and and Schupp et al. Circulation 2021 PECAM1 CLDN5 ERG CDH5 CCL21 TFF3 PDPN PROX1 LYVE1 FLT4 GPM6A SEMA3D TBX1 RELN +Fibroblast Fibroblast/Myofibroblast shared markers from Travaglini et al. COL1A1 COL1A2 PDGFRA ELN BGN +Macrophage Macro CD68 CD163 C1QA MRC1 MS4A6A MSR1 MERTK +SMC SMC from Muus et al., Braga et al. and Schupp et al. MYH11 TAGLN ACTG2 CNN1 PLN +Pericyte Pericyte from Schupp et al. and Travaglini et al. TRPC6 CSPG4 FAM162B GJA4 GJC1 HIGD1B CDH6 LAMC3 FHL5 +T cell T cell markers CD3D CD3E CD3G TRAC +B cell B cell markers CD19 MS4A1 CD79A CD79B +Plasma cell Plasma cell markers from ICA TNFRSF17 PRDM1 SLAMF7 IRF4 SDC1 IGHA1 IGHG1 TNFRSF13B CD38 ABCB9 CHPF PLAAT2 +Mast cell Mast cell markers KIT CPA3 TPSB2 TPSAB1 AREG RGS1 RGS2 +ProNeu Pro-Neutrophil markers validated using 10x public whole blood dataset DEFA3 DEFA4 AZU1 MS4A3 ELANE SLPI CEACAM6 RNASE3 PRTN3 MPO AC104232.1 CTSG +PreNeu Pre-Neutrophil markers validated using 10x public whole blood dataset LTF LCN2 MMP8 CRISP3 CAMP PGLYRP1 CD177 HP +Neutrophil Neutrophil markers CSF3R G0S2 LUCAT1 EPHB1 TNFRSF10C IL1R2 KCNJ15 FCGR3B AC007032.1 HSD11B1-AS1 +AT1 AT1 markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 AGER SPOCK2 RTKN2 TNNC1 SCEL CLIC5 NCKAP5 ARHGEF26 GGTLC1 ITLN2 MS4A15 +AT2 AT2 markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 SFTPA1 SFTPA2 SFTPC PGC LAMP3 FASN HHIP ETV5 RASGRF1 ABCA3 +Basal Basal cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 KRT17 S100A2 MIR205HG KRT15 KRT5 DLK2 CDH3 TP63 TNS4 +Club Club cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 SCGB3A2 MGP CTSE +Ciliated Ciliated cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 ERICH3 ARMC3 DNAI2 ZBBX VWA3B RGS22 TTC29 CDHR4 PPP1R42 CFAP46 CFAP52 CFAP73 CFAP77 CFAP157 DNAH3 DNAH9 ADGB SNTN CCDC170 C6orf118 +Goblet Goblet cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 MUC5AC MUC5B BPIFB1 MSMB SERPINB11 CYP2F1 +Ionocyte Ionocyte markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 ASCL3 CLCNKB FOXI1 ATP6V1G3 TMPRSS11E BSND LINC01187 CLDN25 +PNEC Plumonary neuroendocrine cell markers inferred from Travaglini et al. Nature 2020 and confirmed by Schupp et al. Circulation 2021 CHGA CHGB SCGN SCG5 CPLX2 GRP ASCL1 INSM1 +SMG SMG serous cell markers inferred from Travaglini et al. Nature 2020 PRR4 TCN1 C6orf58 PRB3 LPO PRB1 PRH2 PRH1 ODAM diff --git a/pegasus/data_files/human_t_cell_markers.gmt b/pegasus/data_files/human_t_cell_markers.gmt new file mode 100644 index 00000000..aac1cda8 --- /dev/null +++ b/pegasus/data_files/human_t_cell_markers.gmt @@ -0,0 +1,11 @@ +CD4_Naive CD4 Naive T CD4 CCR7 SELL LEF1 FHIT ACTN1 LDLRAP1 TMIGD2 TRABD2A LRRN3 +CD4_TCM CD4 TCM CD4 GPR183 CD69 PASK LIMS1 LPAR6 SLC2A3 SOCS3 +CD4_TEM CD4 TEM CD4 KLRB1 ANXA2 LGALS1 TIMP1 PTGER2 AHNAK TNFRSF4 YWHAH CD63 +Treg Treg RTKN2 FOXP3 IL2RA HACD1 AC133644.2 FANK1 DUSP4 STAM CCR10 CTLA4 +CD4_CTL CD4 Cytotoxic Lymphocyte CD4 GNLY AGAP1 ZNF683 RGS9 IL5RA LAIR2 MTERF2 SH3RF2 RGS17 +Tfh T follicular helper CD4 ST8SIA1 PDCD1 TIGIT TOX2 ICOS SH2D1A IL21 +CD8_Naive CD8 Naive T CD8A CD8B CCR7 SELL LEF1 ACTN1 TRABD2A LRRN3 LINC02446 S100B CLEC11A NELL2 PASK APBA2 +CD8_TCM CD8 TCM CD8A CD8B GZMK DUSP2 RGS1 CXCR3 CMC1 TIGIT CST7 NKG7 +CD8_TEM CD8 TEM CD8A CD8B FGFBP2 GZMB FCGR3A SPON2 ADGRG1 CX3CR1 ASCL2 PRSS23 +MAIT MAIT SLC4A10 KLRB1 NCR3 CEBPD GPR65 LST1 CXCR6 TRAV1-2 +gdT gdT TRDC TRGC1 TRGC2 KLRC1 KLRD1 GNLY diff --git a/pegasus/data_files/mouse_brain.gmt b/pegasus/data_files/mouse_brain.gmt new file mode 100644 index 00000000..fa32ef25 --- /dev/null +++ b/pegasus/data_files/mouse_brain.gmt @@ -0,0 +1,11 @@ +GlutamatergicNeuron Glutamatergic neuron Slc17a7 Slc17a6 Neurod6 Neurod2 +GABAergicNeuron GABAergic neuron Gad1 Gad2 Slc32a1 +Oligodendrocyte Oligodendrocyte Plp1 Cnp Fa2h St18 Mbp +OPC Oligodendrocyte progenitor cell Pdgfra Cspg4 Emid1 Fabp7 +SMC Smooth muscle cell Atca2 Myh11 Tagln Pln Mylk +Pericyte Pericyte Vtn Atp13a5 Abcc9 Kcnj8 Art3 +Endo Endothelial cell Flt1 Pecam1 Ly6a Slco1a4 Mecom Ptprb Id1 +Microglia Microglia cell Hexb Siglech Selplg Tmem119 Ctss P2ry12 Cx3cr1 Trem2 Fcrls Csf1r +Astrocyte Astrocyte Mt2 Gja1 Prdx6 Htra1 Ntsr2 Aldoc Apoe Prex2 Aqp4 Gpr37l1 +PVM Perivascular macrophages Mrc1 Stab1 Lyz2 Ms4a6c F13a1 Pf4 +VLMC Vascular leptomeningeal cells Slc7a11 Slc6a13 Bmp6 Igfbp2 Fmod Ranbp3l diff --git a/pegasus/data_files/mouse_liver.gmt b/pegasus/data_files/mouse_liver.gmt new file mode 100644 index 00000000..d9c8bb4b --- /dev/null +++ b/pegasus/data_files/mouse_liver.gmt @@ -0,0 +1,28 @@ +Endo Endothelial cell Mmrn2 Cldn5 Adgrl4 Tek Myct1 +Stellate Stellate cell Colec10 Rspo3 Mapt Lama1 Bmp10 +VSMC Vascular smooth muscle cell Cacna1c Myh11 Notch3 Lmod1 Tagln +Meso Mesothelial cell Ephb1 Cadm2 Prss12 Myl7 Prph +Fibro Fibroblast Col1a1 Mrc2 Plcxd3 Fndc1 Cpxm1 +Hepatocyte Hepatocye Acaa1b Arg1 Sult2a8 Hgd Otc +Cholangiocyte Cholangiocyte Spp1 Ddit4l Sox9 Fgfr3 Plet1 +HSPC Hepatic stem and progenitor cell Chrm3 Dmbt1 Slc4a4 Parm1 Pcdh11x +T T cell Cd3d Cd3e Lat Thy1 Lef1 Trac Cd28 +B B cell Cd19 Ms4a1 Cd79a Cd79b Ebf1 Pax5 Fcmr Bank1 +NK NK cell Eomes Cma1 Klra4 Klra7 Klra8 +ILC1 Innate lymphoid cell type 1 Xcl1 Cd160 Klrc1 Cd200r2 Gzmc +cDC1 cDC1 Xcr1 Ifi205 Rab7b Tlr3 Sept3 Hepacam2 Gcsam Snx22 Itgae Xlr +cDC2 cDC2 Cd209a Ltb4r1 Mgl2 Tnip3 Bex6 +migDC Migoritory DC Cacnb3 Nudt17 Ccl22 Apol7c Slco5a1 Ccr7 Fscn1 Il4i1 Mreg Bcl2l14 +pDC Plasmacytoid dendritic cell Siglech Ccr9 Cox6a2 Cd300c Klk1 +MonoI Inflammatory monocyte Ly6c2 F13a1 Ms4a4c Ccr2 Gm9733 Mcub +MonoP Patrolling monocyte Ace Eno3 Ear2 Treml4 Spn Fcgr4 Lair1 Cd300e Cd300ld Adgre4 +PeriMac Peritoneal macrophage Lyz1 Saa3 Prg4 Retnla Cbr2 +Mac Macrophage Cd14 Ms4a7 Cx3cr1 Trem2 Hpgds +Kupffer Kupffer cell Cd5l Clec4f Vsig4 Folr2 Timd4 +Neutrophil Neutrophil S100a8 S100a9 Retnlg Mmp9 Csf3r Wfdc21 Il1r2 Cxcr2 +Basophil Basophil Cd200r3 Aqp9 Il6 Hgf Adora2b Il4 L1cam Grm6 +Eosinophil Eosinophil Epx Prg3 Eml5 Il5ra Qsox2 L2hgdh +Mast Mast cell Tph1 Clnk Hs6st2 Plcg1 +Pericentral Pericentral liver zonation markers from Halpern et al. Nature 2017 and Guilliams et al. Cell 2022 Mup11 Oat Rgn Glul Cyp2e1 Axin2 Cyp1a2 Gstm3 Psmd4 +Periportal Periportal liver zonation markers from Halpern et al. Nature 2017 and Guilliams et al. Cell 2022 Cyp2f2 Hal Sds Ass1 Asl Alb Arg1 Pck1 C2 Sdhd +Midlobular Mid-lobular liver zonation markers picked from Fig. 3 and Extended Data Fig 10a of Halpern et al. Nature 2017 Hamp Igfbp2 Cyp8b1 Mup3 Hamp2 Hsbp8 Ces1d Cebpa Fkbp8 Clpp diff --git a/pegasus/data_files/mouse_lung.gmt b/pegasus/data_files/mouse_lung.gmt new file mode 100644 index 00000000..0ed0bc5b --- /dev/null +++ b/pegasus/data_files/mouse_lung.gmt @@ -0,0 +1,31 @@ +AT1 AT1 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021 Akap5 Rtkn2 Ndnf Col4a3 Spock2 +AT2 AT2 markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021 Sftpc Sftpa1 Lamp3 Hc Slc34a2 +Ciliated Ciliated cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021 Dynlrb2 Tmem212 Foxj1 Ccdc153 Nme5 +Club Club cell markers from Hurskainen et al. Nat. Commun. 2021 and Negretti et al. Development 2021 Scgb1a1 Scgb3a2 Cckar Gabrp Slc16a11 +Basal Basal cell markers from Montoro et al. Nature 2018 Extended Data Fig. 1d Aqp3 Krt5 Dapl1 Hspa1a Trp63 +Goblet Goblet cell markers from Montoro et al. Nature 2018 Supp Table 1 Scgb3a1 Muc5b Serpinb11 Gp2 Dmbt1 +Tuft Tuft cell markers from Sun et al. Dev. Cell 2022 and Montoro et al. Nature 2018 Extended Data Fig. 3b Pou2f3 Ascl2 Dclk1 Lrmp Ltc4s Trpm5 Gnb3 Rgs13 +PNEC Plumonary neuroendocrine cell markers from Montoro et al. Nature 2018 Extended Data Fig. 3b & 3c Ascl1 Chga Calca Scg2 Scg5 +Ionocyte Ionocyte markers from Montoro et al. Nature 2018 Fig. 5a Foxi1 Ascl3 Smbd1 Moxd1 Atp6v0d2 +Endothelial Endothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data Egfl7 Cldn5 Cdh5 Pecam1 Calcrl Ecscr Icam2 +Mesothelial Mesothelial cell markers inferred from Hurskainen et al. Nat. Commun. 2021 Wt1 Upk3b Rspo1 C2 Sbsn Aldh1a2 Lrrn4 Cldn15 +Pericyte Pericyte markers inferred from Hurskainen et al. Nat. Commun. 2021 data Notch3 Heyl Parm1 Ndufa4l2 Cox4i2 +Fibroblast Fibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data Dpt Clec3b Pcolce2 Vegfd Vcam1 +Myofibroblast Myofibroblast markers inferred from Hurskainen et al. Nat. Commun. 2021 data Egfem1 Agt Prag1 Etv1 Trim67 +SMC Smooth muscle cell markers inferred from Hurskainen et al. Nat. Commun. 2021 data Tnnt2 Sgcg Sntg2 Nrtn Mrvi1 Sbspon +AlvMf Alveolar macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 Atp6v0d2 Olr1 F7 Ear1 Tfec Gpnmb Lrp12 Marco +IntMf Interstitial macrophage markers inferred from Hurskainen et al. Nat. Commun. 2021 C1qa C1qb C1qc Pf4 Ms4a7 Fcrls +ILC2 Innate lymphoid cell type 2 markers inferred from Hurskainen et al. Nat. Commun. 2021 data Gata3 Il1rl1 Arg1 Areg Il2ra Csf2 Ccl1 Ccdc184 Calca Il5 +T T cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Cd3d Cd3e Lat Thy1 Lef1 Trac Cd28 +B B cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Cd19 Ms4a1 Cd79a Cd79b Ebf1 Pax5 Fcmr Bank1 +NK NK cell markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Gzma Klrb1c Ncr1 Klre1 Klrc2 Eomes Cma1 Klra4 Klra7 Klra8 +cDC1 cDC1 markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Xcr1 Ifi205 Rab7b Tlr3 Sept3 Hepacam2 Gcsam Snx22 Itgae Xlr +cDC2 cDC2 markers from Kaptein et al. Cell 2022 Cd209a Ltb4r1 Mgl2 Tnip3 Bex6 +migDC Migoritory DC markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Cacnb3 Nudt17 Ccl22 Apol7c Slco5a1 Ccr7 Fscn1 Il4i1 Mreg Bcl2l14 +pDC Plasmacytoid dendritic cell markers from Kaptein et al. Cell 2022 Siglech Ccr9 Cox6a2 Cd300c Klk1 +MonoI Inflammatory monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Ly6c2 F13a1 Ms4a4c Ccr2 Gm9733 Mcub +MonoP Patrolling monocyte markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data Ace Eno3 Ear2 Treml4 Spn Fcgr4 Lair1 Cd300e Cd300ld Adgre4 +Neutrophil Neutrophil markers inferred from Kaptein et al. Cell 2022 and Hurskainen et al. Nat. Commun. 2021 data S100a8 S100a9 Retnlg Mmp9 Csf3r Wfdc21 Il1r2 Cxcr2 +Basophil Basophil markers inferred from Matsumara et al. Nat. Commun. 2022 data Cd200r3 Aqp9 Il6 Hgf Adora2b Il4 L1cam Grm6 +Eosinophil Eosinophil markers inferred from Matsumara et al. Nat. Commun. 2022 data Epx Prg3 Eml5 Il5ra Qsox2 L2hgdh +Mast Mast cell markers inferred from Matsumara et al. Nat. Commun. 2022 data Tph1 Clnk Hs6st2 Plcg1 diff --git a/pegasus/data_files/tonsil_markers.gmt b/pegasus/data_files/tonsil_markers.gmt new file mode 100644 index 00000000..bfefe13c --- /dev/null +++ b/pegasus/data_files/tonsil_markers.gmt @@ -0,0 +1,18 @@ +Skeletal muscle cells Skeletal muscle cells MYBPC1 TNNT1 TNNC1 MYL1 MYBPH TNNC2 TNNI1 MYH7 MYL2 +Tfh T Follicular helper markers (one reference point is https://www.thermofisher.com/us/en/home/life-science/cell-analysis/cell-analysis-learning-center/immunology-at-work/t-follicular-helper-cell-overview.html) CD4 ST8SIA1 PDCD1 TIGIT TOX2 ICOS SH2D1A IL21 +Tregs Tregs CTLA4 TIGIT IL2RA FOXP3 CCR8 BATF +T_Naive Naive T cell CCR7 SELL IL7R TCF7 CD27 +DC_Migratory Migratory Conventional Dendritic cell FSCN1 CCR7 LAMP3 CCL19 CCL22 CD40 BIRC3 +MAIT MAIT SLC4A10 +EC lymphatic Schupp et al. Circulation 2021 PECAM1 CLDN5 CDH5 ERG CCL21 SEMA3D PROX1 PDPN MMRN1 RELN PKHD1L1 TFF3 LYVE1 FLT4 TBX1 +fDC Follicular dendritic cell CXCL13 FCAMR FDCSP SERPINE2 PAPPA NPHS1 PKDCC SYNM NRG2 CDC42EP4 MUC3A PRUNE2 B4GALNT4 NPPC SLC1A2 TMEM150C +DCs_CLEC9A Conventional Dendritic cell type 1 CLEC9A BATF3 IRF8 CPVL CADM1 +DCs_CD1C Conventional Dendritic cell type 2 CD1C FCER1A FCGBP CD1A CD207 HLA-DQB2 +pDCs Plasmacytoid Dendritic cell IRF4 LILRA4 TCF4 MZB1 +B_Naive Naïve B cell MS4A1 IGHD TCL1A FCER2 +B_Memory Memory B cell MS4A1 CD27 TNFRSF13B +B_Germinal_Center Germinal center B cell MEF2B NEIL1 RGS13 ELL3 BCL7A BCL6 NUGGC MYBL1 EML6 FANCA +B_light_zone Light Zone CD83 LMO2 +B_dark_zone Dark Zone CXCR4 AICDA FOXP1 MME +Mono_DCs Monocytes Derived DC CD14 FCGR2B CCL17 CLEC10A +MyoF Myofibroblast from Travaglini et al. and Tony et al. ACTA2 MYL9 MT2A EEF1A1 TMSB10 FAU UBA52 SERF2 PTMA S100A6 diff --git a/pegasus/pipeline/pipeline.py b/pegasus/pipeline/pipeline.py index 34626967..5ede69b3 100644 --- a/pegasus/pipeline/pipeline.py +++ b/pegasus/pipeline/pipeline.py @@ -92,6 +92,7 @@ def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool, tools.neighbors( unidata, K=kwargs["K"], + exact_k=kwargs["exact_K"], rep=dim_key, n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py index f4a95549..edce18d9 100644 --- a/pegasus/plotting/plot_library.py +++ b/pegasus/plotting/plot_library.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import seaborn as sns +import matplotlib import matplotlib.pyplot as plt from scipy.sparse import issparse @@ -31,6 +32,7 @@ _generate_categories, _plot_corners, _plot_spots, + _get_valid_attrs, ) @@ -45,7 +47,7 @@ def scatter( fix_corners: Optional[bool] = True, alpha: Optional[Union[float, List[float]]] = 1.0, legend_loc: Optional[Union[str, List[str]]] = "right margin", - legend_fontsize: Optional[Union[int, List[int]]] = 10, + legend_fontsize: Optional[Union[int, List[int]]] = 10, legend_ncol: Optional[str] = None, palettes: Optional[Union[str, List[str]]] = None, cmaps: Optional[Union[str, List[str]]] = "YlOrRd", @@ -152,6 +154,9 @@ def scatter( elif not is_list_like(attrs): attrs = [attrs] + # Select only valid attributes + attrs = _get_valid_attrs(data, attrs) + if isinstance(basis, str): basis = [basis] if isinstance(components, tuple): @@ -214,7 +219,7 @@ def scatter( if global_marker_size == None: global_marker_size = _get_marker_size(x.size) if marker_size is None else marker_size - + x_label = f"{basis_}{comp_key[0]}" y_label = f"{basis_}{comp_key[1]}" @@ -236,8 +241,6 @@ def scatter( values = slicing(data.X, col = loc) else: obsm_key, sep, component = attr.partition("@") - if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()): - raise KeyError(f"{attr} is not in data.obs, data.var_names or data.obsm!") values = data.obsm[obsm_key][:, int(component)] selected = restr_obj.get_satisfied(data, attr) @@ -864,6 +867,7 @@ def violin( hue: Optional[str] = None, matkey: Optional[str] = None, stripplot: Optional[bool] = False, + stripsize: int = 1, inner: Optional[str] = None, scale: Optional[str] = 'width', panel_size: Optional[Tuple[float, float]] = (8, 0.5), @@ -943,6 +947,9 @@ def violin( assert not isinstance(data, anndata.AnnData) data.select_matrix(matkey) + # Filter out attributes not existing in the data + attrs = _get_valid_attrs(data, attrs) + nrows = len(attrs) fig, axes = _get_subplot_layouts(nrows=nrows, ncols=1, panel_size=panel_size, dpi=dpi, left=left, bottom=bottom, wspace=wspace, hspace=0, squeeze=False, sharey=False) @@ -954,15 +961,20 @@ def violin( assert is_numeric_dtype(data.obs[key]) obs_keys.append(key) else: - if key not in data.var_names: - logger.warning(f"Cannot find gene {key}. Please make sure all genes are included in data.var_names before running this function!") - return None genes.append(key) df_list = [pd.DataFrame({"label": data.obs[groupby].values})] + if hue is not None: df_list.append(pd.DataFrame({hue: data.obs[hue].values})) stripplot = False + kwargs['hue'] = hue + kwargs['split'] = True + else: + kwargs['hue'] = "label" + kwargs['legend'] = False + kwargs['split'] = False + if len(obs_keys) > 0: df_list.append(data.obs[obs_keys].reset_index(drop=True)) if len(genes) > 0: @@ -973,8 +985,8 @@ def violin( for i in range(nrows): ax = axes[i, 0] if stripplot: - sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=1, color="k", jitter=True) - sns.violinplot(x="label", y=attrs[i], hue = hue, data=df, inner=inner, linewidth=1, ax=ax, cut=0, scale=scale, split=True, palette=palette, **kwargs) + sns.stripplot(x="label", y=attrs[i], hue = hue, data=df, ax=ax, size=stripsize, color="k", jitter=True) + sns.violinplot(x="label", y=attrs[i], data=df, inner=inner, linewidth=1, ax=ax, cut=0, density_norm=scale, palette=palette, **kwargs) ax.grid(False) if hue is not None: @@ -987,6 +999,7 @@ def violin( ax.set_xlabel("") else: ax.set_xlabel(groupby) + ax.set_xticks(ax.get_xticks()) # Get rid of the UserWarning: set_ticklabels() should only be used with a fixed number of ticks ax.set_xticklabels(ax.get_xticklabels(), rotation=90) ax.set_ylabel(attrs[i], labelpad=8, rotation=0, horizontalalignment='right', fontsize='medium') ax.tick_params(axis='y', right=True, left=False, labelright=True, labelleft=False, labelsize='small') @@ -1005,16 +1018,26 @@ def violin( def heatmap( data: Union[MultimodalData, UnimodalData, anndata.AnnData], attrs: Union[str, List[str]], - groupby: str, + groupby: Optional[str] = None, matkey: Optional[str] = None, - on_average: bool = True, - switch_axes: bool = False, + gene_zscore: Optional[bool] = True, + on_average: Optional[bool] = True, + switch_axes: Optional[bool] = False, attrs_cluster: Optional[bool] = False, attrs_dendrogram: Optional[bool] = True, + attrs_method: Optional[bool] = 'ward', + attrs_optimal_ordering: Optional[bool] = True, + xlabel_size: Optional[float] = 10.0, + ylabel_size: Optional[float] = 10.0, + legend_fontsize: Optional[float] = 10.0, + xlabel_rotation: Optional[float] = 90.0, + ylabel_rotation: Optional[float] = 0.0, groupby_cluster: Optional[bool] = True, groupby_dendrogram: Optional[bool] = True, - attrs_labelsize: Optional[float] = 10.0, - groupby_labelsize: Optional[float] = 10.0, + groupby_method: Optional[bool] = 'ward', + groupby_optimal_ordering: Optional[bool] = True, + groupby_precomputed_linkage: Optional[np.array] = None, + show_sample_name: Optional[bool] = None, cbar_labelsize: Optional[float] = 10.0, panel_size: Tuple[float, float] = (10, 10), return_fig: Optional[bool] = False, @@ -1026,7 +1049,6 @@ def heatmap( Parameters ----------- - data: ``AnnData`` or ``MultimodalData`` or ``UnimodalData`` object Single-cell expression data. attrs: ``str`` or ``List[str]`` @@ -1034,13 +1056,16 @@ def heatmap( Cell attributes must exist in ``data.obs`` and must be numeric. Features must exist in ``data.var``. By default, attrs are plotted as columns. - groupby: ``str`` + groupby: ``str``, optional, default: ``None`` A categorical variable in data.obs that is used to categorize the cells, e.g. Clusters. By default, data.obs['groupby'] is plotted as rows. + If ``None``, use data.obs_names instead. matkey: ``str``, optional, default: ``None`` If matkey is set, select matrix with matkey as keyword in the current modality. Only works for MultimodalData or UnimodalData objects. + gene_zscore: ``bool``, optional, default: ``True`` + If ``True``, compute and then plot z scores for gene expression. on_average: ``bool``, optional, default: ``True`` - If ``True``, plot cluster average gene expression (i.e. show a Matrixplot); otherwise, plot a general heatmap. + If ``True``, plot cluster average gene expression or z score (i.e. show a Matrixplot); otherwise, plot a general heatmap. switch_axes: ``bool``, optional, default: ``False`` By default, X axis is for attributes, and Y axis for clusters. If this parameter is ``True``, switch the axes. Moreover, with ``on_average`` being ``False``, if ``switch_axes`` is ``False``, ``row_cluster`` is enforced to be ``False``; if ``switch_axes`` is ``True``, ``col_cluster`` is enforced to be ``False``. @@ -1048,14 +1073,32 @@ def heatmap( Cluster attributes and generate a attribute-wise dendrogram. attrs_dendrogram: ``bool``, optional, default: ``True`` Only matters if attrs_cluster is True. Show the dendrogram if this option is True. + attrs_method: ``str``, optional, default: ``ward`` + Linkage method for attrs, choosing from ``single``, ``complete``, ``average``, ``weighted``, ``centroid``, ``median`` and ``ward``. + attrs_optimal_ordering: ``bool``, optional, default: ``True`` + Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the attrs linkage matrix will be reordered so that the distance between successive leaves is minima. + xlabel_size: ``float``, optional, default: 10.0 + Fontsize for x-axis labels. + ylabel_size: ``float``, optional, default: 10.0 + Fontsize for y-axis labels. + legend_fontsize: ``float``, optional, default: 10.0 + Fontsize for legend labels. + xlabel_rotation: ``float``, optional, default: 90.0 + Rotation of x-axis labels. + ylabel_rotation: ``float``, optional, default: 0.0 + Rotation of y-axis labels. groupby_cluster: ``bool``, optional, default: ``True`` Cluster data.obs['groupby'] and generate a cluster-wise dendrogram. groupby_dendrogram: ``bool``, optional, default: ``True`` Only matters if groupby_cluster is True. Show the dendrogram if this option is True. - attrs_labelsize: ``float``, optional, default: 10.0 - Fontsize for labels of attrs. - groupby_labelsize: ``float``, optional, default: 10.0 - Fontsize for labels of data.obs['groupby']. + groupby_method: ``str``, optional, default: ``ward`` + Linkage method for groupby, choosing from ``single``, ``complete``, ``average``, ``weighted``, ``centroid``, ``median`` and ``ward``. + groupby_optimal_ordering: ``bool``, optional, default: ``True`` + Parameter for scipy.cluster.hierarchy.linkage. If ``True``, the groupby linkage matrix will be reordered so that the distance between successive leaves is minima. + groupby_precomputed_linkage: ``np.array``, optional, default: ``None`` + Pass a precomputed linkage. + show_sample_name: ``bool``, optional, default: ``None`` + If show sample names as tick labels. If ``None``, show_sample_name == ``True`` if groupby == ``None`` and otherwise show_sample_name == ``False``. cbar_labelsize: ``float``, optional, default: 10.0 Fontsize of the color bar. panel_size: ``Tuple[float, float]``, optional, default: ``(10, 10)`` @@ -1072,11 +1115,11 @@ def heatmap( ------- ``Figure`` object - A ``matplotlib.figure.Figure`` object containing the dot plot if ``return_fig == True`` + A ``matplotlib.figure.Figure`` object containing the heatmap if ``return_fig == True``; Otherwise, A ``seaborn.matrix.ClusterGrid`` object is returned. Examples -------- - >>> pg.heatmap(data, genes=['CD14', 'TRAC', 'CD34'], groupby='louvain_labels') + >>> pg.heatmap(data, attrs=['CD14', 'TRAC', 'CD34'], groupby='leiden_labels') """ if not isinstance(data, anndata.AnnData): @@ -1088,6 +1131,9 @@ def heatmap( if isinstance(attrs, str): attrs = [attrs] + # Filter out attributes not existing in the data + attrs = _get_valid_attrs(data, attrs) + obs_keys = [] genes = [] for key in attrs: @@ -1100,71 +1146,99 @@ def heatmap( return None genes.append(key) - clusters = data.obs[groupby].values - if not is_categorical_dtype(clusters): - clusters = pd.Categorical(clusters) - else: - clusters = clusters.remove_unused_categories() - df_list = [pd.DataFrame({'cluster_name': clusters})] - + df_list = [] if len(obs_keys) > 0: df_list.append(data.obs[obs_keys].reset_index(drop=True)) if len(genes) > 0: expr_mat = slicing(data[:, genes].X) + if gene_zscore: + from scipy.stats import zscore + expr_mat = zscore(expr_mat, ddof=1) df_list.append(pd.DataFrame(data=expr_mat, columns=genes)) df = pd.concat(df_list, axis = 1) - attr_names = df.columns[1:].values + df.index = data.obs_names + attr_names = df.columns.values + + if show_sample_name is None: + show_sample_name = True if groupby is None else False + groupby_tick_labels = df.index if show_sample_name else [] + + cluster_ids = None + cell_colors = None + if groupby is not None: + cluster_ids = data.obs[groupby].values + if not is_categorical_dtype(cluster_ids): + cluster_ids = pd.Categorical(cluster_ids) + else: + cluster_ids = cluster_ids.remove_unused_categories() + + if on_average: + if not 'cmap' in kwargs.keys(): + kwargs['cmap'] = 'Reds' + df['cluster_name'] = cluster_ids + df = df.groupby(by='cluster_name', observed=True).mean() + cluster_ids = df.index + groupby_tick_labels = cluster_ids + else: + if not groupby_cluster: + idx = cluster_ids.argsort(kind = 'mergesort') + df = df.iloc[idx, :] # organize df by category order + cluster_ids = cluster_ids[idx] - if on_average: - if not 'cmap' in kwargs.keys(): - kwargs['cmap'] = 'Reds' - df = df.groupby('cluster_name').mean() - cluster_ids = df.index - else: - cluster_ids = df.pop('cluster_name').values - if not groupby_cluster: - idx = cluster_ids.argsort(kind = 'mergesort') - df = df.iloc[idx, :] # organize df by category order - cluster_ids = cluster_ids[idx] + cell_colors = np.zeros(df.shape[0], dtype=object) + palette = _get_palette(cluster_ids.categories.size) + + for k, cat in enumerate(cluster_ids.categories): + cell_colors[cluster_ids == cat] = palette[k] - cell_colors = np.zeros(df.shape[0], dtype=object) - palette = _get_palette(cluster_ids.categories.size) + from scipy.cluster.hierarchy import linkage - for k, cat in enumerate(cluster_ids.categories): - cell_colors[cluster_ids == cat] = palette[k] + groupby_linkage = None + if groupby_cluster: + if groupby_precomputed_linkage is not None: + groupby_linkage = groupby_precomputed_linkage + else: + groupby_linkage = linkage(df, groupby_method, optimal_ordering = groupby_optimal_ordering) + attrs_linkage = None + if attrs_cluster: + attrs_linkage = linkage(df.T, attrs_method, optimal_ordering = attrs_optimal_ordering) if not switch_axes: cg = sns.clustermap( data=df, - row_colors=cell_colors if not on_average else None, + row_colors=cell_colors, col_colors=None, row_cluster=groupby_cluster, col_cluster=attrs_cluster, + row_linkage=groupby_linkage, + col_linkage=attrs_linkage, linewidths=0, - yticklabels=cluster_ids if on_average else [], + yticklabels=groupby_tick_labels, xticklabels=attr_names, figsize=panel_size, **kwargs, ) cg.ax_heatmap.set_ylabel("") - if attrs_labelsize is not None: - cg.ax_heatmap.tick_params(axis='x', labelsize=attrs_labelsize, labelrotation=75) + cg.ax_heatmap.tick_params(axis='x', labelsize=xlabel_size, labelrotation=xlabel_rotation) + cg.ax_heatmap.tick_params(axis='y', labelsize=ylabel_size, labelrotation=ylabel_rotation) else: cg = sns.clustermap( data=df.T, row_colors=None, - col_colors=cell_colors if not on_average else None, + col_colors=cell_colors, row_cluster=attrs_cluster, col_cluster=groupby_cluster, + row_linkage=attrs_linkage, + col_linkage=groupby_linkage, linewidths=0, yticklabels=attr_names, - xticklabels=cluster_ids if on_average else [], + xticklabels=groupby_tick_labels, figsize=panel_size, **kwargs, ) cg.ax_heatmap.set_xlabel("") - if attrs_labelsize is not None: - cg.ax_heatmap.tick_params(axis='y', labelsize=attrs_labelsize) + cg.ax_heatmap.tick_params(axis='y', labelsize=ylabel_size, labelrotation=ylabel_rotation) + cg.ax_heatmap.tick_params(axis='x', labelsize=xlabel_size, labelrotation=xlabel_rotation) show_row_dendrogram = (attrs_cluster and attrs_dendrogram) if switch_axes else (groupby_cluster and groupby_dendrogram) show_col_dendrogram = (groupby_cluster and groupby_dendrogram) if switch_axes else (attrs_cluster and attrs_dendrogram) @@ -1193,21 +1267,17 @@ def heatmap( cg.ax_cbar.yaxis.set_ticks_position("right") - if show_col_dendrogram: - cg.ax_heatmap.xaxis.tick_bottom() - cg.ax_col_dendrogram.set_visible(True) - else: - cg.ax_heatmap.xaxis.tick_top() - cg.ax_col_dendrogram.set_visible(False) + cg.ax_heatmap.xaxis.tick_bottom() + cg.ax_col_dendrogram.set_visible(show_col_dendrogram) cg.ax_cbar.tick_params(labelsize=cbar_labelsize) cg.fig.dpi = dpi - if not on_average: + if (groupby is not None) and (not on_average): if groupby_cluster: from matplotlib.patches import Patch legend_elements = [Patch(color = color, label = label) for color, label in zip(palette, cluster_ids.categories)] - cg.ax_heatmap.legend(handles=legend_elements, loc='lower left', bbox_to_anchor = (1.02, 1.02), fontsize = groupby_labelsize) + cg.ax_heatmap.legend(handles=legend_elements, loc='lower left', bbox_to_anchor = (1.02, 1.02), fontsize = legend_fontsize) else: values = cluster_ids.value_counts().values ticks = np.cumsum(values) - values / 2 @@ -1221,20 +1291,20 @@ def heatmap( cg.ax_col_colors.xaxis.tick_top() cg.ax_col_colors.set_xticks(ticks) cg.ax_col_colors.set_xticklabels(labels, rotation=45) - cg.ax_col_colors.tick_params(axis='x', top = False, labelsize = groupby_labelsize, length=10) + cg.ax_col_colors.tick_params(axis='x', top = False, labelsize = xlabel_size, length=10) if not isinstance(data, anndata.AnnData): if cur_matkey != data.current_matrix(): data.select_matrix(cur_matkey) - return cg.fig if return_fig else None + return cg.fig if return_fig else cg def dotplot( data: Union[MultimodalData, UnimodalData, anndata.AnnData], genes: Union[str, List[str]], groupby: str, - reduce_function: Callable[[np.ndarray], float] = np.mean, + reduce_function: Union[str, Callable[[np.ndarray], float]] = "mean", fraction_min: float = 0, fraction_max: float = None, dot_min: int = 0, @@ -1259,7 +1329,7 @@ def dotplot( Features to plot. groupby: ``str`` A categorical variable in data.obs that is used to categorize the cells, e.g. Clusters. - reduce_function: ``Callable[[np.ndarray], float]``, optional, default: ``np.mean`` + reduce_function: ``Union[str, Callable[[np.ndarray], float]]``, optional, default: ``"mean"`` Function to calculate statistic on expression data. Default is mean. fraction_min: ``float``, optional, default: ``0``. Minimum fraction of expressing cells to consider. @@ -1298,12 +1368,14 @@ def dotplot( sns.set(font_scale=0.7, style='whitegrid') if not is_list_like(genes): - geness = [genes] + genes = [genes] + + # Select only genes existing in the data + genes = _get_valid_attrs(data, genes) keywords = dict(cmap=cmap) keywords.update(kwds) - from scipy.sparse import issparse X = slicing(data[:, genes].X) df = pd.DataFrame(data=X, columns=genes) df[groupby] = data.obs[groupby].values @@ -1316,12 +1388,12 @@ def dotplot( idx = series == 0 if idx.sum() > 0: logger.warning(f"The following categories contain no cells and are removed: {','.join(list(series.index[idx]))}.") - df[groupby] = df[groupby].cat.remove_unused_categories() def non_zero(g): return np.count_nonzero(g) / g.shape[0] - summarized_df = df.groupby(groupby).aggregate([reduce_function, non_zero]) + # Set observed=True to suppress warnings. + summarized_df = df.groupby(by=groupby, observed=True).aggregate([reduce_function, non_zero]) row_indices = summarized_df.index.tolist() if sort_function == "natsorted": @@ -1359,9 +1431,9 @@ def non_zero(g): yticks = summarized_df.index.map(str).values if switch_axes: - x, y = y, x - xlabel, ylabel = ylabel, xlabel - xticks, yticks = yticks, xticks + x, y = y[::-1], x[::-1] + xlabel, ylabel = ylabel[::-1], xlabel[::-1] + xticks, yticks = yticks[::-1], xticks[::-1] dotplot_df = pd.DataFrame(data=dict(x=x, y=y, value=summary_values, pixels=pixels, fraction=fraction, xlabel=np.array(xlabel)[x], ylabel=np.array(ylabel)[y])) @@ -1440,7 +1512,7 @@ def non_zero(g): size_legend.grid(False) # Reset global settings. - sns.reset_orig() + matplotlib.rc_file_defaults() return fig if return_fig else None @@ -1497,7 +1569,7 @@ def dendrogram( linkage: ``str``, optional, default: ``complete`` Which linkage criterion to use, used by hierarchical clustering. Below are available options: - ``ward`` minimizes the variance of the clusters being merged. - - ``avarage`` uses the average of the distances of each observation of the two sets. + - ``average`` uses the average of the distances of each observation of the two sets. - ``complete`` uses the maximum distances between all observations of the two sets. (Default) - ``single`` uses the minimum of the distances between all observations of the two sets. diff --git a/pegasus/plotting/plot_utils.py b/pegasus/plotting/plot_utils.py index 0c61cb58..e0b6fde8 100644 --- a/pegasus/plotting/plot_utils.py +++ b/pegasus/plotting/plot_utils.py @@ -9,6 +9,9 @@ from matplotlib.patches import Circle from matplotlib.collections import PatchCollection +import logging +logger = logging.getLogger(__name__) + def _transform_basis(basis: str) -> str: if basis == "tsne": @@ -435,3 +438,24 @@ def _plot_spots(x: np.ndarray, y: np.ndarray, c: Union[str, np.ndarray], s: floa spots.set_clim(vmin, vmax) ax.add_collection(spots) return spots + + +def _get_valid_attrs(data:Union[MultimodalData, UnimodalData], attrs: List[str]) -> List[str]: + attrs_filt = [] + attrs_drop = [] + for attr in attrs: + if (attr == '_all') or (attr in data.obs) or (attr in data.var_names) or ('@' in attr): + if not '@' in attr: + attrs_filt.append(attr) + else: + obsm_key, sep, component = attr.partition("@") + if (sep != "@") or (obsm_key not in data.obsm) or (not component.isdigit()): + attrs_drop.append(attr) + else: + attrs_filt.append(attr) + else: + attrs_drop.append(attr) + if len(attrs_drop) > 0: + logger.warning(f"Attributes {attrs_drop} are not in data.obs, data.var_names or data.obsm!") + + return attrs_filt diff --git a/pegasus/tools/__init__.py b/pegasus/tools/__init__.py index ac0f149e..3b5eebd3 100644 --- a/pegasus/tools/__init__.py +++ b/pegasus/tools/__init__.py @@ -56,7 +56,7 @@ net_umap, net_fle, ) -from .diff_expr import de_analysis, markers, write_results_to_excel, run_de_analysis +from .diff_expr import de_analysis, markers, write_results_to_excel, cluster_specific_markers, run_de_analysis from .gradient_boosting import find_markers, run_find_markers from .subcluster_utils import clone_subset from .signature_score import calc_signature_score, calculate_z_score diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py index 29c50d5b..432129e5 100644 --- a/pegasus/tools/clustering.py +++ b/pegasus/tools/clustering.py @@ -1,6 +1,7 @@ import time import numpy as np import pandas as pd +from pandas.api.types import is_categorical_dtype from pegasusio import MultimodalData from natsort import natsorted @@ -643,10 +644,11 @@ def split_one_cluster( n_clust: int, res_label: str, rep: str = "pca", + n_comps: int = None, random_state: int = 0, ) -> None: """ - Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' clusters and write the new clusting results to 'res_label'. Assume 'clust_label' named clusters as numbers (in str format). + Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' sub-clusters and write the new clusting results to 'res_label'. The sub-cluster names are the concatenation of original cluster name and the subcluster id (e.g. 'T' -> 'T-1', 'T-2'). Parameters ---------- @@ -663,11 +665,14 @@ def split_one_cluster( Split 'clust_id' into `n_clust' subclusters. res_label: `str`, - Write new clustering in data.obs['res_label']. The largest subcluster will use 'clust_id' as its cluster ID, while other subclusters will be numbered after existing clusters. + Write new clustering in data.obs['res_label']. The sub-cluster names are the concatenation of original cluster name and the subcluster id (e.g. 'T' -> 'T-1', 'T-2'). rep: ``str``, optional, default: ``"pca"`` The embedding representation used for Kmeans clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates. + n_comps: `int`, optional (default: None) + Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions. + n_jobs : `int`, optional (default: -1) Number of threads to use for the KMeans step in 'spectral_louvain' and 'spectral_leiden'. -1 refers to using all physical CPU cores. @@ -685,16 +690,35 @@ def split_one_cluster( -------- >>> pg.split_one_cluster(data, 'leiden_labels', '15', 2, 'leiden_labels_split') """ - idx = np.where(data.obs[clust_label] == clust_id)[0] + cats = None + if is_categorical_dtype(data.obs[clust_label]): + cats = data.obs[clust_label].cat.categories.values + else: + cats = pd.Categorical(data.obs[clust_label]).categories.values + if cats.dtype.kind not in {'S', 'U'}: + cats = cats.astype(str) + idx_cat = np.nonzero(cats==clust_id)[0] + + if idx_cat.size == 0: + raise ValueError(f"{clust_id} is not in {clust_label}!") + elif idx_cat.size > 1: + raise ValueError(f"Detected more than one categories in {clust_label} with name {clust_id}!") + else: + idx_cat = idx_cat[0] + + idx = np.nonzero((data.obs[clust_label] == clust_id).values)[0] tmpdat = data[idx].copy() from pegasus.tools import neighbors - neighbors(tmpdat, rep=rep, use_cache=False) + neighbors(tmpdat, rep=rep, n_comps=n_comps, use_cache=False) leiden(tmpdat, rep=rep, resolution=None, n_clust=n_clust, random_state=random_state) - new_clust = data.obs[clust_label].values.astype(int) - new_label = new_clust.max() + 1 - for label in tmpdat.obs['leiden_labels'].value_counts().index[1:]: - new_clust[idx[(tmpdat.obs['leiden_labels'] == label).values]] = new_label - new_label += 1 - data.obs[res_label] = pd.Categorical(values = new_clust.astype(str), categories = np.array(range(1, new_label)).astype(str)) + + new_clust = data.obs[clust_label].values.astype(object) + cats_sub = [] + for i, label in enumerate(tmpdat.obs['leiden_labels'].value_counts().index): + sub_id = f"{clust_id}-{i+1}" + new_clust[idx[(tmpdat.obs['leiden_labels'] == label).values]] = sub_id + cats_sub.append(sub_id) + + data.obs[res_label] = pd.Categorical(values = new_clust, categories = np.concatenate((cats[0:idx_cat], np.array(cats_sub), cats[idx_cat+1:]))) data.register_attr(res_label, "cluster") del tmpdat diff --git a/pegasus/tools/diff_expr.py b/pegasus/tools/diff_expr.py index 45461628..f83c01e1 100644 --- a/pegasus/tools/diff_expr.py +++ b/pegasus/tools/diff_expr.py @@ -419,7 +419,7 @@ def de_analysis( n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all available threads. - t: ``bool``, optional, default: ``True`` + t: ``bool``, optional, default: ``False`` If ``True``, calculate Welch's t test. fisher: ``bool``, optional, default: ``False`` @@ -756,6 +756,62 @@ def add_worksheet( logger.info("Excel spreadsheet is written.") +def cluster_specific_markers( + markers: Dict[str, Dict[str, pd.DataFrame]], + clust_id: str, + min_auroc: float = 0.7, + expected_pfc: float = 10.0, + n_lo: int = 25, + n_up: int = 50, +) -> pd.DataFrame: + """ Extract cluster-specific markers from DE results ``markers``. + + This function extracts cluster-specific markers (e.g. with auroc >= min_auroc and high in percentage fold change). The extracted markers can be screened for signatures representing the cluster. + + The selection procedure is as follows: First, pick genes with AUROC >= min_auroc and pfc (percentage fold change) >= expected_pfc. If the number is between [n_lo, n_up], return the subset of markers containing only these genes. Otherwise, if the number < n_lo, extend the gene set to include up to n_lo genes in descending order of their pfc. If the number > n_up, truncate the set by keeping only n_up genes with highest pfc. + + Parameters + ---------- + markers: ``Dict[str, Dict[str, pd.DataFrame]]`` + Markers from `de_analysis`. + + clust_id: ``str`` + Cluster ID to tell which cluster to focus on. + + min_auroc: ``float``, default, ``0.7`` + Minimum AUROC for a gene. + + expected_pfc: ``float``, optional, default: ``10.0`` + Expected percentage fold change for a gene. + + n_lo: ``int``, optional, default: ``25`` + Lower bound (inclusive) on the number of genes to return. + + n_up: ``int``, optional, default: ``50`` + Upper bound (inclusive) on the number of genes to return. + + Returns + ------- + results: ``pd.DataFrame`` + A Python dataframe containing selected markers, ranking in descending order with respect to AUROC. + + Examples + -------- + >>> candidates = pg.cluster_specific_markers(markers, 'Mono') + """ + df = markers[clust_id]['up'] + idx_auc = df['auroc'] >= min_auroc + idx_epf = df['percentage_fold_change'] >= expected_pfc + idx = idx_auc & idx_epf + n = idx.sum() + if n >= n_lo and n <= n_up: + return df[idx] + else: + res = df[idx_auc].sort_values('percentage_fold_change', ascending=False) + res = res.iloc[0:(n_lo if n < n_lo else n_up)].sort_values('auroc', ascending=False) + return res + + @timer(logger=logger) def run_de_analysis( input_file: str, diff --git a/pegasus/tools/doublet_detection.py b/pegasus/tools/doublet_detection.py index 26aadd4a..f9f69393 100644 --- a/pegasus/tools/doublet_detection.py +++ b/pegasus/tools/doublet_detection.py @@ -267,7 +267,7 @@ def _run_scrublet( If True, plot diagnostic histograms. Each sample would have a figure consisting of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets. manual_correction: ``str``, optional, default: ``None`` - If present, use human guide provided in manual_correction to select threshold. Currently support 'peak' and 'expected'. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate. + If present, use human guide provided in manual_correction to select threshold. Currently support 'peak', 'expected' and threshold. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate. If not both, convert guide to float and use as user-specified threshold. Returns -------- @@ -349,7 +349,7 @@ def _run_scrublet( if k is None: k = int(round(0.5 * np.sqrt(obsX.shape[0]))) k_adj = int(round(k * (1.0 + r))) - indices, _ = calculate_nearest_neighbors(pc_coords, K = k_adj + 1, n_jobs = n_jobs) + indices, _, _ = calculate_nearest_neighbors(pc_coords, K=k_adj + 1, n_jobs=n_jobs, exact_k=True) # Calculate scrublet-like doublet score k_d = is_doublet[indices].sum(axis = 1) @@ -420,6 +420,8 @@ def _run_scrublet( threshold = np.exp(x[maxima_by_x[-1]]) elif manual_correction == "expected": threshold = threshold_theory + else: + threshold = float(manual_correction) data.obs["doublet_score"] = obs_scores.astype(np.float32) data.obs["pred_dbl"] = obs_scores > threshold @@ -474,7 +476,7 @@ def infer_doublets( data: MultimodalData, channel_attr: Optional[str] = None, clust_attr: Optional[str] = None, - raw_mat_key: Optional[str] = 'counts', + raw_mat_key: Optional[str] = None, min_cell: Optional[int] = 100, expected_doublet_rate: Optional[float] = None, sim_doublet_ratio: Optional[float] = 2.0, @@ -501,6 +503,9 @@ def infer_doublets( clust_attr: ``str``, optional, default: None Attribute indicating cluster labels. If set, estimate proportion of doublets in each cluster and statistical significance. + raw_mat_key: ``str``, optional, default: None + The key for raw count matrix. By default, Pegasus will first try "counts" and then try "raw.X" + min_cell: ``int``, optional, default: 100 Minimum number of cells per sample to calculate doublet scores. For samples having less than 'min_cell' cells, doublet score calculation will be skipped. @@ -529,7 +534,7 @@ def infer_doublets( If not None, plot diagnostic histograms using ``plot_hist`` as the prefix. If `channel_attr` is None, ``plot_hist.dbl.png`` is generated; Otherwise, ``plot_hist.channel_name.dbl.png`` files are generated. Each figure consists of 4 panels showing histograms of doublet scores for observed cells (panel 1, density in log scale), simulated doublets (panel 2, density in log scale), KDE plot (panel 3) and signed curvature plot (panel 4) of log doublet scores for simulated doublets. Each plot contains two dashed lines. The red dashed line represents the theoretical cutoff (calucalted based on number of cells and 10x doublet table) and the black dashed line represents the cutof inferred from the data. manual_correction: ``str``, optional, default: ``None`` - Use human guide to correct doublet threshold for certain channels. This is string representing a comma-separately list. Each item in the list represent one sample and the sample name and correction guide are separated using ':'. The orrection guides supported are 'peak' and 'expected'. 'peak' means cutting at the center of the peak and 'expected' means cutting at the expected doublet rate. If only one sample available, use '' as the sample name. + Use human guide to correct doublet threshold for certain channels. This is string representing a comma-separately list. Each item in the list represent one sample and the sample name and correction guide are separated using ':'. The correction guides supported are 'peak', 'expected' and threshold. 'peak' means cutting at the center of the peak; 'expected' means cutting at the expected doublet rate; threshold is the user-specified doublet threshold; if the guide is neither 'peak' nor 'expected', pegasus will try to convert the string into float and use it as doublet threshold. If only one sample available, no need to specify sample name. Returns ------- @@ -545,6 +550,11 @@ def infer_doublets( >>> pg.infer_doublets(data, channel_attr = 'Channel', clust_attr = 'Annotation') """ assert data.get_modality() == "rna" + + if raw_mat_key is None: + raw_mat_key = 'counts' + if raw_mat_key not in data.list_keys(): + raw_mat_key = 'raw.X' try: rawX = data.get_matrix(raw_mat_key) except ValueError: @@ -554,10 +564,13 @@ def infer_doublets( mancor = {} if manual_correction is not None: - for item in manual_correction.split(','): - name, action = item.split(':') - mancor[name] = action - + if channel_attr is None: + mancor[''] = manual_correction + else: + for item in manual_correction.split(','): + name, action = item.split(':') + mancor[name] = action + if channel_attr is None: if data.shape[0] >= min_cell: fig = _run_scrublet(data, raw_mat_key, expected_doublet_rate = expected_doublet_rate, sim_doublet_ratio = sim_doublet_ratio, \ @@ -586,9 +599,9 @@ def infer_doublets( if idx.size >= min_cell: unidata = UnimodalData({"barcodekey": data.obs_names[idx]}, {"featurekey": data.var_names}, - {"counts": rawX[idx]}, + {raw_mat_key: rawX[idx]}, {"genome": genome, "modality": modality}, - cur_matrix = "counts") + cur_matrix = raw_mat_key) # Identify robust genes, count and log normalized and select top 2,000 highly variable features identify_robust_genes(unidata) log_norm(unidata) diff --git a/pegasus/tools/nearest_neighbors.py b/pegasus/tools/nearest_neighbors.py index fdc16109..fd95e4ca 100644 --- a/pegasus/tools/nearest_neighbors.py +++ b/pegasus/tools/nearest_neighbors.py @@ -34,27 +34,61 @@ def calculate_nearest_neighbors( K: int = 100, n_jobs: int = -1, method: str = "hnsw", + exact_k: bool = False, M: int = 20, efC: int = 200, efS: int = 200, random_state: int = 0, full_speed: int = False, dist: str = 'l2', -): - """Calculate nearest neighbors - X is the sample by feature matrix - Return K -1 neighbors, the first one is the point itself and thus omitted. - TODO: Documentation - """ +) -> Tuple[List[int], List[float], int]: + """Find K nearest neighbors for each data point in the matrix and return the indices and distances arrays. + + K is determined by min(K, int(sqrt(X.shape[0]))) if exact_k == False. + + Parameters + ---------- + + X : `np.array` + An array of n_samples by n_features. + K : `int`, optional (default: 100) + Number of neighbors, including the data point itself. If K is None, determine K by sqrt(X.shape[0]). + n_jobs : `int`, optional (default: -1) + Number of threads to use. -1 refers to using all physical CPU cores. + method: `str`, optional (default: 'hnsw') + Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search. If X.shape[0] <= 1000, method will be automatically set to "sklearn" for exact KNN search + exact_k: `bool`, optional (default: 'False') + If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])). + M, efC, efS: `int`, optional (20, 200, 200) + HNSW algorithm parameters. + random_state: `int`, optional (default: 0) + Random seed for random number generator. + full_speed: `bool`, optional (default: False) + If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible. + dist: `str`, optional (default: 'l2') + Distance metric to use. By default, use squared L2 distance. Available options, 'l2', inner product 'ip' or cosine similarity 'cosine'. + + Returns + ------- + kNN indices array, distances array and adjusted K. + + Examples + -------- + >>> indices, distances = calculate_nearest_neighbors(X) + """ nsample = X.shape[0] if nsample <= 1000: method = "sklearn" - if nsample < K: - logger.warning(f"Warning: in calculate_nearest_neighbors, number of samples = {nsample} < K = {K}!\n Set K to {nsample}.") - K = nsample + k_rot = int(nsample ** 0.5) # rot, rule of thumb + if (K is None) or (K > k_rot and (not exact_k)): + K = k_rot + logger.info(f"in calculate_nearest_neighbors, K is adjusted to {K}.") + + if K == 1: + return np.zeros((nsample, 0), dtype=int), np.zeros((nsample, 0), dtype=np.float32), K n_jobs = eff_n_jobs(n_jobs) @@ -91,7 +125,7 @@ def calculate_nearest_neighbors( knn.fit(X) distances, indices = knn.kneighbors() - return indices, distances + return indices, distances, K def knn_is_cached( @@ -114,11 +148,15 @@ def get_neighbors( n_jobs: int = -1, random_state: int = 0, full_speed: bool = False, - use_cache: bool = True, + use_cache: bool = False, dist: str = "l2", -) -> Tuple[List[int], List[float]]: + method: str = "hnsw", + exact_k: bool = False, +) -> Tuple[List[int], List[float], int]: """Find K nearest neighbors for each data point and return the indices and distances arrays. + K is determined by min(K, int(sqrt(data.shape[0]))) if exact_k == False. + Parameters ---------- @@ -136,34 +174,44 @@ def get_neighbors( Random seed for random number generator. full_speed: `bool`, optional (default: False) If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible. - use_cache: `bool`, optional (default: True) + use_cache: `bool`, optional (default: False) If use_cache and found cached knn results, will not recompute. dist: `str`, optional (default: 'l2') - Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'. + Distance metric to use. By default, use squared L2 distance. Available options, 'l2' or inner product 'ip' or cosine similarity 'cosine'. + method: `str`, optional (default: 'hnsw') + Choosing from 'hnsw' for approximate nearest neighbor search or 'sklearn' for exact nearest neighbor search. + exact_k: `bool`, optional (default: 'False') + If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])). Returns ------- - kNN indices and distances arrays. + kNN indices array, distances array, and adjusted K. Examples -------- - >>> indices, distances = tools.get_neighbors(data) + >>> indices, distances, K = tools.get_neighbors(data) """ - rep = update_rep(rep) indices_key = rep + "_knn_indices" distances_key = rep + "_knn_distances" + k_rot = int(data.shape[0] ** 0.5) # rot, rule of thumb + if (K is None) or (K > k_rot and (not exact_k)): + K = k_rot + logger.info(f"in get_neighbors, K is adjusted to {K}.") + if use_cache and knn_is_cached(data, indices_key, distances_key, K): indices = data.obsm[indices_key] distances = data.obsm[distances_key] logger.info("Found cached kNN results, no calculation is required.") else: - indices, distances = calculate_nearest_neighbors( + indices, distances, _ = calculate_nearest_neighbors( X_from_rep(data, rep, n_comps), K=K, n_jobs=eff_n_jobs(n_jobs), + method=method, + exact_k=exact_k, random_state=random_state, full_speed=full_speed, dist=dist, @@ -173,7 +221,7 @@ def get_neighbors( data.obsm[distances_key] = distances data.register_attr(distances_key, "knn") - return indices, distances + return indices, distances, K def get_symmetric_matrix(csr_mat: "csr_matrix") -> "csr_matrix": @@ -235,13 +283,17 @@ def neighbors( n_jobs: int = -1, random_state: int = 0, full_speed: bool = False, - use_cache: bool = True, + use_cache: bool = False, dist: str = "l2", + method: str = "hnsw", + exact_k: bool = False, ) -> None: """Compute k nearest neighbors and affinity matrix, which will be used for diffmap and graph-based community detection algorithms. The kNN calculation uses `hnswlib `_ introduced by [Malkov16]_. + K is determined by min(K, sqrt(data.shape[0])). + Parameters ---------- @@ -267,12 +319,18 @@ def neighbors( * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. - use_cache: ``bool``, optional, default: ``True`` + use_cache: ``bool``, optional, default: ``False`` * If ``True`` and found cached knn results, Pegasus will use cached results and do not recompute. * Otherwise, compute kNN irrespective of caching status. dist: ``str``, optional (default: ``"l2"``) - Distance metric to use. By default, use squared L2 distance. Available options, inner product ``"ip"`` or cosine similarity ``"cosine"``. + Distance metric to use. By default, use squared L2 distance. Available options, ``"l2"`` or inner product ``"ip"`` or cosine similarity ``"cosine"``. + + method: ``str``, optional (default: ``"hnsw"``) + Choose from "hnsw" or "sklearn". "hnsw" uses HNSW algorithm for approximate nearest neighbor search and "sklearn" uses sklearn package for exact nearest neighbor search. + + exact_k: ``bool``, optional (default: ``False``) + If True, use exactly the K passed to the function; otherwise K is determined as min(K, sqrt(X.shape[0])). Returns ------- @@ -292,7 +350,7 @@ def neighbors( # calculate kNN rep = update_rep(rep) - indices, distances = get_neighbors( + indices, distances, K = get_neighbors( data, K=K, rep=rep, @@ -302,6 +360,8 @@ def neighbors( full_speed=full_speed, use_cache=use_cache, dist=dist, + method=method, + exact_k=exact_k, ) # calculate affinity matrix @@ -408,7 +468,7 @@ def calc_kBET( attr_values = data.obs[attr].values.copy() attr_values.categories = range(nbatch) - indices, distances = get_neighbors( + indices, distances, K = get_neighbors( data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, use_cache=use_cache, ) knn_indices = np.concatenate( @@ -499,7 +559,7 @@ def calc_kSIM( assert attr in data.obs nsample = data.shape[0] - indices, distances = get_neighbors( + indices, distances, K = get_neighbors( data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, use_cache=use_cache, ) knn_indices = np.concatenate( diff --git a/pegasus/tools/nmf.py b/pegasus/tools/nmf.py index a37399d2..8cf2cb08 100644 --- a/pegasus/tools/nmf.py +++ b/pegasus/tools/nmf.py @@ -81,6 +81,7 @@ def nmf( alpha_H: float = 0.0, l1_ratio_H: float = 0.0, fp_precision: str = "float", + online_chunk_size: int = 5000, n_jobs: int = -1, random_state: int = 0, ) -> None: @@ -137,6 +138,9 @@ def nmf( fp_precision: ``str``, optional, default: ``float`` The numeric precision on the results. Choose from ``float`` and ``double``. + online_chunk_size: ``int``, optional, default: ``int`` + The chunk / mini-batch size for online learning. Only works when ``mode='online'``. + n_jobs : `int`, optional (default: -1) Number of threads to use. -1 refers to using all physical CPU cores. @@ -189,6 +193,7 @@ def nmf( alpha_H=alpha_H, l1_ratio_H=l1_ratio_H, fp_precision=fp_precision, + online_chunk_size=online_chunk_size, ) data.uns["nmf_features"] = features # record which feature to use @@ -285,6 +290,7 @@ def integrative_nmf( use_gpu: bool = False, lam: float = 5.0, fp_precision: str = "float", + online_chunk_size: int = 5000, n_jobs: int = -1, random_state: int = 0, quantile_norm: bool = True, @@ -334,6 +340,9 @@ def integrative_nmf( fp_precision: ``str``, optional, default: ``float`` The numeric precision on the results. Choose from ``float`` and ``double``. + online_chunk_size: ``int``, optional, default: ``5000`` + The chunk / mini-batch size for online learning. Only works when ``mode='online'``. + n_jobs : `int`, optional (default: -1) Number of threads to use. -1 refers to using all physical CPU cores. @@ -394,6 +403,7 @@ def integrative_nmf( use_gpu=use_gpu, lam=lam, fp_precision=fp_precision, + online_chunk_size=online_chunk_size, ) # Implementation of algo 3, quantile normalization @@ -406,14 +416,19 @@ def integrative_nmf( seeds = rg.integers(4294967295, size=nbatch) ref_batch = max_size = -1 for i in range(nbatch): - H_new = np.ascontiguousarray(Hs[i] / np.linalg.norm(Hs[i], axis=0), dtype=np.float32) # Scale H + h_norm = np.linalg.norm(Hs[i], axis=0) + idx_h_zeros = np.where(h_norm==0)[0] + if idx_h_zeros.size > 0: + # Set norm 0 to 1 to avoid divide by zero issue + h_norm[idx_h_zeros] = 1.0 + H_new = np.ascontiguousarray(Hs[i] / h_norm, dtype=np.float32) # Scale H Hs_new.append(H_new) # Append scaled H if not quantile_norm: continue clusters = np.argmax(H_new, axis=1) # Assign cluster - indices, _ = calculate_nearest_neighbors(H_new, K=20, n_jobs=n_jobs, random_state=seeds[i]) # KNN with K=20 + indices, _, _ = calculate_nearest_neighbors(H_new, K=20, n_jobs=n_jobs, random_state=seeds[i]) # KNN with K=20 clusters, csum = _refine_cluster(clusters, indices, n_components) # Refine cluster csums.append(csum) ids_by_clusts.append(np.argsort(clusters, kind='stable')) diff --git a/pegasus/tools/preprocessing.py b/pegasus/tools/preprocessing.py index 71c78b3b..dd105f74 100644 --- a/pegasus/tools/preprocessing.py +++ b/pegasus/tools/preprocessing.py @@ -276,10 +276,9 @@ def _run_filter_data( if output_filt is not None: group_key = unidata.get_uid() - writer = pd.ExcelWriter(f"{output_filt}.{group_key}.filt.xlsx", engine="xlsxwriter") - df_cells = get_filter_stats(unidata, min_genes_before_filt = min_genes_before_filt) - df_cells.to_excel(writer, sheet_name="Cell filtration stats") - writer.save() + with pd.ExcelWriter(f"{output_filt}.{group_key}.filt.xlsx", engine="xlsxwriter") as writer: + df_cells = get_filter_stats(unidata, min_genes_before_filt = min_genes_before_filt) + df_cells.to_excel(writer, sheet_name="Cell filtration stats") logger.info(f"Filtration results for {group_key} are written.") if plot_filt is not None: @@ -347,7 +346,7 @@ def _set_target_mat(data, X, target_matrix, select, base_matrix, suffix): if target_matrix in data.matrices: logger.warning(f"{target_matrix} is in data's matrices. It will be rewritten.") - data.add_matrix(target_matrix, X) + data.update_matrix(target_matrix, X) if select: data.select_matrix(target_matrix) diff --git a/pegasus/tools/scvitools.py b/pegasus/tools/scvitools.py index a01c0a8e..20dfd1c3 100644 --- a/pegasus/tools/scvitools.py +++ b/pegasus/tools/scvitools.py @@ -190,9 +190,14 @@ def run_scvi( scvi.settings.num_threads = eff_n_jobs(n_jobs) # set n_jobs scvi.settings.seed = random_state # set random_state, see [here](https://docs.scvi-tools.org/en/stable/_modules/scvi/_settings.html) for more details. + print(max_epochs) + if max_epochs is None: max_epochs = np.min([round((20000 / len(adata.obs)) * 400), 400]) + print(type(max_epochs)) + print(max_epochs) + scvi.model.SCVI.setup_anndata(adata, batch_key=batch, categorical_covariate_keys=categorical_covariate_keys, diff --git a/pegasus/tools/signature_score.py b/pegasus/tools/signature_score.py index ee14c446..1f055d39 100644 --- a/pegasus/tools/signature_score.py +++ b/pegasus/tools/signature_score.py @@ -1,4 +1,5 @@ import numpy as np +import scipy.sparse as sp import pandas as pd from typing import Dict, List, Union @@ -30,7 +31,7 @@ def _check_and_calc_sig_background(data: UnimodalData, n_bins: int) -> bool: bins = pd.qcut(mean_vec, n_bins, duplicates = "drop") if bins.value_counts().min() == 1: logger.warning("Detected bins with only 1 gene!") - bins.categories = bins.categories.astype(str) + bins = bins.rename_categories(dict(zip(bins.categories, bins.categories.astype(str)))) data.var["bins"] = bins # calculate background expectations @@ -89,7 +90,11 @@ def calculate_z_score( if not _check_and_calc_sig_background(data, n_bins): return None - z_score_mat = (data.X.toarray().astype(np.float32) - data.var["mean"].values.astype(np.float32) - data.obsm["sig_bkg_mean"][:, data.var["bins"].cat.codes].astype(np.float32)) / data.obsm["sig_bkg_std"][:, data.var["bins"].cat.codes].astype(np.float32) + mat = data.X + if sp.issparse(mat): + mat = mat.toarray() + + z_score_mat = (mat.astype(np.float32) - data.var["mean"].values.astype(np.float32) - data.obsm["sig_bkg_mean"][:, data.var["bins"].cat.codes].astype(np.float32)) / data.obsm["sig_bkg_std"][:, data.var["bins"].cat.codes].astype(np.float32) return z_score_mat @@ -100,6 +105,7 @@ def calc_signature_score( signatures: Union[Dict[str, List[str]], str], n_bins: int = 50, show_omitted_genes: bool = False, + skip_threshold: int = 1, random_state: int = 0 ) -> None: """Calculate signature / gene module score. [Li20-1]_ @@ -124,12 +130,21 @@ def calc_signature_score( * ``apoptosis_human`` contains one signature, ``apoptosis``, which includes apoptosis-related genes from the KEGG pathway. * ``cell_cycle_mouse``, ``gender_mouse``, ``mitochondrial_genes_mouse``, ``ribosomal_genes_mouse`` and ``apoptosis_mouse`` are the corresponding signatures for mouse. Gene symbols are directly translated from human genes. + In addition, Pegasus provides the following 4 curated signature panels: + * ``emt_human``, the Epithelial-Mesenchymal Transition signature from Gibbons and Creighton Dev. Dyn. 2018. + * ``human_lung``, human lung cell type markers. + * ``mouse_brain``, mouse brain cell type markers. + * ``mouse_liver``, mouse liver cell type markers. + n_bins: ``int``, optional, default: 50 Number of bins on expression levels for grouping genes. show_omitted_genes: ``bool``, optional, default False Signature genes that are not expressed in the data will be omitted. By default, pegasus does not report which genes are omitted. If this option is turned on, report omitted genes. + skip_threshold: ``int``, optional, default 1 + Skip signature calculation of number of kept genes is less than skip_threshold. + random_state: ``int``, optional, default: 0 Random state used by KMeans if signature == ``gender_human`` or ``gender_mouse``. @@ -170,16 +185,22 @@ def calc_signature_score( sig_string = signatures if sig_string in predefined_signatures: signatures = load_signatures_from_file(predefined_signatures[sig_string]) - from threadpoolctl import threadpool_limits + + if sig_string.startswith("mitochondrial_genes"): + del signatures["mito_noncoding"] + elif sig_string.startswith("ribosomal_genes"): + del signatures["ribo_like"] + + _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold) if sig_string.startswith("cell_cycle"): - _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes) data.obs["cycle_diff"] = data.obs["G2/M"] - data.obs["G1/S"] values = data.obs[["G1/S", "G2/M"]].values maxvalues = values.max(axis = 1) data.obs["cycling"] = maxvalues + from threadpoolctl import threadpool_limits kmeans = KMeans(n_clusters=2, random_state=random_state) with threadpool_limits(limits = 1): kmeans.fit(maxvalues.reshape(-1, 1)) @@ -191,9 +212,9 @@ def calc_signature_score( data.obs["predicted_phase"] = pd.Categorical.from_codes(codes, categories = ["G0", "G1/S", "G2/M"]) elif sig_string.startswith("gender"): - _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1) data.obs["gender_score"] = data.obs["male_score"] - data.obs["female_score"] + from threadpoolctl import threadpool_limits kmeans = KMeans(n_clusters=3, random_state=random_state) with threadpool_limits(limits = 1): kmeans.fit(data.obs["gender_score"].values.reshape(-1, 1)) @@ -201,18 +222,10 @@ def calc_signature_score( codes = list(map(lambda x: reorg_dict[x], kmeans.labels_)) data.obs["predicted_gender"] = pd.Categorical.from_codes(codes, categories = ["female", "uncertain", "male"]) - elif sig_string.startswith("mitochondrial_genes"): - del signatures["mito_noncoding"] - _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1) - elif sig_string.startswith("ribosomal_genes"): - del signatures["ribo_like"] - _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1) - elif sig_string.startswith("apoptosis"): - _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes, skip_threshold = 1) - else: - assert False + elif sig_string == "emt_human": + data.obs["EMT_score"] = data.obs["Mesenchymal-like"] - data.obs["Epithelial-like"] else: signatures = load_signatures_from_file(sig_string) - _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes) + _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold) else: - _calc_sig_scores(data, signatures, show_omitted_genes = show_omitted_genes) + _calc_sig_scores(data, signatures, show_omitted_genes=show_omitted_genes, skip_threshold=skip_threshold) diff --git a/pegasus/tools/utils.py b/pegasus/tools/utils.py index d3970847..fb739456 100644 --- a/pegasus/tools/utils.py +++ b/pegasus/tools/utils.py @@ -192,6 +192,11 @@ def check_batch_key(data: Union[MultimodalData, UnimodalData], batch: Union[str, ribosomal_genes_mouse=pkg_resources.resource_filename("pegasus", "data_files/ribosomal_genes_mouse.gmt"), apoptosis_human=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_human.gmt"), apoptosis_mouse=pkg_resources.resource_filename("pegasus", "data_files/apoptosis_mouse.gmt"), + human_lung=pkg_resources.resource_filename("pegasus", "data_files/human_lung.gmt"), + mouse_lung=pkg_resources.resource_filename("pegasus", "data_files/mouse_lung.gmt"), + mouse_brain=pkg_resources.resource_filename("pegasus", "data_files/mouse_brain.gmt"), + mouse_liver=pkg_resources.resource_filename("pegasus", "data_files/mouse_liver.gmt"), + emt_human=pkg_resources.resource_filename("pegasus", "data_files/emt_human.gmt"), ) predefined_pathways = dict( diff --git a/pegasus/tools/visualization.py b/pegasus/tools/visualization.py index c1dab252..1d660ada 100644 --- a/pegasus/tools/visualization.py +++ b/pegasus/tools/visualization.py @@ -276,6 +276,7 @@ def umap( dens_var_shift: float = 0.1, n_jobs: int = -1, full_speed: bool = False, + use_cache: bool = True, random_state: int = 0, out_basis: str = "umap", ) -> None: @@ -334,6 +335,9 @@ def umap( * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. + use_cache: ``bool``, optional, default: ``True`` + If use_cache and found cached knn results, will not recompute. + random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. @@ -354,11 +358,7 @@ def umap( rep = update_rep(rep) X = X_from_rep(data, rep, rep_ncomps) - if data.shape[0] < n_neighbors: - logger.warning(f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}.") - n_neighbors = data.shape[0] - - knn_indices, knn_dists = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed) + knn_indices, knn_dists, n_neighbors = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache) knn_indices = np.insert(knn_indices[:, 0 : n_neighbors - 1], 0, range(data.shape[0]), axis=1) knn_dists = np.insert(knn_dists[:, 0 : n_neighbors - 1], 0, 0.0, axis=1) @@ -539,6 +539,7 @@ def net_umap( select_K: int = 25, select_alpha: float = 1.0, full_speed: bool = False, + use_cache: bool = True, net_alpha: float = 0.1, polish_learning_rate: float = 10.0, polish_n_epochs: int = 30, @@ -612,6 +613,9 @@ def net_umap( * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. + use_cache: ``bool``, optional, default: ``True`` + If use_cache and found cached knn results, will not recompute. + net_alpha: ``float``, optional, default: ``0.1`` L2 penalty (regularization term) parameter of the deep regressor. @@ -641,7 +645,7 @@ def net_umap( rep = update_rep(rep) n_jobs = eff_n_jobs(n_jobs) - knn_indices, knn_dists = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed) + knn_indices, knn_dists, select_K = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache) selected = select_cells( knn_dists, @@ -659,7 +663,7 @@ def net_umap( ds_indices_key = "ds_" + rep + "_knn_indices" # ds refers to down-sampling ds_distances_key = "ds_" + rep + "_knn_distances" - indices, distances = calculate_nearest_neighbors( + indices, distances, n_neighbors = calculate_nearest_neighbors( X, K=n_neighbors, n_jobs=n_jobs, @@ -702,7 +706,7 @@ def net_umap( data.obsm["X_" + out_basis + "_pred"] = Y_init - knn_indices, knn_dists = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed) + knn_indices, knn_dists, n_neighbors = get_neighbors(data, K = n_neighbors, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache) knn_indices = np.insert(knn_indices[:, 0 : n_neighbors - 1], 0, range(data.shape[0]), axis=1) knn_dists = np.insert(knn_dists[:, 0 : n_neighbors - 1], 0, 0.0, axis=1) @@ -735,6 +739,7 @@ def net_fle( rep: str = "diffmap", K: int = 50, full_speed: bool = False, + use_cache: bool = True, target_change_per_node: float = 2.0, target_steps: int = 5000, is3d: bool = False, @@ -778,6 +783,9 @@ def net_fle( * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. + use_cache: ``bool``, optional, default: ``True`` + If use_cache and found cached knn results, will not recompute. + target_change_per_node: ``float``, optional, default: ``2.0`` Target change per node to stop ForceAtlas2. @@ -845,7 +853,7 @@ def net_fle( full_speed=full_speed, ) - knn_indices, knn_dists = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed) + knn_indices, knn_dists, select_K = get_neighbors(data, K = select_K, rep = rep, n_jobs = n_jobs, random_state = random_state, full_speed = full_speed, use_cache = use_cache) selected = select_cells( knn_dists, @@ -860,7 +868,7 @@ def net_fle( ds_indices_key = "ds_" + rep + "_knn_indices" ds_distances_key = "ds_" + rep + "_knn_distances" - indices, distances = calculate_nearest_neighbors( + indices, distances, K = calculate_nearest_neighbors( X, K=K, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed ) data.uns[ds_indices_key] = indices diff --git a/requirements.txt b/requirements.txt index f9a154c7..5458ce4d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ Cython docopt demuxEM hnswlib -importlib_metadata>=0.7; python_version < '3.8' psutil threadpoolctl joblib>=0.14 @@ -16,15 +15,15 @@ natsort numba numpy pandas>=1.2.0 -pegasusio>=0.5.1 +pegasusio>=0.9.0 pybind11 scikit-learn>=0.23.2 scikit-misc scipy -seaborn +seaborn>=0.13.0 setuptools statsmodels umap-learn>=0.5.2 wordcloud xlsxwriter -igraph<=0.9.10 +igraph diff --git a/setup.py b/setup.py index 9cd66105..087a27bc 100644 --- a/setup.py +++ b/setup.py @@ -36,9 +36,10 @@ "Topic :: Software Development :: Build Tools", "Topic :: Scientific/Engineering :: Bio-Informatics", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ], keywords="single cell/nucleus genomics analysis", packages=find_packages(), @@ -58,14 +59,15 @@ scvi=["scvi-tools"], all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools"] ), - python_requires="~=3.7", + python_requires="~=3.8", package_data={ "pegasus.annotate_cluster": [ "human_immune_cell_markers.json", - "mouse_immune_cell_markers.json", - "mouse_brain_cell_markers.json", "human_brain_cell_markers.json", "human_lung_cell_markers.json", + "mouse_immune_cell_markers.json", + "mouse_brain_cell_markers.json", + "mouse_liver_cell_markers.json", ], "pegasus.check_sample_indexes": ["chromium-shared-sample-indexes-plate.json", "Chromium-i7-Multiplex-Kit-N-Set-A-sample-indexes-plate.json"], "pegasus": ["data_files/*.gmt"], diff --git a/tests/run_hashing_citeseq.sh b/tests/run_hashing_citeseq.sh deleted file mode 100644 index e546244e..00000000 --- a/tests/run_hashing_citeseq.sh +++ /dev/null @@ -1,13 +0,0 @@ -pegasus demuxEM -p 2 --generate-diagnostic-plots tests/data/hashing_citeseq/cb_cc_raw_gene_bc_matrices_h5.h5 tests/data/hashing_citeseq/cb_cell_hashing.csv tests/cb_cc -if [ -f "tests/cb_cc_demux.zarr.zip" ]; then - pegasus aggregate_matrix --select-only-singlets --min-genes 100 tests/data/sample_hashing_citeseq.csv tests/cb_cc_citeseq - - if [ -f "tests/cb_cc_citeseq.zarr.zip" ]; then - pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --mito-prefix MT- --percent-mito 20 --louvain --umap --citeseq --citeseq-umap --citeseq-umap-exclude Mouse_IgG1,Mouse_IgG2a,Mouse_IgG2b,Rat_IgG2b tests/cb_cc_citeseq.zarr.zip tests/citeseq_result - - if [ -f "tests/citeseq_result.zarr.zip" ]; then - pegasus plot scatter --basis umap --attributes louvain_labels,assignment tests/citeseq_result.zarr.zip tests/citeseq_result.umap.pdf - pegasus plot scatter --basis citeseq_umap --attributes louvain_labels,assignment tests/citeseq_result.zarr.zip tests/citeseq_result.citeseq_umap.pdf - fi - fi -fi diff --git a/tests/run_pipeline.sh b/tests/run_pipeline.sh index c6e0d2eb..8c516f2f 100644 --- a/tests/run_pipeline.sh +++ b/tests/run_pipeline.sh @@ -1,14 +1,14 @@ pegasus aggregate_matrix tests/data/count_matrix.csv tests/aggr if [ -f "tests/aggr.zarr.zip" ]; then - pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --correct-batch-effect --nmf --louvain --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr louvain_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result + pegasus cluster -p 2 --min-genes 500 --max-genes 6000 --percent-mito 20.0 --output-filtration-results --output-h5ad --output-loom --plot-filtration-results --plot-hvf --exact-K --correct-batch-effect --nmf --leiden --tsne --umap --fle --infer-doublets --dbl-cluster-attr leiden_labels --calc-signature-scores cell_cycle_mouse tests/aggr.zarr.zip tests/result fi if [ -f "tests/result.zarr.zip" ]; then - pegasus de_analysis -p 2 --labels louvain_labels --t --fisher tests/result.zarr.zip tests/result.de.xlsx + pegasus de_analysis -p 2 --labels leiden_labels --t --fisher tests/result.zarr.zip tests/result.de.xlsx pegasus annotate_cluster --markers mouse_immune,mouse_brain tests/result.zarr.zip tests/result.anno.txt pegasus plot compo --groupby leiden_labels --condition Channel tests/result.zarr.zip tests/result.compo.pdf - pegasus plot scatter --basis umap --attributes louvain_labels,Channel tests/result.zarr.zip tests/result.louvain_labels.umap.pdf + pegasus plot scatter --basis umap --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.umap.pdf pegasus plot scatter --basis tsne --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.tsne.pdf - pegasus plot scatter --basis fle --attributes louvain_labels,Channel tests/result.zarr.zip tests/result.louvain_labels.fle.pdf + pegasus plot scatter --basis fle --attributes leiden_labels,Channel tests/result.zarr.zip tests/result.leiden_labels.fle.pdf fi diff --git a/tests/test_hashing_citeseq.py b/tests/test_hashing_citeseq.py deleted file mode 100644 index e30cdff7..00000000 --- a/tests/test_hashing_citeseq.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Unittest module for hashing_citeseq -""" - -import os -import glob -import unittest - -import numpy as np -import pandas as pd -import pegasus as pg - - -class TestPipeline(unittest.TestCase): - def test_demux(self): - data = pg.read_input("tests/cb_cc_demux.zarr.zip") - self.assertEqual(data.shape, (737280, 33694), "Demux data shape differs!") - self.assertIn('demux_type', data.obs.columns, "Demux type is lost!") - self.assertIn('assignment', data.obs.columns, "Cell assignment is lost!") - f_list = glob.glob("tests/cb_cc.*.pdf") - self.assertEqual(len(f_list), 4, "Demux diagnosis plots are missing!") - self.assertIn('cb_cc.out.demuxEM.zarr.zip', os.listdir('tests'), "Demultiplexed RNA matrix is lost!") - - def test_citeseq(self): - data = pg.read_input("tests/cb_cc_citeseq.zarr.zip") - self.assertSetEqual(set(data.list_data()), set(['GRCh38-citeseq', 'GRCh38-rna']), "Some modality is missing!") - self.assertNotIn('demux_type', data.obs.columns, "Demux type is not removed!") - self.assertIn('assignment', data.obs.columns, "Cell assignment is lost!") - self.assertEqual(data.shape, (14363, 33694), "RNA data shape differs!") - data.select_data('GRCh38-citeseq') - self.assertEqual(data.shape, (14363, 31), "CITE-Seq data shape differs!") - - def test_clustering(self): - data = pg.read_input("tests/citeseq_result.zarr.zip") - self.assertSetEqual(set(data.list_data()), set(['GRCh38-citeseq', 'GRCh38-rna']), "Some modality is missing!") - n_rna_cells = data.shape[0] - self.assertNotIn('demux_type', data.obs.columns, "Demux type is not removed!") - self.assertEqual(data.obs['assignment'].cat.categories.size, 7, "Not all cells are demultiplexed singlets!") - self.assertIn('X_citeseq', data.obsm.keys(), "CITE-Seq coordinates are lost!") - self.assertEqual(data.obsm['X_citeseq_umap'].shape[1], data.obsm['X_umap'].shape[1], "Some of UMAP embeddings is lost!") - data.select_data('GRCh38-citeseq') - n_citeseq_cells = data.shape[0] - self.assertEqual(n_rna_cells, n_citeseq_cells, "Two modalities have inconsistent number of cells!") - - def test_plot(self): - self.assertIn('citeseq_result.citeseq_umap.pdf', os.listdir('tests'), "CITE-Seq UMAP plot is lost!") - self.assertIn('citeseq_result.umap.pdf', os.listdir('tests'), "RNA UMAP plot is lost!") - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 4b1da926..3e8b7f3b 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -27,7 +27,6 @@ def test_qc(self): def test_clustering(self): self.assertEqual(self.data.obsm['pca_harmony_knn_indices'].shape, (1043, 99), "KNN graph shape differs!") self.assertEqual(self.data.obsm['pca_harmony_knn_distances'].shape, (1043, 99), "KNN distance matrix shape differs!") - self.assertIn('louvain_labels', self.data.obs.columns, "Louvain result is lost!") self.assertIn('leiden_labels', self.data.obs.columns, "Leiden result is lost!") def test_doublet_detection(self): @@ -60,9 +59,9 @@ def test_annotation(self): def test_plot(self): self.assertIn('result.compo.pdf', os.listdir('tests'), "Composition plot is lost!") - self.assertIn('result.louvain_labels.umap.pdf', os.listdir('tests'), "UMAP plot is lost!") + self.assertIn('result.leiden_labels.umap.pdf', os.listdir('tests'), "UMAP plot is lost!") self.assertIn('result.leiden_labels.tsne.pdf', os.listdir('tests'), "tSNE plot is lost!") - self.assertIn('result.louvain_labels.fle.pdf', os.listdir('tests'), 'FLE plot is lost!') + self.assertIn('result.leiden_labels.fle.pdf', os.listdir('tests'), 'FLE plot is lost!') def test_output(self): data_h5ad = pg.read_input("tests/result.mm10-rna.h5ad") diff --git a/wheel_build/build_wheel_for_linux.sh b/wheel_build/build_wheel_for_linux.sh index 792a1d48..98d6dc96 100755 --- a/wheel_build/build_wheel_for_linux.sh +++ b/wheel_build/build_wheel_for_linux.sh @@ -11,7 +11,7 @@ function repair_wheel { fi } -declare -a PythonVersions=("cp37-cp37m" "cp38-cp38" "cp39-cp39") +declare -a PythonVersions=("cp38-cp38" "cp39-cp39" "cp310-cp310" "cp311-cp311") for val in ${PythonVersions[@]}; do /opt/python/$val/bin/pip install -r /src/requirements.txt