From 7e61b3bd62fc544b245d4978fc352bf45d01ed85 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Fri, 31 May 2024 23:15:01 -0700
Subject: [PATCH 1/2] Add pydeseq2 backend for deseq2; fix issue with
 matplotlib v3.9+; improve calc_dendrogram

---
 pegasus/plotting/plot_library.py |   8 +-
 pegasus/pseudo/convenient.py     |  23 ++--
 pegasus/tools/clustering.py      |  10 +-
 pegasus/tools/pseudobulk.py      | 209 ++++++++++++++++++++++---------
 requirements.txt                 |   2 +-
 setup.py                         |   3 +-
 6 files changed, 174 insertions(+), 81 deletions(-)

diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py
index e5eed198..5b23a8b1 100644
--- a/pegasus/plotting/plot_library.py
+++ b/pegasus/plotting/plot_library.py
@@ -353,7 +353,7 @@ def scatter(
                                 fontsize=legend_fontsize[attr_id],
                                 ncol=_get_legend_ncol(label_size, legend_ncol),
                             )
-                            for handle in legend.legendHandles:
+                            for handle in legend.legend_handles:
                                 handle.set_sizes([300.0 if scale_factor is None else 100.0])
                         elif legend_loc[attr_id] == "on data":
                             texts = []
@@ -588,7 +588,7 @@ def scatter_groups(
                             fontsize=legend_fontsize,
                             ncol=legend_ncol,
                         )
-                        for handle in legend.legendHandles:
+                        for handle in legend.legend_handles:
                             handle.set_sizes([300.0])
                     elif legend_loc == "on data":
                         texts = []
@@ -2092,7 +2092,7 @@ def volcano(
         fontsize=8,
         ncol=4,
     )
-    for handle in legend.legendHandles: # adjust legend size
+    for handle in legend.legend_handles: # adjust legend size
         handle.set_sizes([50.0])
 
     ax.axhline(y = yconst, c = 'k', lw = 0.5, ls = '--')
@@ -2355,7 +2355,7 @@ def wordcloud(
     >>> fig = pg.wordcloud(data, factor=0)
     """
     fig, ax = _get_subplot_layouts(panel_size=panel_size, dpi=dpi) # default nrows = 1 & ncols = 1
-    
+
     assert 'W' in data.uns
     hvg = data.var_names[data.var[features]]
     word_dict = {}
diff --git a/pegasus/pseudo/convenient.py b/pegasus/pseudo/convenient.py
index 3e8293a3..f80cb5fe 100644
--- a/pegasus/pseudo/convenient.py
+++ b/pegasus/pseudo/convenient.py
@@ -49,7 +49,7 @@ def markers(
     if de_key not in pseudobulk.varm.keys():
         raise ValueError("Please run DE analysis first")
 
-    res_dict = {}    
+    res_dict = {}
     df = pd.DataFrame(data=pseudobulk.varm[de_key], index=pseudobulk.var_names)
     idx = df["padj"] <= alpha
 
@@ -206,11 +206,20 @@ def volcano(
         logger.warning("Please conduct DE test first!")
         return None
 
-    log2fc = de_res[fcstr]
-    pvals = de_res[pstr]
+    qvals = de_res[qstr]
+    # Ignore genes with q-value NaN
+    idx_select = np.where(~np.isnan(qvals))[0]
+    if idx_select.size == 0:
+        logger.warning("All genes have NaN adjusted p-values!")
+        return None
+    qvals = qvals[idx_select]
+    log2fc = de_res[fcstr][idx_select]
+    pvals = de_res[pstr][idx_select]
+    gene_names = pseudobulk.var_names[idx_select]
+
     pvals[pvals == 0.0] = 1e-45 # very small pvalue to avoid log10 0
     neglog10p = -np.log10(pvals)
-    yconst = min(neglog10p[de_res[qstr] <= qval_threshold])
+    yconst = min(neglog10p[qvals <= qval_threshold])
 
     from pegasus.plotting.plot_utils import _get_subplot_layouts
     fig, ax = _get_subplot_layouts(panel_size=panel_size, dpi=dpi)
@@ -239,7 +248,7 @@ def volcano(
         fontsize=8,
         ncol=4,
     )
-    for handle in legend.legendHandles: # adjust legend size
+    for handle in legend.legend_handles: # adjust legend size
         handle.set_sizes([50.0])
 
     ax.axhline(y = yconst, c = 'k', lw = 0.5, ls = '--')
@@ -252,13 +261,13 @@ def volcano(
     posvec = np.argsort(log2fc[idx])[::-1][0:top_n]
     for pos in posvec:
         gid = idx[pos]
-        texts.append(ax.text(log2fc[gid], neglog10p[gid], pseudobulk.var_names[gid], fontsize=5))
+        texts.append(ax.text(log2fc[gid], neglog10p[gid], gene_names[gid], fontsize=5))
 
     idx = np.where(idxsig & (log2fc <= -log2fc_threshold))[0]
     posvec = np.argsort(log2fc[idx])[0:top_n]
     for pos in posvec:
         gid = idx[pos]
-        texts.append(ax.text(log2fc[gid], neglog10p[gid], pseudobulk.var_names[gid], fontsize=5))
+        texts.append(ax.text(log2fc[gid], neglog10p[gid], gene_names[gid], fontsize=5))
 
     from adjustText import adjust_text
     adjust_text(texts, arrowprops=dict(arrowstyle='-', color='k', lw=0.5))
diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py
index 9be2f447..a6a93f30 100644
--- a/pegasus/tools/clustering.py
+++ b/pegasus/tools/clustering.py
@@ -11,7 +11,7 @@
 from sklearn.cluster import KMeans
 from typing import List, Optional, Union
 
-from pegasus.tools import eff_n_jobs, construct_graph, calc_stat_per_batch, X_from_rep, slicing
+from pegasus.tools import eff_n_jobs, construct_graph, calc_stat_per_batch, update_rep, X_from_rep, slicing
 
 import logging
 logger = logging.getLogger(__name__)
@@ -780,10 +780,8 @@ def calc_dendrogram(
     """
     # Set up embedding or count matrix to use
     if genes is None:
-        if rep:
-            embed_df = pd.DataFrame(X_from_rep(data, rep))
-        else:
-            embed_df = pd.DataFrame(data.X.toarray() if issparse(data.X) else data.X)
+        rep = update_rep(rep)
+        embed_df = pd.DataFrame(X_from_rep(data, rep))
     else:
         embed_df = pd.DataFrame(slicing(data[:, genes].X))
 
@@ -825,4 +823,4 @@ def calc_dendrogram(
     np.fill_diagonal(dissim_df.to_numpy(), 0)    # Enforce main diagonal to be 0 to pass squareform requirement
     Z = linkage(squareform(dissim_df), method=linkage_method, optimal_ordering=True)
 
-    data.uns[res_key] = (Z, csi_df)    
\ No newline at end of file
+    data.uns[res_key] = (Z, csi_df)
diff --git a/pegasus/tools/pseudobulk.py b/pegasus/tools/pseudobulk.py
index 7e4b39a6..62bff03e 100644
--- a/pegasus/tools/pseudobulk.py
+++ b/pegasus/tools/pseudobulk.py
@@ -4,7 +4,8 @@
 logger = logging.getLogger(__name__)
 
 from pegasusio import MultimodalData, UnimodalData, timer
-from pandas.api.types import is_categorical_dtype, is_numeric_dtype
+from pegasus.tools import eff_n_jobs
+from pandas.api.types import is_numeric_dtype
 from typing import Union, Optional, List, Tuple
 
 
@@ -30,10 +31,10 @@ def get_pseudobulk_count(X, df, attr, bulk_list):
 @timer(logger=logger)
 def pseudobulk(
     data: MultimodalData,
-    sample: str,
+    groupby: str,
     attrs: Optional[Union[List[str], str]] = None,
-    mat_key: Optional[str] = "counts",
-    cluster: Optional[str] = None,
+    mat_key: str = "counts",
+    condition: Optional[str] = None,
 ) -> UnimodalData:
     """Generate Pseudo-bulk count matrices.
 
@@ -42,7 +43,7 @@ def pseudobulk(
     data: ``MultimodalData`` or ``UnimodalData`` object
         Annotated data matrix with rows for cells and columns for genes.
 
-    sample: ``str``
+    groupby: ``str``
         Specify the cell attribute used for aggregating pseudo-bulk data.
         Key must exist in ``data.obs``.
 
@@ -56,40 +57,37 @@ def pseudobulk(
         Specify the single-cell count matrix used for aggregating pseudo-bulk counts:
         If specified, use the count matrix with key ``mat_key`` from matrices of ``data``; otherwise, default is ``counts``.
 
-    cluster: ``str``, optional, default: ``None``
-        If set, additionally generate pseudo-bulk matrices per cluster specified in ``data.obs[cluster]``.
+    condition: ``str``, optional, default: ``None``
+        If set, additionally generate pseudo-bulk matrices per condition specified in ``data.obs[condition]``.
 
     Returns
     -------
-    A UnimodalData object ``udata`` containing pseudo-bulk information:
+    A MultimodalData object ``mdata`` containing pseudo-bulk information:
         * It has the following count matrices:
 
           * ``X``: The pseudo-bulk count matrix over all cells.
-          * If ``cluster`` is set, a number of pseudo-bulk count matrices of cells belonging to the clusters, respectively.
-        * ``udata.obs``: It contains pseudo-bulk attributes aggregated from the corresponding single-cell attributes.
-        * ``udata.var``: Gene names and Ensembl IDs are maintained.
-
-    Update ``data``:
-        * Add the returned UnimodalData object above to ``data`` with key ``<sample>-pseudobulk``, where ``<sample>`` is replaced by the actual value of ``sample`` argument.
+          * If ``condition`` is set, add additional pseudo-bulk count matrices of cells restricted to each condition, respectively
+        * ``mdata.obs``: It contains pseudo-bulk attributes aggregated from the corresponding single-cell attributes.
+        * ``mdata.var``: Gene names and Ensembl IDs are maintained.
 
     Examples
     --------
-    >>> pg.pseudobulk(data, sample="Channel")
+    >>> pg.pseudobulk(data, groupby="Channel")
     """
     X = data.get_matrix(mat_key)
 
-    assert sample in data.obs.columns, f"Sample key '{sample}' must exist in data.obs!"
+    assert groupby in data.obs.columns, f"Sample key '{groupby}' must exist in data.obs!"
 
     sample_vec = (
-        data.obs[sample]
-        if is_categorical_dtype(data.obs[sample])
-        else data.obs[sample].astype("category")
+        data.obs[groupby]
+        if isinstance(data.obs[groupby].dtype, pd.CategoricalDtype)
+        else data.obs[groupby].astype("category")
     )
     bulk_list = sample_vec.cat.categories
 
     df_barcode = data.obs.reset_index()
 
-    mat_dict = {"counts": get_pseudobulk_count(X, df_barcode, sample, bulk_list)}
+    mat_dict = {"counts": get_pseudobulk_count(X, df_barcode, groupby, bulk_list)}
 
     # Generate pseudo-bulk attributes if specified
     bulk_attr_list = []
@@ -103,7 +101,7 @@ def pseudobulk(
             ), f"Cell attribute key '{attr}' must exist in data.obs!"
 
     for bulk in bulk_list:
-        df_bulk = df_barcode.loc[df_barcode[sample] == bulk]
+        df_bulk = df_barcode.loc[df_barcode[groupby] == bulk]
         if attrs is not None:
             bulk_attr = df_bulk[attrs].apply(set_bulk_value, axis=0)
             bulk_attr["barcodekey"] = bulk
@@ -112,43 +110,52 @@ def pseudobulk(
         bulk_attr_list.append(bulk_attr)
 
     df_pseudobulk = pd.DataFrame(bulk_attr_list)
+    for col in df_pseudobulk.columns:
+        if col == 'barcodekey':
+            continue
+        if isinstance(df_barcode[col].dtype, pd.CategoricalDtype):
+            c_old = df_barcode[col].cat
+            cat_dtype = pd.CategoricalDtype(categories=c_old.categories, ordered=c_old.ordered)
+            c_new = pd.Series(df_pseudobulk[col], dtype=cat_dtype).cat
+            df_pseudobulk[col] = c_new.remove_unused_categories()
 
     df_feature = pd.DataFrame(index=data.var_names)
     if "featureid" in data.var.columns:
         df_feature["featureid"] = data.var["featureid"]
 
-    if cluster is not None:
+    if condition is not None:
         assert (
-            cluster in data.obs.columns
-        ), f"Cluster key '{attr}' must exist in data.obs!"
+            condition in data.obs.columns
+        ), f"Condition key '{attr}' must exist in data.obs!"
 
-        cluster_list = data.obs[cluster].astype("category").cat.categories
+        cluster_list = data.obs[condition].astype("category").cat.categories
         for cls in cluster_list:
-            mat_dict[f"{cluster}_{cls}.X"] = get_pseudobulk_count(
-                X, df_barcode.loc[df_barcode[cluster] == cls], sample, bulk_list
+            mat_dict[f"{condition}_{cls}.X"] = get_pseudobulk_count(
+                X, df_barcode.loc[df_barcode[condition] == cls], groupby, bulk_list
             )
 
     udata = UnimodalData(
         barcode_metadata=df_pseudobulk,
         feature_metadata=df_feature,
         matrices=mat_dict,
-        genome=sample,
+        genome=groupby,
         modality="pseudobulk",
         cur_matrix="counts",
     )
 
-    data.add_data(udata)
-
-    return udata
+    return MultimodalData(udata)
 
 
 @timer(logger=logger)
 def deseq2(
-    pseudobulk: UnimodalData,
+    pseudobulk: Union[MultimodalData, UnimodalData],
     design: str,
     contrast: Tuple[str, str, str],
+    backend: str = "pydeseq2",
     de_key: str = "deseq2",
-    replaceOutliers: bool = True,
+    alpha: float = 0.05,
+    compute_all: bool = False,
+    n_jobs: int = -1,
 ) -> None:
     """Perform Differential Expression (DE) Analysis using DESeq2 on pseduobulk data. This function calls R package DESeq2, requiring DESeq2 in R installed.
 
@@ -156,20 +163,32 @@ def deseq2(
 
     Parameters
     ----------
-    pseudobulk: ``UnimodalData``
-        Pseudobulk data with rows for samples and columns for genes. If pseudobulk contains multiple matrices, DESeq2 will apply to all matrices.
+    pseudobulk: ``MultimodalData`` or ``UnimodalData``
+        Pseudobulk data with rows for samples/pseudobulks and columns for genes. It may contain multiple count matrices of the same shape with different keys
 
-    design: ``str``
-        Design formula that will be passed to DESeq2
+    design: ``str`` or ``List[str]``
+        For ``pydeseq2`` backend, specify either a factor or a list of factors to be used as design variables.They must be all in ``pseudobulk.obs``.
+        For ``deseq2`` backend, specify the design formula that will be passed to DESeq2. E.g. ``~group+condition`` or ``~genotype+treatment+genotype:treatment``.
 
     contrast: ``Tuple[str, str, str]``
-        A tuple of three elements passing to DESeq2: a factor in design formula, a level in the factor as numeritor of fold change, and a level as denominator of fold change.
-    
+        A tuple of three elements passing to DESeq2: a factor in design formula, a level in the factor as the test level (numeritor of fold change), and a level as the reference level (denominator of fold change).
+
+    backend: ``str``, optional, default: ``pydeseq2``
+        Specify which package to use as the backend for pseudobulk DE analysis.
+        By default, use ``PyDESeq2`` which is a pure Python implementation of DESeq2 method.
+        Alternatively, if specifying ``deseq2``, then use R package DESeq2, which requires ``rpy2`` package and R installation.
+
     de_key: ``str``, optional, default: ``"deseq2"``
-        Key name of DE analysis results stored. For cluster.X, stored key will be cluster.de_key
+        Key name of DE analysis results stored. For count matrix with name ``condition.X``, stored key will be ``condition.de_key``.
+
+    alpha: ``float``, optional, default: ``0.05``
+        The significance cutoff (between 0 and 1) used for optimizing the independent filtering to calculate the adjusted p-values (FDR).
 
-    replaceOutliers: ``bool``, optional, default: ``True``
-        If execute DESeq2's replaceOutliers step. If set to ``False``, we will set minReplicatesForReplace=Inf in ``DESeq`` function and set cooksCutoff=False in ``results`` function.
+    compute_all: ``bool``, optional, default: ``False``
+        If performing DE analysis on all count matrices. By default (``compute_all=False``), only apply DE analysis to the default count matrix ``counts``.
+
+    n_jobs: ``int``, optional, default: ``-1``
+        Number of threads to use. If ``-1``, use all physical CPU cores. This only works when ``backend="pydeseq2"`.
 
     Returns
     -------
@@ -177,12 +196,82 @@ def deseq2(
 
     Update ``pseudobulk.varm``:
         ``pseudobulk.varm[de_key]``: DE analysis result for pseudo-bulk count matrix.
-        ``pseudobulk.varm[cluster.de_key]``: DE results for cluster-specific pseudo-bulk count matrices.
+        (Optional) ``pseudobulk.varm[condition.de_key]``: If ``compute_all=True``, DE results for each condition-specific pseudo-bulk count matrices.
 
     Examples
     --------
-    >>> pg.deseq2(pseudobulk, '~gender', ('gender', 'female', 'male'))
+    >>> pg.deseq2(pseudobulk, 'gender', ('gender', 'female', 'male'))
+    >>> pg.deseq2(pseudobulk, '~gender', ('gender', 'female', 'male'), backend="deseq2")
     """
+    mat_keys = ['counts'] if not compute_all else pseudobulk.list_keys()
+    for mat_key in mat_keys:
+        if backend == "pydeseq2":
+            _run_pydeseq2(pseudobulk=pseudobulk, mat_key=mat_key, design_factors=design, contrast=contrast, de_key=de_key, alpha=alpha, n_jobs=n_jobs)
+        else:
+            _run_rdeseq2(pseudobulk=pseudobulk, mat_key=mat_key, design=design, contrast=contrast, de_key=de_key, alpha=alpha)
+
+
+def _run_pydeseq2(
+    pseudobulk: Union[MultimodalData, UnimodalData],
+    mat_key: str,
+    design_factors: Union[str, List[str]],
+    contrast: Tuple[str, str, str],
+    de_key: str,
+    alpha: float,
+    n_jobs: int,
+) -> None:
+    try:
+        from pydeseq2.dds import DeseqDataSet
+        from pydeseq2.default_inference import DefaultInference
+        from pydeseq2.ds import DeseqStats
+    except ModuleNotFoundError as e:
+        import sys
+        logger.error(f"{e}\nNeed pydeseq2! Try 'pip install pydeseq2'.")
+        sys.exit(-1)
+
+    if isinstance(design_factors, str):
+        assert design_factors in pseudobulk.obs.columns, f"The design factor {design_factors} does not exist in data.obs!"
+    else:
+        for factor in design_factors:
+            assert factor in pseudobulk.obs.columns, f"The design factor {factor} does not exist in data.obs!"
+
+    counts_df = pd.DataFrame(pseudobulk.get_matrix(mat_key), index=pseudobulk.obs_names, columns=pseudobulk.var_names)
+    metadata = pseudobulk.obs
+
+    n_cpus = eff_n_jobs(n_jobs)
+    inference = DefaultInference(n_cpus=n_cpus)
+    dds = DeseqDataSet(
+        counts=counts_df,
+        metadata=metadata,
+        design_factors=design_factors,
+        refit_cooks=True,
+        inference=inference,
+        quiet=True,
+    )
+    dds.deseq2()
+
+    stat_res = DeseqStats(
+        dds,
+        contrast=contrast,
+        cooks_filter=True,
+        alpha=alpha,
+        inference=inference,
+        quiet=True,
+    )
+    stat_res.summary()
+    res_key = de_key if mat_key == "counts" else mat_key.removesuffix(".X") + "." + de_key
+    res_df = stat_res.results_df
+    pseudobulk.varm[res_key] = res_df.to_records(index=False)
+
+
+def _run_rdeseq2(
+    pseudobulk: UnimodalData,
+    mat_key: str,
+    design: str,
+    contrast: Tuple[str, str, str],
+    de_key: str,
+    alpha: float,
+) -> None:
     try:
         import rpy2.robjects as ro
         from rpy2.robjects import pandas2ri, numpy2ri, Formula
@@ -202,26 +291,22 @@ def deseq2(
                 if (!require("BiocManager", quietly = TRUE))
                     install.packages("BiocManager")
                 BiocManager::install("DESeq2")"""
-                
+
         logger.error(text)
         sys.exit(-1)
 
-    import math    
+    assert design.strip().startswith("~"), f"Design '{design}' is not a valid R formula! Valid examples: '~var', '~var1+var2', '~var1+var2+var1:var2'."
+
+    import math
     to_dataframe = ro.r('function(x) data.frame(x)')
 
-    for mat_key in pseudobulk.list_keys():
-        with localconverter(ro.default_converter + numpy2ri.converter + pandas2ri.converter):
-            dds = deseq2.DESeqDataSetFromMatrix(countData = pseudobulk.get_matrix(mat_key).T, colData = pseudobulk.obs, design = Formula(design))
-        
-        if replaceOutliers:
-            dds = deseq2.DESeq(dds)
-            res= deseq2.results(dds, contrast=ro.StrVector(contrast))
-        else:
-            dds = deseq2.DESeq(dds, minReplicatesForReplace=math.inf)
-            res= deseq2.results(dds, contrast=ro.StrVector(contrast), cooksCutoff=False)
-        with localconverter(ro.default_converter + pandas2ri.converter):
-          res_df = ro.conversion.rpy2py(to_dataframe(res))
-          res_df.fillna({'log2FoldChange': 0.0, 'lfcSE': 0.0, 'stat': 0.0, 'pvalue': 1.0, 'padj': 1.0}, inplace=True)
-
-        de_res_key = de_key if mat_key.find('.') < 0 else f"{mat_key.partition('.')[0]}.{de_key}"
-        pseudobulk.varm[de_res_key] = res_df.to_records(index=False)
+    with localconverter(ro.default_converter + numpy2ri.converter + pandas2ri.converter):
+        dds = deseq2.DESeqDataSetFromMatrix(countData = pseudobulk.get_matrix(mat_key).T, colData = pseudobulk.obs, design = Formula(design))
+
+    dds = deseq2.DESeq(dds)
+    res= deseq2.results(dds, contrast=ro.StrVector(contrast), alpha=alpha)
+    with localconverter(ro.default_converter + pandas2ri.converter):
+      res_df = ro.conversion.rpy2py(to_dataframe(res))
+
+    res_key = de_key if mat_key == "counts" else mat_key.removesuffix(".X") + "." + de_key
+    pseudobulk.varm[res_key] = res_df.to_records(index=False)
diff --git a/requirements.txt b/requirements.txt
index 5458ce4d..1788c336 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ joblib>=0.14
 lightgbm>=2.2.1
 loompy
 leidenalg
-matplotlib>=2.0.0
+matplotlib>=3.7.0
 natsort
 numba
 numpy
diff --git a/setup.py b/setup.py
index 22fb763f..43dadaf9 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,8 @@
         mkl=["mkl"],
         rpy2=["rpy2"],
         scvi=["scvi-tools"],
-        all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools"]
+        pseudobulk=["pydeseq2"]
+        all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools", "pydeseq2"]
     ),
     python_requires="~=3.8",
     package_data={

From e368e45b900e739bc93945ab228847e030a75d9d Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Sat, 1 Jun 2024 06:27:58 +0000
Subject: [PATCH 2/2] fix bug

---
 pegasus/tools/pseudobulk.py | 2 +-
 setup.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pegasus/tools/pseudobulk.py b/pegasus/tools/pseudobulk.py
index 62bff03e..75e8493b 100644
--- a/pegasus/tools/pseudobulk.py
+++ b/pegasus/tools/pseudobulk.py
@@ -265,7 +265,7 @@ def _run_pydeseq2(
 
 
 def _run_rdeseq2(
-    pseudobulk: UnimodalData,
+    pseudobulk: Union[MultimodalData, UnimodalData],
     mat_key: str,
     design: str,
     contrast: Tuple[str, str, str],
diff --git a/setup.py b/setup.py
index 43dadaf9..7f912a63 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@
         mkl=["mkl"],
         rpy2=["rpy2"],
         scvi=["scvi-tools"],
-        pseudobulk=["pydeseq2"]
+        pseudobulk=["pydeseq2"],
         all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools", "pydeseq2"]
     ),
     python_requires="~=3.8",