From 7e61b3bd62fc544b245d4978fc352bf45d01ed85 Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Fri, 31 May 2024 23:15:01 -0700 Subject: [PATCH 1/2] Add pydeseq2 backend for deseq2; fix issue with matplotlib v3.9+; improve calc_dendrogram --- pegasus/plotting/plot_library.py | 8 +- pegasus/pseudo/convenient.py | 23 ++-- pegasus/tools/clustering.py | 10 +- pegasus/tools/pseudobulk.py | 209 ++++++++++++++++++++++--------- requirements.txt | 2 +- setup.py | 3 +- 6 files changed, 174 insertions(+), 81 deletions(-) diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py index e5eed198..5b23a8b1 100644 --- a/pegasus/plotting/plot_library.py +++ b/pegasus/plotting/plot_library.py @@ -353,7 +353,7 @@ def scatter( fontsize=legend_fontsize[attr_id], ncol=_get_legend_ncol(label_size, legend_ncol), ) - for handle in legend.legendHandles: + for handle in legend.legend_handles: handle.set_sizes([300.0 if scale_factor is None else 100.0]) elif legend_loc[attr_id] == "on data": texts = [] @@ -588,7 +588,7 @@ def scatter_groups( fontsize=legend_fontsize, ncol=legend_ncol, ) - for handle in legend.legendHandles: + for handle in legend.legend_handles: handle.set_sizes([300.0]) elif legend_loc == "on data": texts = [] @@ -2092,7 +2092,7 @@ def volcano( fontsize=8, ncol=4, ) - for handle in legend.legendHandles: # adjust legend size + for handle in legend.legend_handles: # adjust legend size handle.set_sizes([50.0]) ax.axhline(y = yconst, c = 'k', lw = 0.5, ls = '--') @@ -2355,7 +2355,7 @@ def wordcloud( >>> fig = pg.wordcloud(data, factor=0) """ fig, ax = _get_subplot_layouts(panel_size=panel_size, dpi=dpi) # default nrows = 1 & ncols = 1 - + assert 'W' in data.uns hvg = data.var_names[data.var[features]] word_dict = {} diff --git a/pegasus/pseudo/convenient.py b/pegasus/pseudo/convenient.py index 3e8293a3..f80cb5fe 100644 --- a/pegasus/pseudo/convenient.py +++ b/pegasus/pseudo/convenient.py @@ -49,7 +49,7 @@ def markers( if de_key not in pseudobulk.varm.keys(): raise ValueError("Please run DE analysis first") - res_dict = {} + res_dict = {} df = pd.DataFrame(data=pseudobulk.varm[de_key], index=pseudobulk.var_names) idx = df["padj"] <= alpha @@ -206,11 +206,20 @@ def volcano( logger.warning("Please conduct DE test first!") return None - log2fc = de_res[fcstr] - pvals = de_res[pstr] + qvals = de_res[qstr] + # Ignore genes with q-value NaN + idx_select = np.where(~np.isnan(qvals))[0] + if idx_select.size == 0: + logger.warning("All genes have NaN adjusted p-values!") + return None + qvals = qvals[idx_select] + log2fc = de_res[fcstr][idx_select] + pvals = de_res[pstr][idx_select] + gene_names = pseudobulk.var_names[idx_select] + pvals[pvals == 0.0] = 1e-45 # very small pvalue to avoid log10 0 neglog10p = -np.log10(pvals) - yconst = min(neglog10p[de_res[qstr] <= qval_threshold]) + yconst = min(neglog10p[qvals <= qval_threshold]) from pegasus.plotting.plot_utils import _get_subplot_layouts fig, ax = _get_subplot_layouts(panel_size=panel_size, dpi=dpi) @@ -239,7 +248,7 @@ def volcano( fontsize=8, ncol=4, ) - for handle in legend.legendHandles: # adjust legend size + for handle in legend.legend_handles: # adjust legend size handle.set_sizes([50.0]) ax.axhline(y = yconst, c = 'k', lw = 0.5, ls = '--') @@ -252,13 +261,13 @@ def volcano( posvec = np.argsort(log2fc[idx])[::-1][0:top_n] for pos in posvec: gid = idx[pos] - texts.append(ax.text(log2fc[gid], neglog10p[gid], pseudobulk.var_names[gid], fontsize=5)) + texts.append(ax.text(log2fc[gid], neglog10p[gid], gene_names[gid], fontsize=5)) idx = np.where(idxsig & (log2fc <= -log2fc_threshold))[0] posvec = np.argsort(log2fc[idx])[0:top_n] for pos in posvec: gid = idx[pos] - texts.append(ax.text(log2fc[gid], neglog10p[gid], pseudobulk.var_names[gid], fontsize=5)) + texts.append(ax.text(log2fc[gid], neglog10p[gid], gene_names[gid], fontsize=5)) from adjustText import adjust_text adjust_text(texts, arrowprops=dict(arrowstyle='-', color='k', lw=0.5)) diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py index 9be2f447..a6a93f30 100644 --- a/pegasus/tools/clustering.py +++ b/pegasus/tools/clustering.py @@ -11,7 +11,7 @@ from sklearn.cluster import KMeans from typing import List, Optional, Union -from pegasus.tools import eff_n_jobs, construct_graph, calc_stat_per_batch, X_from_rep, slicing +from pegasus.tools import eff_n_jobs, construct_graph, calc_stat_per_batch, update_rep, X_from_rep, slicing import logging logger = logging.getLogger(__name__) @@ -780,10 +780,8 @@ def calc_dendrogram( """ # Set up embedding or count matrix to use if genes is None: - if rep: - embed_df = pd.DataFrame(X_from_rep(data, rep)) - else: - embed_df = pd.DataFrame(data.X.toarray() if issparse(data.X) else data.X) + rep = update_rep(rep) + embed_df = pd.DataFrame(X_from_rep(data, rep)) else: embed_df = pd.DataFrame(slicing(data[:, genes].X)) @@ -825,4 +823,4 @@ def calc_dendrogram( np.fill_diagonal(dissim_df.to_numpy(), 0) # Enforce main diagonal to be 0 to pass squareform requirement Z = linkage(squareform(dissim_df), method=linkage_method, optimal_ordering=True) - data.uns[res_key] = (Z, csi_df) \ No newline at end of file + data.uns[res_key] = (Z, csi_df) diff --git a/pegasus/tools/pseudobulk.py b/pegasus/tools/pseudobulk.py index 7e4b39a6..62bff03e 100644 --- a/pegasus/tools/pseudobulk.py +++ b/pegasus/tools/pseudobulk.py @@ -4,7 +4,8 @@ logger = logging.getLogger(__name__) from pegasusio import MultimodalData, UnimodalData, timer -from pandas.api.types import is_categorical_dtype, is_numeric_dtype +from pegasus.tools import eff_n_jobs +from pandas.api.types import is_numeric_dtype from typing import Union, Optional, List, Tuple @@ -30,10 +31,10 @@ def get_pseudobulk_count(X, df, attr, bulk_list): @timer(logger=logger) def pseudobulk( data: MultimodalData, - sample: str, + groupby: str, attrs: Optional[Union[List[str], str]] = None, - mat_key: Optional[str] = "counts", - cluster: Optional[str] = None, + mat_key: str = "counts", + condition: Optional[str] = None, ) -> UnimodalData: """Generate Pseudo-bulk count matrices. @@ -42,7 +43,7 @@ def pseudobulk( data: ``MultimodalData`` or ``UnimodalData`` object Annotated data matrix with rows for cells and columns for genes. - sample: ``str`` + groupby: ``str`` Specify the cell attribute used for aggregating pseudo-bulk data. Key must exist in ``data.obs``. @@ -56,40 +57,37 @@ def pseudobulk( Specify the single-cell count matrix used for aggregating pseudo-bulk counts: If specified, use the count matrix with key ``mat_key`` from matrices of ``data``; otherwise, default is ``counts``. - cluster: ``str``, optional, default: ``None`` - If set, additionally generate pseudo-bulk matrices per cluster specified in ``data.obs[cluster]``. + condition: ``str``, optional, default: ``None`` + If set, additionally generate pseudo-bulk matrices per condition specified in ``data.obs[condition]``. Returns ------- - A UnimodalData object ``udata`` containing pseudo-bulk information: + A MultimodalData object ``mdata`` containing pseudo-bulk information: * It has the following count matrices: * ``X``: The pseudo-bulk count matrix over all cells. - * If ``cluster`` is set, a number of pseudo-bulk count matrices of cells belonging to the clusters, respectively. - * ``udata.obs``: It contains pseudo-bulk attributes aggregated from the corresponding single-cell attributes. - * ``udata.var``: Gene names and Ensembl IDs are maintained. - - Update ``data``: - * Add the returned UnimodalData object above to ``data`` with key ``-pseudobulk``, where ```` is replaced by the actual value of ``sample`` argument. + * If ``condition`` is set, add additional pseudo-bulk count matrices of cells restricted to each condition, respectively + * ``mdata.obs``: It contains pseudo-bulk attributes aggregated from the corresponding single-cell attributes. + * ``mdata.var``: Gene names and Ensembl IDs are maintained. Examples -------- - >>> pg.pseudobulk(data, sample="Channel") + >>> pg.pseudobulk(data, groupby="Channel") """ X = data.get_matrix(mat_key) - assert sample in data.obs.columns, f"Sample key '{sample}' must exist in data.obs!" + assert groupby in data.obs.columns, f"Sample key '{groupby}' must exist in data.obs!" sample_vec = ( - data.obs[sample] - if is_categorical_dtype(data.obs[sample]) - else data.obs[sample].astype("category") + data.obs[groupby] + if isinstance(data.obs[groupby].dtype, pd.CategoricalDtype) + else data.obs[groupby].astype("category") ) bulk_list = sample_vec.cat.categories df_barcode = data.obs.reset_index() - mat_dict = {"counts": get_pseudobulk_count(X, df_barcode, sample, bulk_list)} + mat_dict = {"counts": get_pseudobulk_count(X, df_barcode, groupby, bulk_list)} # Generate pseudo-bulk attributes if specified bulk_attr_list = [] @@ -103,7 +101,7 @@ def pseudobulk( ), f"Cell attribute key '{attr}' must exist in data.obs!" for bulk in bulk_list: - df_bulk = df_barcode.loc[df_barcode[sample] == bulk] + df_bulk = df_barcode.loc[df_barcode[groupby] == bulk] if attrs is not None: bulk_attr = df_bulk[attrs].apply(set_bulk_value, axis=0) bulk_attr["barcodekey"] = bulk @@ -112,43 +110,52 @@ def pseudobulk( bulk_attr_list.append(bulk_attr) df_pseudobulk = pd.DataFrame(bulk_attr_list) + for col in df_pseudobulk.columns: + if col == 'barcodekey': + continue + if isinstance(df_barcode[col].dtype, pd.CategoricalDtype): + c_old = df_barcode[col].cat + cat_dtype = pd.CategoricalDtype(categories=c_old.categories, ordered=c_old.ordered) + c_new = pd.Series(df_pseudobulk[col], dtype=cat_dtype).cat + df_pseudobulk[col] = c_new.remove_unused_categories() df_feature = pd.DataFrame(index=data.var_names) if "featureid" in data.var.columns: df_feature["featureid"] = data.var["featureid"] - if cluster is not None: + if condition is not None: assert ( - cluster in data.obs.columns - ), f"Cluster key '{attr}' must exist in data.obs!" + condition in data.obs.columns + ), f"Condition key '{attr}' must exist in data.obs!" - cluster_list = data.obs[cluster].astype("category").cat.categories + cluster_list = data.obs[condition].astype("category").cat.categories for cls in cluster_list: - mat_dict[f"{cluster}_{cls}.X"] = get_pseudobulk_count( - X, df_barcode.loc[df_barcode[cluster] == cls], sample, bulk_list + mat_dict[f"{condition}_{cls}.X"] = get_pseudobulk_count( + X, df_barcode.loc[df_barcode[condition] == cls], groupby, bulk_list ) udata = UnimodalData( barcode_metadata=df_pseudobulk, feature_metadata=df_feature, matrices=mat_dict, - genome=sample, + genome=groupby, modality="pseudobulk", cur_matrix="counts", ) - data.add_data(udata) - - return udata + return MultimodalData(udata) @timer(logger=logger) def deseq2( - pseudobulk: UnimodalData, + pseudobulk: Union[MultimodalData, UnimodalData], design: str, contrast: Tuple[str, str, str], + backend: str = "pydeseq2", de_key: str = "deseq2", - replaceOutliers: bool = True, + alpha: float = 0.05, + compute_all: bool = False, + n_jobs: int = -1, ) -> None: """Perform Differential Expression (DE) Analysis using DESeq2 on pseduobulk data. This function calls R package DESeq2, requiring DESeq2 in R installed. @@ -156,20 +163,32 @@ def deseq2( Parameters ---------- - pseudobulk: ``UnimodalData`` - Pseudobulk data with rows for samples and columns for genes. If pseudobulk contains multiple matrices, DESeq2 will apply to all matrices. + pseudobulk: ``MultimodalData`` or ``UnimodalData`` + Pseudobulk data with rows for samples/pseudobulks and columns for genes. It may contain multiple count matrices of the same shape with different keys - design: ``str`` - Design formula that will be passed to DESeq2 + design: ``str`` or ``List[str]`` + For ``pydeseq2`` backend, specify either a factor or a list of factors to be used as design variables.They must be all in ``pseudobulk.obs``. + For ``deseq2`` backend, specify the design formula that will be passed to DESeq2. E.g. ``~group+condition`` or ``~genotype+treatment+genotype:treatment``. contrast: ``Tuple[str, str, str]`` - A tuple of three elements passing to DESeq2: a factor in design formula, a level in the factor as numeritor of fold change, and a level as denominator of fold change. - + A tuple of three elements passing to DESeq2: a factor in design formula, a level in the factor as the test level (numeritor of fold change), and a level as the reference level (denominator of fold change). + + backend: ``str``, optional, default: ``pydeseq2`` + Specify which package to use as the backend for pseudobulk DE analysis. + By default, use ``PyDESeq2`` which is a pure Python implementation of DESeq2 method. + Alternatively, if specifying ``deseq2``, then use R package DESeq2, which requires ``rpy2`` package and R installation. + de_key: ``str``, optional, default: ``"deseq2"`` - Key name of DE analysis results stored. For cluster.X, stored key will be cluster.de_key + Key name of DE analysis results stored. For count matrix with name ``condition.X``, stored key will be ``condition.de_key``. + + alpha: ``float``, optional, default: ``0.05`` + The significance cutoff (between 0 and 1) used for optimizing the independent filtering to calculate the adjusted p-values (FDR). - replaceOutliers: ``bool``, optional, default: ``True`` - If execute DESeq2's replaceOutliers step. If set to ``False``, we will set minReplicatesForReplace=Inf in ``DESeq`` function and set cooksCutoff=False in ``results`` function. + compute_all: ``bool``, optional, default: ``False`` + If performing DE analysis on all count matrices. By default (``compute_all=False``), only apply DE analysis to the default count matrix ``counts``. + + n_jobs: ``int``, optional, default: ``-1`` + Number of threads to use. If ``-1``, use all physical CPU cores. This only works when ``backend="pydeseq2"`. Returns ------- @@ -177,12 +196,82 @@ def deseq2( Update ``pseudobulk.varm``: ``pseudobulk.varm[de_key]``: DE analysis result for pseudo-bulk count matrix. - ``pseudobulk.varm[cluster.de_key]``: DE results for cluster-specific pseudo-bulk count matrices. + (Optional) ``pseudobulk.varm[condition.de_key]``: If ``compute_all=True``, DE results for each condition-specific pseudo-bulk count matrices. Examples -------- - >>> pg.deseq2(pseudobulk, '~gender', ('gender', 'female', 'male')) + >>> pg.deseq2(pseudobulk, 'gender', ('gender', 'female', 'male')) + >>> pg.deseq2(pseudobulk, '~gender', ('gender', 'female', 'male'), backend="deseq2") """ + mat_keys = ['counts'] if not compute_all else pseudobulk.list_keys() + for mat_key in mat_keys: + if backend == "pydeseq2": + _run_pydeseq2(pseudobulk=pseudobulk, mat_key=mat_key, design_factors=design, contrast=contrast, de_key=de_key, alpha=alpha, n_jobs=n_jobs) + else: + _run_rdeseq2(pseudobulk=pseudobulk, mat_key=mat_key, design=design, contrast=contrast, de_key=de_key, alpha=alpha) + + +def _run_pydeseq2( + pseudobulk: Union[MultimodalData, UnimodalData], + mat_key: str, + design_factors: Union[str, List[str]], + contrast: Tuple[str, str, str], + de_key: str, + alpha: float, + n_jobs: int, +) -> None: + try: + from pydeseq2.dds import DeseqDataSet + from pydeseq2.default_inference import DefaultInference + from pydeseq2.ds import DeseqStats + except ModuleNotFoundError as e: + import sys + logger.error(f"{e}\nNeed pydeseq2! Try 'pip install pydeseq2'.") + sys.exit(-1) + + if isinstance(design_factors, str): + assert design_factors in pseudobulk.obs.columns, f"The design factor {design_factors} does not exist in data.obs!" + else: + for factor in design_factors: + assert factor in pseudobulk.obs.columns, f"The design factor {factor} does not exist in data.obs!" + + counts_df = pd.DataFrame(pseudobulk.get_matrix(mat_key), index=pseudobulk.obs_names, columns=pseudobulk.var_names) + metadata = pseudobulk.obs + + n_cpus = eff_n_jobs(n_jobs) + inference = DefaultInference(n_cpus=n_cpus) + dds = DeseqDataSet( + counts=counts_df, + metadata=metadata, + design_factors=design_factors, + refit_cooks=True, + inference=inference, + quiet=True, + ) + dds.deseq2() + + stat_res = DeseqStats( + dds, + contrast=contrast, + cooks_filter=True, + alpha=alpha, + inference=inference, + quiet=True, + ) + stat_res.summary() + res_key = de_key if mat_key == "counts" else mat_key.removesuffix(".X") + "." + de_key + res_df = stat_res.results_df + pseudobulk.varm[res_key] = res_df.to_records(index=False) + + +def _run_rdeseq2( + pseudobulk: UnimodalData, + mat_key: str, + design: str, + contrast: Tuple[str, str, str], + de_key: str, + alpha: float, +) -> None: try: import rpy2.robjects as ro from rpy2.robjects import pandas2ri, numpy2ri, Formula @@ -202,26 +291,22 @@ def deseq2( if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("DESeq2")""" - + logger.error(text) sys.exit(-1) - import math + assert design.strip().startswith("~"), f"Design '{design}' is not a valid R formula! Valid examples: '~var', '~var1+var2', '~var1+var2+var1:var2'." + + import math to_dataframe = ro.r('function(x) data.frame(x)') - for mat_key in pseudobulk.list_keys(): - with localconverter(ro.default_converter + numpy2ri.converter + pandas2ri.converter): - dds = deseq2.DESeqDataSetFromMatrix(countData = pseudobulk.get_matrix(mat_key).T, colData = pseudobulk.obs, design = Formula(design)) - - if replaceOutliers: - dds = deseq2.DESeq(dds) - res= deseq2.results(dds, contrast=ro.StrVector(contrast)) - else: - dds = deseq2.DESeq(dds, minReplicatesForReplace=math.inf) - res= deseq2.results(dds, contrast=ro.StrVector(contrast), cooksCutoff=False) - with localconverter(ro.default_converter + pandas2ri.converter): - res_df = ro.conversion.rpy2py(to_dataframe(res)) - res_df.fillna({'log2FoldChange': 0.0, 'lfcSE': 0.0, 'stat': 0.0, 'pvalue': 1.0, 'padj': 1.0}, inplace=True) - - de_res_key = de_key if mat_key.find('.') < 0 else f"{mat_key.partition('.')[0]}.{de_key}" - pseudobulk.varm[de_res_key] = res_df.to_records(index=False) + with localconverter(ro.default_converter + numpy2ri.converter + pandas2ri.converter): + dds = deseq2.DESeqDataSetFromMatrix(countData = pseudobulk.get_matrix(mat_key).T, colData = pseudobulk.obs, design = Formula(design)) + + dds = deseq2.DESeq(dds) + res= deseq2.results(dds, contrast=ro.StrVector(contrast), alpha=alpha) + with localconverter(ro.default_converter + pandas2ri.converter): + res_df = ro.conversion.rpy2py(to_dataframe(res)) + + res_key = de_key if mat_key == "counts" else mat_key.removesuffix(".X") + "." + de_key + pseudobulk.varm[res_key] = res_df.to_records(index=False) diff --git a/requirements.txt b/requirements.txt index 5458ce4d..1788c336 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ joblib>=0.14 lightgbm>=2.2.1 loompy leidenalg -matplotlib>=2.0.0 +matplotlib>=3.7.0 natsort numba numpy diff --git a/setup.py b/setup.py index 22fb763f..43dadaf9 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,8 @@ mkl=["mkl"], rpy2=["rpy2"], scvi=["scvi-tools"], - all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools"] + pseudobulk=["pydeseq2"] + all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools", "pydeseq2"] ), python_requires="~=3.8", package_data={ From e368e45b900e739bc93945ab228847e030a75d9d Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Sat, 1 Jun 2024 06:27:58 +0000 Subject: [PATCH 2/2] fix bug --- pegasus/tools/pseudobulk.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pegasus/tools/pseudobulk.py b/pegasus/tools/pseudobulk.py index 62bff03e..75e8493b 100644 --- a/pegasus/tools/pseudobulk.py +++ b/pegasus/tools/pseudobulk.py @@ -265,7 +265,7 @@ def _run_pydeseq2( def _run_rdeseq2( - pseudobulk: UnimodalData, + pseudobulk: Union[MultimodalData, UnimodalData], mat_key: str, design: str, contrast: Tuple[str, str, str], diff --git a/setup.py b/setup.py index 43dadaf9..7f912a63 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ mkl=["mkl"], rpy2=["rpy2"], scvi=["scvi-tools"], - pseudobulk=["pydeseq2"] + pseudobulk=["pydeseq2"], all=["fitsne", "louvain", "scanorama", "torch", "harmony-pytorch", "nmf-torch", "rpy2", "forceatlas2-python", "scvi-tools", "pydeseq2"] ), python_requires="~=3.8",