From 692c4362b9e6956663429e022eb5663b597b7616 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Mon, 13 May 2024 22:41:20 -0700 Subject: [PATCH 1/2] Added find_nmf_programs --- pegasus/__init__.py | 1 + pegasus/plotting/plot_library.py | 7 +- pegasus/plotting/plot_utils.py | 6 +- pegasus/tools/__init__.py | 2 +- pegasus/tools/nmf.py | 164 ++++++++++++++++++++++++++++++- 5 files changed, 174 insertions(+), 6 deletions(-) diff --git a/pegasus/__init__.py b/pegasus/__init__.py index 640f6e06..043d5b77 100644 --- a/pegasus/__init__.py +++ b/pegasus/__init__.py @@ -40,6 +40,7 @@ tsvd_transform, regress_out, nmf, + find_nmf_programs, integrative_nmf, highly_variable_features, run_harmony, diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py index b65cd934..91b10b70 100644 --- a/pegasus/plotting/plot_library.py +++ b/pegasus/plotting/plot_library.py @@ -2234,6 +2234,7 @@ def wordcloud( data: Union[MultimodalData, UnimodalData, anndata.AnnData], factor: int, max_words: Optional[int] = 20, + features: Optional[str] = "highly_variable_features", random_state: Optional[int] = 0, colormap: Optional[str] = "hsv", width: Optional[int] = 800, @@ -2254,6 +2255,8 @@ def wordcloud( Which factor to plot. factor starts from 0. max_words: ``int``, optional, default: 20 Maximum number of genes to show in the image. + features: ``str``, optional, default: ``highly_variable_features`` + Features selected for NMF computation. random_state: ``int``, optional, default: 0 Random seed passing to WordCloud function. colormap: ``str``, optional, default: ``hsv`` @@ -2280,9 +2283,9 @@ def wordcloud( >>> fig = pg.wordcloud(data, factor=0) """ fig, ax = _get_subplot_layouts(panel_size=panel_size, dpi=dpi) # default nrows = 1 & ncols = 1 - + assert 'W' in data.uns - hvg = data.var_names[data.var['highly_variable_features']] + hvg = data.var_names[data.var[features]] word_dict = {} for i in range(hvg.size): word_dict[hvg[i]] = data.uns['W'][i, factor] diff --git a/pegasus/plotting/plot_utils.py b/pegasus/plotting/plot_utils.py index e0b6fde8..f797fc42 100644 --- a/pegasus/plotting/plot_utils.py +++ b/pegasus/plotting/plot_utils.py @@ -285,9 +285,11 @@ def _get_palette(n_labels: int, with_background: bool = False, show_background: palette = pegasus_20 elif n_labels <= 26: palette = zeileis_26 - else: - assert n_labels <= 64 + elif n_labels <= 64: palette = godsnot_64 + else: + n_rep = (n_labels - 1) // 20 + 1 # a cyclic color panel + palette = pegasus_20 * n_rep if with_background: palette = np.array( diff --git a/pegasus/tools/__init__.py b/pegasus/tools/__init__.py index c092a8d8..9d24c5a5 100644 --- a/pegasus/tools/__init__.py +++ b/pegasus/tools/__init__.py @@ -62,7 +62,7 @@ from .subcluster_utils import clone_subset from .signature_score import calc_signature_score, calculate_z_score from .doublet_detection import infer_doublets, mark_doublets -from .nmf import nmf, integrative_nmf +from .nmf import nmf, integrative_nmf, find_nmf_programs from .pseudobulk import pseudobulk, deseq2 from .fgsea import fgsea, write_fgsea_results_to_excel from .scvitools import ( diff --git a/pegasus/tools/nmf.py b/pegasus/tools/nmf.py index 8cf2cb08..bb000e1b 100644 --- a/pegasus/tools/nmf.py +++ b/pegasus/tools/nmf.py @@ -5,7 +5,7 @@ from numba import njit from numba.typed import List as numbaList -from typing import List, Union +from typing import List, Union, Tuple from pegasusio import UnimodalData, MultimodalData from pegasus.tools import slicing, eff_n_jobs, calculate_nearest_neighbors, check_batch_key @@ -205,6 +205,168 @@ def nmf( data.obsm["X_nmf"] = H / np.linalg.norm(H, axis=0) +@timer(logger=logger) +def find_nmf_programs( + data: Union[MultimodalData, UnimodalData], + n_range: Tuple[int, int] = (4, 9), + n_rep: int = 10, + features: str = "highly_variable_features", + space: str = "log", + init: str = "random", + algo: str = "halsvar", + mode: str = "batch", + tol: float = 1e-4, + use_gpu: bool = False, + alpha_W: float = 0.0, + l1_ratio_W: float = 0.0, + alpha_H: float = 0.01, + l1_ratio_H: float = 1.0, + fp_precision: str = "float", + online_chunk_size: int = 5000, + n_jobs: int = -1, + random_state: int = 0, +) -> Tuple[list, list, list, list]: + """Perform Nonnegative Matrix Factorization (NMF) to the data using Frobenius norm. Steps include select features and L2 normalization and NMF and L2 normalization of resulting coordinates. + + The calculation uses `nmf-torch `_ package. + + Parameters + ---------- + data: ``pegasusio.MultimodalData`` + Annotated data matrix with rows for cells and columns for genes. + + n_range: ``Tuple[int, int]``, optional, default: ``(4, 9)``. + Number of ranks to iterate over. + + n_rep: ``int``, optional, default: 10 + Number of reruns for each value in n_range. + + features: ``str``, optional, default: ``"highly_variable_features"``. + Keyword in ``data.var`` to specify features used for nmf. + + max_value: ``float``, optional, default: ``None``. + The threshold to truncate data symmetrically after scaling. If ``None``, do not truncate. + + space: ``str``, optional, default: ``log``. + Choose from ``log`` and ``expression``. ``log`` works on log-transformed expression space; ``expression`` works on the original expression space (normalized by total UMIs). + + init: ``str``, optional, default: ``random``. + Method to initialize NMF. Options are 'random', 'nndsvd', 'nndsvda' and 'nndsvdar'. + + algo: ``str``, optional, default: ``halsvar`` + Choose from ``mu`` (Multiplicative Update), ``hals`` (Hierarchical Alternative Least Square), ``halsvar`` (HALS variant, use HALS to mimic ``bpp`` and can get better convergence for sometimes) and ``bpp`` (alternative non-negative least squares with Block Principal Pivoting method). + + mode: ``str``, optional, default: ``batch`` + Learning mode. Choose from ``batch`` and ``online``. Notice that ``online`` only works when ``beta=2.0``. For other beta loss, it switches back to ``batch`` method. + + tol: ``float``, optional, default: ``1e-4`` + The toleration used for convergence check. + + use_gpu: ``bool``, optional, default: ``False`` + If ``True``, use GPU if available. Otherwise, use CPU only. + + alpha_W: ``float``, optional, default: ``0.0`` + A numeric scale factor which multiplies the regularization terms related to W. + If zero or negative, no regularization regarding W is considered. + + l1_ratio_W: ``float``, optional, default: ``0.0`` + The ratio of L1 penalty on W, must be between 0 and 1. And thus the ratio of L2 penalty on W is (1 - l1_ratio_W). + + alpha_H: ``float``, optional, default: ``0.01`` + A numeric scale factor which multiplies the regularization terms related to H. + If zero or negative, no regularization regarding H is considered. + + l1_ratio_H: ``float``, optional, default: ``1.0`` + The ratio of L1 penalty on W, must be between 0 and 1. And thus the ratio of L2 penalty on H is (1 - l1_ratio_H). + + fp_precision: ``str``, optional, default: ``float`` + The numeric precision on the results. Choose from ``float`` and ``double``. + + online_chunk_size: ``int``, optional, default: ``int`` + The chunk / mini-batch size for online learning. Only works when ``mode='online'``. + + n_jobs : `int`, optional (default: -1) + Number of threads to use. -1 refers to using all physical CPU cores. + + random_state: ``int``, optional, default: ``0``. + Random seed to be set for reproducing result. + + Returns + ------- + Hs: best H for each k in n_range + Ws: best W for each k in n_range + errs: best err for each k in n_range + coph_corrs: cophenetic correlation coefficients for each k in n_range + + Examples + -------- + >>> pg.find_nmf_programs(data) + """ + X = _select_and_scale_features(data, features=features, space=space) + + try: + from nmf import run_nmf + from scipy.cluster.hierarchy import linkage, cophenet + from scipy.spatial.distance import squareform + except ImportError as e: + import sys + logger.error(f"{e}\nNeed NMF-Torch! Try 'pip install nmf-torch'.") + sys.exit(-1) + + Hs = [] + Ws = [] + errs = [] + coph_corrs = [] + + rng = np.random.default_rng(random_state) + BIG_NUM = 1000000000 + mats_conn = np.zeros((n_rep, X.shape[0], X.shape[0])) # connectivity matrices + + for k in range(n_range[0], n_range[1] + 1): + print(f"Begin k={k}:") + + H_best = W_best = None + err_best = 1e100 + + for i in range(n_rep): + H, W, err = run_nmf( + X, + n_components=k, + init=init, + algo=algo, + mode=mode, + tol=tol, + n_jobs=eff_n_jobs(n_jobs), + random_state=rng.integers(BIG_NUM), + use_gpu=use_gpu, + alpha_W=alpha_W, + l1_ratio_W=l1_ratio_W, + alpha_H=alpha_H, + l1_ratio_H=l1_ratio_H, + fp_precision=fp_precision, + online_chunk_size=online_chunk_size, + ) + + if err_best > err: + err_best = err + H_best = H + W_best = W + + clusters = H.argmax(axis=1) + mats_conn[i] = clusters.reshape((-1, 1)) == clusters.reshape((1, -1)) + + consensus = mats_conn.mean(axis=0) + Y = squareform(1.0 - consensus) + Z = linkage(Y, method='average') + coph_corr = cophenet(Z, Y)[0] + + Hs.append(H_best) + Ws.append(W_best) + errs.append(err_best) + coph_corrs.append(coph_corr) + + return Hs, Ws, errs, coph_corrs + @njit(fastmath=True, cache=True) def _refine_cluster(clusters, indices, ncluster): From c428574aaabbbd0c95d15db31ed17501202e9aab Mon Sep 17 00:00:00 2001 From: Bo Li Date: Tue, 28 May 2024 17:38:20 -0700 Subject: [PATCH 2/2] Updated several functions --- pegasus/plotting/plot_library.py | 3 ++- pegasus/tools/clustering.py | 2 +- pegasus/tools/hvf_selection.py | 3 ++- pegasus/tools/nmf.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pegasus/plotting/plot_library.py b/pegasus/plotting/plot_library.py index 09f66494..e5eed198 100644 --- a/pegasus/plotting/plot_library.py +++ b/pegasus/plotting/plot_library.py @@ -1636,7 +1636,8 @@ def plot_dendrogram( from scipy.cluster.hierarchy import dendrogram - Z, labels = data.uns[graph_key] + Z = data.uns[graph_key][0] + labels = data.uns[graph_key][1].index fig, ax = _get_subplot_layouts(panel_size=panel_size, dpi=dpi) dendrogram( Z, diff --git a/pegasus/tools/clustering.py b/pegasus/tools/clustering.py index 05a067d0..9be2f447 100644 --- a/pegasus/tools/clustering.py +++ b/pegasus/tools/clustering.py @@ -825,4 +825,4 @@ def calc_dendrogram( np.fill_diagonal(dissim_df.to_numpy(), 0) # Enforce main diagonal to be 0 to pass squareform requirement Z = linkage(squareform(dissim_df), method=linkage_method, optimal_ordering=True) - data.uns[res_key] = (Z, dissim_df.index.values.astype(str)) + data.uns[res_key] = (Z, csi_df) \ No newline at end of file diff --git a/pegasus/tools/hvf_selection.py b/pegasus/tools/hvf_selection.py index 350104ce..e2a2fe95 100644 --- a/pegasus/tools/hvf_selection.py +++ b/pegasus/tools/hvf_selection.py @@ -49,7 +49,8 @@ def select_hvf_pegasus( """ Select highly variable features using the pegasus method """ if "robust" not in data.var: - raise ValueError("Please run `identify_robust_genes` to identify robust genes") + logger.warning("Robust genes are not identified. Mark all genes as robust.") + data.var["robust"] = True estimate_feature_statistics(data, batch) diff --git a/pegasus/tools/nmf.py b/pegasus/tools/nmf.py index bb000e1b..754023e3 100644 --- a/pegasus/tools/nmf.py +++ b/pegasus/tools/nmf.py @@ -300,7 +300,7 @@ def find_nmf_programs( Examples -------- - >>> pg.find_nmf_programs(data) + >>> Hs, Ws, errs, coph_corrs = pg.find_nmf_programs(data) """ X = _select_and_scale_features(data, features=features, space=space)