From 27da140566ee78e984d431eb4f5928d7e6f055c9 Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Wed, 17 Apr 2024 10:18:51 +0200 Subject: [PATCH 01/18] inherit from BaseEstimator and TransformerMixin --- multispaeti/_multispati_pca.py | 123 ++++++++++++++++----------------- 1 file changed, 58 insertions(+), 65 deletions(-) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index 3f0c97b..23c9520 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -7,7 +7,10 @@ from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix, issparse from scipy.sparse import linalg as sparse_linalg from scipy.sparse import sparray, spmatrix +from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import scale +from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1 +from sklearn.utils.validation import check_array, check_is_fitted T = TypeVar("T", bound=np.number) U = TypeVar("U", bound=np.number) @@ -18,7 +21,12 @@ _X: TypeAlias = np.ndarray | _Csr | _Csc -class MultispatiPCA: +# TODO: should pass the following check +# from sklearn.utils.estimator_checks import check_estimator +# check_estimator(MultispatiPCA()) + + +class MultispatiPCA(TransformerMixin, BaseEstimator): """ MULTISPATI-PCA @@ -46,28 +54,21 @@ class MultispatiPCA: A distance matrix should be transformed to connectivities by e.g. calculating :math:`1-d/d_{max}` beforehand. - Raises - ------ - ValueError - If connectivity is not a square matrix. - ZeroDivisionError - If one of the observations has no neighbors. - Attributes ---------- components_ : numpy.ndarray The estimated components: Array of shape `(n_components, n_features)`. + eigenvalues_ : numpy.ndarray + The eigenvalues corresponding to each of the selected components. Array of shape + `(n_components,)`. + variance_ : numpy.ndarray The estimated variance part of the eigenvalues. Array of shape `(n_components,)`. moransI_ : numpy.ndarray The estimated Moran's I part of the eigenvalues. Array of shape `(n_components,)`. - eigenvalues_ : numpy.ndarray - The eigenvalues corresponding to each of the selected components. Array of shape - `(n_components,)`. - n_components_ : int The estimated number of components. @@ -92,17 +93,22 @@ def __init__( self, n_components: int | tuple[int, int] | None = None, *, - connectivity: sparray | spmatrix, + connectivity: sparray | spmatrix | None = None, ): - self._fitted = False - W = csr_array(connectivity) + self.n_components = n_components + self.connectivity = connectivity + + @staticmethod + def _validate_connectivity(W: csr_array, n: int): if W.shape[0] != W.shape[1]: raise ValueError("`connectivity` must be square") - self.W = self._normalize_connectivities(W) - - n = self.W.shape[0] + if W.shape[0] != n: + raise ValueError( + "#rows in `X` must be the same as dimensions of `connectivity`" + ) - self.n_components = n_components + def _validate_n_components(self, n: int): + n_components = self.n_components self._n_neg = 0 if n_components is None: @@ -127,13 +133,7 @@ def __init__( else: raise ValueError("`n_components` must be None, int or (int, int)") - @staticmethod - def _normalize_connectivities(W: csr_array) -> csr_array: - # normalize rowsums to 1 for better interpretability - # TODO can't handle points without neighbors because of division by zero - return W.multiply(1 / W.sum(axis=1)[:, np.newaxis]) - - def fit(self, X: _X): + def fit(self, X: _X, y: None = None): """ Fit MULTISPATI-PCA projection. @@ -141,6 +141,8 @@ def fit(self, X: _X): ---------- X : numpy.ndarray or scipy.sparse.csr_array or scipy.sparse.csc_array Array of observations x features. + y : None + Ignored. scikit-learn compatibility only. Raises ------ @@ -148,15 +150,28 @@ def fit(self, X: _X): If `X` has not the same number of rows like `connectivity`. If `n_components` is None and `X` is sparse. If (sum of) `n_components` is larger than the smaller dimension of `X`. + ValueError + If connectivity is not a square matrix. + ZeroDivisionError + If one of the observations has no neighbors. """ + + X = check_array(X) + self.W = check_array( + self.connectivity, accept_sparse="csr", dtype=float + ) # TODO float32, float64? + + n, d = X.shape + + self._validate_connectivity(self.W, n) + self._validate_n_components(n) + + inplace_csr_row_normalize_l1(self.W) + if issparse(X): X = csc_array(X) assert isinstance(X, (np.ndarray, csc_array)) - if X.shape[0] != self.W.shape[0]: - raise ValueError( - "#rows in `X` must be the same as dimensions of `connectivity`" - ) if self._n_pos is None: if issparse(X): raise ValueError( @@ -164,7 +179,6 @@ def fit(self, X: _X): "supported for dense matrices." ) elif (self._n_pos + self._n_neg) > X.shape[1]: - n, d = X.shape n_comp = self._n_pos + self._n_neg n_comp_max = min(n, d) raise ValueError( @@ -180,8 +194,13 @@ def fit(self, X: _X): self.components_ = eig_vec self.eigenvalues_ = eig_val self.n_components_ = eig_val.size - self.n_features_in_ = X.shape[1] - self._fitted = True + self.n_features_in_ = d + + self.variance_, self.moransI_ = self._variance_moransI_decomposition( + X @ self.components_ + ) + + return self def _multispati_eigendecomposition( self, X: _X, W: _Csr @@ -266,31 +285,10 @@ def transform(self, X: _X) -> np.ndarray: ValueError If instance has not been fitted. """ + check_is_fitted(self) # Data must be scaled, avoid mean-centering for sparse - if not self._fitted: - self._not_fitted() - X = scale(X, with_mean=not issparse(X)) - X_t = X @ self.components_ - self.variance_, self.moransI_ = self._variance_moransI_decomposition(X_t) - - return X_t - - def fit_transform(self, X: _X) -> np.ndarray: - """ - Fit the MULTISPATI-PCA projection and transform the data. - - Parameters - ---------- - X : numpy.ndarray or scipy.sparse.csr_array or scipy.sparse.csc_array - Array of observations x features. - - Returns - ------- - numpy.ndarray - """ - self.fit(X) - return self.transform(X) + return X @ self.components_ def transform_spatial_lag(self, X: _X) -> np.ndarray: """ @@ -311,8 +309,7 @@ def transform_spatial_lag(self, X: _X) -> np.ndarray: ValueError If instance has not been fitted. """ - if not self._fitted: - self._not_fitted() + check_is_fitted(self) return self._spatial_lag(self.transform(X)) def _spatial_lag(self, X: np.ndarray) -> np.ndarray: @@ -351,9 +348,11 @@ def moransI_bounds( # following R package adespatial::moran.bounds # sparse approx is following adegenet sPCA as shown in screeplot/summary - def double_center(W): + def double_center(W: np.ndarray | csr_array) -> np.ndarray: if issparse(W): + assert isinstance(W, csr_array) W = W.toarray() + assert isinstance(W, np.ndarray) row_means = np.mean(W, axis=1, keepdims=True) col_means = np.mean(W, axis=0, keepdims=True) - np.mean(row_means) @@ -381,9 +380,3 @@ def double_center(W): I_max = max(eigen_values) return I_min, I_max, I_0 - - def _not_fitted(self): - raise ValueError( - "This MultispatiPCA instance is not fitted yet. " - "Call 'fit' with appropriate arguments first." - ) From 8b12afe70e4b2b4620a2f382b17ec17a8f943f06 Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 19 Apr 2024 15:13:39 +0200 Subject: [PATCH 02/18] fully compatible with sklearn, test compatibility --- docs/source/api.rst | 10 +++--- docs/source/usage.rst | 15 +++++++-- multispaeti/_multispati_pca.py | 57 +++++++++++++++++++--------------- multispaeti/_plotting.py | 26 +++++++++++----- pyproject.toml | 5 +++ tests/test_multispati_pca.py | 11 +++++++ 6 files changed, 84 insertions(+), 40 deletions(-) create mode 100644 tests/test_multispati_pca.py diff --git a/docs/source/api.rst b/docs/source/api.rst index 0d79833..cc20154 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -2,11 +2,11 @@ API === -The API for :py:class:`multispaeti.MultispatiPCA` tries to follow the design in -`scikit-learn `_. The plan is to make it fully compatible in -the future by inheriting from :py:class:`sklearn.base.BaseEstimator` and -:py:class:`sklearn.base.TransformerMixin`. - +The API for :py:class:`multispaeti.MultispatiPCA` follows the design in +`scikit-learn `_. It inherits from +:py:class:`sklearn.base.BaseEstimator`, :py:class:`sklearn.base.TransformerMixin`, and +:py:class:`sklearn.base.ClassNamePrefixFeaturesOutMixin` making it fully compatible +with `scikit-learn` pipelines. .. currentmodule:: multispaeti diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 3a15088..790d4b1 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -34,15 +34,24 @@ As for e.g. :py:class:`sklearn.decomposition.PCA` we first need to .. code-block:: python - msPCA.fit(X) - X_transformed = msPCA.transform(X) + from sklearn.preprocessing import scale + + X_scaled = scale(X) + msPCA.fit(X_scaled) + X_transformed = msPCA.transform(X_scaled) + +.. note:: + + MultispatiPCA expects its input to be standardized by e.g. using + :py:func:`sklearn.decomposition.scale` before passing it to fit/transform. + Alternatively, this can be achieved in one step by :py:meth:`multispaeti.MultispatiPCA.fit_transform` .. code-block:: python - X_transformed = msPCA.fit_transform(X) + X_transformed = msPCA.fit_transform(X_scaled) .. Additional, functionality is offered through the method .. :py:meth:`multispaeti.MultispatiPCA.moransI_bounds` which calculates the minimum and diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index 23c9520..6c9c737 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -4,12 +4,15 @@ import numpy as np from numpy.typing import NDArray from scipy import linalg -from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix, issparse +from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix, eye, issparse from scipy.sparse import linalg as sparse_linalg from scipy.sparse import sparray, spmatrix -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.preprocessing import scale -from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1 +from sklearn.base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, +) +from sklearn.preprocessing import normalize from sklearn.utils.validation import check_array, check_is_fitted T = TypeVar("T", bound=np.number) @@ -26,7 +29,7 @@ # check_estimator(MultispatiPCA()) -class MultispatiPCA(TransformerMixin, BaseEstimator): +class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """ MULTISPATI-PCA @@ -87,8 +90,6 @@ class MultispatiPCA(TransformerMixin, BaseEstimator): `_ """ - # TODO, should scaling be part of multispati - def __init__( self, n_components: int | tuple[int, int] | None = None, @@ -157,16 +158,24 @@ def fit(self, X: _X, y: None = None): """ X = check_array(X) - self.W = check_array( - self.connectivity, accept_sparse="csr", dtype=float - ) # TODO float32, float64? + if self.connectivity is None: + warnings.warn( + "`connectivity` has not been set. Defaulting to identity matrix " + "which will conceptually compute a standard PCA. " + "It is not recommended to not set `connectivity`." + ) + W = csr_array(eye(X.shape[0])) + else: + W = self.connectivity + W = check_array(W, accept_sparse="csr") n, d = X.shape - self._validate_connectivity(self.W, n) + self._validate_connectivity(W, n) self._validate_n_components(n) - inplace_csr_row_normalize_l1(self.W) + self.W_ = normalize(W, norm="l1") + assert isinstance(self.W_, csr_array) if issparse(X): X = csc_array(X) @@ -186,14 +195,12 @@ def fit(self, X: _X, y: None = None): "can be calculated." ) - # Data must be scaled, avoid mean-centering for sparse - X = scale(X, with_mean=not issparse(X)) - - eig_val, eig_vec = self._multispati_eigendecomposition(X, self.W) + eig_val, eig_vec = self._multispati_eigendecomposition(X, self.W_) self.components_ = eig_vec self.eigenvalues_ = eig_val self.n_components_ = eig_val.size + self._n_features_out = self.n_components_ self.n_features_in_ = d self.variance_, self.moransI_ = self._variance_moransI_decomposition( @@ -286,8 +293,7 @@ def transform(self, X: _X) -> np.ndarray: If instance has not been fitted. """ check_is_fitted(self) - # Data must be scaled, avoid mean-centering for sparse - X = scale(X, with_mean=not issparse(X)) + X = check_array(X) return X @ self.components_ def transform_spatial_lag(self, X: _X) -> np.ndarray: @@ -313,19 +319,19 @@ def transform_spatial_lag(self, X: _X) -> np.ndarray: return self._spatial_lag(self.transform(X)) def _spatial_lag(self, X: np.ndarray) -> np.ndarray: - return self.W @ X + return self.W_ @ X def _variance_moransI_decomposition( - self, X_t: np.ndarray + self, X_tr: np.ndarray ) -> tuple[np.ndarray, np.ndarray]: - lag = self._spatial_lag(X_t) + lag = self._spatial_lag(X_tr) # vector of row_Weights from dudi.PCA # (we only use default row_weights i.e. 1/n) - w = 1 / X_t.shape[0] + w = 1 / X_tr.shape[0] - variance = np.sum(X_t * X_t * w, axis=0) - moran = np.sum(X_t * lag * w, axis=0) / variance + variance = np.sum(X_tr * X_tr * w, axis=0) + moran = np.sum(X_tr * lag * w, axis=0) / variance return variance, moran @@ -340,6 +346,7 @@ def moransI_bounds( ---------- sparse_approx : bool Only applicable if `connectivity` is sparse. + Returns ------- tuple[float, float, float] @@ -360,7 +367,7 @@ def double_center(W: np.ndarray | csr_array) -> np.ndarray: return W - row_means - col_means # ensure symmetry - W = 0.5 * (self.W + self.W.T) + W = 0.5 * (self.W_ + self.W_.T) n_sample = W.shape[0] s = n_sample / np.sum(W) # 1 if original W has rowSums or colSums of 1 diff --git a/multispaeti/_plotting.py b/multispaeti/_plotting.py index a36c37e..e71f520 100644 --- a/multispaeti/_plotting.py +++ b/multispaeti/_plotting.py @@ -2,6 +2,7 @@ import numpy as np from matplotlib.figure import Figure from matplotlib.gridspec import GridSpec +from sklearn.utils.validation import check_is_fitted from ._multispati_pca import MultispatiPCA @@ -13,16 +14,21 @@ def plot_eigenvalues(msPCA: MultispatiPCA, *, n_top: int | None = None) -> Figur Parameters ---------- msPCA : MultispatiPCA - An instance of MultispatiPCA that has already been used for - :py:meth:`multispaeti.MultispatiPCA.fit` so that eigenvalues have already been - calculated. + An instance of MultispatiPCA that has been fitted so that the eigenvalues + have been calculated. n_top : int, optional Plot the `n_top` highest and `n_top` lowest eigenvalues in a zoomed in view. Returns ------- matplotlib.figure.Figure + + Raises + ------ + sklearn.exceptions.NotFittedError + If the MultispatiPCA has not been fitted. """ + check_is_fitted(msPCA) eigenvalues = msPCA.eigenvalues_ x_lbl, y_lbl = "Component", "Eigenvalue" @@ -66,17 +72,23 @@ def plot_variance_moransI_decomposition( Parameters ---------- msPCA : multispaeti.MultispatiPCA - An instance of MultispatiPCA that has already been used for - :py:meth:`multispaeti.MultispatiPCA.transform` so that variance and Moran's I - contributions to the eigenvalues have already been calculated. + An instance of MultispatiPCA that has been fitted so that variance and Moran's I + contributions to the eigenvalues have been calculated. sparse_approx : bool Whether to use a sparse approximation to calculate the decomposition. + kwargs + Other keyword arguments are passed to :py:func:`matplotlib.pyplot.scatter` Returns ------- matplotlib.figure.Figure - """ + Raises + ------ + sklearn.exceptions.NotFittedError + If the MultispatiPCA has not been fitted. + """ + check_is_fitted(msPCA) I_min, I_max, I_0 = msPCA.moransI_bounds(sparse_approx=sparse_approx) fig, ax = plt.subplots(1) diff --git a/pyproject.toml b/pyproject.toml index 8c360e3..7982b21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ classifiers = [ ] [project.optional-dependencies] +test = ["pytest"] docs = ["sphinx", "sphinx-copybutton", "sphinx-rtd-theme"] dev = ["multispaeti[docs]", "pre-commit"] @@ -56,3 +57,7 @@ target-version = "py310" ignore_missing_imports = true warn_no_return = false packages = "multispaeti" + +[tool.pytest.ini_options] +addopts = ["--import-mode=importlib"] +pythonpath = "multispaeti" diff --git a/tests/test_multispati_pca.py b/tests/test_multispati_pca.py new file mode 100644 index 0000000..095a60d --- /dev/null +++ b/tests/test_multispati_pca.py @@ -0,0 +1,11 @@ +import pytest +from sklearn.utils.estimator_checks import check_estimator + +from multispaeti import MultispatiPCA + + +def test_sklearn_compatibility(): + try: + check_estimator(MultispatiPCA()) + except AssertionError: + pytest.fail("Failed check_estimator() test of sklearn.") From 80c34dabd3e1c2330ca65287cab4ecbc5ebbaaf1 Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 19 Apr 2024 15:18:35 +0200 Subject: [PATCH 03/18] fix reference --- docs/source/usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 790d4b1..5650232 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -43,7 +43,7 @@ As for e.g. :py:class:`sklearn.decomposition.PCA` we first need to .. note:: MultispatiPCA expects its input to be standardized by e.g. using - :py:func:`sklearn.decomposition.scale` before passing it to fit/transform. + :py:func:`sklearn.preprocessing.scale` before passing it to fit/transform. Alternatively, this can be achieved in one step by From 521fedf0021b4bb01741343880dac8763a36095c Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 19 Apr 2024 15:33:46 +0200 Subject: [PATCH 04/18] add test to dev optional deps --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7982b21..8409c3d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ classifiers = [ [project.optional-dependencies] test = ["pytest"] docs = ["sphinx", "sphinx-copybutton", "sphinx-rtd-theme"] -dev = ["multispaeti[docs]", "pre-commit"] +dev = ["multispaeti[test,docs]", "pre-commit"] [project.urls] homepage = "https://github.com/HiDiHlabs/multiSPAETI" From f67969b9ac98b752a473a719669ffa3ece4bf783 Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Mon, 22 Apr 2024 09:49:26 +0200 Subject: [PATCH 05/18] correct error documentation --- multispaeti/_multispati_pca.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index 6c9c737..f38b4b9 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -148,13 +148,10 @@ def fit(self, X: _X, y: None = None): Raises ------ ValueError - If `X` has not the same number of rows like `connectivity`. + If `X` does not have the same number of rows as `connectivity`. If `n_components` is None and `X` is sparse. If (sum of) `n_components` is larger than the smaller dimension of `X`. - ValueError - If connectivity is not a square matrix. - ZeroDivisionError - If one of the observations has no neighbors. + If `connectivity` is not a square matrix. """ X = check_array(X) @@ -289,7 +286,7 @@ def transform(self, X: _X) -> np.ndarray: Raises ------ - ValueError + sklearn.exceptions.NotFittedError If instance has not been fitted. """ check_is_fitted(self) @@ -312,7 +309,7 @@ def transform_spatial_lag(self, X: _X) -> np.ndarray: Raises ------ - ValueError + sklearn.exceptions.NotFittedError If instance has not been fitted. """ check_is_fitted(self) From 1cf30be7e73d880964082ea3dc91115411e42bba Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Mon, 22 Apr 2024 14:57:14 +0200 Subject: [PATCH 06/18] components in same shape as sklearn --- multispaeti/_multispati_pca.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index f38b4b9..552fa89 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -201,7 +201,7 @@ def fit(self, X: _X, y: None = None): self.n_features_in_ = d self.variance_, self.moransI_ = self._variance_moransI_decomposition( - X @ self.components_ + X @ self.components_.T ) return self @@ -262,7 +262,7 @@ def remove_zero_eigenvalues( eig_val = eig_val[component_indices] eig_vec = eig_vec[:, component_indices] - return np.flip(eig_val), np.fliplr(eig_vec) + return np.flip(eig_val), np.flipud(eig_vec.T) @staticmethod def _get_component_indices(n: int, n_pos: int, n_neg: int) -> list[int]: @@ -291,7 +291,7 @@ def transform(self, X: _X) -> np.ndarray: """ check_is_fitted(self) X = check_array(X) - return X @ self.components_ + return X @ self.components_.T def transform_spatial_lag(self, X: _X) -> np.ndarray: """ From 5dd9d074927699e851e9bbd5cd52b0fd13892a2b Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Mon, 29 Apr 2024 11:48:11 +0200 Subject: [PATCH 07/18] center X if dense --- multispaeti/_multispati_pca.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index 552fa89..ee31829 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -24,11 +24,6 @@ _X: TypeAlias = np.ndarray | _Csr | _Csc -# TODO: should pass the following check -# from sklearn.utils.estimator_checks import check_estimator -# check_estimator(MultispatiPCA()) - - class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): """ MULTISPATI-PCA @@ -72,6 +67,10 @@ class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstim moransI_ : numpy.ndarray The estimated Moran's I part of the eigenvalues. Array of shape `(n_components,)`. + mean_ : numpy.ndarray or None + Per-feature empirical mean, estimated from the training set if `X` is not sparse. + Array of shape `(n_features,)`. + n_components_ : int The estimated number of components. @@ -192,7 +191,15 @@ def fit(self, X: _X, y: None = None): "can be calculated." ) - eig_val, eig_vec = self._multispati_eigendecomposition(X, self.W_) + if issparse(X): + self.mean_ = None + X_centered = X + else: + self.mean_ = X.mean(axis=0) + X_centered = X - self.mean_ + assert isinstance(X_centered, np.ndarray) + + eig_val, eig_vec = self._multispati_eigendecomposition(X_centered, self.W_) self.components_ = eig_vec self.eigenvalues_ = eig_val @@ -201,7 +208,7 @@ def fit(self, X: _X, y: None = None): self.n_features_in_ = d self.variance_, self.moransI_ = self._variance_moransI_decomposition( - X @ self.components_.T + X_centered @ self.components_.T ) return self @@ -209,7 +216,7 @@ def fit(self, X: _X, y: None = None): def _multispati_eigendecomposition( self, X: _X, W: _Csr ) -> tuple[NDArray[np.float64], NDArray[np.float64]]: - # X: beads/bin x gene, must be standardized + # X: observations x features # W: row-wise definition of neighbors, row-sums should be 1 def remove_zero_eigenvalues( eigen_values: NDArray[T], eigen_vectors: NDArray[U], n: int @@ -291,6 +298,9 @@ def transform(self, X: _X) -> np.ndarray: """ check_is_fitted(self) X = check_array(X) + if self.mean_ is not None: + if not issparse(X): + X = X - self.mean_ return X @ self.components_.T def transform_spatial_lag(self, X: _X) -> np.ndarray: @@ -323,8 +333,7 @@ def _variance_moransI_decomposition( ) -> tuple[np.ndarray, np.ndarray]: lag = self._spatial_lag(X_tr) - # vector of row_Weights from dudi.PCA - # (we only use default row_weights i.e. 1/n) + # vector of row_Weights from dudi.PCA (we only use default row_weights i.e. 1/n) w = 1 / X_tr.shape[0] variance = np.sum(X_tr * X_tr * w, axis=0) From 353185915439bdf2ba7ea6bf42ed03824556269a Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 3 May 2024 11:44:16 +0200 Subject: [PATCH 08/18] enable sparse X --- multispaeti/_multispati_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index ee31829..bc04cc1 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -153,7 +153,7 @@ def fit(self, X: _X, y: None = None): If `connectivity` is not a square matrix. """ - X = check_array(X) + X = check_array(X, accept_sparse=["csr", "csc"]) if self.connectivity is None: warnings.warn( "`connectivity` has not been set. Defaulting to identity matrix " @@ -297,7 +297,7 @@ def transform(self, X: _X) -> np.ndarray: If instance has not been fitted. """ check_is_fitted(self) - X = check_array(X) + X = check_array(X, accept_sparse=["csr", "csc"]) if self.mean_ is not None: if not issparse(X): X = X - self.mean_ From 352b126dbcb8f1667b97fc4fc99ecc986723d1d2 Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 3 May 2024 11:45:30 +0200 Subject: [PATCH 09/18] docs --- docs/source/usage.rst | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 5650232..7b95128 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -34,16 +34,8 @@ As for e.g. :py:class:`sklearn.decomposition.PCA` we first need to .. code-block:: python - from sklearn.preprocessing import scale - - X_scaled = scale(X) - msPCA.fit(X_scaled) - X_transformed = msPCA.transform(X_scaled) - -.. note:: - - MultispatiPCA expects its input to be standardized by e.g. using - :py:func:`sklearn.preprocessing.scale` before passing it to fit/transform. + msPCA.fit(X) + X_transformed = msPCA.transform(X) Alternatively, this can be achieved in one step by @@ -51,7 +43,7 @@ Alternatively, this can be achieved in one step by .. code-block:: python - X_transformed = msPCA.fit_transform(X_scaled) + X_transformed = msPCA.fit_transform(X) .. Additional, functionality is offered through the method .. :py:meth:`multispaeti.MultispatiPCA.moransI_bounds` which calculates the minimum and From c4b015caa64d859c25163e15c05f6a81e6ee78a0 Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 3 May 2024 11:47:13 +0200 Subject: [PATCH 10/18] version compatibility according to scientfic python SPEC0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8409c3d..26a7798 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ dynamic = ["version"] authors = [ { name = "Niklas Müller-Bötticher", email = "niklas.mueller-boetticher@charite.de" }, ] -dependencies = ["matplotlib", "numpy>=1.21", "scikit-learn", "scipy>=1.9"] +dependencies = ["matplotlib>=3.6", "numpy>=1.23", "scikit-learn>=1.1", "scipy>=1.9"] classifiers = [ "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", From a65b0bb6c8e4b1b2204e880f7dcc7d367bfb6b1b Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 3 May 2024 12:44:52 +0200 Subject: [PATCH 11/18] fix n_component validation, enable all components for sparse matrix --- multispaeti/_multispati_pca.py | 132 +++++++++++++++------------------ 1 file changed, 60 insertions(+), 72 deletions(-) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index bc04cc1..2ad7ecf 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -41,9 +41,9 @@ class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstim ---------- n_components : int or tuple[int, int], optional Number of components to keep. - If None, will keep all components (only supported for non-sparse `X`). + If None, will keep all components. If an int, it will keep the top `n_components`. - If a tuple, it will keep the top and bottom `n_components` respectively. + If a tuple, it will keep the top and bottom `n_components`, respectively. connectivity : scipy.sparse.sparray or scipy.sparse.spmatrix Matrix of row-wise neighbor definitions i.e. c\ :sub:`ij` is the connectivity of i :math:`\\to` j. The matrix does not have to be symmetric. It can be a @@ -107,29 +107,34 @@ def _validate_connectivity(W: csr_array, n: int): "#rows in `X` must be the same as dimensions of `connectivity`" ) - def _validate_n_components(self, n: int): - n_components = self.n_components + def _validate_n_components(self, n: int, d: int): + self._n_components = self.n_components - self._n_neg = 0 - if n_components is None: - self._n_pos = n_components - else: - if isinstance(n_components, int): - if n_components > n: + m = min(n, d) + + if self.n_components is not None: + if isinstance(self.n_components, int): + if self.n_components <= 0: + raise ValueError("`n_components` must be a positive integer.") + elif self.n_components >= m: warnings.warn( - "`n_components` should be less or equal than " - f"#rows of `connectivity`. Using {n} components." + "`n_components` should be less than minimum of " + "#samples and #features. Using all components." + ) + self._n_components = None + elif isinstance(self.n_components, tuple) and len(self.n_components) == 2: + if any( + not isinstance(i, int) or i < 0 for i in self.n_components + ) or self.n_components == (0, 0): + raise ValueError( + "`n_components` must be a tuple of positive integers." ) - self._n_pos = min(n_components, n) - elif isinstance(n_components, tuple) and len(n_components) == 2: - if n < n_components[0] + n_components[1]: + elif sum(self.n_components) >= m: warnings.warn( - "Sum of `n_components` should be less or equal than " - f"#rows of `connectivity`. Using {n} components." + "Sum of `n_components` should be less than minimum of " + "#samples and #features. Using all components." ) - self._n_pos = n - else: - self._n_pos, self._n_neg = n_components + self._n_components = None else: raise ValueError("`n_components` must be None, int or (int, int)") @@ -148,8 +153,7 @@ def fit(self, X: _X, y: None = None): ------ ValueError If `X` does not have the same number of rows as `connectivity`. - If `n_components` is None and `X` is sparse. - If (sum of) `n_components` is larger than the smaller dimension of `X`. + If `n_components` has the wrong type or is negative. If `connectivity` is not a square matrix. """ @@ -168,7 +172,7 @@ def fit(self, X: _X, y: None = None): n, d = X.shape self._validate_connectivity(W, n) - self._validate_n_components(n) + self._validate_n_components(n, d) self.W_ = normalize(W, norm="l1") assert isinstance(self.W_, csr_array) @@ -177,19 +181,6 @@ def fit(self, X: _X, y: None = None): X = csc_array(X) assert isinstance(X, (np.ndarray, csc_array)) - if self._n_pos is None: - if issparse(X): - raise ValueError( - "`n_components` is None, but `X` is a sparse matrix. None is only " - "supported for dense matrices." - ) - elif (self._n_pos + self._n_neg) > X.shape[1]: - n_comp = self._n_pos + self._n_neg - n_comp_max = min(n, d) - raise ValueError( - f"Requested {n_comp} components but given `X` at most {n_comp_max} " - "can be calculated." - ) if issparse(X): self.mean_ = None @@ -230,44 +221,41 @@ def remove_zero_eigenvalues( H = (X.T @ (W + W.T) @ X) / (2 * n) # TODO handle sparse based on density? if issparse(H): - # TODO fix can't return all eigenvalues of sparse matrix - # TODO check that number of eigenvalues does not exceed d - if self._n_pos is None: - raise ValueError( - "`n_components` is None, but `X` is a sparse matrix. None is only " - "supported for dense matrices." - ) - elif self._n_pos == 0: - eig_val, eig_vec = sparse_linalg.eigsh(H, k=self._n_neg, which="SA") - elif self._n_neg == 0: - eig_val, eig_vec = sparse_linalg.eigsh(H, k=self._n_pos, which="LA") - else: - n_comp = 2 * max(self._n_neg, self._n_pos) - eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_comp, which="BE") - component_indices = self._get_component_indices( - n_comp, self._n_pos, self._n_neg - ) - eig_val = eig_val[component_indices] - eig_vec = eig_vec[:, component_indices] + match self._n_components: + case None: + eig_val, eig_vec = sparse_linalg.eigsh( + H, k=min(n, d) - 1, which="LM" + ) + case (n_pos, 0) | int(n_pos): + eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_pos, which="LA") + case (0, n_neg): + eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_neg, which="SA") + case (n_pos, n_neg): + n_comp = max(2 * max(n_neg, n_pos), max(n, d)) + eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_comp, which="BE") + component_indices = self._get_component_indices( + n_comp, n_pos, n_neg + ) + eig_val = eig_val[component_indices] + eig_vec = eig_vec[:, component_indices] else: - if self._n_pos is None: - eig_val, eig_vec = linalg.eigh(H) - if n < d: - eig_val, eig_vec = remove_zero_eigenvalues(eig_val, eig_vec, n) - elif self._n_pos == 0: - eig_val, eig_vec = linalg.eigh(H, subset_by_index=[0, self._n_neg]) - elif self._n_neg == 0: - eig_val, eig_vec = linalg.eigh( - H, subset_by_index=[d - self._n_pos, d - 1] - ) - else: - eig_val, eig_vec = linalg.eigh(H) - component_indices = self._get_component_indices( - d, self._n_pos, self._n_neg - ) - eig_val = eig_val[component_indices] - eig_vec = eig_vec[:, component_indices] + match self._n_components: + case None: + eig_val, eig_vec = linalg.eigh(H) + if n < d: + eig_val, eig_vec = remove_zero_eigenvalues(eig_val, eig_vec, n) + case (n_pos, 0) | int(n_pos): + eig_val, eig_vec = linalg.eigh( + H, subset_by_index=[d - n_pos, d - 1] + ) + case (0, n_neg): + eig_val, eig_vec = linalg.eigh(H, subset_by_index=[0, n_neg]) + case (n_pos, n_neg): + eig_val, eig_vec = linalg.eigh(H) + component_indices = self._get_component_indices(d, n_pos, n_neg) + eig_val = eig_val[component_indices] + eig_vec = eig_vec[:, component_indices] return np.flip(eig_val), np.flipud(eig_vec.T) From 26ae02398a3a59e3befc42a58de5a2955ef50fa0 Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 3 May 2024 12:57:25 +0200 Subject: [PATCH 12/18] remove unnecessary conversion of X --- multispaeti/_multispati_pca.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index 2ad7ecf..8d78e4b 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -175,12 +175,6 @@ def fit(self, X: _X, y: None = None): self._validate_n_components(n, d) self.W_ = normalize(W, norm="l1") - assert isinstance(self.W_, csr_array) - - if issparse(X): - X = csc_array(X) - - assert isinstance(X, (np.ndarray, csc_array)) if issparse(X): self.mean_ = None @@ -286,9 +280,8 @@ def transform(self, X: _X) -> np.ndarray: """ check_is_fitted(self) X = check_array(X, accept_sparse=["csr", "csc"]) - if self.mean_ is not None: - if not issparse(X): - X = X - self.mean_ + if self.mean_ is not None and not issparse(X): + X = X - self.mean_ return X @ self.components_.T def transform_spatial_lag(self, X: _X) -> np.ndarray: From 10b415c383231ef47bbfa03733ec92dd6fb862c5 Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 3 May 2024 12:58:35 +0200 Subject: [PATCH 13/18] csr or ndarray as connectivity --- multispaeti/_multispati_pca.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index 8d78e4b..a6826fa 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -6,7 +6,6 @@ from scipy import linalg from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix, eye, issparse from scipy.sparse import linalg as sparse_linalg -from scipy.sparse import sparray, spmatrix from sklearn.base import ( BaseEstimator, ClassNamePrefixFeaturesOutMixin, @@ -22,6 +21,7 @@ _Csr: TypeAlias = csr_array | csr_matrix _Csc: TypeAlias = csc_array | csc_matrix _X: TypeAlias = np.ndarray | _Csr | _Csc +_Connectivity: TypeAlias = np.ndarray | _Csr class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): @@ -93,13 +93,13 @@ def __init__( self, n_components: int | tuple[int, int] | None = None, *, - connectivity: sparray | spmatrix | None = None, + connectivity: _Connectivity | None = None, ): self.n_components = n_components self.connectivity = connectivity @staticmethod - def _validate_connectivity(W: csr_array, n: int): + def _validate_connectivity(W: _Connectivity, n: int): if W.shape[0] != W.shape[1]: raise ValueError("`connectivity` must be square") if W.shape[0] != n: @@ -199,7 +199,7 @@ def fit(self, X: _X, y: None = None): return self def _multispati_eigendecomposition( - self, X: _X, W: _Csr + self, X: _X, W: _Connectivity ) -> tuple[NDArray[np.float64], NDArray[np.float64]]: # X: observations x features # W: row-wise definition of neighbors, row-sums should be 1 From c1664c3823ce401ac79d548df7e238e2278210c1 Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 3 May 2024 15:22:32 +0200 Subject: [PATCH 14/18] bugfix in n_component limits --- multispaeti/_multispati_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index a6826fa..e7ca2d2 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -225,7 +225,7 @@ def remove_zero_eigenvalues( case (0, n_neg): eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_neg, which="SA") case (n_pos, n_neg): - n_comp = max(2 * max(n_neg, n_pos), max(n, d)) + n_comp = min(2 * max(n_neg, n_pos), min(n, d)) eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_comp, which="BE") component_indices = self._get_component_indices( n_comp, n_pos, n_neg From b29d727a8ee2dd896aea96a9b503b258d6e95c0f Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Fri, 3 May 2024 15:50:43 +0200 Subject: [PATCH 15/18] use sparse linalg for moran bounds --- multispaeti/_multispati_pca.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index e7ca2d2..65b738c 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -362,12 +362,9 @@ def double_center(W: np.ndarray | csr_array) -> np.ndarray: if not issparse(W) or not sparse_approx: W = double_center(W) - if issparse(W): - eigen_values = s * sparse_linalg.eigsh( - W, k=2, which="BE", return_eigenvectors=False - ) - else: - eigen_values = s * linalg.eigvalsh(W, overwrite_a=True) + eigen_values = s * sparse_linalg.eigsh( + W, k=2, which="BE", return_eigenvectors=False + ) I_0 = -1 / (n_sample - 1) I_min = min(eigen_values) From acab6986614c623d4e0d36e2880f1cd22c8799db Mon Sep 17 00:00:00 2001 From: niklasmubeoe Date: Mon, 6 May 2024 10:23:01 +0200 Subject: [PATCH 16/18] implement fit_transform for efficiency --- docs/source/usage.rst | 3 ++- multispaeti/_multispati_pca.py | 33 +++++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 7b95128..7b33cb2 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -39,7 +39,8 @@ As for e.g. :py:class:`sklearn.decomposition.PCA` we first need to Alternatively, this can be achieved in one step by -:py:meth:`multispaeti.MultispatiPCA.fit_transform` +:py:meth:`multispaeti.MultispatiPCA.fit_transform` which is also more performant and +avoids redundant computation .. code-block:: python diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index 65b738c..affae08 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -156,6 +156,10 @@ def fit(self, X: _X, y: None = None): If `n_components` has the wrong type or is negative. If `connectivity` is not a square matrix. """ + self._fit(X) + return self + + def _fit(self, X: _X, *, return_transform: bool = False) -> np.ndarray | None: X = check_array(X, accept_sparse=["csr", "csc"]) if self.connectivity is None: @@ -192,11 +196,11 @@ def fit(self, X: _X, y: None = None): self._n_features_out = self.n_components_ self.n_features_in_ = d - self.variance_, self.moransI_ = self._variance_moransI_decomposition( - X_centered @ self.components_.T - ) + X_tr = X_centered @ self.components_.T + self.variance_, self.moransI_ = self._variance_moransI_decomposition(X_tr) - return self + if return_transform: + return X_tr def _multispati_eigendecomposition( self, X: _X, W: _Connectivity @@ -284,6 +288,27 @@ def transform(self, X: _X) -> np.ndarray: X = X - self.mean_ return X @ self.components_.T + def fit_transform(self, X: _X, y: None = None) -> np.ndarray: + """ + Fit and transform the data using MULTISPATI-PCA projection. + + See :py:meth:`multispaeti.MultispatiPCA` for more information. + + Parameters + ---------- + X : numpy.ndarray or scipy.sparse.csr_array or scipy.sparse.csc_array + Array of observations x features. + y : None + Ignored. scikit-learn compatibility only. + + Returns + ------- + numpy.ndarray + """ + X_tr = self._fit(X, return_transform=True) + assert isinstance(X_tr, np.ndarray) + return X_tr + def transform_spatial_lag(self, X: _X) -> np.ndarray: """ Transform the data using fitted MULTISPATI-PCA projection and calculate the From df00a02b49d7f7e5b26cda729b60afdaaeb9ac7f Mon Sep 17 00:00:00 2001 From: niklasmueboe Date: Mon, 10 Jun 2024 12:52:05 +0200 Subject: [PATCH 17/18] doc update --- docs/source/usage.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 7b33cb2..8957234 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -39,8 +39,7 @@ As for e.g. :py:class:`sklearn.decomposition.PCA` we first need to Alternatively, this can be achieved in one step by -:py:meth:`multispaeti.MultispatiPCA.fit_transform` which is also more performant and -avoids redundant computation +:py:meth:`multispaeti.MultispatiPCA.fit_transform` which avoids redundant computation. .. code-block:: python From 354bd981db5bad0888cabe5f817dcb402406c5d4 Mon Sep 17 00:00:00 2001 From: niklasmueboe Date: Mon, 10 Jun 2024 14:57:20 +0200 Subject: [PATCH 18/18] option to center sparse X --- multispaeti/_multispati_pca.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py index affae08..34c4dea 100644 --- a/multispaeti/_multispati_pca.py +++ b/multispaeti/_multispati_pca.py @@ -51,6 +51,10 @@ class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstim c\ :sub:`ij` should be larger if i and j are close. A distance matrix should be transformed to connectivities by e.g. calculating :math:`1-d/d_{max}` beforehand. + center_sparse : bool + Whether to center `X` if it is a sparse array. By default sparse `X` will not be + centered as this requires transforming it to a dense array, potentially raising + out-of-memory errors. Attributes ---------- @@ -94,9 +98,11 @@ def __init__( n_components: int | tuple[int, int] | None = None, *, connectivity: _Connectivity | None = None, + center_sparse: bool = False, ): self.n_components = n_components self.connectivity = connectivity + self.center_sparse = center_sparse @staticmethod def _validate_connectivity(W: _Connectivity, n: int): @@ -180,6 +186,10 @@ def _fit(self, X: _X, *, return_transform: bool = False) -> np.ndarray | None: self.W_ = normalize(W, norm="l1") + if self.center_sparse and issparse(X): + assert isinstance(X, (csr_array, csr_matrix, csc_array, csc_matrix)) + X = X.toarray() + if issparse(X): self.mean_ = None X_centered = X