From 27da140566ee78e984d431eb4f5928d7e6f055c9 Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Wed, 17 Apr 2024 10:18:51 +0200
Subject: [PATCH 01/18] inherit from BaseEstimator and TransformerMixin

---
 multispaeti/_multispati_pca.py | 123 ++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 65 deletions(-)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index 3f0c97b..23c9520 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -7,7 +7,10 @@
 from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix, issparse
 from scipy.sparse import linalg as sparse_linalg
 from scipy.sparse import sparray, spmatrix
+from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.preprocessing import scale
+from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
+from sklearn.utils.validation import check_array, check_is_fitted
 
 T = TypeVar("T", bound=np.number)
 U = TypeVar("U", bound=np.number)
@@ -18,7 +21,12 @@
 _X: TypeAlias = np.ndarray | _Csr | _Csc
 
 
-class MultispatiPCA:
+# TODO: should pass the following check
+# from sklearn.utils.estimator_checks import check_estimator
+# check_estimator(MultispatiPCA())
+
+
+class MultispatiPCA(TransformerMixin, BaseEstimator):
     """
     MULTISPATI-PCA
 
@@ -46,28 +54,21 @@ class MultispatiPCA:
         A distance matrix should be transformed to connectivities by e.g.
         calculating :math:`1-d/d_{max}` beforehand.
 
-    Raises
-    ------
-    ValueError
-        If connectivity is not a square matrix.
-    ZeroDivisionError
-        If one of the observations has no neighbors.
-
     Attributes
     ----------
     components_ : numpy.ndarray
         The estimated components: Array of shape `(n_components, n_features)`.
 
+    eigenvalues_ : numpy.ndarray
+        The eigenvalues corresponding to each of the selected components. Array of shape
+        `(n_components,)`.
+
     variance_ : numpy.ndarray
         The estimated variance part of the eigenvalues. Array of shape `(n_components,)`.
 
     moransI_ : numpy.ndarray
         The estimated Moran's I part of the eigenvalues. Array of shape `(n_components,)`.
 
-    eigenvalues_ : numpy.ndarray
-        The eigenvalues corresponding to each of the selected components. Array of shape
-        `(n_components,)`.
-
     n_components_ : int
         The estimated number of components.
 
@@ -92,17 +93,22 @@ def __init__(
         self,
         n_components: int | tuple[int, int] | None = None,
         *,
-        connectivity: sparray | spmatrix,
+        connectivity: sparray | spmatrix | None = None,
     ):
-        self._fitted = False
-        W = csr_array(connectivity)
+        self.n_components = n_components
+        self.connectivity = connectivity
+
+    @staticmethod
+    def _validate_connectivity(W: csr_array, n: int):
         if W.shape[0] != W.shape[1]:
             raise ValueError("`connectivity` must be square")
-        self.W = self._normalize_connectivities(W)
-
-        n = self.W.shape[0]
+        if W.shape[0] != n:
+            raise ValueError(
+                "#rows in `X` must be the same as dimensions of `connectivity`"
+            )
 
-        self.n_components = n_components
+    def _validate_n_components(self, n: int):
+        n_components = self.n_components
 
         self._n_neg = 0
         if n_components is None:
@@ -127,13 +133,7 @@ def __init__(
             else:
                 raise ValueError("`n_components` must be None, int or (int, int)")
 
-    @staticmethod
-    def _normalize_connectivities(W: csr_array) -> csr_array:
-        # normalize rowsums to 1 for better interpretability
-        # TODO can't handle points without neighbors because of division by zero
-        return W.multiply(1 / W.sum(axis=1)[:, np.newaxis])
-
-    def fit(self, X: _X):
+    def fit(self, X: _X, y: None = None):
         """
         Fit MULTISPATI-PCA projection.
 
@@ -141,6 +141,8 @@ def fit(self, X: _X):
         ----------
         X : numpy.ndarray or scipy.sparse.csr_array or scipy.sparse.csc_array
             Array of observations x features.
+        y : None
+            Ignored. scikit-learn compatibility only.
 
         Raises
         ------
@@ -148,15 +150,28 @@ def fit(self, X: _X):
             If `X` has not the same number of rows like `connectivity`.
             If `n_components` is None and `X` is sparse.
             If (sum of) `n_components` is larger than the smaller dimension of `X`.
+        ValueError
+            If connectivity is not a square matrix.
+        ZeroDivisionError
+            If one of the observations has no neighbors.
         """
+
+        X = check_array(X)
+        self.W = check_array(
+            self.connectivity, accept_sparse="csr", dtype=float
+        )  # TODO float32, float64?
+
+        n, d = X.shape
+
+        self._validate_connectivity(self.W, n)
+        self._validate_n_components(n)
+
+        inplace_csr_row_normalize_l1(self.W)
+
         if issparse(X):
             X = csc_array(X)
 
         assert isinstance(X, (np.ndarray, csc_array))
-        if X.shape[0] != self.W.shape[0]:
-            raise ValueError(
-                "#rows in `X` must be the same as dimensions of `connectivity`"
-            )
         if self._n_pos is None:
             if issparse(X):
                 raise ValueError(
@@ -164,7 +179,6 @@ def fit(self, X: _X):
                     "supported for dense matrices."
                 )
         elif (self._n_pos + self._n_neg) > X.shape[1]:
-            n, d = X.shape
             n_comp = self._n_pos + self._n_neg
             n_comp_max = min(n, d)
             raise ValueError(
@@ -180,8 +194,13 @@ def fit(self, X: _X):
         self.components_ = eig_vec
         self.eigenvalues_ = eig_val
         self.n_components_ = eig_val.size
-        self.n_features_in_ = X.shape[1]
-        self._fitted = True
+        self.n_features_in_ = d
+
+        self.variance_, self.moransI_ = self._variance_moransI_decomposition(
+            X @ self.components_
+        )
+
+        return self
 
     def _multispati_eigendecomposition(
         self, X: _X, W: _Csr
@@ -266,31 +285,10 @@ def transform(self, X: _X) -> np.ndarray:
         ValueError
             If instance has not been fitted.
         """
+        check_is_fitted(self)
         # Data must be scaled, avoid mean-centering for sparse
-        if not self._fitted:
-            self._not_fitted()
-
         X = scale(X, with_mean=not issparse(X))
-        X_t = X @ self.components_
-        self.variance_, self.moransI_ = self._variance_moransI_decomposition(X_t)
-
-        return X_t
-
-    def fit_transform(self, X: _X) -> np.ndarray:
-        """
-        Fit the MULTISPATI-PCA projection and transform the data.
-
-        Parameters
-        ----------
-        X : numpy.ndarray or scipy.sparse.csr_array or scipy.sparse.csc_array
-            Array of observations x features.
-
-        Returns
-        -------
-        numpy.ndarray
-        """
-        self.fit(X)
-        return self.transform(X)
+        return X @ self.components_
 
     def transform_spatial_lag(self, X: _X) -> np.ndarray:
         """
@@ -311,8 +309,7 @@ def transform_spatial_lag(self, X: _X) -> np.ndarray:
         ValueError
             If instance has not been fitted.
         """
-        if not self._fitted:
-            self._not_fitted()
+        check_is_fitted(self)
         return self._spatial_lag(self.transform(X))
 
     def _spatial_lag(self, X: np.ndarray) -> np.ndarray:
@@ -351,9 +348,11 @@ def moransI_bounds(
 
         # following R package adespatial::moran.bounds
         # sparse approx is following adegenet sPCA as shown in screeplot/summary
-        def double_center(W):
+        def double_center(W: np.ndarray | csr_array) -> np.ndarray:
             if issparse(W):
+                assert isinstance(W, csr_array)
                 W = W.toarray()
+            assert isinstance(W, np.ndarray)
 
             row_means = np.mean(W, axis=1, keepdims=True)
             col_means = np.mean(W, axis=0, keepdims=True) - np.mean(row_means)
@@ -381,9 +380,3 @@ def double_center(W):
         I_max = max(eigen_values)
 
         return I_min, I_max, I_0
-
-    def _not_fitted(self):
-        raise ValueError(
-            "This MultispatiPCA instance is not fitted yet. "
-            "Call 'fit' with appropriate arguments first."
-        )

From 8b12afe70e4b2b4620a2f382b17ec17a8f943f06 Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 19 Apr 2024 15:13:39 +0200
Subject: [PATCH 02/18] fully compatible with sklearn, test compatibility

---
 docs/source/api.rst            | 10 +++---
 docs/source/usage.rst          | 15 +++++++--
 multispaeti/_multispati_pca.py | 57 +++++++++++++++++++---------------
 multispaeti/_plotting.py       | 26 +++++++++++-----
 pyproject.toml                 |  5 +++
 tests/test_multispati_pca.py   | 11 +++++++
 6 files changed, 84 insertions(+), 40 deletions(-)
 create mode 100644 tests/test_multispati_pca.py

diff --git a/docs/source/api.rst b/docs/source/api.rst
index 0d79833..cc20154 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -2,11 +2,11 @@ API
 ===
 
 
-The API for :py:class:`multispaeti.MultispatiPCA` tries to follow the design in
-`scikit-learn <https://scikit-learn.org/>`_. The plan is to make it fully compatible in
-the future by inheriting from :py:class:`sklearn.base.BaseEstimator` and
-:py:class:`sklearn.base.TransformerMixin`.
-
+The API for :py:class:`multispaeti.MultispatiPCA` follows the design in
+`scikit-learn <https://scikit-learn.org/>`_. It inherits from
+:py:class:`sklearn.base.BaseEstimator`, :py:class:`sklearn.base.TransformerMixin`, and
+:py:class:`sklearn.base.ClassNamePrefixFeaturesOutMixin` making it fully compatible
+with `scikit-learn` pipelines.
 
 
 .. currentmodule:: multispaeti
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 3a15088..790d4b1 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -34,15 +34,24 @@ As for e.g. :py:class:`sklearn.decomposition.PCA` we first need to
 
 .. code-block:: python
 
-    msPCA.fit(X)
-    X_transformed = msPCA.transform(X)
+    from sklearn.preprocessing import scale
+
+    X_scaled = scale(X)
+    msPCA.fit(X_scaled)
+    X_transformed = msPCA.transform(X_scaled)
+
+.. note::
+
+    MultispatiPCA expects its input to be standardized by e.g. using
+    :py:func:`sklearn.decomposition.scale` before passing it to fit/transform.
+
 
 Alternatively, this can be achieved in one step by
 :py:meth:`multispaeti.MultispatiPCA.fit_transform`
 
 .. code-block:: python
 
-    X_transformed = msPCA.fit_transform(X)
+    X_transformed = msPCA.fit_transform(X_scaled)
 
 .. Additional, functionality is offered through the method
 .. :py:meth:`multispaeti.MultispatiPCA.moransI_bounds` which calculates the minimum and
diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index 23c9520..6c9c737 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -4,12 +4,15 @@
 import numpy as np
 from numpy.typing import NDArray
 from scipy import linalg
-from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix, issparse
+from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix, eye, issparse
 from scipy.sparse import linalg as sparse_linalg
 from scipy.sparse import sparray, spmatrix
-from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.preprocessing import scale
-from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
+from sklearn.base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+)
+from sklearn.preprocessing import normalize
 from sklearn.utils.validation import check_array, check_is_fitted
 
 T = TypeVar("T", bound=np.number)
@@ -26,7 +29,7 @@
 # check_estimator(MultispatiPCA())
 
 
-class MultispatiPCA(TransformerMixin, BaseEstimator):
+class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
     """
     MULTISPATI-PCA
 
@@ -87,8 +90,6 @@ class MultispatiPCA(TransformerMixin, BaseEstimator):
     <https://onlinelibrary.wiley.com/doi/abs/10.3170/2007-8-18312>`_
     """
 
-    # TODO, should scaling be part of multispati
-
     def __init__(
         self,
         n_components: int | tuple[int, int] | None = None,
@@ -157,16 +158,24 @@ def fit(self, X: _X, y: None = None):
         """
 
         X = check_array(X)
-        self.W = check_array(
-            self.connectivity, accept_sparse="csr", dtype=float
-        )  # TODO float32, float64?
+        if self.connectivity is None:
+            warnings.warn(
+                "`connectivity` has not been set. Defaulting to identity matrix "
+                "which will conceptually compute a standard PCA. "
+                "It is not recommended to not set `connectivity`."
+            )
+            W = csr_array(eye(X.shape[0]))
+        else:
+            W = self.connectivity
+        W = check_array(W, accept_sparse="csr")
 
         n, d = X.shape
 
-        self._validate_connectivity(self.W, n)
+        self._validate_connectivity(W, n)
         self._validate_n_components(n)
 
-        inplace_csr_row_normalize_l1(self.W)
+        self.W_ = normalize(W, norm="l1")
+        assert isinstance(self.W_, csr_array)
 
         if issparse(X):
             X = csc_array(X)
@@ -186,14 +195,12 @@ def fit(self, X: _X, y: None = None):
                 "can be calculated."
             )
 
-        # Data must be scaled, avoid mean-centering for sparse
-        X = scale(X, with_mean=not issparse(X))
-
-        eig_val, eig_vec = self._multispati_eigendecomposition(X, self.W)
+        eig_val, eig_vec = self._multispati_eigendecomposition(X, self.W_)
 
         self.components_ = eig_vec
         self.eigenvalues_ = eig_val
         self.n_components_ = eig_val.size
+        self._n_features_out = self.n_components_
         self.n_features_in_ = d
 
         self.variance_, self.moransI_ = self._variance_moransI_decomposition(
@@ -286,8 +293,7 @@ def transform(self, X: _X) -> np.ndarray:
             If instance has not been fitted.
         """
         check_is_fitted(self)
-        # Data must be scaled, avoid mean-centering for sparse
-        X = scale(X, with_mean=not issparse(X))
+        X = check_array(X)
         return X @ self.components_
 
     def transform_spatial_lag(self, X: _X) -> np.ndarray:
@@ -313,19 +319,19 @@ def transform_spatial_lag(self, X: _X) -> np.ndarray:
         return self._spatial_lag(self.transform(X))
 
     def _spatial_lag(self, X: np.ndarray) -> np.ndarray:
-        return self.W @ X
+        return self.W_ @ X
 
     def _variance_moransI_decomposition(
-        self, X_t: np.ndarray
+        self, X_tr: np.ndarray
     ) -> tuple[np.ndarray, np.ndarray]:
-        lag = self._spatial_lag(X_t)
+        lag = self._spatial_lag(X_tr)
 
         # vector of row_Weights from dudi.PCA
         # (we only use default row_weights i.e. 1/n)
-        w = 1 / X_t.shape[0]
+        w = 1 / X_tr.shape[0]
 
-        variance = np.sum(X_t * X_t * w, axis=0)
-        moran = np.sum(X_t * lag * w, axis=0) / variance
+        variance = np.sum(X_tr * X_tr * w, axis=0)
+        moran = np.sum(X_tr * lag * w, axis=0) / variance
 
         return variance, moran
 
@@ -340,6 +346,7 @@ def moransI_bounds(
         ----------
         sparse_approx : bool
             Only applicable if `connectivity` is sparse.
+
         Returns
         -------
         tuple[float, float, float]
@@ -360,7 +367,7 @@ def double_center(W: np.ndarray | csr_array) -> np.ndarray:
             return W - row_means - col_means
 
         # ensure symmetry
-        W = 0.5 * (self.W + self.W.T)
+        W = 0.5 * (self.W_ + self.W_.T)
 
         n_sample = W.shape[0]
         s = n_sample / np.sum(W)  # 1 if original W has rowSums or colSums of 1
diff --git a/multispaeti/_plotting.py b/multispaeti/_plotting.py
index a36c37e..e71f520 100644
--- a/multispaeti/_plotting.py
+++ b/multispaeti/_plotting.py
@@ -2,6 +2,7 @@
 import numpy as np
 from matplotlib.figure import Figure
 from matplotlib.gridspec import GridSpec
+from sklearn.utils.validation import check_is_fitted
 
 from ._multispati_pca import MultispatiPCA
 
@@ -13,16 +14,21 @@ def plot_eigenvalues(msPCA: MultispatiPCA, *, n_top: int | None = None) -> Figur
     Parameters
     ----------
     msPCA : MultispatiPCA
-        An instance of MultispatiPCA that has already been used for
-        :py:meth:`multispaeti.MultispatiPCA.fit` so that eigenvalues have already been
-        calculated.
+        An instance of MultispatiPCA that has been fitted so that the eigenvalues
+        have been calculated.
     n_top : int, optional
         Plot the `n_top` highest and `n_top` lowest eigenvalues in a zoomed in view.
 
     Returns
     -------
     matplotlib.figure.Figure
+
+    Raises
+    ------
+    sklearn.exceptions.NotFittedError
+        If the MultispatiPCA has not been fitted.
     """
+    check_is_fitted(msPCA)
     eigenvalues = msPCA.eigenvalues_
 
     x_lbl, y_lbl = "Component", "Eigenvalue"
@@ -66,17 +72,23 @@ def plot_variance_moransI_decomposition(
     Parameters
     ----------
     msPCA : multispaeti.MultispatiPCA
-        An instance of MultispatiPCA that has already been used for
-        :py:meth:`multispaeti.MultispatiPCA.transform` so that variance and Moran's I
-        contributions to the eigenvalues have already been calculated.
+        An instance of MultispatiPCA that has been fitted so that variance and Moran's I
+        contributions to the eigenvalues have been calculated.
     sparse_approx : bool
         Whether to use a sparse approximation to calculate the decomposition.
+    kwargs
+        Other keyword arguments are passed to :py:func:`matplotlib.pyplot.scatter`
 
     Returns
     -------
     matplotlib.figure.Figure
-    """
 
+    Raises
+    ------
+    sklearn.exceptions.NotFittedError
+        If the MultispatiPCA has not been fitted.
+    """
+    check_is_fitted(msPCA)
     I_min, I_max, I_0 = msPCA.moransI_bounds(sparse_approx=sparse_approx)
 
     fig, ax = plt.subplots(1)
diff --git a/pyproject.toml b/pyproject.toml
index 8c360e3..7982b21 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ classifiers = [
 ]
 
 [project.optional-dependencies]
+test = ["pytest"]
 docs = ["sphinx", "sphinx-copybutton", "sphinx-rtd-theme"]
 dev  = ["multispaeti[docs]", "pre-commit"]
 
@@ -56,3 +57,7 @@ target-version = "py310"
 ignore_missing_imports = true
 warn_no_return         = false
 packages               = "multispaeti"
+
+[tool.pytest.ini_options]
+addopts    = ["--import-mode=importlib"]
+pythonpath = "multispaeti"
diff --git a/tests/test_multispati_pca.py b/tests/test_multispati_pca.py
new file mode 100644
index 0000000..095a60d
--- /dev/null
+++ b/tests/test_multispati_pca.py
@@ -0,0 +1,11 @@
+import pytest
+from sklearn.utils.estimator_checks import check_estimator
+
+from multispaeti import MultispatiPCA
+
+
+def test_sklearn_compatibility():
+    try:
+        check_estimator(MultispatiPCA())
+    except AssertionError:
+        pytest.fail("Failed check_estimator() test of sklearn.")

From 80c34dabd3e1c2330ca65287cab4ecbc5ebbaaf1 Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 19 Apr 2024 15:18:35 +0200
Subject: [PATCH 03/18] fix reference

---
 docs/source/usage.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 790d4b1..5650232 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -43,7 +43,7 @@ As for e.g. :py:class:`sklearn.decomposition.PCA` we first need to
 .. note::
 
     MultispatiPCA expects its input to be standardized by e.g. using
-    :py:func:`sklearn.decomposition.scale` before passing it to fit/transform.
+    :py:func:`sklearn.preprocessing.scale` before passing it to fit/transform.
 
 
 Alternatively, this can be achieved in one step by

From 521fedf0021b4bb01741343880dac8763a36095c Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 19 Apr 2024 15:33:46 +0200
Subject: [PATCH 04/18] add test to dev optional deps

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7982b21..8409c3d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ classifiers = [
 [project.optional-dependencies]
 test = ["pytest"]
 docs = ["sphinx", "sphinx-copybutton", "sphinx-rtd-theme"]
-dev  = ["multispaeti[docs]", "pre-commit"]
+dev  = ["multispaeti[test,docs]", "pre-commit"]
 
 [project.urls]
 homepage      = "https://github.com/HiDiHlabs/multiSPAETI"

From f67969b9ac98b752a473a719669ffa3ece4bf783 Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Mon, 22 Apr 2024 09:49:26 +0200
Subject: [PATCH 05/18] correct error documentation

---
 multispaeti/_multispati_pca.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index 6c9c737..f38b4b9 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -148,13 +148,10 @@ def fit(self, X: _X, y: None = None):
         Raises
         ------
         ValueError
-            If `X` has not the same number of rows like `connectivity`.
+            If `X` does not have the same number of rows as `connectivity`.
             If `n_components` is None and `X` is sparse.
             If (sum of) `n_components` is larger than the smaller dimension of `X`.
-        ValueError
-            If connectivity is not a square matrix.
-        ZeroDivisionError
-            If one of the observations has no neighbors.
+            If `connectivity` is not a square matrix.
         """
 
         X = check_array(X)
@@ -289,7 +286,7 @@ def transform(self, X: _X) -> np.ndarray:
 
         Raises
         ------
-        ValueError
+        sklearn.exceptions.NotFittedError
             If instance has not been fitted.
         """
         check_is_fitted(self)
@@ -312,7 +309,7 @@ def transform_spatial_lag(self, X: _X) -> np.ndarray:
 
         Raises
         ------
-        ValueError
+        sklearn.exceptions.NotFittedError
             If instance has not been fitted.
         """
         check_is_fitted(self)

From 1cf30be7e73d880964082ea3dc91115411e42bba Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Mon, 22 Apr 2024 14:57:14 +0200
Subject: [PATCH 06/18] components in same shape as sklearn

---
 multispaeti/_multispati_pca.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index f38b4b9..552fa89 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -201,7 +201,7 @@ def fit(self, X: _X, y: None = None):
         self.n_features_in_ = d
 
         self.variance_, self.moransI_ = self._variance_moransI_decomposition(
-            X @ self.components_
+            X @ self.components_.T
         )
 
         return self
@@ -262,7 +262,7 @@ def remove_zero_eigenvalues(
                 eig_val = eig_val[component_indices]
                 eig_vec = eig_vec[:, component_indices]
 
-        return np.flip(eig_val), np.fliplr(eig_vec)
+        return np.flip(eig_val), np.flipud(eig_vec.T)
 
     @staticmethod
     def _get_component_indices(n: int, n_pos: int, n_neg: int) -> list[int]:
@@ -291,7 +291,7 @@ def transform(self, X: _X) -> np.ndarray:
         """
         check_is_fitted(self)
         X = check_array(X)
-        return X @ self.components_
+        return X @ self.components_.T
 
     def transform_spatial_lag(self, X: _X) -> np.ndarray:
         """

From 5dd9d074927699e851e9bbd5cd52b0fd13892a2b Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Mon, 29 Apr 2024 11:48:11 +0200
Subject: [PATCH 07/18] center X if dense

---
 multispaeti/_multispati_pca.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index 552fa89..ee31829 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -24,11 +24,6 @@
 _X: TypeAlias = np.ndarray | _Csr | _Csc
 
 
-# TODO: should pass the following check
-# from sklearn.utils.estimator_checks import check_estimator
-# check_estimator(MultispatiPCA())
-
-
 class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
     """
     MULTISPATI-PCA
@@ -72,6 +67,10 @@ class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstim
     moransI_ : numpy.ndarray
         The estimated Moran's I part of the eigenvalues. Array of shape `(n_components,)`.
 
+    mean_ : numpy.ndarray or None
+        Per-feature empirical mean, estimated from the training set if `X` is not sparse.
+        Array of shape `(n_features,)`.
+
     n_components_ : int
         The estimated number of components.
 
@@ -192,7 +191,15 @@ def fit(self, X: _X, y: None = None):
                 "can be calculated."
             )
 
-        eig_val, eig_vec = self._multispati_eigendecomposition(X, self.W_)
+        if issparse(X):
+            self.mean_ = None
+            X_centered = X
+        else:
+            self.mean_ = X.mean(axis=0)
+            X_centered = X - self.mean_
+            assert isinstance(X_centered, np.ndarray)
+
+        eig_val, eig_vec = self._multispati_eigendecomposition(X_centered, self.W_)
 
         self.components_ = eig_vec
         self.eigenvalues_ = eig_val
@@ -201,7 +208,7 @@ def fit(self, X: _X, y: None = None):
         self.n_features_in_ = d
 
         self.variance_, self.moransI_ = self._variance_moransI_decomposition(
-            X @ self.components_.T
+            X_centered @ self.components_.T
         )
 
         return self
@@ -209,7 +216,7 @@ def fit(self, X: _X, y: None = None):
     def _multispati_eigendecomposition(
         self, X: _X, W: _Csr
     ) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
-        # X: beads/bin x gene, must be standardized
+        # X: observations x features
         # W: row-wise definition of neighbors, row-sums should be 1
         def remove_zero_eigenvalues(
             eigen_values: NDArray[T], eigen_vectors: NDArray[U], n: int
@@ -291,6 +298,9 @@ def transform(self, X: _X) -> np.ndarray:
         """
         check_is_fitted(self)
         X = check_array(X)
+        if self.mean_ is not None:
+            if not issparse(X):
+                X = X - self.mean_
         return X @ self.components_.T
 
     def transform_spatial_lag(self, X: _X) -> np.ndarray:
@@ -323,8 +333,7 @@ def _variance_moransI_decomposition(
     ) -> tuple[np.ndarray, np.ndarray]:
         lag = self._spatial_lag(X_tr)
 
-        # vector of row_Weights from dudi.PCA
-        # (we only use default row_weights i.e. 1/n)
+        # vector of row_Weights from dudi.PCA (we only use default row_weights i.e. 1/n)
         w = 1 / X_tr.shape[0]
 
         variance = np.sum(X_tr * X_tr * w, axis=0)

From 353185915439bdf2ba7ea6bf42ed03824556269a Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 3 May 2024 11:44:16 +0200
Subject: [PATCH 08/18] enable sparse X

---
 multispaeti/_multispati_pca.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index ee31829..bc04cc1 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -153,7 +153,7 @@ def fit(self, X: _X, y: None = None):
             If `connectivity` is not a square matrix.
         """
 
-        X = check_array(X)
+        X = check_array(X, accept_sparse=["csr", "csc"])
         if self.connectivity is None:
             warnings.warn(
                 "`connectivity` has not been set. Defaulting to identity matrix "
@@ -297,7 +297,7 @@ def transform(self, X: _X) -> np.ndarray:
             If instance has not been fitted.
         """
         check_is_fitted(self)
-        X = check_array(X)
+        X = check_array(X, accept_sparse=["csr", "csc"])
         if self.mean_ is not None:
             if not issparse(X):
                 X = X - self.mean_

From 352b126dbcb8f1667b97fc4fc99ecc986723d1d2 Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 3 May 2024 11:45:30 +0200
Subject: [PATCH 09/18] docs

---
 docs/source/usage.rst | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 5650232..7b95128 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -34,16 +34,8 @@ As for e.g. :py:class:`sklearn.decomposition.PCA` we first need to
 
 .. code-block:: python
 
-    from sklearn.preprocessing import scale
-
-    X_scaled = scale(X)
-    msPCA.fit(X_scaled)
-    X_transformed = msPCA.transform(X_scaled)
-
-.. note::
-
-    MultispatiPCA expects its input to be standardized by e.g. using
-    :py:func:`sklearn.preprocessing.scale` before passing it to fit/transform.
+    msPCA.fit(X)
+    X_transformed = msPCA.transform(X)
 
 
 Alternatively, this can be achieved in one step by
@@ -51,7 +43,7 @@ Alternatively, this can be achieved in one step by
 
 .. code-block:: python
 
-    X_transformed = msPCA.fit_transform(X_scaled)
+    X_transformed = msPCA.fit_transform(X)
 
 .. Additional, functionality is offered through the method
 .. :py:meth:`multispaeti.MultispatiPCA.moransI_bounds` which calculates the minimum and

From c4b015caa64d859c25163e15c05f6a81e6ee78a0 Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 3 May 2024 11:47:13 +0200
Subject: [PATCH 10/18] version compatibility according to scientfic python
 SPEC0

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8409c3d..26a7798 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ dynamic         = ["version"]
 authors = [
     { name = "Niklas Müller-Bötticher", email = "niklas.mueller-boetticher@charite.de" },
 ]
-dependencies = ["matplotlib", "numpy>=1.21", "scikit-learn", "scipy>=1.9"]
+dependencies = ["matplotlib>=3.6", "numpy>=1.23", "scikit-learn>=1.1", "scipy>=1.9"]
 classifiers = [
     "Intended Audience :: Science/Research",
     "License :: OSI Approved :: MIT License",

From a65b0bb6c8e4b1b2204e880f7dcc7d367bfb6b1b Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 3 May 2024 12:44:52 +0200
Subject: [PATCH 11/18] fix n_component validation, enable all components for
 sparse matrix

---
 multispaeti/_multispati_pca.py | 132 +++++++++++++++------------------
 1 file changed, 60 insertions(+), 72 deletions(-)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index bc04cc1..2ad7ecf 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -41,9 +41,9 @@ class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstim
     ----------
     n_components : int or tuple[int, int], optional
         Number of components to keep.
-        If None, will keep all components (only supported for non-sparse `X`).
+        If None, will keep all components.
         If an int, it will keep the top `n_components`.
-        If a tuple, it will keep the top and bottom `n_components` respectively.
+        If a tuple, it will keep the top and bottom `n_components`, respectively.
     connectivity : scipy.sparse.sparray or scipy.sparse.spmatrix
         Matrix of row-wise neighbor definitions i.e. c\ :sub:`ij` is the connectivity of
         i :math:`\\to` j. The matrix does not have to be symmetric. It can be a
@@ -107,29 +107,34 @@ def _validate_connectivity(W: csr_array, n: int):
                 "#rows in `X` must be the same as dimensions of `connectivity`"
             )
 
-    def _validate_n_components(self, n: int):
-        n_components = self.n_components
+    def _validate_n_components(self, n: int, d: int):
+        self._n_components = self.n_components
 
-        self._n_neg = 0
-        if n_components is None:
-            self._n_pos = n_components
-        else:
-            if isinstance(n_components, int):
-                if n_components > n:
+        m = min(n, d)
+
+        if self.n_components is not None:
+            if isinstance(self.n_components, int):
+                if self.n_components <= 0:
+                    raise ValueError("`n_components` must be a positive integer.")
+                elif self.n_components >= m:
                     warnings.warn(
-                        "`n_components` should be less or equal than "
-                        f"#rows of `connectivity`. Using {n} components."
+                        "`n_components` should be less than minimum of "
+                        "#samples and #features. Using all components."
+                    )
+                self._n_components = None
+            elif isinstance(self.n_components, tuple) and len(self.n_components) == 2:
+                if any(
+                    not isinstance(i, int) or i < 0 for i in self.n_components
+                ) or self.n_components == (0, 0):
+                    raise ValueError(
+                        "`n_components` must be a tuple of positive integers."
                     )
-                self._n_pos = min(n_components, n)
-            elif isinstance(n_components, tuple) and len(n_components) == 2:
-                if n < n_components[0] + n_components[1]:
+                elif sum(self.n_components) >= m:
                     warnings.warn(
-                        "Sum of `n_components` should be less or equal than "
-                        f"#rows of `connectivity`. Using {n} components."
+                        "Sum of `n_components` should be less than minimum of "
+                        "#samples and #features. Using all components."
                     )
-                    self._n_pos = n
-                else:
-                    self._n_pos, self._n_neg = n_components
+                    self._n_components = None
             else:
                 raise ValueError("`n_components` must be None, int or (int, int)")
 
@@ -148,8 +153,7 @@ def fit(self, X: _X, y: None = None):
         ------
         ValueError
             If `X` does not have the same number of rows as `connectivity`.
-            If `n_components` is None and `X` is sparse.
-            If (sum of) `n_components` is larger than the smaller dimension of `X`.
+            If `n_components` has the wrong type or is negative.
             If `connectivity` is not a square matrix.
         """
 
@@ -168,7 +172,7 @@ def fit(self, X: _X, y: None = None):
         n, d = X.shape
 
         self._validate_connectivity(W, n)
-        self._validate_n_components(n)
+        self._validate_n_components(n, d)
 
         self.W_ = normalize(W, norm="l1")
         assert isinstance(self.W_, csr_array)
@@ -177,19 +181,6 @@ def fit(self, X: _X, y: None = None):
             X = csc_array(X)
 
         assert isinstance(X, (np.ndarray, csc_array))
-        if self._n_pos is None:
-            if issparse(X):
-                raise ValueError(
-                    "`n_components` is None, but `X` is a sparse matrix. None is only "
-                    "supported for dense matrices."
-                )
-        elif (self._n_pos + self._n_neg) > X.shape[1]:
-            n_comp = self._n_pos + self._n_neg
-            n_comp_max = min(n, d)
-            raise ValueError(
-                f"Requested {n_comp} components but given `X` at most {n_comp_max} "
-                "can be calculated."
-            )
 
         if issparse(X):
             self.mean_ = None
@@ -230,44 +221,41 @@ def remove_zero_eigenvalues(
         H = (X.T @ (W + W.T) @ X) / (2 * n)
         # TODO handle sparse based on density?
         if issparse(H):
-            # TODO fix can't return all eigenvalues of sparse matrix
-            # TODO check that number of eigenvalues does not exceed d
-            if self._n_pos is None:
-                raise ValueError(
-                    "`n_components` is None, but `X` is a sparse matrix. None is only "
-                    "supported for dense matrices."
-                )
-            elif self._n_pos == 0:
-                eig_val, eig_vec = sparse_linalg.eigsh(H, k=self._n_neg, which="SA")
-            elif self._n_neg == 0:
-                eig_val, eig_vec = sparse_linalg.eigsh(H, k=self._n_pos, which="LA")
-            else:
-                n_comp = 2 * max(self._n_neg, self._n_pos)
-                eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_comp, which="BE")
-                component_indices = self._get_component_indices(
-                    n_comp, self._n_pos, self._n_neg
-                )
-                eig_val = eig_val[component_indices]
-                eig_vec = eig_vec[:, component_indices]
+            match self._n_components:
+                case None:
+                    eig_val, eig_vec = sparse_linalg.eigsh(
+                        H, k=min(n, d) - 1, which="LM"
+                    )
+                case (n_pos, 0) | int(n_pos):
+                    eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_pos, which="LA")
+                case (0, n_neg):
+                    eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_neg, which="SA")
+                case (n_pos, n_neg):
+                    n_comp = max(2 * max(n_neg, n_pos), max(n, d))
+                    eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_comp, which="BE")
+                    component_indices = self._get_component_indices(
+                        n_comp, n_pos, n_neg
+                    )
+                    eig_val = eig_val[component_indices]
+                    eig_vec = eig_vec[:, component_indices]
 
         else:
-            if self._n_pos is None:
-                eig_val, eig_vec = linalg.eigh(H)
-                if n < d:
-                    eig_val, eig_vec = remove_zero_eigenvalues(eig_val, eig_vec, n)
-            elif self._n_pos == 0:
-                eig_val, eig_vec = linalg.eigh(H, subset_by_index=[0, self._n_neg])
-            elif self._n_neg == 0:
-                eig_val, eig_vec = linalg.eigh(
-                    H, subset_by_index=[d - self._n_pos, d - 1]
-                )
-            else:
-                eig_val, eig_vec = linalg.eigh(H)
-                component_indices = self._get_component_indices(
-                    d, self._n_pos, self._n_neg
-                )
-                eig_val = eig_val[component_indices]
-                eig_vec = eig_vec[:, component_indices]
+            match self._n_components:
+                case None:
+                    eig_val, eig_vec = linalg.eigh(H)
+                    if n < d:
+                        eig_val, eig_vec = remove_zero_eigenvalues(eig_val, eig_vec, n)
+                case (n_pos, 0) | int(n_pos):
+                    eig_val, eig_vec = linalg.eigh(
+                        H, subset_by_index=[d - n_pos, d - 1]
+                    )
+                case (0, n_neg):
+                    eig_val, eig_vec = linalg.eigh(H, subset_by_index=[0, n_neg])
+                case (n_pos, n_neg):
+                    eig_val, eig_vec = linalg.eigh(H)
+                    component_indices = self._get_component_indices(d, n_pos, n_neg)
+                    eig_val = eig_val[component_indices]
+                    eig_vec = eig_vec[:, component_indices]
 
         return np.flip(eig_val), np.flipud(eig_vec.T)
 

From 26ae02398a3a59e3befc42a58de5a2955ef50fa0 Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 3 May 2024 12:57:25 +0200
Subject: [PATCH 12/18] remove unnecessary conversion of X

---
 multispaeti/_multispati_pca.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index 2ad7ecf..8d78e4b 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -175,12 +175,6 @@ def fit(self, X: _X, y: None = None):
         self._validate_n_components(n, d)
 
         self.W_ = normalize(W, norm="l1")
-        assert isinstance(self.W_, csr_array)
-
-        if issparse(X):
-            X = csc_array(X)
-
-        assert isinstance(X, (np.ndarray, csc_array))
 
         if issparse(X):
             self.mean_ = None
@@ -286,9 +280,8 @@ def transform(self, X: _X) -> np.ndarray:
         """
         check_is_fitted(self)
         X = check_array(X, accept_sparse=["csr", "csc"])
-        if self.mean_ is not None:
-            if not issparse(X):
-                X = X - self.mean_
+        if self.mean_ is not None and not issparse(X):
+            X = X - self.mean_
         return X @ self.components_.T
 
     def transform_spatial_lag(self, X: _X) -> np.ndarray:

From 10b415c383231ef47bbfa03733ec92dd6fb862c5 Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 3 May 2024 12:58:35 +0200
Subject: [PATCH 13/18] csr or ndarray as connectivity

---
 multispaeti/_multispati_pca.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index 8d78e4b..a6826fa 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -6,7 +6,6 @@
 from scipy import linalg
 from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix, eye, issparse
 from scipy.sparse import linalg as sparse_linalg
-from scipy.sparse import sparray, spmatrix
 from sklearn.base import (
     BaseEstimator,
     ClassNamePrefixFeaturesOutMixin,
@@ -22,6 +21,7 @@
 _Csr: TypeAlias = csr_array | csr_matrix
 _Csc: TypeAlias = csc_array | csc_matrix
 _X: TypeAlias = np.ndarray | _Csr | _Csc
+_Connectivity: TypeAlias = np.ndarray | _Csr
 
 
 class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
@@ -93,13 +93,13 @@ def __init__(
         self,
         n_components: int | tuple[int, int] | None = None,
         *,
-        connectivity: sparray | spmatrix | None = None,
+        connectivity: _Connectivity | None = None,
     ):
         self.n_components = n_components
         self.connectivity = connectivity
 
     @staticmethod
-    def _validate_connectivity(W: csr_array, n: int):
+    def _validate_connectivity(W: _Connectivity, n: int):
         if W.shape[0] != W.shape[1]:
             raise ValueError("`connectivity` must be square")
         if W.shape[0] != n:
@@ -199,7 +199,7 @@ def fit(self, X: _X, y: None = None):
         return self
 
     def _multispati_eigendecomposition(
-        self, X: _X, W: _Csr
+        self, X: _X, W: _Connectivity
     ) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
         # X: observations x features
         # W: row-wise definition of neighbors, row-sums should be 1

From c1664c3823ce401ac79d548df7e238e2278210c1 Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 3 May 2024 15:22:32 +0200
Subject: [PATCH 14/18] bugfix in n_component limits

---
 multispaeti/_multispati_pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index a6826fa..e7ca2d2 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -225,7 +225,7 @@ def remove_zero_eigenvalues(
                 case (0, n_neg):
                     eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_neg, which="SA")
                 case (n_pos, n_neg):
-                    n_comp = max(2 * max(n_neg, n_pos), max(n, d))
+                    n_comp = min(2 * max(n_neg, n_pos), min(n, d))
                     eig_val, eig_vec = sparse_linalg.eigsh(H, k=n_comp, which="BE")
                     component_indices = self._get_component_indices(
                         n_comp, n_pos, n_neg

From b29d727a8ee2dd896aea96a9b503b258d6e95c0f Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Fri, 3 May 2024 15:50:43 +0200
Subject: [PATCH 15/18] use sparse linalg for moran bounds

---
 multispaeti/_multispati_pca.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index e7ca2d2..65b738c 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -362,12 +362,9 @@ def double_center(W: np.ndarray | csr_array) -> np.ndarray:
         if not issparse(W) or not sparse_approx:
             W = double_center(W)
 
-        if issparse(W):
-            eigen_values = s * sparse_linalg.eigsh(
-                W, k=2, which="BE", return_eigenvectors=False
-            )
-        else:
-            eigen_values = s * linalg.eigvalsh(W, overwrite_a=True)
+        eigen_values = s * sparse_linalg.eigsh(
+            W, k=2, which="BE", return_eigenvectors=False
+        )
 
         I_0 = -1 / (n_sample - 1)
         I_min = min(eigen_values)

From acab6986614c623d4e0d36e2880f1cd22c8799db Mon Sep 17 00:00:00 2001
From: niklasmubeoe <sputnik.mb@web.de>
Date: Mon, 6 May 2024 10:23:01 +0200
Subject: [PATCH 16/18] implement fit_transform for efficiency

---
 docs/source/usage.rst          |  3 ++-
 multispaeti/_multispati_pca.py | 33 +++++++++++++++++++++++++++++----
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 7b95128..7b33cb2 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -39,7 +39,8 @@ As for e.g. :py:class:`sklearn.decomposition.PCA` we first need to
 
 
 Alternatively, this can be achieved in one step by
-:py:meth:`multispaeti.MultispatiPCA.fit_transform`
+:py:meth:`multispaeti.MultispatiPCA.fit_transform` which is also more performant and
+avoids redundant computation
 
 .. code-block:: python
 
diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index 65b738c..affae08 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -156,6 +156,10 @@ def fit(self, X: _X, y: None = None):
             If `n_components` has the wrong type or is negative.
             If `connectivity` is not a square matrix.
         """
+        self._fit(X)
+        return self
+
+    def _fit(self, X: _X, *, return_transform: bool = False) -> np.ndarray | None:
 
         X = check_array(X, accept_sparse=["csr", "csc"])
         if self.connectivity is None:
@@ -192,11 +196,11 @@ def fit(self, X: _X, y: None = None):
         self._n_features_out = self.n_components_
         self.n_features_in_ = d
 
-        self.variance_, self.moransI_ = self._variance_moransI_decomposition(
-            X_centered @ self.components_.T
-        )
+        X_tr = X_centered @ self.components_.T
+        self.variance_, self.moransI_ = self._variance_moransI_decomposition(X_tr)
 
-        return self
+        if return_transform:
+            return X_tr
 
     def _multispati_eigendecomposition(
         self, X: _X, W: _Connectivity
@@ -284,6 +288,27 @@ def transform(self, X: _X) -> np.ndarray:
             X = X - self.mean_
         return X @ self.components_.T
 
+    def fit_transform(self, X: _X, y: None = None) -> np.ndarray:
+        """
+        Fit and transform the data using MULTISPATI-PCA projection.
+
+        See :py:meth:`multispaeti.MultispatiPCA` for more information.
+
+        Parameters
+        ----------
+        X : numpy.ndarray or scipy.sparse.csr_array or scipy.sparse.csc_array
+            Array of observations x features.
+        y : None
+            Ignored. scikit-learn compatibility only.
+
+        Returns
+        -------
+        numpy.ndarray
+        """
+        X_tr = self._fit(X, return_transform=True)
+        assert isinstance(X_tr, np.ndarray)
+        return X_tr
+
     def transform_spatial_lag(self, X: _X) -> np.ndarray:
         """
         Transform the data using fitted MULTISPATI-PCA projection and calculate the

From df00a02b49d7f7e5b26cda729b60afdaaeb9ac7f Mon Sep 17 00:00:00 2001
From: niklasmueboe <sputnik.mb@web.de>
Date: Mon, 10 Jun 2024 12:52:05 +0200
Subject: [PATCH 17/18] doc update

---
 docs/source/usage.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 7b33cb2..8957234 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -39,8 +39,7 @@ As for e.g. :py:class:`sklearn.decomposition.PCA` we first need to
 
 
 Alternatively, this can be achieved in one step by
-:py:meth:`multispaeti.MultispatiPCA.fit_transform` which is also more performant and
-avoids redundant computation
+:py:meth:`multispaeti.MultispatiPCA.fit_transform` which avoids redundant computation.
 
 .. code-block:: python
 

From 354bd981db5bad0888cabe5f817dcb402406c5d4 Mon Sep 17 00:00:00 2001
From: niklasmueboe <sputnik.mb@web.de>
Date: Mon, 10 Jun 2024 14:57:20 +0200
Subject: [PATCH 18/18] option to center sparse X

---
 multispaeti/_multispati_pca.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/multispaeti/_multispati_pca.py b/multispaeti/_multispati_pca.py
index affae08..34c4dea 100644
--- a/multispaeti/_multispati_pca.py
+++ b/multispaeti/_multispati_pca.py
@@ -51,6 +51,10 @@ class MultispatiPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstim
         c\ :sub:`ij` should be larger if i and j are close.
         A distance matrix should be transformed to connectivities by e.g.
         calculating :math:`1-d/d_{max}` beforehand.
+    center_sparse : bool
+        Whether to center `X` if it is a sparse array. By default sparse `X` will not be
+        centered as this requires transforming it to a dense array, potentially raising
+        out-of-memory errors.
 
     Attributes
     ----------
@@ -94,9 +98,11 @@ def __init__(
         n_components: int | tuple[int, int] | None = None,
         *,
         connectivity: _Connectivity | None = None,
+        center_sparse: bool = False,
     ):
         self.n_components = n_components
         self.connectivity = connectivity
+        self.center_sparse = center_sparse
 
     @staticmethod
     def _validate_connectivity(W: _Connectivity, n: int):
@@ -180,6 +186,10 @@ def _fit(self, X: _X, *, return_transform: bool = False) -> np.ndarray | None:
 
         self.W_ = normalize(W, norm="l1")
 
+        if self.center_sparse and issparse(X):
+            assert isinstance(X, (csr_array, csr_matrix, csc_array, csc_matrix))
+            X = X.toarray()
+
         if issparse(X):
             self.mean_ = None
             X_centered = X