From a200f1e7b683c7f21a9c33a3e7474556a36859d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Sat, 26 Aug 2023 21:07:50 +0100 Subject: [PATCH 1/3] [ENH] add test for subsetting distributions (#43) Adds a test to the distribution suite tests ensuring that they subset appropriately via `loc` and `iloc`, and satisfy common interface expectations on the resulting object and index. Also fixes a bug in subsetting of `Empirical` that was highlighted by these tests. --- skpro/distributions/empirical.py | 34 +++++++++++++++++++ skpro/distributions/tests/test_all_distrs.py | 35 ++++++++++++++++++++ skpro/utils/index.py | 21 ++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 skpro/utils/index.py diff --git a/skpro/distributions/empirical.py b/skpro/distributions/empirical.py index 20ae58a1..45718541 100644 --- a/skpro/distributions/empirical.py +++ b/skpro/distributions/empirical.py @@ -128,6 +128,40 @@ def _apply_per_ix(self, func, params, x=None): res.loc[ix, col] = func(spl=spl_t, weights=weights_t, x=x_t, **params) return res.convert_dtypes() + def _iloc(self, rowidx=None, colidx=None): + + index = self.index + columns = self.columns + weights = self.weights + + spl_subset = self.spl + + if rowidx is not None: + rowidx_loc = index[rowidx] + # subset multiindex to rowidx by last level + spl_subset = self.spl.loc[(slice(None), rowidx_loc), :] + if weights is not None: + weights_subset = weights.loc[(slice(None), rowidx_loc)] + else: + weights_subset = None + subs_rowidx = index[rowidx] + else: + subs_rowidx = index + + if colidx is not None: + spl_subset = spl_subset.iloc[:, colidx] + subs_colidx = columns[colidx] + else: + subs_colidx = columns + + return Empirical( + spl_subset, + weights=weights_subset, + time_indep=self.time_indep, + index=subs_rowidx, + columns=subs_colidx, + ) + def energy(self, x=None): r"""Energy of self, w.r.t. self or a constant frame x. diff --git a/skpro/distributions/tests/test_all_distrs.py b/skpro/distributions/tests/test_all_distrs.py index 1c19258f..7aa427a4 100644 --- a/skpro/distributions/tests/test_all_distrs.py +++ b/skpro/distributions/tests/test_all_distrs.py @@ -13,6 +13,7 @@ from skpro.datatypes import check_is_mtype from skpro.distributions.base import BaseDistribution from skpro.tests.test_all_estimators import PackageConfig +from skpro.utils.index import random_ss_ix class DistributionFixtureGenerator(BaseFixtureGenerator): @@ -132,6 +133,40 @@ def _check_quantile_output(obj, q): res = d.quantile(q) _check_quantile_output(res, q) + @pytest.mark.parametrize("subset_row", [True, False]) + @pytest.mark.parametrize("subset_col", [True, False]) + def test_subsetting(self, object_instance, subset_row, subset_col): + """Test subsetting of distribution.""" + d = object_instance + + if subset_row: + ix_loc = random_ss_ix(d.index, 3) + ix_iloc = d.index.get_indexer(ix_loc) + else: + ix_loc = d.index + ix_iloc = pd.RangeIndex(len(d.index)) + + if subset_col: + iy_loc = random_ss_ix(d.columns, 1) + iy_iloc = d.columns.get_indexer(iy_loc) + else: + iy_loc = d.columns + iy_iloc = pd.RangeIndex(len(d.columns)) + + res_loc = d.loc[ix_loc, iy_loc] + + assert isinstance(res_loc, type(d)) + assert res_loc.shape == (len(ix_loc), len(iy_loc)) + assert (res_loc.index == ix_loc).all() + assert (res_loc.columns == iy_loc).all() + + res_iloc = d.iloc[ix_iloc, iy_iloc] + + assert isinstance(res_iloc, type(d)) + assert res_iloc.shape == (len(ix_iloc), len(iy_iloc)) + assert (res_iloc.index == ix_loc).all() + assert (res_iloc.columns == iy_loc).all() + def _check_output_format(res, dist, method): """Check output format expectations for BaseDistribution tests.""" diff --git a/skpro/utils/index.py b/skpro/utils/index.py new file mode 100644 index 00000000..5496dfd3 --- /dev/null +++ b/skpro/utils/index.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +"""Utility functions for working with indices.""" + +import numpy as np + + +def random_ss_ix(ix, size, replace=True): + """Randomly uniformly sample indices from a list of indices. + + Parameters + ---------- + ix : pd.Index or subsettable iterable via getitem + list of indices to sample from + size : int + number of indices to sample + replace : bool, default=True + whether to sample with replacement + """ + a = range(len(ix)) + ixs = ix[np.random.choice(a, size=size, replace=replace)] + return ixs From 813d90ec315f5ac1c2ae5e73001e30e1380a0c08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Sat, 26 Aug 2023 21:29:37 +0100 Subject: [PATCH 2/3] [ENH] turn off sorting in `groupby`-s (#41) This PR turns off sorting in `groupby` calls. Towards #40. --- skpro/distributions/base.py | 11 ++++++----- skpro/distributions/empirical.py | 8 ++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/skpro/distributions/base.py b/skpro/distributions/base.py index f6c35677..1523b17c 100644 --- a/skpro/distributions/base.py +++ b/skpro/distributions/base.py @@ -252,7 +252,7 @@ def cdf(self, x): spl = self.sample(N) ind = splx <= spl - return ind.groupby(level=1).mean() + return ind.groupby(level=1, sort=False).mean() def ppf(self, p): """Quantile function = percent point function = inverse cdf.""" @@ -332,7 +332,8 @@ def energy(self, x=None): # approx E[abs(X-Y)] via mean of samples of abs(X-Y) obtained from splx, sply spl = splx - sply - energy = spl.apply(np.linalg.norm, axis=1, ord=1).groupby(level=1).mean() + energy = spl.apply(np.linalg.norm, axis=1, ord=1) + energy = energy.groupby(level=1, sort=False).mean() energy = pd.DataFrame(energy, index=self.index, columns=["energy"]) return energy @@ -355,7 +356,7 @@ def mean(self): warn(self._method_error_msg("mean", fill_in=approx_method)) spl = self.sample(approx_spl_size) - return spl.groupby(level=1).mean() + return spl.groupby(level=1, sort=False).mean() def var(self): r"""Return element/entry-wise variance of the distribution. @@ -378,7 +379,7 @@ def var(self): spl1 = self.sample(approx_spl_size) spl2 = self.sample(approx_spl_size) spl = (spl1 - spl2) ** 2 - return spl.groupby(level=1).mean() + return spl.groupby(level=1, sort=False).mean() def pdfnorm(self, a=2): r"""a-norm of pdf, defaults to 2-norm. @@ -410,7 +411,7 @@ def pdfnorm(self, a=2): # uses formula int p(x)^a dx = E[p(X)^{a-1}], and MC approximates the RHS spl = [self.pdf(self.sample()) ** (a - 1) for _ in range(approx_spl_size)] - return pd.concat(spl, axis=0).groupby(level=1).mean() + return pd.concat(spl, axis=0).groupby(level=1, sort=False).mean() def _coerce_to_self_index_df(self, x): x = np.array(x) diff --git a/skpro/distributions/empirical.py b/skpro/distributions/empirical.py index 45718541..7959c808 100644 --- a/skpro/distributions/empirical.py +++ b/skpro/distributions/empirical.py @@ -197,9 +197,9 @@ def mean(self): """ spl = self.spl if self.weights is None: - mean_df = spl.groupby(level=-1).mean() + mean_df = spl.groupby(level=-1, sort=False).mean() else: - mean_df = spl.groupby(level=-1).apply( + mean_df = spl.groupby(level=-1, sort=False).apply( lambda x: np.average(x, weights=self.weights.loc[x.index], axis=0) ) mean_df = pd.DataFrame(mean_df.tolist(), index=mean_df.index) @@ -221,11 +221,11 @@ def var(self): spl = self.spl N = self._N if self.weights is None: - var_df = spl.groupby(level=-1).var(ddof=0) + var_df = spl.groupby(level=-1, sort=False).var(ddof=0) else: mean = self.mean() means = pd.concat([mean] * N, axis=0, keys=self._spl_instances) - var_df = spl.groupby(level=-1).apply( + var_df = spl.groupby(level=-1, sort=False).apply( lambda x: np.average( (x - means.loc[x.index]) ** 2, weights=self.weights.loc[x.index], From bc3cf0ad22aa727822d53500e8bfb8d42f8dd036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Sat, 26 Aug 2023 21:35:17 +0100 Subject: [PATCH 3/3] [ENH] test that distribution methods do not sort index (#42) This adds tests to ensure that distribution methods do not sort the index in outputs. Towards #40. Consistency of index was already tested, but most test cases already had a sorted index, so the test logic would not catch sorting. This is remedied by adding an optional shuffling before each test that checks index consistency. Depends on https://github.com/sktime/skpro/pull/41 for fixing some bugs detected through this testing. --- skpro/distributions/tests/test_all_distrs.py | 36 +++++++++++++++---- skpro/regression/tests/test_all_regressors.py | 2 +- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/skpro/distributions/tests/test_all_distrs.py b/skpro/distributions/tests/test_all_distrs.py index 7aa427a4..04f20af4 100644 --- a/skpro/distributions/tests/test_all_distrs.py +++ b/skpro/distributions/tests/test_all_distrs.py @@ -61,10 +61,14 @@ def _has_capability(distr, method): class TestAllDistributions(PackageConfig, DistributionFixtureGenerator, QuickTester): """Module level tests for all sktime parameter fitters.""" - def test_sample(self, object_instance): + @pytest.mark.parametrize("shuffled", [False, True]) + def test_sample(self, object_instance, shuffled): """Test sample expected return.""" d = object_instance + if shuffled: + d = _shuffle_distr(d) + res = d.sample() assert d.shape == res.shape @@ -77,36 +81,50 @@ def test_sample(self, object_instance): assert (res_panel.index == dummy_panel.index).all() assert (res_panel.columns == dummy_panel.columns).all() + @pytest.mark.parametrize("shuffled", [False, True]) @pytest.mark.parametrize("method", METHODS_SCALAR, ids=METHODS_SCALAR) - def test_methods_scalar(self, object_instance, method): + def test_methods_scalar(self, object_instance, method, shuffled): """Test expected return of scalar methods.""" if not _has_capability(object_instance, method): return None d = object_instance - res = getattr(object_instance, method)() + if shuffled: + d = _shuffle_distr(d) + + res = getattr(d, method)() _check_output_format(res, d, method) + @pytest.mark.parametrize("shuffled", [False, True]) @pytest.mark.parametrize("method", METHODS_X, ids=METHODS_X) - def test_methods_x(self, object_instance, method): + def test_methods_x(self, object_instance, method, shuffled): """Test expected return of methods that take sample-like argument.""" if not _has_capability(object_instance, method): return None d = object_instance + + if shuffled: + d = _shuffle_distr(d) + x = d.sample() - res = getattr(object_instance, method)(x) + res = getattr(d, method)(x) _check_output_format(res, d, method) + @pytest.mark.parametrize("shuffled", [False, True]) @pytest.mark.parametrize("method", METHODS_P, ids=METHODS_P) - def test_methods_p(self, object_instance, method): + def test_methods_p(self, object_instance, method, shuffled): """Test expected return of methods that take percentage-like argument.""" if not _has_capability(object_instance, method): return None d = object_instance + + if shuffled: + d = _shuffle_distr(d) + np_unif = np.random.uniform(size=d.shape) p = pd.DataFrame(np_unif, index=d.index, columns=d.columns) res = getattr(object_instance, method)(p) @@ -181,3 +199,9 @@ def _check_output_format(res, dist, method): if method in METHODS_SCALAR_POS or method in METHODS_X_POS: assert (res >= 0).all().all() + + +def _shuffle_distr(d): + """Shuffle distribution row index.""" + shuffled_index = pd.DataFrame(d.index).sample(frac=1).index + return d.loc[shuffled_index] diff --git a/skpro/regression/tests/test_all_regressors.py b/skpro/regression/tests/test_all_regressors.py index b3c08e5d..8b831d1a 100644 --- a/skpro/regression/tests/test_all_regressors.py +++ b/skpro/regression/tests/test_all_regressors.py @@ -6,7 +6,7 @@ from skpro.datatypes import check_is_mtype, check_raise from skpro.distributions.base import BaseDistribution -from skpro.regression.base import BaseProbaRegressor +from skpro.regression.base._base import BaseProbaRegressor from skpro.tests.test_all_estimators import PackageConfig TEST_ALPHAS = [0.05, [0.1], [0.25, 0.75], [0.3, 0.1, 0.9]]