Skip to content

Commit

Permalink
Merge branch 'main' into bootstrap
Browse files Browse the repository at this point in the history
  • Loading branch information
fkiraly committed Aug 26, 2023
2 parents b900130 + bc3cf0a commit c2c092a
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 16 deletions.
11 changes: 6 additions & 5 deletions skpro/distributions/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def cdf(self, x):
spl = self.sample(N)
ind = splx <= spl

return ind.groupby(level=1).mean()
return ind.groupby(level=1, sort=False).mean()

def ppf(self, p):
"""Quantile function = percent point function = inverse cdf."""
Expand Down Expand Up @@ -332,7 +332,8 @@ def energy(self, x=None):

# approx E[abs(X-Y)] via mean of samples of abs(X-Y) obtained from splx, sply
spl = splx - sply
energy = spl.apply(np.linalg.norm, axis=1, ord=1).groupby(level=1).mean()
energy = spl.apply(np.linalg.norm, axis=1, ord=1)
energy = energy.groupby(level=1, sort=False).mean()
energy = pd.DataFrame(energy, index=self.index, columns=["energy"])
return energy

Expand All @@ -355,7 +356,7 @@ def mean(self):
warn(self._method_error_msg("mean", fill_in=approx_method))

spl = self.sample(approx_spl_size)
return spl.groupby(level=1).mean()
return spl.groupby(level=1, sort=False).mean()

def var(self):
r"""Return element/entry-wise variance of the distribution.
Expand All @@ -378,7 +379,7 @@ def var(self):
spl1 = self.sample(approx_spl_size)
spl2 = self.sample(approx_spl_size)
spl = (spl1 - spl2) ** 2
return spl.groupby(level=1).mean()
return spl.groupby(level=1, sort=False).mean()

def pdfnorm(self, a=2):
r"""a-norm of pdf, defaults to 2-norm.
Expand Down Expand Up @@ -410,7 +411,7 @@ def pdfnorm(self, a=2):

# uses formula int p(x)^a dx = E[p(X)^{a-1}], and MC approximates the RHS
spl = [self.pdf(self.sample()) ** (a - 1) for _ in range(approx_spl_size)]
return pd.concat(spl, axis=0).groupby(level=1).mean()
return pd.concat(spl, axis=0).groupby(level=1, sort=False).mean()

def _coerce_to_self_index_df(self, x):
x = np.array(x)
Expand Down
42 changes: 38 additions & 4 deletions skpro/distributions/empirical.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,40 @@ def _apply_per_ix(self, func, params, x=None):
res.loc[ix, col] = func(spl=spl_t, weights=weights_t, x=x_t, **params)
return res.convert_dtypes()

def _iloc(self, rowidx=None, colidx=None):

index = self.index
columns = self.columns
weights = self.weights

spl_subset = self.spl

if rowidx is not None:
rowidx_loc = index[rowidx]
# subset multiindex to rowidx by last level
spl_subset = self.spl.loc[(slice(None), rowidx_loc), :]
if weights is not None:
weights_subset = weights.loc[(slice(None), rowidx_loc)]
else:
weights_subset = None
subs_rowidx = index[rowidx]
else:
subs_rowidx = index

if colidx is not None:
spl_subset = spl_subset.iloc[:, colidx]
subs_colidx = columns[colidx]
else:
subs_colidx = columns

return Empirical(
spl_subset,
weights=weights_subset,
time_indep=self.time_indep,
index=subs_rowidx,
columns=subs_colidx,
)

def energy(self, x=None):
r"""Energy of self, w.r.t. self or a constant frame x.
Expand Down Expand Up @@ -163,9 +197,9 @@ def mean(self):
"""
spl = self.spl
if self.weights is None:
mean_df = spl.groupby(level=-1).mean()
mean_df = spl.groupby(level=-1, sort=False).mean()
else:
mean_df = spl.groupby(level=-1).apply(
mean_df = spl.groupby(level=-1, sort=False).apply(
lambda x: np.average(x, weights=self.weights.loc[x.index], axis=0)
)
mean_df = pd.DataFrame(mean_df.tolist(), index=mean_df.index)
Expand All @@ -187,11 +221,11 @@ def var(self):
spl = self.spl
N = self._N
if self.weights is None:
var_df = spl.groupby(level=-1).var(ddof=0)
var_df = spl.groupby(level=-1, sort=False).var(ddof=0)
else:
mean = self.mean()
means = pd.concat([mean] * N, axis=0, keys=self._spl_instances)
var_df = spl.groupby(level=-1).apply(
var_df = spl.groupby(level=-1, sort=False).apply(
lambda x: np.average(
(x - means.loc[x.index]) ** 2,
weights=self.weights.loc[x.index],
Expand Down
71 changes: 65 additions & 6 deletions skpro/distributions/tests/test_all_distrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from skpro.datatypes import check_is_mtype
from skpro.distributions.base import BaseDistribution
from skpro.tests.test_all_estimators import PackageConfig
from skpro.utils.index import random_ss_ix


class DistributionFixtureGenerator(BaseFixtureGenerator):
Expand Down Expand Up @@ -60,10 +61,14 @@ def _has_capability(distr, method):
class TestAllDistributions(PackageConfig, DistributionFixtureGenerator, QuickTester):
"""Module level tests for all sktime parameter fitters."""

def test_sample(self, object_instance):
@pytest.mark.parametrize("shuffled", [False, True])
def test_sample(self, object_instance, shuffled):
"""Test sample expected return."""
d = object_instance

if shuffled:
d = _shuffle_distr(d)

res = d.sample()

assert d.shape == res.shape
Expand All @@ -76,36 +81,50 @@ def test_sample(self, object_instance):
assert (res_panel.index == dummy_panel.index).all()
assert (res_panel.columns == dummy_panel.columns).all()

@pytest.mark.parametrize("shuffled", [False, True])
@pytest.mark.parametrize("method", METHODS_SCALAR, ids=METHODS_SCALAR)
def test_methods_scalar(self, object_instance, method):
def test_methods_scalar(self, object_instance, method, shuffled):
"""Test expected return of scalar methods."""
if not _has_capability(object_instance, method):
return None

d = object_instance
res = getattr(object_instance, method)()
if shuffled:
d = _shuffle_distr(d)

res = getattr(d, method)()

_check_output_format(res, d, method)

@pytest.mark.parametrize("shuffled", [False, True])
@pytest.mark.parametrize("method", METHODS_X, ids=METHODS_X)
def test_methods_x(self, object_instance, method):
def test_methods_x(self, object_instance, method, shuffled):
"""Test expected return of methods that take sample-like argument."""
if not _has_capability(object_instance, method):
return None

d = object_instance

if shuffled:
d = _shuffle_distr(d)

x = d.sample()
res = getattr(object_instance, method)(x)
res = getattr(d, method)(x)

_check_output_format(res, d, method)

@pytest.mark.parametrize("shuffled", [False, True])
@pytest.mark.parametrize("method", METHODS_P, ids=METHODS_P)
def test_methods_p(self, object_instance, method):
def test_methods_p(self, object_instance, method, shuffled):
"""Test expected return of methods that take percentage-like argument."""
if not _has_capability(object_instance, method):
return None

d = object_instance

if shuffled:
d = _shuffle_distr(d)

np_unif = np.random.uniform(size=d.shape)
p = pd.DataFrame(np_unif, index=d.index, columns=d.columns)
res = getattr(object_instance, method)(p)
Expand All @@ -132,6 +151,40 @@ def _check_quantile_output(obj, q):
res = d.quantile(q)
_check_quantile_output(res, q)

@pytest.mark.parametrize("subset_row", [True, False])
@pytest.mark.parametrize("subset_col", [True, False])
def test_subsetting(self, object_instance, subset_row, subset_col):
"""Test subsetting of distribution."""
d = object_instance

if subset_row:
ix_loc = random_ss_ix(d.index, 3)
ix_iloc = d.index.get_indexer(ix_loc)
else:
ix_loc = d.index
ix_iloc = pd.RangeIndex(len(d.index))

if subset_col:
iy_loc = random_ss_ix(d.columns, 1)
iy_iloc = d.columns.get_indexer(iy_loc)
else:
iy_loc = d.columns
iy_iloc = pd.RangeIndex(len(d.columns))

res_loc = d.loc[ix_loc, iy_loc]

assert isinstance(res_loc, type(d))
assert res_loc.shape == (len(ix_loc), len(iy_loc))
assert (res_loc.index == ix_loc).all()
assert (res_loc.columns == iy_loc).all()

res_iloc = d.iloc[ix_iloc, iy_iloc]

assert isinstance(res_iloc, type(d))
assert res_iloc.shape == (len(ix_iloc), len(iy_iloc))
assert (res_iloc.index == ix_loc).all()
assert (res_iloc.columns == iy_loc).all()


def _check_output_format(res, dist, method):
"""Check output format expectations for BaseDistribution tests."""
Expand All @@ -146,3 +199,9 @@ def _check_output_format(res, dist, method):

if method in METHODS_SCALAR_POS or method in METHODS_X_POS:
assert (res >= 0).all().all()


def _shuffle_distr(d):
"""Shuffle distribution row index."""
shuffled_index = pd.DataFrame(d.index).sample(frac=1).index
return d.loc[shuffled_index]
2 changes: 1 addition & 1 deletion skpro/regression/tests/test_all_regressors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from skpro.datatypes import check_is_mtype, check_raise
from skpro.distributions.base import BaseDistribution
from skpro.regression.base import BaseProbaRegressor
from skpro.regression.base._base import BaseProbaRegressor
from skpro.tests.test_all_estimators import PackageConfig

TEST_ALPHAS = [0.05, [0.1], [0.25, 0.75], [0.3, 0.1, 0.9]]
Expand Down
21 changes: 21 additions & 0 deletions skpro/utils/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
"""Utility functions for working with indices."""

import numpy as np


def random_ss_ix(ix, size, replace=True):
"""Randomly uniformly sample indices from a list of indices.
Parameters
----------
ix : pd.Index or subsettable iterable via getitem
list of indices to sample from
size : int
number of indices to sample
replace : bool, default=True
whether to sample with replacement
"""
a = range(len(ix))
ixs = ix[np.random.choice(a, size=size, replace=replace)]
return ixs

0 comments on commit c2c092a

Please sign in to comment.