Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
mdbenito committed Jan 16, 2025
2 parents 02acb0f + 3c54ff5 commit f91c6db
Show file tree
Hide file tree
Showing 16 changed files with 46 additions and 36 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@

### Changed

- Changed the way semi-value coefficients are composed with sampler weights in
order to avoid `OverflowError` for very small or large values
[PR #639](https://github.com/aai-institute/pyDVL/pull/639)
- Uniformly distribute test points across processes for KNNShapley. Fail for
`GroupedDataset` [PR #632](https://github.com/aai-institute/pyDVL/pull/632)
- Introduced the concept of logical vs data indices for `Dataset`, and
Expand Down
6 changes: 3 additions & 3 deletions src/pydvl/valuation/methods/beta_shapley.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ def __init__(
self.beta = beta
self.const = sp.special.beta(alpha, beta)

def coefficient(self, n: int, k: int) -> float:
def coefficient(self, n: int, k: int, other: float) -> float:
j = k + 1
w = sp.special.beta(j + self.beta - 1, n - j + self.alpha) / self.const
# return math.comb(n - 1, j - 1) * w * n
return float(w)
# return math.comb(n - 1, j - 1) * w * n * other
return float(w) * other
4 changes: 2 additions & 2 deletions src/pydvl/valuation/methods/data_banzhaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ class DataBanzhafValuation(SemivalueValuation):

algorithm_name = "Data-Banzhaf"

def coefficient(self, n: int, k: int) -> float:
return float(1 / 2 ** (n - 1))
def coefficient(self, n: int, k: int, other: float) -> float:
return float(other / 2 ** (n - 1))
4 changes: 2 additions & 2 deletions src/pydvl/valuation/methods/data_shapley.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ class DataShapleyValuation(SemivalueValuation):

algorithm_name = "Data-Shapley"

def coefficient(self, n: int, k: int) -> float:
return float(1 / math.comb(n - 1, k) / n)
def coefficient(self, n: int, k: int, other: float) -> float:
return other / math.comb(n - 1, k) / n
4 changes: 2 additions & 2 deletions src/pydvl/valuation/methods/delta_shapley.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,5 @@ def __init__(
)
super().__init__(utility, sampler, is_done, progress=progress)

def coefficient(self, n: int, k: int) -> float:
return float(1 / math.comb(n, k))
def coefficient(self, n: int, k: int, other: float) -> float:
return other / math.comb(n, k)
2 changes: 1 addition & 1 deletion src/pydvl/valuation/methods/gt_shapley.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def weight(n: int, subset_len: int) -> float:
def make_strategy(
self,
utility: UtilityBase,
coefficient: Callable[[int, int], float] | None = None,
coefficient: Callable[[int, int, float], float] | None = None,
) -> EvaluationStrategy:
raise NotImplementedError("This is not a semi-value sampler.")

Expand Down
4 changes: 2 additions & 2 deletions src/pydvl/valuation/methods/loo.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ def __init__(self, utility: UtilityBase, progress: bool = False):
progress=progress,
)

def coefficient(self, n: int, k: int) -> float:
def coefficient(self, n: int, k: int, other: float) -> float:
"""
This is never actually used to filter out sets, because the LOOSampler returns
only complements of {idx}, but it is required by the abstract class.
"""
return 1 if k == n - 1 else 0
return other if k == n - 1 else 0
5 changes: 3 additions & 2 deletions src/pydvl/valuation/methods/msr_banzhaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ def __init__(
progress=progress,
)

def coefficient(self, n: int, k: int) -> float:
return 1.0
def coefficient(self, n: int, k: int, other: float) -> float:
# Coefficient is 1.0 for all n and k
return other

def fit(self, data: Dataset) -> Self:
"""Calculate the MSR Banzhaf valuation on a dataset.
Expand Down
5 changes: 3 additions & 2 deletions src/pydvl/valuation/methods/owen_shapley.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,5 +76,6 @@ def fit(self, dataset: Dataset) -> Self:
self.result._status = Status.Converged
return self

def coefficient(self, n: int, k: int) -> float:
return 1
def coefficient(self, n: int, k: int, other: float) -> float:
# Coefficient is 1.0 for all n and k
return other
13 changes: 11 additions & 2 deletions src/pydvl/valuation/methods/semivalue.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,21 @@ def __init__(
self.tqdm_args.update(progress if isinstance(progress, dict) else {})

@abstractmethod
def coefficient(self, n: int, k: int) -> float:
"""Computes the coefficient for a given subset size.
def coefficient(self, n: int, k: int, other: float) -> float:
"""Returns the function computing the final coefficient to be used in the
semi-value valuation.
The semi-value coefficient is a function of the number of elements in the set,
and the size of the subset for which the coefficient is being computed.
Coefficients can be very large or very small, so that simply multiplying them
with the rest of the factors in a semi-value computation can lead to overflow or
underflow. To avoid this, we pass the other factors to this method, and delegate
the choice of whether to multiply or divide to the implementation.
Args:
n: Total number of elements in the set.
k: Size of the subset for which the coefficient is being computed
other: The other factors in the computation.
"""
...

Expand Down
6 changes: 3 additions & 3 deletions src/pydvl/valuation/samplers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def weight(n: int, subset_len: int) -> float:
def make_strategy(
self,
utility: UtilityBase,
coefficient: Callable[[int, int], float] | None = None,
coefficient: Callable[[int, int, float], float] | None = None,
) -> EvaluationStrategy:
"""Returns the strategy for this sampler."""
... # return SomeEvaluationStrategy(self)
Expand Down Expand Up @@ -242,7 +242,7 @@ def __init__(
self,
sampler: SamplerT,
utility: UtilityBase,
coefficient: Callable[[int, int], float] | None = None,
coefficient: Callable[[int, int, float], float] | None = None,
):
self.utility = utility
self.n_indices = (
Expand All @@ -256,7 +256,7 @@ def __init__(
if coefficient is not None:

def coefficient_fun(n: int, subset_len: int) -> float:
return sampler.weight(n, subset_len) * coefficient(n, subset_len)
return coefficient(n, subset_len, sampler.weight(n, subset_len))

self.coefficient = coefficient_fun
else:
Expand Down
2 changes: 1 addition & 1 deletion src/pydvl/valuation/samplers/classwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,6 @@ def sample_limit(self, indices: IndexSetT) -> int:
def make_strategy(
self,
utility: UtilityBase,
coefficient: Callable[[int, int], float] | None = None,
coefficient: Callable[[int, int, float], float] | None = None,
) -> EvaluationStrategy[IndexSampler, ValueUpdate]:
return self.in_class.make_strategy(utility, coefficient)
7 changes: 3 additions & 4 deletions src/pydvl/valuation/samplers/msr.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def weight(n: int, subset_len: int) -> float:
def make_strategy(
self,
utility: UtilityBase,
coefficient: Callable[[int, int], float] | None = None,
coefficient: Callable[[int, int, float], float] | None = None,
) -> MSREvaluationStrategy:
return MSREvaluationStrategy(self, utility, coefficient)

Expand All @@ -64,10 +64,9 @@ class MSREvaluationStrategy(EvaluationStrategy[SamplerT, MSRValueUpdate]):
The MSR evaluation strategy makes one utility evaluation per sample but generates
`n_indices` many updates from it. The updates will be used to update two running
means that will later be combined into on final value. We send the
`ValueUpdate.kind` field to `ValueUpdateKind.POSITVE` or `ValueUpdateKind.NEGATIVE`
means that will later be combined into a final value. We send the
`ValueUpdate.kind` field to `ValueUpdateKind.POSITIVE` or `ValueUpdateKind.NEGATIVE`
to decide which of the two running means is going to be updated.
"""

def process(
Expand Down
9 changes: 3 additions & 6 deletions src/pydvl/valuation/samplers/permutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import math
from copy import copy
from itertools import permutations
from typing import Callable, cast
from typing import Callable

import numpy as np

Expand All @@ -32,7 +32,6 @@
from pydvl.valuation.samplers.utils import StochasticSamplerMixin
from pydvl.valuation.types import (
IndexSetT,
IndexT,
NullaryPredicate,
Sample,
SampleBatch,
Expand Down Expand Up @@ -98,7 +97,7 @@ def weight(n: int, subset_len: int) -> float:
def make_strategy(
self,
utility: UtilityBase,
coefficient: Callable[[int, int], float] | None = None,
coefficient: Callable[[int, int, float], float] | None = None,
) -> PermutationEvaluationStrategy:
return PermutationEvaluationStrategy(self, utility, coefficient)

Expand Down Expand Up @@ -152,7 +151,7 @@ def __init__(
self,
sampler: PermutationSampler,
utility: UtilityBase,
coefficient: Callable[[int, int], float] | None = None,
coefficient: Callable[[int, int, float], float] | None = None,
):
super().__init__(sampler, utility, coefficient)
self.truncation = copy(sampler.truncation)
Expand All @@ -168,8 +167,6 @@ def process(
curr = prev = self.utility(None)
permutation = sample.subset
for i, idx in enumerate(permutation):
# FIXME: type checker claims this could be Any (?)
idx = cast(IndexT, idx)
if not truncated:
new_sample = sample.with_idx(idx).with_subset(permutation[: i + 1])
curr = self.utility(new_sample)
Expand Down
6 changes: 3 additions & 3 deletions src/pydvl/valuation/samplers/powerset.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def index_iterator(
def make_strategy(
self,
utility: UtilityBase,
coefficient: Callable[[int, int], float] | None = None,
coefficient: Callable[[int, int, float], float] | None = None,
) -> PowersetEvaluationStrategy:
return PowersetEvaluationStrategy(self, utility, coefficient)

Expand Down Expand Up @@ -236,7 +236,7 @@ def weight(n: int, subset_len: int) -> float:
def make_strategy(
self,
utility: UtilityBase,
coefficient: Callable[[int, int], float] | None = None,
coefficient: Callable[[int, int, float], float] | None = None,
) -> EvaluationStrategy:
return LOOEvaluationStrategy(self, utility, coefficient)

Expand All @@ -251,7 +251,7 @@ def __init__(
self,
sampler: LOOSampler,
utility: UtilityBase,
coefficient: Callable[[int, int], float] | None = None,
coefficient: Callable[[int, int, float], float] | None = None,
):
super().__init__(sampler, utility, coefficient)
assert utility.training_data is not None
Expand Down
2 changes: 1 addition & 1 deletion tests/valuation/methods/test_semivalues.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def test_coefficients(n, valuation_class, kwargs):
)

s = [
math.comb(n - 1, j - 1) * valuation.coefficient(n, j - 1)
valuation.coefficient(n, j - 1, math.comb(n - 1, j - 1))
for j in range(1, n + 1)
]
assert np.isclose(1, np.sum(s))
Expand Down

0 comments on commit f91c6db

Please sign in to comment.