Merge remote-tracking branch 'origin/develop' into develop

aai-institute · Jan 16, 2025 · f91c6db · f91c6db
2 parents 02acb0f + 3c54ff5
commit f91c6db
Show file tree

Hide file tree

Showing 16 changed files with 46 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -59,6 +59,9 @@
 
 ### Changed
 
+- Changed the way semi-value coefficients are composed with sampler weights in
+  order to avoid `OverflowError` for very small or large values
+  [PR #639](https://github.com/aai-institute/pyDVL/pull/639)
 - Uniformly distribute test points across processes for KNNShapley. Fail for
   `GroupedDataset` [PR #632](https://github.com/aai-institute/pyDVL/pull/632)
 - Introduced the concept of logical vs data indices for `Dataset`, and

diff --git a/src/pydvl/valuation/methods/beta_shapley.py b/src/pydvl/valuation/methods/beta_shapley.py
@@ -29,8 +29,8 @@ def __init__(
         self.beta = beta
         self.const = sp.special.beta(alpha, beta)
 
-    def coefficient(self, n: int, k: int) -> float:
+    def coefficient(self, n: int, k: int, other: float) -> float:
         j = k + 1
         w = sp.special.beta(j + self.beta - 1, n - j + self.alpha) / self.const
-        # return math.comb(n - 1, j - 1) * w * n
-        return float(w)
+        # return math.comb(n - 1, j - 1) * w * n * other
+        return float(w) * other
diff --git a/src/pydvl/valuation/methods/data_banzhaf.py b/src/pydvl/valuation/methods/data_banzhaf.py
@@ -35,5 +35,5 @@ class DataBanzhafValuation(SemivalueValuation):
 
     algorithm_name = "Data-Banzhaf"
 
-    def coefficient(self, n: int, k: int) -> float:
-        return float(1 / 2 ** (n - 1))
+    def coefficient(self, n: int, k: int, other: float) -> float:
+        return float(other / 2 ** (n - 1))
diff --git a/src/pydvl/valuation/methods/data_shapley.py b/src/pydvl/valuation/methods/data_shapley.py
@@ -10,5 +10,5 @@ class DataShapleyValuation(SemivalueValuation):
 
     algorithm_name = "Data-Shapley"
 
-    def coefficient(self, n: int, k: int) -> float:
-        return float(1 / math.comb(n - 1, k) / n)
+    def coefficient(self, n: int, k: int, other: float) -> float:
+        return other / math.comb(n - 1, k) / n
diff --git a/src/pydvl/valuation/methods/delta_shapley.py b/src/pydvl/valuation/methods/delta_shapley.py
@@ -38,5 +38,5 @@ def __init__(
         )
         super().__init__(utility, sampler, is_done, progress=progress)
 
-    def coefficient(self, n: int, k: int) -> float:
-        return float(1 / math.comb(n, k))
+    def coefficient(self, n: int, k: int, other: float) -> float:
+        return other / math.comb(n, k)
diff --git a/src/pydvl/valuation/methods/gt_shapley.py b/src/pydvl/valuation/methods/gt_shapley.py
@@ -222,7 +222,7 @@ def weight(n: int, subset_len: int) -> float:
     def make_strategy(
         self,
         utility: UtilityBase,
-        coefficient: Callable[[int, int], float] | None = None,
+        coefficient: Callable[[int, int, float], float] | None = None,
     ) -> EvaluationStrategy:
         raise NotImplementedError("This is not a semi-value sampler.")
 

diff --git a/src/pydvl/valuation/methods/loo.py b/src/pydvl/valuation/methods/loo.py
@@ -43,9 +43,9 @@ def __init__(self, utility: UtilityBase, progress: bool = False):
             progress=progress,
         )
 
-    def coefficient(self, n: int, k: int) -> float:
+    def coefficient(self, n: int, k: int, other: float) -> float:
         """
         This is never actually used to filter out sets, because the LOOSampler returns
         only complements of {idx}, but it is required by the abstract class.
         """
-        return 1 if k == n - 1 else 0
+        return other if k == n - 1 else 0
diff --git a/src/pydvl/valuation/methods/msr_banzhaf.py b/src/pydvl/valuation/methods/msr_banzhaf.py
@@ -66,8 +66,9 @@ def __init__(
             progress=progress,
         )
 
-    def coefficient(self, n: int, k: int) -> float:
-        return 1.0
+    def coefficient(self, n: int, k: int, other: float) -> float:
+        # Coefficient is 1.0 for all n and k
+        return other
 
     def fit(self, data: Dataset) -> Self:
         """Calculate the MSR Banzhaf valuation on a dataset.

diff --git a/src/pydvl/valuation/methods/owen_shapley.py b/src/pydvl/valuation/methods/owen_shapley.py
@@ -76,5 +76,6 @@ def fit(self, dataset: Dataset) -> Self:
             self.result._status = Status.Converged
         return self
 
-    def coefficient(self, n: int, k: int) -> float:
-        return 1
+    def coefficient(self, n: int, k: int, other: float) -> float:
+        # Coefficient is 1.0 for all n and k
+        return other
diff --git a/src/pydvl/valuation/methods/semivalue.py b/src/pydvl/valuation/methods/semivalue.py
@@ -84,12 +84,21 @@ def __init__(
             self.tqdm_args.update(progress if isinstance(progress, dict) else {})
 
     @abstractmethod
-    def coefficient(self, n: int, k: int) -> float:
-        """Computes the coefficient for a given subset size.
+    def coefficient(self, n: int, k: int, other: float) -> float:
+        """Returns the function computing the final coefficient to be used in the
+        semi-value valuation.
+
+        The semi-value coefficient is a function of the number of elements in the set,
+        and the size of the subset for which the coefficient is being computed.
+        Coefficients can be very large or very small, so that simply multiplying them
+        with the rest of the factors in a semi-value computation can lead to overflow or
+        underflow. To avoid this, we pass the other factors to this method, and delegate
+        the choice of whether to multiply or divide to the implementation.
 
         Args:
             n: Total number of elements in the set.
             k: Size of the subset for which the coefficient is being computed
+            other: The other factors in the computation.
         """
         ...
 

diff --git a/src/pydvl/valuation/samplers/base.py b/src/pydvl/valuation/samplers/base.py
@@ -185,7 +185,7 @@ def weight(n: int, subset_len: int) -> float:
     def make_strategy(
         self,
         utility: UtilityBase,
-        coefficient: Callable[[int, int], float] | None = None,
+        coefficient: Callable[[int, int, float], float] | None = None,
     ) -> EvaluationStrategy:
         """Returns the strategy for this sampler."""
         ...  # return SomeEvaluationStrategy(self)
@@ -242,7 +242,7 @@ def __init__(
         self,
         sampler: SamplerT,
         utility: UtilityBase,
-        coefficient: Callable[[int, int], float] | None = None,
+        coefficient: Callable[[int, int, float], float] | None = None,
     ):
         self.utility = utility
         self.n_indices = (
@@ -256,7 +256,7 @@ def __init__(
             if coefficient is not None:
 
                 def coefficient_fun(n: int, subset_len: int) -> float:
-                    return sampler.weight(n, subset_len) * coefficient(n, subset_len)
+                    return coefficient(n, subset_len, sampler.weight(n, subset_len))
 
                 self.coefficient = coefficient_fun
             else:

diff --git a/src/pydvl/valuation/samplers/classwise.py b/src/pydvl/valuation/samplers/classwise.py
@@ -179,6 +179,6 @@ def sample_limit(self, indices: IndexSetT) -> int:
     def make_strategy(
         self,
         utility: UtilityBase,
-        coefficient: Callable[[int, int], float] | None = None,
+        coefficient: Callable[[int, int, float], float] | None = None,
     ) -> EvaluationStrategy[IndexSampler, ValueUpdate]:
         return self.in_class.make_strategy(utility, coefficient)
diff --git a/src/pydvl/valuation/samplers/msr.py b/src/pydvl/valuation/samplers/msr.py
@@ -54,7 +54,7 @@ def weight(n: int, subset_len: int) -> float:
     def make_strategy(
         self,
         utility: UtilityBase,
-        coefficient: Callable[[int, int], float] | None = None,
+        coefficient: Callable[[int, int, float], float] | None = None,
     ) -> MSREvaluationStrategy:
         return MSREvaluationStrategy(self, utility, coefficient)
 
@@ -64,10 +64,9 @@ class MSREvaluationStrategy(EvaluationStrategy[SamplerT, MSRValueUpdate]):
 
     The MSR evaluation strategy makes one utility evaluation per sample but generates
     `n_indices` many updates from it. The updates will be used to update two running
-    means that will later be combined into on final value. We send the
-    `ValueUpdate.kind` field to `ValueUpdateKind.POSITVE` or `ValueUpdateKind.NEGATIVE`
+    means that will later be combined into a final value. We send the
+    `ValueUpdate.kind` field to `ValueUpdateKind.POSITIVE` or `ValueUpdateKind.NEGATIVE`
     to decide which of the two running means is going to be updated.
-
     """
 
     def process(

diff --git a/src/pydvl/valuation/samplers/permutation.py b/src/pydvl/valuation/samplers/permutation.py
@@ -22,7 +22,7 @@
 import math
 from copy import copy
 from itertools import permutations
-from typing import Callable, cast
+from typing import Callable
 
 import numpy as np
 
@@ -32,7 +32,6 @@
 from pydvl.valuation.samplers.utils import StochasticSamplerMixin
 from pydvl.valuation.types import (
     IndexSetT,
-    IndexT,
     NullaryPredicate,
     Sample,
     SampleBatch,
@@ -98,7 +97,7 @@ def weight(n: int, subset_len: int) -> float:
     def make_strategy(
         self,
         utility: UtilityBase,
-        coefficient: Callable[[int, int], float] | None = None,
+        coefficient: Callable[[int, int, float], float] | None = None,
     ) -> PermutationEvaluationStrategy:
         return PermutationEvaluationStrategy(self, utility, coefficient)
 
@@ -152,7 +151,7 @@ def __init__(
         self,
         sampler: PermutationSampler,
         utility: UtilityBase,
-        coefficient: Callable[[int, int], float] | None = None,
+        coefficient: Callable[[int, int, float], float] | None = None,
     ):
         super().__init__(sampler, utility, coefficient)
         self.truncation = copy(sampler.truncation)
@@ -168,8 +167,6 @@ def process(
             curr = prev = self.utility(None)
             permutation = sample.subset
             for i, idx in enumerate(permutation):
-                # FIXME: type checker claims this could be Any (?)
-                idx = cast(IndexT, idx)
                 if not truncated:
                     new_sample = sample.with_idx(idx).with_subset(permutation[: i + 1])
                     curr = self.utility(new_sample)

diff --git a/src/pydvl/valuation/samplers/powerset.py b/src/pydvl/valuation/samplers/powerset.py
@@ -179,7 +179,7 @@ def index_iterator(
     def make_strategy(
         self,
         utility: UtilityBase,
-        coefficient: Callable[[int, int], float] | None = None,
+        coefficient: Callable[[int, int, float], float] | None = None,
     ) -> PowersetEvaluationStrategy:
         return PowersetEvaluationStrategy(self, utility, coefficient)
 
@@ -236,7 +236,7 @@ def weight(n: int, subset_len: int) -> float:
     def make_strategy(
         self,
         utility: UtilityBase,
-        coefficient: Callable[[int, int], float] | None = None,
+        coefficient: Callable[[int, int, float], float] | None = None,
     ) -> EvaluationStrategy:
         return LOOEvaluationStrategy(self, utility, coefficient)
 
@@ -251,7 +251,7 @@ def __init__(
         self,
         sampler: LOOSampler,
         utility: UtilityBase,
-        coefficient: Callable[[int, int], float] | None = None,
+        coefficient: Callable[[int, int, float], float] | None = None,
     ):
         super().__init__(sampler, utility, coefficient)
         assert utility.training_data is not None

diff --git a/tests/valuation/methods/test_semivalues.py b/tests/valuation/methods/test_semivalues.py
@@ -85,7 +85,7 @@ def test_coefficients(n, valuation_class, kwargs):
     )
 
     s = [
-        math.comb(n - 1, j - 1) * valuation.coefficient(n, j - 1)
+        valuation.coefficient(n, j - 1, math.comb(n - 1, j - 1))
         for j in range(1, n + 1)
     ]
     assert np.isclose(1, np.sum(s))