Merge pull request #632 from aai-institute/fix/knn-parallelization

Improvements to knnshap parallelization
aai-institute · Jan 13, 2025 · ace6b38 · ace6b38
2 parents 4c88278 + da14832
commit ace6b38
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 49 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,16 +4,18 @@
 
 ### Added
 
-- Refactor Classwise Shapley valuation with the interfaces and sampler architecture [PR #616](https://github.com/aai-institute/pyDVL/pull/616).
-- Refactoring KNN Shapley values with the new sampler architecture [PR #610](https://github.com/aai-institute/pyDVL/pull/610).
-- Refactoring MSR Banzhaf semivalues with the new sampler architecture.
+- Refactor Classwise Shapley valuation with the interfaces and sampler
+  architecture [PR #616](https://github.com/aai-institute/pyDVL/pull/616)
+- Refactor KNN Shapley values with the new sampler architecture
+  [PR #610](https://github.com/aai-institute/pyDVL/pull/610)
+- Refactor MSR Banzhaf semivalues with the new sampler architecture.
   [PR #605](https://github.com/aai-institute/pyDVL/pull/605)
-- Refactoring group-testing shapley values with new sampler architecture
+- Refactor group-testing shapley values with new sampler architecture
   [PR #602](https://github.com/aai-institute/pyDVL/pull/602)
-- Refactoring of least-core data valuation methods with more supported sampling methods
-  and consistent interface.
+- Refactor least-core data valuation methods with more supported sampling
+  methods and consistent interface.
   [PR #580](https://github.com/aai-institute/pyDVL/pull/580)
-- Refactoring of owen shapley valuation with new sampler architecture
+- Refactor Owen-Shapley valuation with new sampler architecture
   [PR #597](https://github.com/aai-institute/pyDVL/pull/597)
 - New method `InverseHarmonicMeanInfluence`, implementation for the paper
   `DataInf: Efficiently Estimating Data Influence in LoRA-tuned LLMs and
@@ -53,6 +55,8 @@
 
 ### Changed
 
+- Uniformly distribute test points across processes for KNNShapley. Fail for
+  `GroupedDataset` [PR #632](https://github.com/aai-institute/pyDVL/pull/632)
 - Introduced the concept of logical vs data indices for `Dataset`, and
   `GroupedDataset`, fixing inconsistencies in how the latter operates on indices.
   Also, both now return objects of the same type when slicing.

diff --git a/src/pydvl/valuation/methods/knn_shapley.py b/src/pydvl/valuation/methods/knn_shapley.py
@@ -16,16 +16,19 @@
 
 from __future__ import annotations
 
+from typing import cast
+
 import numpy as np
-from joblib import Parallel, delayed
+from joblib.parallel import Parallel, delayed, get_active_backend
+from more_itertools import chunked
 from numpy.typing import NDArray
 from sklearn.neighbors import NearestNeighbors
 from tqdm.auto import tqdm
 from typing_extensions import Self
 
 from pydvl.utils.status import Status
 from pydvl.valuation.base import Valuation
-from pydvl.valuation.dataset import Dataset
+from pydvl.valuation.dataset import Dataset, GroupedDataset
 from pydvl.valuation.result import ValuationResult
 from pydvl.valuation.utility import KNNClassifierUtility
 
@@ -65,7 +68,7 @@ def fit(self, data: Dataset) -> Self:
         calculates the Shapley values directly.
 
         In contrast to other data valuation models, the runtime increases linearly
-        with the size of the test dataset.
+        with the size of the dataset.
 
         Calculating the KNN valuation is a computationally expensive task that
         can be parallelized. To do so, call the `fit()` method inside a
@@ -79,29 +82,33 @@ def fit(self, data: Dataset) -> Self:
         ```
 
         """
-        self.helper_model = self.helper_model.fit(data.data().x)
-        n_obs = len(data.data().x)
-        n_test = len(self.utility.test_data)
 
-        generator = zip(
-            self.utility.test_data.data().x, self.utility.test_data.data().y
-        )
+        if isinstance(data, GroupedDataset):
+            raise TypeError("GroupedDataset is not supported by KNNShapleyValuation")
 
+        x_train, y_train = data.data()
+        self.helper_model = self.helper_model.fit(x_train)
+        n_test = len(self.utility.test_data)
+
+        _, n_jobs = get_active_backend()
+        batch_size = (n_test // n_jobs) + (1 if n_test % n_jobs else 0)
+        x_test, y_test = self.utility.test_data.data()
+        generator = zip(chunked(x_test, batch_size), chunked(y_test, batch_size))
         generator_with_progress = tqdm(
             generator,
             total=n_test,
             disable=not self.progress,
             position=0,
         )
 
-        with Parallel(return_as="generator") as parallel:
+        with Parallel(return_as="generator_unordered") as parallel:
             results = parallel(
-                delayed(self._compute_values_for_one_test_point)(
-                    self.helper_model, x, y, data.data().y
+                delayed(self._compute_values_for_test_points)(
+                    self.helper_model, np.array(x_test), np.array(y_test), y_train
                 )
-                for x, y in generator_with_progress
+                for x_test, y_test in generator_with_progress
             )
-            values = np.zeros(n_obs)
+            values = np.zeros(len(data))
             for res in results:
                 values += res
             values /= n_test
@@ -117,43 +124,50 @@ def fit(self, data: Dataset) -> Self:
         return self
 
     @staticmethod
-    def _compute_values_for_one_test_point(
-        helper_model: NearestNeighbors, x: NDArray, y: int, y_train: NDArray
-    ) -> np.ndarray:
-        """Compute the Shapley value for a single test data point.
+    def _compute_values_for_test_points(
+        helper_model: NearestNeighbors,
+        x_test: NDArray,
+        y_test: NDArray,
+        y_train: NDArray,
+    ) -> NDArray[np.float64]:
+        """Compute the Shapley value using a set of test points.
 
-        The shapley values of the whole test set are the average of the shapley values
-        of the single test data points.
+        The Shapley value for a training point is computed over the whole test set by
+        averaging the Shapley values of the single test data points.
 
         Args:
             helper_model: A fitted NearestNeighbors model.
-            x: A single test data point.
-            y: The correct label of the test data point.
-            y_train: The training labels.
+            x_test: The test data points.
+            y_test: The test labels.
+            y_train: The labels for the training points to be valued.
 
         Returns:
-            The Shapley values for the test data point.
+            The Shapley values for the training data points.
 
         """
         n_obs = len(y_train)
         n_neighbors = helper_model.get_params()["n_neighbors"]
 
-        # sorts data indices from close to far
+        # sort data indices from close to far
         sorted_indices = helper_model.kneighbors(
-            x.reshape(1, -1), n_neighbors=n_obs, return_distance=False
-        )[0]
-
-        values = np.zeros(n_obs)
-
-        idx = sorted_indices[-1]
-        values[idx] = float(y_train[idx] == y) / n_obs
-        # reverse range because we want to go from far to close
-        for i in range(n_obs - 1, 0, -1):
-            prev_idx = sorted_indices[i]
-            idx = sorted_indices[i - 1]
-            values[idx] = values[prev_idx]
-            values[idx] += (int(y_train[idx] == y) - int(y_train[prev_idx] == y)) / max(
-                n_neighbors, i
-            )
+            x_test, n_neighbors=n_obs, return_distance=False
+        )
 
-        return values
+        values = np.zeros(shape=(len(x_test), n_obs))
+
+        for query, neighbors in enumerate(sorted_indices):
+            label = y_test[query]
+            # Initialize the farthest neighbor's value
+            idx = neighbors[-1]
+            values[query][idx] = float(y_train[idx] == label) / n_obs
+            # reverse range because we want to go from far to close
+            for i in range(n_obs - 1, 0, -1):
+                prev_idx = neighbors[i]
+                idx = neighbors[i - 1]
+                values[query][idx] = values[query][prev_idx]
+                values[query][idx] += (
+                    int(y_train[idx] == label) - int(y_train[prev_idx] == label)
+                ) / max(n_neighbors, i)
+                # 1/max(K, i) = 1/K * min{K, i}/i as in the paper
+
+        return cast(NDArray[np.float64], values.sum(axis=0))
diff --git a/tests/valuation/methods/test_knn_shapley.py b/tests/valuation/methods/test_knn_shapley.py
@@ -4,7 +4,7 @@
 from sklearn import datasets
 from sklearn.neighbors import KNeighborsClassifier
 
-from pydvl.valuation.dataset import Dataset
+from pydvl.valuation.dataset import Dataset, GroupedDataset
 from pydvl.valuation.methods import DataShapleyValuation, KNNShapleyValuation
 from pydvl.valuation.samplers import PermutationSampler
 from pydvl.valuation.stopping import MinUpdates
@@ -49,3 +49,17 @@ def test_against_montecarlo(n_jobs, data, montecarlo_results):
     np.testing.assert_allclose(
         results.values, montecarlo_results.values, atol=1e-2, rtol=1e-2
     )
+
+
+def test_unsupported_grouped_dataset(data):
+    train, test = data
+    data_groups = np.zeros(len(train))
+    grouped = GroupedDataset.from_dataset(train, data_groups)
+
+    utility = KNNClassifierUtility(
+        model=KNeighborsClassifier(n_neighbors=1), test_data=test
+    )
+    valuation = KNNShapleyValuation(utility, progress=False)
+
+    with pytest.raises(TypeError, match="GroupedDataset is not supported"):
+        valuation.fit(grouped)