diff --git a/src/pydvl/influence/base_influence_function_model.py b/src/pydvl/influence/base_influence_function_model.py index b6854c250..5077f8d88 100644 --- a/src/pydvl/influence/base_influence_function_model.py +++ b/src/pydvl/influence/base_influence_function_model.py @@ -217,14 +217,6 @@ class ComposableInfluence( block_mapper: BlockMapperType - @property - def n_parameters(self): - return super().n_parameters() - - @property - def is_thread_safe(self) -> bool: - return False - @property def is_fitted(self): try: diff --git a/src/pydvl/influence/torch/influence_function_model.py b/src/pydvl/influence/torch/influence_function_model.py index 1114a641e..4fd89a9e6 100644 --- a/src/pydvl/influence/torch/influence_function_model.py +++ b/src/pydvl/influence/torch/influence_function_model.py @@ -1812,6 +1812,19 @@ class TorchOperatorGradientComposition( torch.Tensor, TorchBatch, TorchOperator, TorchPerSampleGradientProvider ] ): + """ + Representing a composable block that integrates an [TorchOperator] + [pydvl.influence.torch.operator.base.TorchOperator] and + a [TorchPerSampleGradientProvider] + [pydvl.influence.torch.operator.gradient_provider.TorchPerSampleGradientProvider] + + This block is designed to be flexible, handling different computational modes via + an abstract operator and gradient provider. + """ + + def __init__(self, op: TorchOperator, gp: TorchPerSampleGradientProvider): + super().__init__(op, gp) + def to(self, device: torch.device): self.gp = self.gp.to(device) self.op = self.op.to(device) @@ -1821,6 +1834,20 @@ def to(self, device: torch.device): class TorchBlockMapper( BlockMapper[torch.Tensor, TorchBatch, TorchOperatorGradientComposition] ): + """ + Class for mapping operations across multiple compositional blocks represented by + instances of [TorchOperatorGradientComposition] + [pydvl.influence.torch.influence_function_model.TorchOperatorGradientComposition]. + + This class takes a dictionary of compositional blocks and applies their methods to + batches or tensors, and aggregates the results. + """ + + def __init__( + self, composable_block_dict: OrderedDict[str, TorchOperatorGradientComposition] + ): + super().__init__(composable_block_dict) + def _split_to_blocks( self, z: torch.Tensor, dim: int = -1 ) -> OrderedDict[str, torch.Tensor]: @@ -1844,6 +1871,7 @@ def to(self, device: torch.device): class TorchComposableInfluence( ComposableInfluence[torch.Tensor, TorchBatch, DataLoader, TorchBlockMapper], ModelInfoMixin, + ABC, ): def __init__( self, @@ -1949,6 +1977,14 @@ def __init__( self.gradient_provider_factory = TorchPerSampleAutoGrad self.loss = loss + @property + def n_parameters(self): + return super().n_parameters() + + @property + def is_thread_safe(self) -> bool: + return False + @staticmethod def _validate_regularization( block_name: str, value: Optional[float] diff --git a/src/pydvl/influence/torch/operator/bilinear_form.py b/src/pydvl/influence/torch/operator/bilinear_form.py index 4e7cf94a2..0fbacd718 100644 --- a/src/pydvl/influence/torch/operator/bilinear_form.py +++ b/src/pydvl/influence/torch/operator/bilinear_form.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Optional, cast +from typing import TYPE_CHECKING, Optional import torch @@ -13,6 +13,15 @@ class OperatorBilinearForm( BilinearForm[torch.Tensor, TorchBatch, TorchPerSampleGradientProvider] ): + r""" + Base class for bilinear forms based on an instance of + [TorchOperator][pydvl.influence.torch.operator.base.TorchOperator]. This means it + computes weighted inner products of the form: + + $$ \langle \operatorname{Op}(x), y \rangle $$ + + """ + def __init__( self, operator: "TorchOperator", @@ -22,6 +31,19 @@ def __init__( def inner_product( self, left: torch.Tensor, right: Optional[torch.Tensor] ) -> torch.Tensor: + r""" + Computes the weighted inner product of two vectors, i.e. + + $$ \langle x, y \rangle_{B} = \langle \operatorname{Op}(x), y \rangle $$ + + Args: + left: The first tensor in the inner product computation. + right: The second tensor, optional; if not provided, the inner product will + use `left` tensor for both arguments. + + Returns: + A tensor representing the inner product. + """ if right is None: right = left if left.shape[0] <= right.shape[0]: diff --git a/src/pydvl/influence/torch/operator/gradient_provider.py b/src/pydvl/influence/torch/operator/gradient_provider.py index 76ca84010..f3cf8e554 100644 --- a/src/pydvl/influence/torch/operator/gradient_provider.py +++ b/src/pydvl/influence/torch/operator/gradient_provider.py @@ -22,6 +22,29 @@ class TorchPerSampleGradientProvider( PerSampleGradientProvider[TorchBatch, torch.Tensor], ABC ): + r""" + Abstract base class for calculating per-sample gradients of a function defined by + a [torch.nn.Module][torch.nn.Module] and a loss function. + + This class must be subclassed with implementations for its abstract methods tailored + to specific gradient computation needs, e.g. using [torch.autograd][torch.autograd] + or stochastic finite differences. + + Consider a function + + $$ \ell: \mathbb{R}^{d_1} \times \mathbb{R}^{d_2} \times \mathbb{R}^{n} \times + \mathbb{R}^{n}, \quad \ell(\omega_1, \omega_2, x, y) = + \operatorname{loss}(f(\omega_1, \omega_2; x), y) $$ + + e.g. a two layer neural network $f$ with a loss function, then this object should + compute the expressions: + + $$ \nabla_{\omega_{i}}\ell(\omega_1, \omega_2, x, y), + \nabla_{\omega_{i}}\nabla_{x}\ell(\omega_1, \omega_2, x, y), + \nabla_{\omega}\ell(\omega_1, \omega_2, x, y) \cdot v$$ + + """ + def __init__( self, model: torch.nn.Module, @@ -76,12 +99,52 @@ def _detach_dict(tensor_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor return {k: g.detach() if g.requires_grad else g for k, g in tensor_dict.items()} def per_sample_gradient_dict(self, batch: TorchBatch) -> Dict[str, torch.Tensor]: + r""" + Computes and returns a dictionary mapping gradient names to their respective + per-sample gradients. Given the example in the class docstring, this means + + $$ \text{result}[\omega_i] = \nabla_{\omega_{i}}\ell(\omega_1, \omega_2, + \text{batch.x}, \text{batch.y}), $$ + + where the first dimension of the resulting tensors is always considered to be + the batch dimension, so the shape of the resulting tensors are $(N, d_i)$, + where $N$ is the number of samples in the batch. + + Args: + batch: The batch of data for which to compute gradients. + + Returns: + A dictionary where keys are gradient identifiers and values are the + gradients computed per sample. + """ gradient_dict = self._per_sample_gradient_dict(batch.to(self.device)) return self._detach_dict(gradient_dict) def per_sample_mixed_gradient_dict( self, batch: TorchBatch ) -> Dict[str, torch.Tensor]: + r""" + Computes and returns a dictionary mapping gradient names to their respective + per-sample mixed gradients. In this context, mixed gradients refer to computing + gradients with respect to the instance definition in addition to + compute derivatives with respect to the input batch. + Given the example in the class docstring, this means + + $$ \text{result}[\omega_i] = \nabla_{\omega_{i}}\nabla_{x}\ell(\omega_1, + \omega_2, \text{batch.x}, \text{batch.y}), $$ + + where the first dimension of the resulting tensors is always considered to be + the batch dimension and the last to be the non-batch input related derivatives. + So the shape of the resulting tensors are $(N, n, d_i)$, + where $N$ is the number of samples in the batch. + + Args: + batch: The batch of data for which to compute mixed gradients. + + Returns: + A dictionary where keys are gradient identifiers and values are the + mixed gradients computed per sample. + """ gradient_dict = self._per_sample_mixed_gradient_dict(batch.to(self.device)) return self._detach_dict(gradient_dict) @@ -90,6 +153,26 @@ def matrix_jacobian_product( batch: TorchBatch, g: torch.Tensor, ) -> torch.Tensor: + r""" + Computes the matrix-Jacobian product for the provided batch and input tensor. + Given the example in the class docstring, this means + + $$ (\nabla_{\omega_{1}}\ell(\omega_1, \omega_2, + \text{batch.x}, \text{batch.y}), + \nabla_{\omega_{2}}\ell(\omega_1, \omega_2, + \text{batch.x}, \text{batch.y})) \cdot g^T$$ + + where g must be a tensor of shape $(K, d_1+d_2)$, so the resulting tensor + is of shape $(N, K)$. + + Args: + batch: The batch of data for which to compute the Jacobian. + g: The tensor to be used in the matrix-Jacobian product + calculation. + + Returns: + The resulting tensor from the matrix-Jacobian product computation. + """ result = self._matrix_jacobian_product(batch.to(self.device), g.to(self.device)) if result.requires_grad: result = result.detach() @@ -108,6 +191,26 @@ def per_sample_flat_mixed_gradient(self, batch: TorchBatch) -> torch.Tensor: class TorchPerSampleAutoGrad(TorchPerSampleGradientProvider): + r""" + Compute per-sample gradients of a function defined by + a [torch.nn.Module][torch.nn.Module] and a loss function using + [torch.func][torch.func]. + + Consider a function + + $$ \ell: \mathbb{R}^{d_1} \times \mathbb{R}^{d_2} \times \mathbb{R}^{n} \times + \mathbb{R}^{n}, \quad \ell(\omega_1, \omega_2, x, y) = + \operatorname{loss}(f(\omega_1, \omega_2; x), y) $$ + + e.g. a two layer neural network $f$ with a loss function, then this object should + compute the expressions: + + $$ \nabla_{\omega_{i}}\ell(\omega_1, \omega_2, x, y), + \nabla_{\omega_{i}}\nabla_{x}\ell(\omega_1, \omega_2, x, y), + \nabla_{\omega}\ell(\omega_1, \omega_2, x, y) \cdot v$$ + + """ + def __init__( self, model: torch.nn.Module, diff --git a/src/pydvl/influence/types.py b/src/pydvl/influence/types.py index e9adafc1a..b43ddcb4a 100644 --- a/src/pydvl/influence/types.py +++ b/src/pydvl/influence/types.py @@ -230,7 +230,6 @@ def inner_product( Returns: A tensor representing the inner product. """ - pass def gradient_inner_product( self, @@ -359,7 +358,7 @@ class OperatorGradientComposition( ): """ Generic base class representing a composable block that integrates an operator and - a gradient provider to compute influences between batches of data. + a gradient provider to compute interactions between batches of data. This block is designed to be flexible, handling different computational modes via an abstract operator and gradient provider.