diff --git a/src/pydvl/influence/base_influence_function_model.py b/src/pydvl/influence/base_influence_function_model.py
index b6854c250..5077f8d88 100644
--- a/src/pydvl/influence/base_influence_function_model.py
+++ b/src/pydvl/influence/base_influence_function_model.py
@@ -217,14 +217,6 @@ class ComposableInfluence(
 
     block_mapper: BlockMapperType
 
-    @property
-    def n_parameters(self):
-        return super().n_parameters()
-
-    @property
-    def is_thread_safe(self) -> bool:
-        return False
-
     @property
     def is_fitted(self):
         try:
diff --git a/src/pydvl/influence/torch/influence_function_model.py b/src/pydvl/influence/torch/influence_function_model.py
index 1114a641e..4fd89a9e6 100644
--- a/src/pydvl/influence/torch/influence_function_model.py
+++ b/src/pydvl/influence/torch/influence_function_model.py
@@ -1812,6 +1812,19 @@ class TorchOperatorGradientComposition(
         torch.Tensor, TorchBatch, TorchOperator, TorchPerSampleGradientProvider
     ]
 ):
+    """
+    Representing a composable block that integrates an [TorchOperator]
+    [pydvl.influence.torch.operator.base.TorchOperator] and
+    a [TorchPerSampleGradientProvider]
+    [pydvl.influence.torch.operator.gradient_provider.TorchPerSampleGradientProvider]
+
+    This block is designed to be flexible, handling different computational modes via
+    an abstract operator and gradient provider.
+    """
+
+    def __init__(self, op: TorchOperator, gp: TorchPerSampleGradientProvider):
+        super().__init__(op, gp)
+
     def to(self, device: torch.device):
         self.gp = self.gp.to(device)
         self.op = self.op.to(device)
@@ -1821,6 +1834,20 @@ def to(self, device: torch.device):
 class TorchBlockMapper(
     BlockMapper[torch.Tensor, TorchBatch, TorchOperatorGradientComposition]
 ):
+    """
+    Class for mapping operations across multiple compositional blocks represented by
+    instances of [TorchOperatorGradientComposition]
+    [pydvl.influence.torch.influence_function_model.TorchOperatorGradientComposition].
+
+    This class takes a dictionary of compositional blocks and applies their methods to
+    batches or tensors, and aggregates the results.
+    """
+
+    def __init__(
+        self, composable_block_dict: OrderedDict[str, TorchOperatorGradientComposition]
+    ):
+        super().__init__(composable_block_dict)
+
     def _split_to_blocks(
         self, z: torch.Tensor, dim: int = -1
     ) -> OrderedDict[str, torch.Tensor]:
@@ -1844,6 +1871,7 @@ def to(self, device: torch.device):
 class TorchComposableInfluence(
     ComposableInfluence[torch.Tensor, TorchBatch, DataLoader, TorchBlockMapper],
     ModelInfoMixin,
+    ABC,
 ):
     def __init__(
         self,
@@ -1949,6 +1977,14 @@ def __init__(
         self.gradient_provider_factory = TorchPerSampleAutoGrad
         self.loss = loss
 
+    @property
+    def n_parameters(self):
+        return super().n_parameters()
+
+    @property
+    def is_thread_safe(self) -> bool:
+        return False
+
     @staticmethod
     def _validate_regularization(
         block_name: str, value: Optional[float]
diff --git a/src/pydvl/influence/torch/operator/bilinear_form.py b/src/pydvl/influence/torch/operator/bilinear_form.py
index 4e7cf94a2..0fbacd718 100644
--- a/src/pydvl/influence/torch/operator/bilinear_form.py
+++ b/src/pydvl/influence/torch/operator/bilinear_form.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Optional, cast
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -13,6 +13,15 @@
 class OperatorBilinearForm(
     BilinearForm[torch.Tensor, TorchBatch, TorchPerSampleGradientProvider]
 ):
+    r"""
+    Base class for bilinear forms based on an instance of
+    [TorchOperator][pydvl.influence.torch.operator.base.TorchOperator]. This means it
+    computes weighted inner products of the form:
+
+    $$ \langle \operatorname{Op}(x), y \rangle $$
+
+    """
+
     def __init__(
         self,
         operator: "TorchOperator",
@@ -22,6 +31,19 @@ def __init__(
     def inner_product(
         self, left: torch.Tensor, right: Optional[torch.Tensor]
     ) -> torch.Tensor:
+        r"""
+        Computes the weighted inner product of two vectors, i.e.
+
+        $$ \langle x, y \rangle_{B} = \langle \operatorname{Op}(x), y \rangle $$
+
+        Args:
+            left: The first tensor in the inner product computation.
+            right: The second tensor, optional; if not provided, the inner product will
+                use `left` tensor for both arguments.
+
+        Returns:
+            A tensor representing the inner product.
+        """
         if right is None:
             right = left
         if left.shape[0] <= right.shape[0]:
diff --git a/src/pydvl/influence/torch/operator/gradient_provider.py b/src/pydvl/influence/torch/operator/gradient_provider.py
index 76ca84010..f3cf8e554 100644
--- a/src/pydvl/influence/torch/operator/gradient_provider.py
+++ b/src/pydvl/influence/torch/operator/gradient_provider.py
@@ -22,6 +22,29 @@
 class TorchPerSampleGradientProvider(
     PerSampleGradientProvider[TorchBatch, torch.Tensor], ABC
 ):
+    r"""
+    Abstract base class for calculating per-sample gradients of a function defined by
+    a [torch.nn.Module][torch.nn.Module] and a loss function.
+
+    This class must be subclassed with implementations for its abstract methods tailored
+    to specific gradient computation needs, e.g. using [torch.autograd][torch.autograd]
+    or stochastic finite differences.
+
+    Consider a function
+
+    $$ \ell: \mathbb{R}^{d_1} \times \mathbb{R}^{d_2} \times \mathbb{R}^{n} \times
+        \mathbb{R}^{n}, \quad \ell(\omega_1, \omega_2, x, y) =
+        \operatorname{loss}(f(\omega_1, \omega_2; x), y) $$
+
+    e.g. a two layer neural network $f$ with a loss function, then this object should
+    compute the expressions:
+
+    $$ \nabla_{\omega_{i}}\ell(\omega_1, \omega_2, x, y),
+    \nabla_{\omega_{i}}\nabla_{x}\ell(\omega_1, \omega_2, x, y),
+    \nabla_{\omega}\ell(\omega_1, \omega_2, x, y) \cdot v$$
+
+    """
+
     def __init__(
         self,
         model: torch.nn.Module,
@@ -76,12 +99,52 @@ def _detach_dict(tensor_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
         return {k: g.detach() if g.requires_grad else g for k, g in tensor_dict.items()}
 
     def per_sample_gradient_dict(self, batch: TorchBatch) -> Dict[str, torch.Tensor]:
+        r"""
+        Computes and returns a dictionary mapping gradient names to their respective
+        per-sample gradients. Given the example in the class docstring, this means
+
+        $$ \text{result}[\omega_i] = \nabla_{\omega_{i}}\ell(\omega_1, \omega_2,
+            \text{batch.x}, \text{batch.y}), $$
+
+        where the first dimension of the resulting tensors is always considered to be
+        the batch dimension, so the shape of the resulting tensors are $(N, d_i)$,
+        where $N$ is the number of samples in the batch.
+
+        Args:
+            batch: The batch of data for which to compute gradients.
+
+        Returns:
+            A dictionary where keys are gradient identifiers and values are the
+                gradients computed per sample.
+        """
         gradient_dict = self._per_sample_gradient_dict(batch.to(self.device))
         return self._detach_dict(gradient_dict)
 
     def per_sample_mixed_gradient_dict(
         self, batch: TorchBatch
     ) -> Dict[str, torch.Tensor]:
+        r"""
+        Computes and returns a dictionary mapping gradient names to their respective
+        per-sample mixed gradients. In this context, mixed gradients refer to computing
+        gradients with respect to the instance definition in addition to
+        compute derivatives with respect to the input batch.
+        Given the example in the class docstring, this means
+
+        $$ \text{result}[\omega_i] = \nabla_{\omega_{i}}\nabla_{x}\ell(\omega_1,
+            \omega_2, \text{batch.x}, \text{batch.y}), $$
+
+        where the first dimension of the resulting tensors is always considered to be
+        the batch dimension and the last to be the non-batch input related derivatives.
+        So the shape of the resulting tensors are $(N, n, d_i)$,
+        where $N$ is the number of samples in the batch.
+
+        Args:
+            batch: The batch of data for which to compute mixed gradients.
+
+        Returns:
+            A dictionary where keys are gradient identifiers and values are the
+                mixed gradients computed per sample.
+        """
         gradient_dict = self._per_sample_mixed_gradient_dict(batch.to(self.device))
         return self._detach_dict(gradient_dict)
 
@@ -90,6 +153,26 @@ def matrix_jacobian_product(
         batch: TorchBatch,
         g: torch.Tensor,
     ) -> torch.Tensor:
+        r"""
+        Computes the matrix-Jacobian product for the provided batch and input tensor.
+        Given the example in the class docstring, this means
+
+        $$ (\nabla_{\omega_{1}}\ell(\omega_1, \omega_2,
+            \text{batch.x}, \text{batch.y}),
+            \nabla_{\omega_{2}}\ell(\omega_1, \omega_2,
+            \text{batch.x}, \text{batch.y})) \cdot g^T$$
+
+        where g must be a tensor of shape $(K, d_1+d_2)$, so the resulting tensor
+        is of shape $(N, K)$.
+
+        Args:
+            batch: The batch of data for which to compute the Jacobian.
+            g: The tensor to be used in the matrix-Jacobian product
+                calculation.
+
+        Returns:
+            The resulting tensor from the matrix-Jacobian product computation.
+        """
         result = self._matrix_jacobian_product(batch.to(self.device), g.to(self.device))
         if result.requires_grad:
             result = result.detach()
@@ -108,6 +191,26 @@ def per_sample_flat_mixed_gradient(self, batch: TorchBatch) -> torch.Tensor:
 
 
 class TorchPerSampleAutoGrad(TorchPerSampleGradientProvider):
+    r"""
+    Compute per-sample gradients of a function defined by
+    a [torch.nn.Module][torch.nn.Module] and a loss function using
+    [torch.func][torch.func].
+
+    Consider a function
+
+    $$ \ell: \mathbb{R}^{d_1} \times \mathbb{R}^{d_2} \times \mathbb{R}^{n} \times
+        \mathbb{R}^{n}, \quad \ell(\omega_1, \omega_2, x, y) =
+        \operatorname{loss}(f(\omega_1, \omega_2; x), y) $$
+
+    e.g. a two layer neural network $f$ with a loss function, then this object should
+    compute the expressions:
+
+    $$ \nabla_{\omega_{i}}\ell(\omega_1, \omega_2, x, y),
+    \nabla_{\omega_{i}}\nabla_{x}\ell(\omega_1, \omega_2, x, y),
+    \nabla_{\omega}\ell(\omega_1, \omega_2, x, y) \cdot v$$
+
+    """
+
     def __init__(
         self,
         model: torch.nn.Module,
diff --git a/src/pydvl/influence/types.py b/src/pydvl/influence/types.py
index e9adafc1a..b43ddcb4a 100644
--- a/src/pydvl/influence/types.py
+++ b/src/pydvl/influence/types.py
@@ -230,7 +230,6 @@ def inner_product(
         Returns:
             A tensor representing the inner product.
         """
-        pass
 
     def gradient_inner_product(
         self,
@@ -359,7 +358,7 @@ class OperatorGradientComposition(
 ):
     """
     Generic base class representing a composable block that integrates an operator and
-    a gradient provider to compute influences between batches of data.
+    a gradient provider to compute interactions between batches of data.
 
     This block is designed to be flexible, handling different computational modes via
     an abstract operator and gradient provider.