diff --git a/docs/assets/pydvl.bib b/docs/assets/pydvl.bib
index 9bf523e2f..dacf52e53 100644
--- a/docs/assets/pydvl.bib
+++ b/docs/assets/pydvl.bib
@@ -25,6 +25,7 @@ @article{benmerzoug_re_2023
   url = {https://zenodo.org/record/8173733},
   urldate = {2023-08-27},
   abstract = {We investigate the results of [1] in the field of data valuation. We repeat their experiments and conclude that the (Monte Carlo) Least Core is sensitive to important characteristics of the ML problem of interest, making it difficult to apply.},
+  keywords = {notion}
 }
 
 @article{castro_polynomial_2009,
@@ -39,10 +40,39 @@ @article{castro_polynomial_2009
   pages = {1726--1730},
   issn = {0305-0548},
   doi = {10.1016/j.cor.2008.04.004},
-  url = {http://www.sciencedirect.com/science/article/pii/S0305054808000804},
+  url = {https://www.sciencedirect.com/science/article/pii/S0305054808000804},
   urldate = {2020-11-21},
   abstract = {In this paper we develop a polynomial method based on sampling theory that can be used to estimate the Shapley value (or any semivalue) for cooperative games. Besides analyzing the complexity problem, we examine some desirable statistical properties of the proposed approach and provide some computational results.},
-  langid = {english}
+  langid = {english},
+  keywords = {notion}
+}
+
+@online{frangella_randomized_2021,
+  title = {Randomized {{Nyström Preconditioning}}},
+  author = {Frangella, Zachary and Tropp, Joel A. and Udell, Madeleine},
+  date = {2021-12-17},
+  eprint = {2110.02820},
+  eprinttype = {arxiv},
+  eprintclass = {cs, math},
+  doi = {10.48550/arXiv.2110.02820},
+  url = {https://arxiv.org/abs/2110.02820},
+  urldate = {2023-06-04},
+  abstract = {This paper introduces the Nystr\textbackslash "om PCG algorithm for solving a symmetric positive-definite linear system. The algorithm applies the randomized Nystr\textbackslash "om method to form a low-rank approximation of the matrix, which leads to an efficient preconditioner that can be deployed with the conjugate gradient algorithm. Theoretical analysis shows that preconditioned system has constant condition number as soon as the rank of the approximation is comparable with the number of effective degrees of freedom in the matrix. The paper also develops adaptive methods that provably achieve similar performance without knowledge of the effective dimension. Numerical tests show that Nystr\textbackslash "om PCG can rapidly solve large linear systems that arise in data analysis problems, and it surpasses several competing methods from the literature.},
+  pubstate = {preprint}
+}
+
+@inproceedings{george_fast_2018,
+  title = {Fast {{Approximate Natural Gradient Descent}} in a {{Kronecker Factored Eigenbasis}}},
+  booktitle = {Advances in {{Neural Information Processing Systems}}},
+  author = {George, Thomas and Laurent, César and Bouthillier, Xavier and Ballas, Nicolas and Vincent, Pascal},
+  date = {2018},
+  volume = {31},
+  eprint = {1806.03884},
+  eprinttype = {arxiv},
+  publisher = {Curran Associates, Inc.},
+  url = {https://proceedings.neurips.cc/paper/2018/hash/48000647b315f6f00f913caa757a70b3-Abstract.html},
+  urldate = {2024-01-12},
+  abstract = {Optimization algorithms that leverage gradient covariance information, such as variants of natural gradient descent (Amari, 1998), offer the prospect of yielding more effective descent directions. For models with many parameters, the covari- ance matrix they are based on becomes gigantic, making them inapplicable in their original form. This has motivated research into both simple diagonal approxima- tions and more sophisticated factored approximations such as KFAC (Heskes, 2000; Martens \& Grosse, 2015; Grosse \& Martens, 2016). In the present work we draw inspiration from both to propose a novel approximation that is provably better than KFAC and amendable to cheap partial updates. It consists in tracking a diagonal variance, not in parameter coordinates, but in a Kronecker-factored eigenbasis, in which the diagonal approximation is likely to be more effective. Experiments show improvements over KFAC in optimization speed for several deep network architectures.}
 }
 
 @inproceedings{ghorbani_data_2019,
@@ -54,9 +84,9 @@ @inproceedings{ghorbani_data_2019
   eprint = {1904.02868},
   eprinttype = {arxiv},
   pages = {2242--2251},
-  publisher = {{PMLR}},
+  publisher = {PMLR},
   issn = {2640-3498},
-  url = {http://proceedings.mlr.press/v97/ghorbani19c.html},
+  url = {https://proceedings.mlr.press/v97/ghorbani19c.html},
   urldate = {2020-11-01},
   abstract = {As data becomes the fuel driving technological and economic growth, a fundamental challenge is how to quantify the value of data in algorithmic predictions and decisions. For example, in healthcare and consumer markets, it has been suggested that individuals should be compensated for the data that they generate, but it is not clear what is an equitable valuation for individual data. In this work, we develop a principled framework to address data valuation in the context of supervised machine learning. Given a learning algorithm trained on n data points to produce a predictor, we propose data Shapley as a metric to quantify the value of each training datum to the predictor performance. Data Shapley uniquely satisfies several natural properties of equitable data valuation. We develop Monte Carlo and gradient-based methods to efficiently estimate data Shapley values in practical settings where complex learning algorithms, including neural networks, are trained on large datasets. In addition to being equitable, extensive experiments across biomedical, image and synthetic data demonstrate that data Shapley has several other benefits: 1) it is more powerful than the popular leave-one-out or leverage score in providing insight on what data is more valuable for a given learning task; 2) low Shapley value data effectively capture outliers and corruptions; 3) high Shapley value data inform what type of new data to acquire to improve the predictor.},
   eventtitle = {International {{Conference}} on {{Machine Learning}} ({{ICML}} 2019)},
@@ -75,7 +105,7 @@ @article{hampel_influence_1974
   eprint = {2285666},
   eprinttype = {jstor},
   pages = {383--393},
-  publisher = {{[American Statistical Association, Taylor \& Francis, Ltd.]}},
+  publisher = {[American Statistical Association, Taylor \& Francis, Ltd.]},
   issn = {0162-1459},
   doi = {10.2307/2285666},
   url = {https://www.jstor.org/stable/2285666},
@@ -84,16 +114,34 @@ @article{hampel_influence_1974
 }
 
 @inproceedings{hataya_nystrom_2023,
-  title = {Nystr{\"o}m {{Method}} for {{Accurate}} and {{Scalable Implicit Differentiation}}},
+  title = {Nyström {{Method}} for {{Accurate}} and {{Scalable Implicit Differentiation}}},
   booktitle = {Proceedings of {{The}} 26th {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
   author = {Hataya, Ryuichiro and Yamada, Makoto},
-  year = {2023},
-  month = apr,
+  date = {2023-04-11},
   pages = {4643--4654},
-  publisher = {{PMLR}},
+  publisher = {PMLR},
   issn = {2640-3498},
+  url = {https://proceedings.mlr.press/v206/hataya23a.html},
   urldate = {2024-02-26},
-  abstract = {The essential difficulty of gradient-based bilevel optimization using implicit differentiation is to estimate the inverse Hessian vector product with respect to neural network parameters. This paper proposes to tackle this problem by the Nystr{\"o}m method and the Woodbury matrix identity, exploiting the low-rankness of the Hessian. Compared to existing methods using iterative approximation, such as conjugate gradient and the Neumann series approximation, the proposed method avoids numerical instability and can be efficiently computed in matrix operations without iterations. As a result, the proposed method works stably in various tasks and is faster than iterative approximations. Throughout experiments including large-scale hyperparameter optimization and meta learning, we demonstrate that the Nystr{\"o}m method consistently achieves comparable or even superior performance to other approaches. The source code is available from https://github.com/moskomule/hypergrad.},
+  abstract = {The essential difficulty of gradient-based bilevel optimization using implicit differentiation is to estimate the inverse Hessian vector product with respect to neural network parameters. This paper proposes to tackle this problem by the Nyström method and the Woodbury matrix identity, exploiting the low-rankness of the Hessian. Compared to existing methods using iterative approximation, such as conjugate gradient and the Neumann series approximation, the proposed method avoids numerical instability and can be efficiently computed in matrix operations without iterations. As a result, the proposed method works stably in various tasks and is faster than iterative approximations. Throughout experiments including large-scale hyperparameter optimization and meta learning, we demonstrate that the Nyström method consistently achieves comparable or even superior performance to other approaches. The source code is available from https://github.com/moskomule/hypergrad.},
+  eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
+  langid = {english}
+}
+
+@article{ji_breakdownfree_2017,
+  title = {A Breakdown-Free Block Conjugate Gradient Method},
+  author = {Ji, Hao and Li, Yaohang},
+  date = {2017-06},
+  journaltitle = {BIT Numerical Mathematics},
+  shortjournal = {Bit Numer Math},
+  volume = {57},
+  number = {2},
+  pages = {379--403},
+  issn = {0006-3835, 1572-9125},
+  doi = {10.1007/s10543-016-0631-z},
+  url = {https://link.springer.com/10.1007/s10543-016-0631-z},
+  urldate = {2024-02-28},
+  abstract = {In this paper, we analyze all possible situations of rank deficiency that cause breakdown in block conjugate gradient (BCG) solvers. A simple solution, breakdownfree block conjugate gradient (BFBCG), is designed to address the rank deficiency problem. The rationale of the BFBCG algorithm is to derive new forms of parameter matrices based on the potentially reduced search subspace to handle rank deficiency. Orthogonality properties and convergence of BFBCG in case of rank deficiency are justified accordingly with mathematical rigor. BFBCG yields faster convergence than restarting BCG when breakdown occurs. Numerical examples suffering from rank deficiency are provided to demonstrate the robustness of BFBCG.},
   langid = {english}
 }
 
@@ -103,9 +151,9 @@ @inproceedings{jia_efficient_2019
   author = {Jia, Ruoxi and Dao, David and Wang, Boxin and Hubis, Frances Ann and Hynes, Nick and Gürel, Nezihe Merve and Li, Bo and Zhang, Ce and Song, Dawn and Spanos, Costas J.},
   date = {2019-04-11},
   pages = {1167--1176},
-  publisher = {{PMLR}},
+  publisher = {PMLR},
   issn = {2640-3498},
-  url = {http://proceedings.mlr.press/v89/jia19a.html},
+  url = {https://proceedings.mlr.press/v89/jia19a.html},
   urldate = {2021-02-12},
   abstract = {“How much is my data worth?” is an increasingly common question posed by organizations and individuals alike. An answer to this question could allow, for instance, fairly distributing profits...},
   eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}})},
@@ -127,7 +175,7 @@ @article{jia_efficient_2019a
   doi = {10.14778/3342263.3342637},
   url = {https://doi.org/10.14778/3342263.3342637},
   urldate = {2021-02-12},
-  abstract = {Given a data set D containing millions of data points and a data consumer who is willing to pay for \$X to train a machine learning (ML) model over D, how should we distribute this \$X to each data point to reflect its "value"? In this paper, we define the "relative value of data" via the Shapley value, as it uniquely possesses properties with appealing real-world interpretations, such as fairness, rationality and decentralizability. For general, bounded utility functions, the Shapley value is known to be challenging to compute: to get Shapley values for all N data points, it requires O(2N) model evaluations for exact computation and O(N log N) for (ϵ, δ)-approximation. In this paper, we focus on one popular family of ML models relying on K-nearest neighbors (KNN). The most surprising result is that for unweighted KNN classifiers and regressors, the Shapley value of all N data points can be computed, exactly, in O(N log N) time - an exponential improvement on computational complexity! Moreover, for (ϵ, δ)-approximation, we are able to develop an algorithm based on Locality Sensitive Hashing (LSH) with only sublinear complexity O(Nh(ϵ, K) log N) when ϵ is not too small and K is not too large. We empirically evaluate our algorithms on up to 10 million data points and even our exact algorithm is up to three orders of magnitude faster than the baseline approximation algorithm. The LSH-based approximation algorithm can accelerate the value calculation process even further. We then extend our algorithm to other scenarios such as (1) weighed KNN classifiers, (2) different data points are clustered by different data curators, and (3) there are data analysts providing computation who also requires proper valuation. Some of these extensions, although also being improved exponentially, are less practical for exact computation (e.g., O(NK) complexity for weigthed KNN). We thus propose an Monte Carlo approximation algorithm, which is O(N(log N)2/(log K)2) times more efficient than the baseline approximation algorithm.},
+  abstract = {Given a data set D containing millions of data points and a data consumer who is willing to pay \textbackslash\$X to train a machine learning (ML) model over D, how should we distribute this \textbackslash\$X to each data point to reflect its "value"? In this paper, we define the "relative value of data" via the Shapley value, as it uniquely possesses properties with appealing real-world interpretations, such as fairness, rationality and decentralizability. For general, bounded utility functions, the Shapley value is known to be challenging to compute: to get Shapley values for all N data points, it requires O(2N) model evaluations for exact computation and O(N log N) for (ϵ, δ)-approximation. In this paper, we focus on one popular family of ML models relying on K-nearest neighbors (KNN). The most surprising result is that for unweighted KNN classifiers and regressors, the Shapley value of all N data points can be computed, exactly, in O(N log N) time - an exponential improvement on computational complexity! Moreover, for (ϵ, δ)-approximation, we are able to develop an algorithm based on Locality Sensitive Hashing (LSH) with only sublinear complexity O(Nh(ϵ, K) log N) when ϵ is not too small and K is not too large. We empirically evaluate our algorithms on up to 10 million data points and even our exact algorithm is up to three orders of magnitude faster than the baseline approximation algorithm. The LSH-based approximation algorithm can accelerate the value calculation process even further. We then extend our algorithm to other scenarios such as (1) weighed KNN classifiers, (2) different data points are clustered by different data curators, and (3) there are data analysts providing computation who also requires proper valuation. Some of these extensions, although also being improved exponentially, are less practical for exact computation (e.g., O(NK) complexity for weigthed KNN). We thus propose an Monte Carlo approximation algorithm, which is O(N(log N)2/(log K)2) times more efficient than the baseline approximation algorithm.},
   langid = {english},
   keywords = {notion}
 }
@@ -153,7 +201,7 @@ @inproceedings{koh_understanding_2017
   eprint = {1703.04730},
   eprinttype = {arxiv},
   pages = {1885--1894},
-  publisher = {{PMLR}},
+  publisher = {PMLR},
   url = {https://proceedings.mlr.press/v70/koh17a.html},
   urldate = {2022-05-09},
   abstract = {How can we explain the predictions of a black-box model? In this paper, we use influence functions — a classic technique from robust statistics — to trace a model’s prediction through the learning algorithm and back to its training data, thereby identifying training points most responsible for a given prediction. To scale up influence functions to modern machine learning settings, we develop a simple, efficient implementation that requires only oracle access to gradients and Hessian-vector products. We show that even on non-convex and non-differentiable models where the theory breaks down, approximations to influence functions can still provide valuable information. On linear models and convolutional neural networks, we demonstrate that influence functions are useful for multiple purposes: understanding model behavior, debugging models, detecting dataset errors, and even creating visually-indistinguishable training-set attacks.},
@@ -171,9 +219,9 @@ @inproceedings{kwon_beta_2022
   volume = {151},
   eprint = {2110.14049},
   eprinttype = {arxiv},
-  publisher = {{PMLR}},
-  location = {{Valencia, Spain}},
-  url = {http://arxiv.org/abs/2110.14049},
+  publisher = {PMLR},
+  location = {Valencia, Spain},
+  url = {https://arxiv.org/abs/2110.14049},
   urldate = {2022-04-06},
   abstract = {Data Shapley has recently been proposed as a principled framework to quantify the contribution of individual datum in machine learning. It can effectively identify helpful or harmful data points for a learning algorithm. In this paper, we propose Beta Shapley, which is a substantial generalization of Data Shapley. Beta Shapley arises naturally by relaxing the efficiency axiom of the Shapley value, which is not critical for machine learning settings. Beta Shapley unifies several popular data valuation methods and includes data Shapley as a special case. Moreover, we prove that Beta Shapley has several desirable statistical properties and propose efficient algorithms to estimate it. We demonstrate that Beta Shapley outperforms state-of-the-art data valuation methods on several downstream ML tasks such as: 1) detecting mislabeled training data; 2) learning with subsamples; and 3) identifying points whose addition or removal have the largest positive or negative impact on the model.},
   eventtitle = {{{AISTATS}} 2022},
@@ -181,6 +229,23 @@ @inproceedings{kwon_beta_2022
   keywords = {notion}
 }
 
+@inproceedings{kwon_dataoob_2023,
+  title = {Data-{{OOB}}: {{Out-of-bag Estimate}} as a {{Simple}} and {{Efficient Data Value}}},
+  shorttitle = {Data-{{OOB}}},
+  booktitle = {Proceedings of the 40th {{International Conference}} on {{Machine Learning}}},
+  author = {Kwon, Yongchan and Zou, James},
+  date = {2023-07-03},
+  pages = {18135--18152},
+  publisher = {PMLR},
+  issn = {2640-3498},
+  url = {https://proceedings.mlr.press/v202/kwon23e.html},
+  urldate = {2023-09-06},
+  abstract = {Data valuation is a powerful framework for providing statistical insights into which data are beneficial or detrimental to model training. Many Shapley-based data valuation methods have shown promising results in various downstream tasks, however, they are well known to be computationally challenging as it requires training a large number of models. As a result, it has been recognized as infeasible to apply to large datasets. To address this issue, we propose Data-OOB, a new data valuation method for a bagging model that utilizes the out-of-bag estimate. The proposed method is computationally efficient and can scale to millions of data by reusing trained weak learners. Specifically, Data-OOB takes less than 2.25 hours on a single CPU processor when there are \$10\^{}6\$ samples to evaluate and the input dimension is 100. Furthermore, Data-OOB has solid theoretical interpretations in that it identifies the same important data point as the infinitesimal jackknife influence function when two different points are compared. We conduct comprehensive experiments using 12 classification datasets, each with thousands of sample sizes. We demonstrate that the proposed method significantly outperforms existing state-of-the-art data valuation methods in identifying mislabeled data and finding a set of helpful (or harmful) data points, highlighting the potential for applying data values in real-world applications.},
+  eventtitle = {International {{Conference}} on {{Machine Learning}}},
+  langid = {english},
+  keywords = {notion}
+}
+
 @inproceedings{kwon_efficient_2021,
   title = {Efficient {{Computation}} and {{Analysis}} of {{Distributional Shapley Values}}},
   booktitle = {Proceedings of the 24th {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
@@ -189,15 +254,30 @@ @inproceedings{kwon_efficient_2021
   eprint = {2007.01357},
   eprinttype = {arxiv},
   pages = {793--801},
-  publisher = {{PMLR}},
+  publisher = {PMLR},
   issn = {2640-3498},
-  url = {http://proceedings.mlr.press/v130/kwon21a.html},
+  url = {https://proceedings.mlr.press/v130/kwon21a.html},
   urldate = {2021-04-23},
   abstract = {Distributional data Shapley value (DShapley) has recently been proposed as a principled framework to quantify the contribution of individual datum in machine learning. DShapley develops the founda...},
   eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
   langid = {english}
 }
 
+@inproceedings{martens_optimizing_2015,
+  title = {Optimizing {{Neural Networks}} with {{Kronecker-factored Approximate Curvature}}},
+  booktitle = {Proceedings of the 32nd {{International Conference}} on {{Machine Learning}}},
+  author = {Martens, James and Grosse, Roger},
+  date = {2015-06-01},
+  pages = {2408--2417},
+  publisher = {PMLR},
+  issn = {1938-7228},
+  url = {https://proceedings.mlr.press/v37/martens15.html},
+  urldate = {2022-11-26},
+  abstract = {We propose an efficient method for approximating natural gradient descent in neural networks which we call Kronecker-factored Approximate Curvature (K-FAC). K-FAC is based on an efficiently invertible approximation of a neural network’s Fisher information matrix which is neither diagonal nor low-rank, and in some cases is completely non-sparse. It is derived by approximating various large blocks of the Fisher (corresponding to entire layers) as being the Kronecker product of two much smaller matrices. While only several times more expensive to compute than the plain stochastic gradient, the updates produced by K-FAC make much more progress optimizing the objective, which results in an algorithm that can be much faster than stochastic gradient descent with momentum in practice. And unlike some previously proposed approximate natural-gradient/Newton methods which use high-quality non-diagonal curvature matrices (such as Hessian-free optimization), K-FAC works very well in highly stochastic optimization regimes. This is because the cost of storing and inverting K-FAC’s approximation to the curvature matrix does not depend on the amount of data used to estimate it, which is a feature typically associated only with diagonal or low-rank approximations to the curvature matrix.},
+  eventtitle = {International {{Conference}} on {{Machine Learning}}},
+  langid = {english}
+}
+
 @article{mitchell_sampling_2022,
   title = {Sampling {{Permutations}} for {{Shapley Value Estimation}}},
   author = {Mitchell, Rory and Cooper, Joshua and Frank, Eibe and Holmes, Geoffrey},
@@ -208,7 +288,7 @@ @article{mitchell_sampling_2022
   number = {43},
   pages = {1--46},
   issn = {1533-7928},
-  url = {http://jmlr.org/papers/v23/21-0439.html},
+  url = {https://jmlr.org/papers/v23/21-0439.html},
   urldate = {2022-10-23},
   abstract = {Game-theoretic attribution techniques based on Shapley values are used to interpret black-box machine learning models, but their exact calculation is generally NP-hard, requiring approximation methods for non-trivial models. As the computation of Shapley values can be expressed as a summation over a set of permutations, a common approach is to sample a subset of these permutations for approximation. Unfortunately, standard Monte Carlo sampling methods can exhibit slow convergence, and more sophisticated quasi-Monte Carlo methods have not yet been applied to the space of permutations. To address this, we investigate new approaches based on two classes of approximation methods and compare them empirically. First, we demonstrate quadrature techniques in a RKHS containing functions of permutations, using the Mallows kernel in combination with kernel herding and sequential Bayesian quadrature. The RKHS perspective also leads to quasi-Monte Carlo type error bounds, with a tractable discrepancy measure defined on permutations. Second, we exploit connections between the hypersphere  S d−2 Sd−2  and permutations to create practical algorithms for generating permutation samples with good properties. Experiments show the above techniques provide significant improvements for Shapley value estimates over existing methods, converging to a smaller RMSE in the same number of model evaluations.}
 }
@@ -221,7 +301,7 @@ @inproceedings{okhrati_multilinear_2021
   eprint = {2010.12082},
   eprinttype = {arxiv},
   pages = {7992--7999},
-  publisher = {{IEEE}},
+  publisher = {IEEE},
   issn = {1051-4651},
   doi = {10.1109/ICPR48806.2021.9412511},
   url = {https://ieeexplore.ieee.org/abstract/document/9412511},
@@ -238,9 +318,9 @@ @inproceedings{schioppa_scaling_2021
   eprint = {2112.03052},
   eprinttype = {arxiv},
   eprintclass = {cs},
-  publisher = {{arXiv}},
+  publisher = {arXiv},
   doi = {10.48550/arXiv.2112.03052},
-  url = {http://arxiv.org/abs/2112.03052},
+  url = {https://arxiv.org/abs/2112.03052},
   urldate = {2023-03-10},
   abstract = {We address efficient calculation of influence functions for tracking predictions back to the training data. We propose and analyze a new approach to speeding up the inverse Hessian calculation based on Arnoldi iteration. With this improvement, we achieve, to the best of our knowledge, the first successful implementation of influence functions that scales to full-size (language and vision) Transformer models with several hundreds of millions of parameters. We evaluate our approach on image classification and sequence-to-sequence tasks with tens to a hundred of millions of training examples. Our code will be available at https://github.com/google-research/jax-influence.},
   eventtitle = {{{AAAI-22}}},
@@ -253,7 +333,7 @@ @inproceedings{schoch_csshapley_2022
   booktitle = {Proc. of the Thirty-Sixth {{Conference}} on {{Neural Information Processing Systems}} ({{NeurIPS}})},
   author = {Schoch, Stephanie and Xu, Haifeng and Ji, Yangfeng},
   date = {2022-10-31},
-  location = {{New Orleans, Louisiana, USA}},
+  location = {New Orleans, Louisiana, USA},
   url = {https://openreview.net/forum?id=KTOcrOR5mQ9},
   urldate = {2022-11-23},
   abstract = {Data valuation, or the valuation of individual datum contributions, has seen growing interest in machine learning due to its demonstrable efficacy for tasks such as noisy label detection. In particular, due to the desirable axiomatic properties, several Shapley value approximations have been proposed. In these methods, the value function is usually defined as the predictive accuracy over the entire development set. However, this limits the ability to differentiate between training instances that are helpful or harmful to their own classes. Intuitively, instances that harm their own classes may be noisy or mislabeled and should receive a lower valuation than helpful instances. In this work, we propose CS-Shapley, a Shapley value with a new value function that discriminates between training instances’ in-class and out-of-class contributions. Our theoretical analysis shows the proposed value function is (essentially) the unique function that satisfies two desirable properties for evaluating data values in classification. Further, our experiments on two benchmark evaluation tasks (data removal and noisy label detection) and four classifiers demonstrate the effectiveness of CS-Shapley over existing methods. Lastly, we evaluate the “transferability” of data values estimated from one classifier to others, and our results suggest Shapley-based data valuation is transferable for application across different models.},
@@ -262,31 +342,15 @@ @inproceedings{schoch_csshapley_2022
   keywords = {notion}
 }
 
-@online{wang_data_2022,
-  title = {Data {{Banzhaf}}: {{A Robust Data Valuation Framework}} for {{Machine Learning}}},
-  shorttitle = {Data {{Banzhaf}}},
-  author = {Wang, Jiachen T. and Jia, Ruoxi},
-  date = {2022-10-22},
-  eprint = {2205.15466},
-  eprinttype = {arxiv},
-  eprintclass = {cs, stat},
-  doi = {10.48550/arXiv.2205.15466},
-  url = {http://arxiv.org/abs/2205.15466},
-  urldate = {2022-10-28},
-  abstract = {This paper studies the robustness of data valuation to noisy model performance scores. Particularly, we find that the inherent randomness of the widely used stochastic gradient descent can cause existing data value notions (e.g., the Shapley value and the Leave-one-out error) to produce inconsistent data value rankings across different runs. To address this challenge, we first pose a formal framework within which one can measure the robustness of a data value notion. We show that the Banzhaf value, a value notion originated from cooperative game theory literature, achieves the maximal robustness among all semivalues -- a class of value notions that satisfy crucial properties entailed by ML applications. We propose an algorithm to efficiently estimate the Banzhaf value based on the Maximum Sample Reuse (MSR) principle. We derive the lower bound sample complexity for Banzhaf value estimation, and we show that our MSR algorithm's sample complexity is close to the lower bound. Our evaluation demonstrates that the Banzhaf value outperforms the existing semivalue-based data value notions on several downstream ML tasks such as learning with weighted samples and noisy label detection. Overall, our study suggests that when the underlying ML algorithm is stochastic, the Banzhaf value is a promising alternative to the semivalue-based data value schemes given its computational advantage and ability to robustly differentiate data quality.},
-  pubstate = {preprint},
-  keywords = {notion}
-}
-
 @inproceedings{wang_improving_2022,
   title = {Improving {{Cooperative Game Theory-based Data Valuation}} via {{Data Utility Learning}}},
   author = {Wang, Tianhao and Yang, Yu and Jia, Ruoxi},
   date = {2022-04-07},
   eprint = {2107.06336v2},
   eprinttype = {arxiv},
-  publisher = {{arXiv}},
+  publisher = {arXiv},
   doi = {10.48550/arXiv.2107.06336},
-  url = {http://arxiv.org/abs/2107.06336v2},
+  url = {https://arxiv.org/abs/2107.06336v2},
   urldate = {2022-05-19},
   abstract = {The Shapley value (SV) and Least core (LC) are classic methods in cooperative game theory for cost/profit sharing problems. Both methods have recently been proposed as a principled solution for data valuation tasks, i.e., quantifying the contribution of individual datum in machine learning. However, both SV and LC suffer computational challenges due to the need for retraining models on combinatorially many data subsets. In this work, we propose to boost the efficiency in computing Shapley value or Least core by learning to estimate the performance of a learning algorithm on unseen data combinations. Theoretically, we derive bounds relating the error in the predicted learning performance to the approximation error in SV and LC. Empirically, we show that the proposed method can significantly improve the accuracy of SV and LC estimation.},
   eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022). {{Workshop}} on {{Socially Responsible Machine Learning}}},
@@ -294,6 +358,20 @@ @inproceedings{wang_improving_2022
   keywords = {notion}
 }
 
+@online{watson_accelerated_2023,
+  title = {Accelerated {{Shapley Value Approximation}} for {{Data Evaluation}}},
+  author = {Watson, Lauren and Kujawa, Zeno and Andreeva, Rayna and Yang, Hao-Tsung and Elahi, Tariq and Sarkar, Rik},
+  date = {2023-11-09},
+  eprint = {2311.05346},
+  eprinttype = {arxiv},
+  eprintclass = {cs},
+  doi = {10.48550/arXiv.2311.05346},
+  url = {https://arxiv.org/abs/2311.05346},
+  urldate = {2023-12-07},
+  abstract = {Data valuation has found various applications in machine learning, such as data filtering, efficient learning and incentives for data sharing. The most popular current approach to data valuation is the Shapley value. While popular for its various applications, Shapley value is computationally expensive even to approximate, as it requires repeated iterations of training models on different subsets of data. In this paper we show that the Shapley value of data points can be approximated more efficiently by leveraging the structural properties of machine learning problems. We derive convergence guarantees on the accuracy of the approximate Shapley value for different learning settings including Stochastic Gradient Descent with convex and non-convex loss functions. Our analysis suggests that in fact models trained on small subsets are more important in the context of data valuation. Based on this idea, we describe \$\textbackslash delta\$-Shapley -- a strategy of only using small subsets for the approximation. Experiments show that this approach preserves approximate value and rank of data, while achieving speedup of up to 9.9x. In pre-trained networks the approach is found to bring more efficiency in terms of accurate evaluation using small subsets.},
+  pubstate = {preprint}
+}
+
 @inproceedings{wu_davinz_2022,
   title = {{{DAVINZ}}: {{Data Valuation}} Using {{Deep Neural Networks}} at {{Initialization}}},
   shorttitle = {{{DAVINZ}}},
@@ -301,7 +379,7 @@ @inproceedings{wu_davinz_2022
   author = {Wu, Zhaoxuan and Shu, Yao and Low, Bryan Kian Hsiang},
   date = {2022-06-28},
   pages = {24150--24176},
-  publisher = {{PMLR}},
+  publisher = {PMLR},
   url = {https://proceedings.mlr.press/v162/wu22j.html},
   urldate = {2022-10-29},
   abstract = {Recent years have witnessed a surge of interest in developing trustworthy methods to evaluate the value of data in many real-world applications (e.g., collaborative machine learning, data marketplaces). Existing data valuation methods typically valuate data using the generalization performance of converged machine learning models after their long-term model training, hence making data valuation on large complex deep neural networks (DNNs) unaffordable. To this end, we theoretically derive a domain-aware generalization bound to estimate the generalization performance of DNNs without model training. We then exploit this theoretically derived generalization bound to develop a novel training-free data valuation method named data valuation at initialization (DAVINZ) on DNNs, which consistently achieves remarkable effectiveness and efficiency in practice. Moreover, our training-free DAVINZ, surprisingly, can even theoretically and empirically enjoy the desirable properties that training-based data valuation methods usually attain, thus making it more trustworthy in practice.},
@@ -317,8 +395,8 @@ @inproceedings{yan_if_2021
   date = {2021-05-18},
   volume = {6},
   pages = {5751--5759},
-  publisher = {{Association for the Advancement of Artificial Intelligence}},
-  location = {{Virtual conference}},
+  publisher = {Association for the Advancement of Artificial Intelligence},
+  location = {Virtual conference},
   doi = {10.1609/aaai.v35i6.16721},
   url = {https://ojs.aaai.org/index.php/AAAI/article/view/16721},
   urldate = {2021-04-23},
@@ -327,51 +405,3 @@ @inproceedings{yan_if_2021
   langid = {english},
   keywords = {notion}
 }
-
-@InProceedings{kwon_data_2023,
-  title = 	 {Data-{OOB}: Out-of-bag Estimate as a Simple and Efficient Data Value},
-  author =       {Kwon, Yongchan and Zou, James},
-  booktitle = 	 {Proceedings of the 40th International Conference on Machine Learning},
-  pages = 	 {18135--18152},
-  year = 	 {2023},
-  editor = 	 {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},
-  volume = 	 {202},
-  series = 	 {Proceedings of Machine Learning Research},
-  month = 	 {23--29 Jul},
-  publisher =    {PMLR},
-  pdf = 	 {https://proceedings.mlr.press/v202/kwon23e/kwon23e.pdf},
-  url = 	 {https://proceedings.mlr.press/v202/kwon23e.html},
-  abstract = 	 {Data valuation is a powerful framework for providing statistical insights into which data are beneficial or detrimental to model training. Many Shapley-based data valuation methods have shown promising results in various downstream tasks, however, they are well known to be computationally challenging as it requires training a large number of models. As a result, it has been recognized as infeasible to apply to large datasets. To address this issue, we propose Data-OOB, a new data valuation method for a bagging model that utilizes the out-of-bag estimate. The proposed method is computationally efficient and can scale to millions of data by reusing trained weak learners. Specifically, Data-OOB takes less than $2.25$ hours on a single CPU processor when there are $10^6$ samples to evaluate and the input dimension is $100$. Furthermore, Data-OOB has solid theoretical interpretations in that it identifies the same important data point as the infinitesimal jackknife influence function when two different points are compared. We conduct comprehensive experiments using 12 classification datasets, each with thousands of sample sizes. We demonstrate that the proposed method significantly outperforms existing state-of-the-art data valuation methods in identifying mislabeled data and finding a set of helpful (or harmful) data points, highlighting the potential for applying data values in real-world applications.}
-}
-
-@article{george2018fast,
-  title={Fast approximate natural gradient descent in a kronecker factored eigenbasis},
-  author={George, Thomas and Laurent, C{\'e}sar and Bouthillier, Xavier and Ballas, Nicolas and Vincent, Pascal},
-  journal={Advances in Neural Information Processing Systems},
-  volume={31},
-  year={2018}
-}
-
-@inproceedings{martens2015optimizing,
-  title={Optimizing neural networks with kronecker-factored approximate curvature},
-  author={Martens, James and Grosse, Roger},
-  booktitle={International conference on machine learning},
-  pages={2408--2417},
-  year={2015},
-  organization={PMLR}
-}
-
-@misc{frangella_randomized_2021,
-  title = {Randomized {{Nystr}}{\textbackslash}"om {{Preconditioning}}},
-  author = {Frangella, Zachary and Tropp, Joel A. and Udell, Madeleine},
-  year = {2021},
-  month = dec,
-  number = {arXiv:2110.02820},
-  eprint = {2110.02820},
-  primaryclass = {cs, math},
-  publisher = {{arXiv}},
-  doi = {10.48550/arXiv.2110.02820},
-  urldate = {2023-06-04},
-  abstract = {This paper introduces the Nystr{\textbackslash}"om PCG algorithm for solving a symmetric positive-definite linear system. The algorithm applies the randomized Nystr{\textbackslash}"om method to form a low-rank approximation of the matrix, which leads to an efficient preconditioner that can be deployed with the conjugate gradient algorithm. Theoretical analysis shows that preconditioned system has constant condition number as soon as the rank of the approximation is comparable with the number of effective degrees of freedom in the matrix. The paper also develops adaptive methods that provably achieve similar performance without knowledge of the effective dimension. Numerical tests show that Nystr{\textbackslash}"om PCG can rapidly solve large linear systems that arise in data analysis problems, and it surpasses several competing methods from the literature.},
-  archiveprefix = {arxiv},
-}
\ No newline at end of file
diff --git a/docs/influence/influence_function_model.md b/docs/influence/influence_function_model.md
index b4c95e0ba..951ea8420 100644
--- a/docs/influence/influence_function_model.md
+++ b/docs/influence/influence_function_model.md
@@ -99,7 +99,7 @@ if_model = ArnoldiInfluence(
 ### Eigenvalue Corrected K-FAC
 
 K-FAC, short for Kronecker-Factored Approximate Curvature, is a method that approximates the Fisher information matrix [FIM](https://en.wikipedia.org/wiki/Fisher_information) of a model. It is possible to show that for classification models with appropriate loss functions the FIM is equal to the Hessian of the model’s loss over the dataset. In this restricted but nonetheless important context K-FAC offers an efficient way to approximate the Hessian and hence the influence scores. 
-For more info and details refer to the original paper [@martens2015optimizing].
+For more info and details refer to the original paper [@martens_optimizing_2015].
 
 The K-FAC method is implemented in the class [EkfacInfluence](pydvl/influence/torch/influence_function_model.py). The following code snippet shows how to use the K-FAC method to calculate the influence function of a model. Note that, in contrast to the other methods for influence function calculation, K-FAC does not require the loss function as an input. This is because the current implementation is only applicable to classification models with a cross entropy loss function. 
 
@@ -112,7 +112,7 @@ if_model = EkfacInfluence(
 ```
 Upon initialization, the K-FAC method will parse the model and extract which layers require grad and which do not. Then it will only calculate the influence scores for the layers that require grad. The current implementation of the K-FAC method is only available for linear layers, and therefore if the model contains non-linear layers that require gradient the K-FAC method will raise a NotImplementedLayerRepresentationException.
 
-A further improvement of the K-FAC method is the Eigenvalue Corrected K-FAC (EKFAC) method [@george2018fast], which allows to further re-fit the eigenvalues of the Hessian, thus providing a more accurate approximation. On top of the K-FAC method, the EKFAC method is implemented by setting `update_diagonal=True` when initialising [EkfacInfluence](pydvl/influence/torch/influence_function_model.py). The following code snippet shows how to use the EKFAC method to calculate the influence function of a model. 
+A further improvement of the K-FAC method is the Eigenvalue Corrected K-FAC (EKFAC) method [@george_fast_2018], which allows to further re-fit the eigenvalues of the Hessian, thus providing a more accurate approximation. On top of the K-FAC method, the EKFAC method is implemented by setting `update_diagonal=True` when initialising [EkfacInfluence](pydvl/influence/torch/influence_function_model.py). The following code snippet shows how to use the EKFAC method to calculate the influence function of a model. 
 
 ```python
 from pydvl.influence.torch import EkfacInfluence