From 6b9c5bbc400d6d5b6b7652988736b671de41452a Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <k.schroeder@applied-institute.de>
Date: Mon, 11 Mar 2024 12:29:43 +0100
Subject: [PATCH 1/3] Fix pydvl.bib file by export from zotero, fix references
 in documentation

---
 docs/assets/pydvl.bib                      | 327 +++++++++++----------
 docs/influence/influence_function_model.md |   4 +-
 2 files changed, 171 insertions(+), 160 deletions(-)

diff --git a/docs/assets/pydvl.bib b/docs/assets/pydvl.bib
index 9bf523e2f..c0b6df378 100644
--- a/docs/assets/pydvl.bib
+++ b/docs/assets/pydvl.bib
@@ -1,48 +1,75 @@
 @article{agarwal_secondorder_2017,
   title = {Second-{{Order Stochastic Optimization}} for {{Machine Learning}} in {{Linear Time}}},
   author = {Agarwal, Naman and Bullins, Brian and Hazan, Elad},
-  date = {2017},
-  journaltitle = {Journal of Machine Learning Research},
-  shortjournal = {JMLR},
+  year = {2017},
+  journal = {Journal of Machine Learning Research},
   volume = {18},
   eprint = {1602.03943},
-  eprinttype = {arxiv},
   pages = {1--40},
-  url = {https://www.jmlr.org/papers/v18/16-491.html},
   abstract = {First-order stochastic methods are the state-of-the-art in large-scale machine learning optimization owing to efficient per-iteration complexity. Second-order methods, while able to provide faster convergence, have been much less explored due to the high cost of computing the second-order information. In this paper we develop second-order stochastic methods for optimization problems in machine learning that match the per-iteration cost of gradient based methods, and in certain settings improve upon the overall running time over popular first-order methods. Furthermore, our algorithm has the desirable property of being implementable in time linear in the sparsity of the input data.},
+  archiveprefix = {arxiv},
   langid = {english}
 }
 
 @article{benmerzoug_re_2023,
   title = {[{{Re}}] {{If}} You like {{Shapley}}, Then You'll Love the Core},
   author = {Benmerzoug, Anes and Delgado, Miguel de Benito},
-  date = {2023-07-31},
-  journaltitle = {ReScience C},
+  year = {2023},
+  month = jul,
+  journal = {ReScience C},
   volume = {9},
   number = {2},
   pages = {\#32},
   doi = {10.5281/zenodo.8173733},
-  url = {https://zenodo.org/record/8173733},
   urldate = {2023-08-27},
   abstract = {We investigate the results of [1] in the field of data valuation. We repeat their experiments and conclude that the (Monte Carlo) Least Core is sensitive to important characteristics of the ML problem of interest, making it difficult to apply.},
+  keywords = {notion}
 }
 
 @article{castro_polynomial_2009,
   title = {Polynomial Calculation of the {{Shapley}} Value Based on Sampling},
-  author = {Castro, Javier and Gómez, Daniel and Tejada, Juan},
-  date = {2009-05-01},
-  journaltitle = {Computers \& Operations Research},
-  shortjournal = {Computers \& Operations Research},
+  author = {Castro, Javier and G{\'o}mez, Daniel and Tejada, Juan},
+  year = {2009},
+  month = may,
+  journal = {Computers \& Operations Research},
   series = {Selected Papers Presented at the {{Tenth International Symposium}} on {{Locational Decisions}} ({{ISOLDE X}})},
   volume = {36},
   number = {5},
   pages = {1726--1730},
   issn = {0305-0548},
   doi = {10.1016/j.cor.2008.04.004},
-  url = {http://www.sciencedirect.com/science/article/pii/S0305054808000804},
   urldate = {2020-11-21},
   abstract = {In this paper we develop a polynomial method based on sampling theory that can be used to estimate the Shapley value (or any semivalue) for cooperative games. Besides analyzing the complexity problem, we examine some desirable statistical properties of the proposed approach and provide some computational results.},
-  langid = {english}
+  langid = {english},
+  keywords = {notion}
+}
+
+@misc{frangella_randomized_2021,
+  title = {Randomized {{Nystr{\"o}m Preconditioning}}},
+  author = {Frangella, Zachary and Tropp, Joel A. and Udell, Madeleine},
+  year = {2021},
+  month = dec,
+  number = {arXiv:2110.02820},
+  eprint = {2110.02820},
+  primaryclass = {cs, math},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2110.02820},
+  urldate = {2023-06-04},
+  abstract = {This paper introduces the Nystr{\textbackslash}"om PCG algorithm for solving a symmetric positive-definite linear system. The algorithm applies the randomized Nystr{\textbackslash}"om method to form a low-rank approximation of the matrix, which leads to an efficient preconditioner that can be deployed with the conjugate gradient algorithm. Theoretical analysis shows that preconditioned system has constant condition number as soon as the rank of the approximation is comparable with the number of effective degrees of freedom in the matrix. The paper also develops adaptive methods that provably achieve similar performance without knowledge of the effective dimension. Numerical tests show that Nystr{\textbackslash}"om PCG can rapidly solve large linear systems that arise in data analysis problems, and it surpasses several competing methods from the literature.},
+  archiveprefix = {arxiv}
+}
+
+@inproceedings{george_fast_2018,
+  title = {Fast {{Approximate Natural Gradient Descent}} in a {{Kronecker Factored Eigenbasis}}},
+  booktitle = {Advances in {{Neural Information Processing Systems}}},
+  author = {George, Thomas and Laurent, C{\'e}sar and Bouthillier, Xavier and Ballas, Nicolas and Vincent, Pascal},
+  year = {2018},
+  volume = {31},
+  eprint = {1806.03884},
+  publisher = {Curran Associates, Inc.},
+  urldate = {2024-01-12},
+  abstract = {Optimization algorithms that leverage gradient covariance information, such as variants of natural gradient descent (Amari, 1998), offer the prospect of yielding more effective descent directions. For models with many parameters, the covari- ance matrix they are based on becomes gigantic, making them inapplicable in their original form. This has motivated research into both simple diagonal approxima- tions and more sophisticated factored approximations such as KFAC (Heskes, 2000; Martens \& Grosse, 2015; Grosse \& Martens, 2016). In the present work we draw inspiration from both to propose a novel approximation that is provably better than KFAC and amendable to cheap partial updates. It consists in tracking a diagonal variance, not in parameter coordinates, but in a Kronecker-factored eigenbasis, in which the diagonal approximation is likely to be more effective. Experiments show improvements over KFAC in optimization speed for several deep network architectures.},
+  archiveprefix = {arxiv}
 }
 
 @inproceedings{ghorbani_data_2019,
@@ -50,16 +77,15 @@ @inproceedings{ghorbani_data_2019
   shorttitle = {Data {{Shapley}}},
   booktitle = {Proceedings of the 36th {{International Conference}} on {{Machine Learning}}, {{PMLR}}},
   author = {Ghorbani, Amirata and Zou, James},
-  date = {2019-05-24},
+  year = {2019},
+  month = may,
   eprint = {1904.02868},
-  eprinttype = {arxiv},
   pages = {2242--2251},
-  publisher = {{PMLR}},
+  publisher = {PMLR},
   issn = {2640-3498},
-  url = {http://proceedings.mlr.press/v97/ghorbani19c.html},
   urldate = {2020-11-01},
   abstract = {As data becomes the fuel driving technological and economic growth, a fundamental challenge is how to quantify the value of data in algorithmic predictions and decisions. For example, in healthcare and consumer markets, it has been suggested that individuals should be compensated for the data that they generate, but it is not clear what is an equitable valuation for individual data. In this work, we develop a principled framework to address data valuation in the context of supervised machine learning. Given a learning algorithm trained on n data points to produce a predictor, we propose data Shapley as a metric to quantify the value of each training datum to the predictor performance. Data Shapley uniquely satisfies several natural properties of equitable data valuation. We develop Monte Carlo and gradient-based methods to efficiently estimate data Shapley values in practical settings where complex learning algorithms, including neural networks, are trained on large datasets. In addition to being equitable, extensive experiments across biomedical, image and synthetic data demonstrate that data Shapley has several other benefits: 1) it is more powerful than the popular leave-one-out or leverage score in providing insight on what data is more valuable for a given learning task; 2) low Shapley value data effectively capture outliers and corruptions; 3) high Shapley value data inform what type of new data to acquire to improve the predictor.},
-  eventtitle = {International {{Conference}} on {{Machine Learning}} ({{ICML}} 2019)},
+  archiveprefix = {arxiv},
   langid = {english},
   keywords = {notion}
 }
@@ -67,18 +93,16 @@ @inproceedings{ghorbani_data_2019
 @article{hampel_influence_1974,
   title = {The {{Influence Curve}} and {{Its Role}} in {{Robust Estimation}}},
   author = {Hampel, Frank R.},
-  date = {1974},
-  journaltitle = {Journal of the American Statistical Association},
-  shortjournal = {J. Am. Stat. Assoc.},
+  year = {1974},
+  journal = {Journal of the American Statistical Association},
   volume = {69},
   number = {346},
   eprint = {2285666},
   eprinttype = {jstor},
   pages = {383--393},
-  publisher = {{[American Statistical Association, Taylor \& Francis, Ltd.]}},
+  publisher = {[American Statistical Association, Taylor \& Francis, Ltd.]},
   issn = {0162-1459},
   doi = {10.2307/2285666},
-  url = {https://www.jstor.org/stable/2285666},
   urldate = {2022-05-09},
   abstract = {This paper treats essentially the first derivative of an estimator viewed as functional and the ways in which it can be used to study local robustness properties. A theory of robust estimation "near" strict parametric models is briefly sketched and applied to some classical situations. Relations between von Mises functionals, the jackknife and U-statistics are indicated. A number of classical and new estimators are discussed, including trimmed and Winsorized means, Huber-estimators, and more generally maximum likelihood and M-estimators. Finally, a table with some numerical robustness properties is given.}
 }
@@ -90,25 +114,40 @@ @inproceedings{hataya_nystrom_2023
   year = {2023},
   month = apr,
   pages = {4643--4654},
-  publisher = {{PMLR}},
+  publisher = {PMLR},
   issn = {2640-3498},
   urldate = {2024-02-26},
   abstract = {The essential difficulty of gradient-based bilevel optimization using implicit differentiation is to estimate the inverse Hessian vector product with respect to neural network parameters. This paper proposes to tackle this problem by the Nystr{\"o}m method and the Woodbury matrix identity, exploiting the low-rankness of the Hessian. Compared to existing methods using iterative approximation, such as conjugate gradient and the Neumann series approximation, the proposed method avoids numerical instability and can be efficiently computed in matrix operations without iterations. As a result, the proposed method works stably in various tasks and is faster than iterative approximations. Throughout experiments including large-scale hyperparameter optimization and meta learning, we demonstrate that the Nystr{\"o}m method consistently achieves comparable or even superior performance to other approaches. The source code is available from https://github.com/moskomule/hypergrad.},
   langid = {english}
 }
 
+@article{ji_breakdownfree_2017,
+  title = {A Breakdown-Free Block Conjugate Gradient Method},
+  author = {Ji, Hao and Li, Yaohang},
+  year = {2017},
+  month = jun,
+  journal = {BIT Numerical Mathematics},
+  volume = {57},
+  number = {2},
+  pages = {379--403},
+  issn = {0006-3835, 1572-9125},
+  doi = {10.1007/s10543-016-0631-z},
+  urldate = {2024-02-28},
+  abstract = {In this paper, we analyze all possible situations of rank deficiency that cause breakdown in block conjugate gradient (BCG) solvers. A simple solution, breakdownfree block conjugate gradient (BFBCG), is designed to address the rank deficiency problem. The rationale of the BFBCG algorithm is to derive new forms of parameter matrices based on the potentially reduced search subspace to handle rank deficiency. Orthogonality properties and convergence of BFBCG in case of rank deficiency are justified accordingly with mathematical rigor. BFBCG yields faster convergence than restarting BCG when breakdown occurs. Numerical examples suffering from rank deficiency are provided to demonstrate the robustness of BFBCG.},
+  langid = {english}
+}
+
 @inproceedings{jia_efficient_2019,
   title = {Towards {{Efficient Data Valuation Based}} on the {{Shapley Value}}},
   booktitle = {Proceedings of the 22nd {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
-  author = {Jia, Ruoxi and Dao, David and Wang, Boxin and Hubis, Frances Ann and Hynes, Nick and Gürel, Nezihe Merve and Li, Bo and Zhang, Ce and Song, Dawn and Spanos, Costas J.},
-  date = {2019-04-11},
+  author = {Jia, Ruoxi and Dao, David and Wang, Boxin and Hubis, Frances Ann and Hynes, Nick and G{\"u}rel, Nezihe Merve and Li, Bo and Zhang, Ce and Song, Dawn and Spanos, Costas J.},
+  year = {2019},
+  month = apr,
   pages = {1167--1176},
-  publisher = {{PMLR}},
+  publisher = {PMLR},
   issn = {2640-3498},
-  url = {http://proceedings.mlr.press/v89/jia19a.html},
   urldate = {2021-02-12},
-  abstract = {“How much is my data worth?” is an increasingly common question posed by organizations and individuals alike. An answer to this question could allow, for instance, fairly distributing profits...},
-  eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}})},
+  abstract = {``How much is my data worth?'' is an increasingly common question posed by organizations and individuals alike. An answer to this question could allow, for instance, fairly distributing profits...},
   langid = {english},
   keywords = {notion}
 }
@@ -117,17 +156,16 @@ @article{jia_efficient_2019a
   title = {Efficient Task-Specific Data Valuation for Nearest Neighbor Algorithms},
   shorttitle = {{{VLDB}} 2019},
   author = {Jia, Ruoxi and Dao, David and Wang, Boxin and Hubis, Frances Ann and Gurel, Nezihe Merve and Li, Bo and Zhang, Ce and Spanos, Costas and Song, Dawn},
-  date = {2019-07-01},
-  journaltitle = {Proceedings of the VLDB Endowment},
-  shortjournal = {Proc. VLDB Endow.},
+  year = {2019},
+  month = jul,
+  journal = {Proceedings of the VLDB Endowment},
   volume = {12},
   number = {11},
   pages = {1610--1623},
   issn = {2150-8097},
   doi = {10.14778/3342263.3342637},
-  url = {https://doi.org/10.14778/3342263.3342637},
   urldate = {2021-02-12},
-  abstract = {Given a data set D containing millions of data points and a data consumer who is willing to pay for \$X to train a machine learning (ML) model over D, how should we distribute this \$X to each data point to reflect its "value"? In this paper, we define the "relative value of data" via the Shapley value, as it uniquely possesses properties with appealing real-world interpretations, such as fairness, rationality and decentralizability. For general, bounded utility functions, the Shapley value is known to be challenging to compute: to get Shapley values for all N data points, it requires O(2N) model evaluations for exact computation and O(N log N) for (ϵ, δ)-approximation. In this paper, we focus on one popular family of ML models relying on K-nearest neighbors (KNN). The most surprising result is that for unweighted KNN classifiers and regressors, the Shapley value of all N data points can be computed, exactly, in O(N log N) time - an exponential improvement on computational complexity! Moreover, for (ϵ, δ)-approximation, we are able to develop an algorithm based on Locality Sensitive Hashing (LSH) with only sublinear complexity O(Nh(ϵ, K) log N) when ϵ is not too small and K is not too large. We empirically evaluate our algorithms on up to 10 million data points and even our exact algorithm is up to three orders of magnitude faster than the baseline approximation algorithm. The LSH-based approximation algorithm can accelerate the value calculation process even further. We then extend our algorithm to other scenarios such as (1) weighed KNN classifiers, (2) different data points are clustered by different data curators, and (3) there are data analysts providing computation who also requires proper valuation. Some of these extensions, although also being improved exponentially, are less practical for exact computation (e.g., O(NK) complexity for weigthed KNN). We thus propose an Monte Carlo approximation algorithm, which is O(N(log N)2/(log K)2) times more efficient than the baseline approximation algorithm.},
+  abstract = {Given a data set D containing millions of data points and a data consumer who is willing to pay {\textbackslash}\$X to train a machine learning (ML) model over D, how should we distribute this {\textbackslash}\$X to each data point to reflect its "value"? In this paper, we define the "relative value of data" via the Shapley value, as it uniquely possesses properties with appealing real-world interpretations, such as fairness, rationality and decentralizability. For general, bounded utility functions, the Shapley value is known to be challenging to compute: to get Shapley values for all N data points, it requires O(2N) model evaluations for exact computation and O(N log N) for ({$\epsilon$}, {$\delta$})-approximation. In this paper, we focus on one popular family of ML models relying on K-nearest neighbors (KNN). The most surprising result is that for unweighted KNN classifiers and regressors, the Shapley value of all N data points can be computed, exactly, in O(N log N) time - an exponential improvement on computational complexity! Moreover, for ({$\epsilon$}, {$\delta$})-approximation, we are able to develop an algorithm based on Locality Sensitive Hashing (LSH) with only sublinear complexity O(Nh({$\epsilon$}, K) log N) when {$\epsilon$} is not too small and K is not too large. We empirically evaluate our algorithms on up to 10 million data points and even our exact algorithm is up to three orders of magnitude faster than the baseline approximation algorithm. The LSH-based approximation algorithm can accelerate the value calculation process even further. We then extend our algorithm to other scenarios such as (1) weighed KNN classifiers, (2) different data points are clustered by different data curators, and (3) there are data analysts providing computation who also requires proper valuation. Some of these extensions, although also being improved exponentially, are less practical for exact computation (e.g., O(NK) complexity for weigthed KNN). We thus propose an Monte Carlo approximation algorithm, which is O(N(log N)2/(log K)2) times more efficient than the baseline approximation algorithm.},
   langid = {english},
   keywords = {notion}
 }
@@ -135,12 +173,12 @@ @article{jia_efficient_2019a
 @inproceedings{just_lava_2023,
   title = {{{LAVA}}: {{Data Valuation}} without {{Pre-Specified Learning Algorithms}}},
   shorttitle = {{{LAVA}}},
+  booktitle = {The {{Eleventh International Conference}} on {{Learning Representations}} ({{ICLR}} 2023)},
   author = {Just, Hoang Anh and Kang, Feiyang and Wang, Tianhao and Zeng, Yi and Ko, Myeongseob and Jin, Ming and Jia, Ruoxi},
-  date = {2023-02-01},
-  url = {https://openreview.net/forum?id=JJuP86nBl4q},
+  year = {2023},
+  month = feb,
   urldate = {2023-04-25},
-  abstract = {Traditionally, data valuation is posed as a problem of equitably splitting the validation performance of a learning algorithm among the training data. As a result, the calculated data values depend on many design choices of the underlying learning algorithm. However, this dependence is undesirable for many use cases of data valuation, such as setting priorities over different data sources in a data acquisition process and informing pricing mechanisms in a data marketplace. In these scenarios, data needs to be valued before the actual analysis and the choice of the learning algorithm is still undetermined then. Another side-effect of the dependence is that to assess the value of individual points, one needs to re-run the learning algorithm with and without a point, which incurs a large computation burden. This work leapfrogs over the current limits of data valuation methods by introducing a new framework that can value training data in a way that is oblivious to the downstream learning algorithm. Our main results are as follows. \$\textbackslash textbf\{(1)\}\$ We develop a proxy for the validation performance associated with a training set based on a non-conventional \$\textbackslash textit\{class-wise\}\$ \$\textbackslash textit\{Wasserstein distance\}\$ between the training and the validation set. We show that the distance characterizes the upper bound of the validation performance for any given model under certain Lipschitz conditions. \$\textbackslash textbf\{(2)\}\$ We develop a novel method to value individual data based on the sensitivity analysis of the \$\textbackslash textit\{class-wise\}\$ Wasserstein distance. Importantly, these values can be directly obtained \$\textbackslash textit\{for free\}\$ from the output of off-the-shelf optimization solvers once the Wasserstein distance is computed. \$\textbackslash textbf\{(3) \}\$We evaluate our new data valuation framework over various use cases related to detecting low-quality data and show that, surprisingly, the learning-agnostic feature of our framework enables a \$\textbackslash textit\{significant improvement\}\$ over the state-of-the-art performance while being \$\textbackslash textit\{orders of magnitude faster.\}\$},
-  eventtitle = {The {{Eleventh International Conference}} on {{Learning Representations}} ({{ICLR}} 2023)},
+  abstract = {Traditionally, data valuation is posed as a problem of equitably splitting the validation performance of a learning algorithm among the training data. As a result, the calculated data values depend on many design choices of the underlying learning algorithm. However, this dependence is undesirable for many use cases of data valuation, such as setting priorities over different data sources in a data acquisition process and informing pricing mechanisms in a data marketplace. In these scenarios, data needs to be valued before the actual analysis and the choice of the learning algorithm is still undetermined then. Another side-effect of the dependence is that to assess the value of individual points, one needs to re-run the learning algorithm with and without a point, which incurs a large computation burden. This work leapfrogs over the current limits of data valuation methods by introducing a new framework that can value training data in a way that is oblivious to the downstream learning algorithm. Our main results are as follows. \${\textbackslash}textbf\{(1)\}\$ We develop a proxy for the validation performance associated with a training set based on a non-conventional \${\textbackslash}textit\{class-wise\}\$ \${\textbackslash}textit\{Wasserstein distance\}\$ between the training and the validation set. We show that the distance characterizes the upper bound of the validation performance for any given model under certain Lipschitz conditions. \${\textbackslash}textbf\{(2)\}\$ We develop a novel method to value individual data based on the sensitivity analysis of the \${\textbackslash}textit\{class-wise\}\$ Wasserstein distance. Importantly, these values can be directly obtained \${\textbackslash}textit\{for free\}\$ from the output of off-the-shelf optimization solvers once the Wasserstein distance is computed. \${\textbackslash}textbf\{(3) \}\$We evaluate our new data valuation framework over various use cases related to detecting low-quality data and show that, surprisingly, the learning-agnostic feature of our framework enables a \${\textbackslash}textit\{significant improvement\}\$ over the state-of-the-art performance while being \${\textbackslash}textit\{orders of magnitude faster.\}\$},
   langid = {english},
   keywords = {notion}
 }
@@ -149,15 +187,14 @@ @inproceedings{koh_understanding_2017
   title = {Understanding {{Black-box Predictions}} via {{Influence Functions}}},
   booktitle = {Proceedings of the 34th {{International Conference}} on {{Machine Learning}}},
   author = {Koh, Pang Wei and Liang, Percy},
-  date = {2017-07-17},
+  year = {2017},
+  month = jul,
   eprint = {1703.04730},
-  eprinttype = {arxiv},
   pages = {1885--1894},
-  publisher = {{PMLR}},
-  url = {https://proceedings.mlr.press/v70/koh17a.html},
+  publisher = {PMLR},
   urldate = {2022-05-09},
-  abstract = {How can we explain the predictions of a black-box model? In this paper, we use influence functions — a classic technique from robust statistics — to trace a model’s prediction through the learning algorithm and back to its training data, thereby identifying training points most responsible for a given prediction. To scale up influence functions to modern machine learning settings, we develop a simple, efficient implementation that requires only oracle access to gradients and Hessian-vector products. We show that even on non-convex and non-differentiable models where the theory breaks down, approximations to influence functions can still provide valuable information. On linear models and convolutional neural networks, we demonstrate that influence functions are useful for multiple purposes: understanding model behavior, debugging models, detecting dataset errors, and even creating visually-indistinguishable training-set attacks.},
-  eventtitle = {International {{Conference}} on {{Machine Learning}}},
+  abstract = {How can we explain the predictions of a black-box model? In this paper, we use influence functions --- a classic technique from robust statistics --- to trace a model's prediction through the learning algorithm and back to its training data, thereby identifying training points most responsible for a given prediction. To scale up influence functions to modern machine learning settings, we develop a simple, efficient implementation that requires only oracle access to gradients and Hessian-vector products. We show that even on non-convex and non-differentiable models where the theory breaks down, approximations to influence functions can still provide valuable information. On linear models and convolutional neural networks, we demonstrate that influence functions are useful for multiple purposes: understanding model behavior, debugging models, detecting dataset errors, and even creating visually-indistinguishable training-set attacks.},
+  archiveprefix = {arxiv},
   langid = {english},
   keywords = {notion}
 }
@@ -167,16 +204,31 @@ @inproceedings{kwon_beta_2022
   shorttitle = {Beta {{Shapley}}},
   booktitle = {Proceedings of the 25th {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}}) 2022,},
   author = {Kwon, Yongchan and Zou, James},
-  date = {2022-01-18},
+  year = {2022},
+  month = jan,
   volume = {151},
   eprint = {2110.14049},
-  eprinttype = {arxiv},
-  publisher = {{PMLR}},
-  location = {{Valencia, Spain}},
-  url = {http://arxiv.org/abs/2110.14049},
+  publisher = {PMLR},
+  address = {Valencia, Spain},
   urldate = {2022-04-06},
   abstract = {Data Shapley has recently been proposed as a principled framework to quantify the contribution of individual datum in machine learning. It can effectively identify helpful or harmful data points for a learning algorithm. In this paper, we propose Beta Shapley, which is a substantial generalization of Data Shapley. Beta Shapley arises naturally by relaxing the efficiency axiom of the Shapley value, which is not critical for machine learning settings. Beta Shapley unifies several popular data valuation methods and includes data Shapley as a special case. Moreover, we prove that Beta Shapley has several desirable statistical properties and propose efficient algorithms to estimate it. We demonstrate that Beta Shapley outperforms state-of-the-art data valuation methods on several downstream ML tasks such as: 1) detecting mislabeled training data; 2) learning with subsamples; and 3) identifying points whose addition or removal have the largest positive or negative impact on the model.},
-  eventtitle = {{{AISTATS}} 2022},
+  archiveprefix = {arxiv},
+  langid = {english},
+  keywords = {notion}
+}
+
+@inproceedings{kwon_dataoob_2023,
+  title = {Data-{{OOB}}: {{Out-of-bag Estimate}} as a {{Simple}} and {{Efficient Data Value}}},
+  shorttitle = {Data-{{OOB}}},
+  booktitle = {Proceedings of the 40th {{International Conference}} on {{Machine Learning}}},
+  author = {Kwon, Yongchan and Zou, James},
+  year = {2023},
+  month = jul,
+  pages = {18135--18152},
+  publisher = {PMLR},
+  issn = {2640-3498},
+  urldate = {2023-09-06},
+  abstract = {Data valuation is a powerful framework for providing statistical insights into which data are beneficial or detrimental to model training. Many Shapley-based data valuation methods have shown promising results in various downstream tasks, however, they are well known to be computationally challenging as it requires training a large number of models. As a result, it has been recognized as infeasible to apply to large datasets. To address this issue, we propose Data-OOB, a new data valuation method for a bagging model that utilizes the out-of-bag estimate. The proposed method is computationally efficient and can scale to millions of data by reusing trained weak learners. Specifically, Data-OOB takes less than 2.25 hours on a single CPU processor when there are \$10\^{}6\$ samples to evaluate and the input dimension is 100. Furthermore, Data-OOB has solid theoretical interpretations in that it identifies the same important data point as the infinitesimal jackknife influence function when two different points are compared. We conduct comprehensive experiments using 12 classification datasets, each with thousands of sample sizes. We demonstrate that the proposed method significantly outperforms existing state-of-the-art data valuation methods in identifying mislabeled data and finding a set of helpful (or harmful) data points, highlighting the potential for applying data values in real-world applications.},
   langid = {english},
   keywords = {notion}
 }
@@ -185,65 +237,75 @@ @inproceedings{kwon_efficient_2021
   title = {Efficient {{Computation}} and {{Analysis}} of {{Distributional Shapley Values}}},
   booktitle = {Proceedings of the 24th {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
   author = {Kwon, Yongchan and Rivas, Manuel A. and Zou, James},
-  date = {2021-03-18},
+  year = {2021},
+  month = mar,
   eprint = {2007.01357},
-  eprinttype = {arxiv},
   pages = {793--801},
-  publisher = {{PMLR}},
+  publisher = {PMLR},
   issn = {2640-3498},
-  url = {http://proceedings.mlr.press/v130/kwon21a.html},
   urldate = {2021-04-23},
   abstract = {Distributional data Shapley value (DShapley) has recently been proposed as a principled framework to quantify the contribution of individual datum in machine learning. DShapley develops the founda...},
-  eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
+  archiveprefix = {arxiv},
+  langid = {english}
+}
+
+@inproceedings{martens_optimizing_2015,
+  title = {Optimizing {{Neural Networks}} with {{Kronecker-factored Approximate Curvature}}},
+  booktitle = {Proceedings of the 32nd {{International Conference}} on {{Machine Learning}}},
+  author = {Martens, James and Grosse, Roger},
+  year = {2015},
+  month = jun,
+  pages = {2408--2417},
+  publisher = {PMLR},
+  issn = {1938-7228},
+  urldate = {2022-11-26},
+  abstract = {We propose an efficient method for approximating natural gradient descent in neural networks which we call Kronecker-factored Approximate Curvature (K-FAC). K-FAC is based on an efficiently invertible approximation of a neural network's Fisher information matrix which is neither diagonal nor low-rank, and in some cases is completely non-sparse. It is derived by approximating various large blocks of the Fisher (corresponding to entire layers) as being the Kronecker product of two much smaller matrices. While only several times more expensive to compute than the plain stochastic gradient, the updates produced by K-FAC make much more progress optimizing the objective, which results in an algorithm that can be much faster than stochastic gradient descent with momentum in practice. And unlike some previously proposed approximate natural-gradient/Newton methods which use high-quality non-diagonal curvature matrices (such as Hessian-free optimization), K-FAC works very well in highly stochastic optimization regimes. This is because the cost of storing and inverting K-FAC's approximation to the curvature matrix does not depend on the amount of data used to estimate it, which is a feature typically associated only with diagonal or low-rank approximations to the curvature matrix.},
   langid = {english}
 }
 
 @article{mitchell_sampling_2022,
   title = {Sampling {{Permutations}} for {{Shapley Value Estimation}}},
   author = {Mitchell, Rory and Cooper, Joshua and Frank, Eibe and Holmes, Geoffrey},
-  date = {2022},
-  journaltitle = {Journal of Machine Learning Research},
-  shortjournal = {J. Mach. Learn. Res.},
+  year = {2022},
+  journal = {Journal of Machine Learning Research},
   volume = {23},
   number = {43},
   pages = {1--46},
   issn = {1533-7928},
-  url = {http://jmlr.org/papers/v23/21-0439.html},
   urldate = {2022-10-23},
-  abstract = {Game-theoretic attribution techniques based on Shapley values are used to interpret black-box machine learning models, but their exact calculation is generally NP-hard, requiring approximation methods for non-trivial models. As the computation of Shapley values can be expressed as a summation over a set of permutations, a common approach is to sample a subset of these permutations for approximation. Unfortunately, standard Monte Carlo sampling methods can exhibit slow convergence, and more sophisticated quasi-Monte Carlo methods have not yet been applied to the space of permutations. To address this, we investigate new approaches based on two classes of approximation methods and compare them empirically. First, we demonstrate quadrature techniques in a RKHS containing functions of permutations, using the Mallows kernel in combination with kernel herding and sequential Bayesian quadrature. The RKHS perspective also leads to quasi-Monte Carlo type error bounds, with a tractable discrepancy measure defined on permutations. Second, we exploit connections between the hypersphere  S d−2 Sd−2  and permutations to create practical algorithms for generating permutation samples with good properties. Experiments show the above techniques provide significant improvements for Shapley value estimates over existing methods, converging to a smaller RMSE in the same number of model evaluations.}
+  abstract = {Game-theoretic attribution techniques based on Shapley values are used to interpret black-box machine learning models, but their exact calculation is generally NP-hard, requiring approximation methods for non-trivial models. As the computation of Shapley values can be expressed as a summation over a set of permutations, a common approach is to sample a subset of these permutations for approximation. Unfortunately, standard Monte Carlo sampling methods can exhibit slow convergence, and more sophisticated quasi-Monte Carlo methods have not yet been applied to the space of permutations. To address this, we investigate new approaches based on two classes of approximation methods and compare them empirically. First, we demonstrate quadrature techniques in a RKHS containing functions of permutations, using the Mallows kernel in combination with kernel herding and sequential Bayesian quadrature. The RKHS perspective also leads to quasi-Monte Carlo type error bounds, with a tractable discrepancy measure defined on permutations. Second, we exploit connections between the hypersphere  S d-2 Sd-2  and permutations to create practical algorithms for generating permutation samples with good properties. Experiments show the above techniques provide significant improvements for Shapley value estimates over existing methods, converging to a smaller RMSE in the same number of model evaluations.}
 }
 
 @inproceedings{okhrati_multilinear_2021,
   title = {A {{Multilinear Sampling Algorithm}} to {{Estimate Shapley Values}}},
   booktitle = {2020 25th {{International Conference}} on {{Pattern Recognition}} ({{ICPR}})},
   author = {Okhrati, Ramin and Lipani, Aldo},
-  date = {2021-01},
+  year = {2021},
+  month = jan,
   eprint = {2010.12082},
-  eprinttype = {arxiv},
   pages = {7992--7999},
-  publisher = {{IEEE}},
+  publisher = {IEEE},
   issn = {1051-4651},
   doi = {10.1109/ICPR48806.2021.9412511},
-  url = {https://ieeexplore.ieee.org/abstract/document/9412511},
   abstract = {Shapley values are great analytical tools in game theory to measure the importance of a player in a game. Due to their axiomatic and desirable properties such as efficiency, they have become popular for feature importance analysis in data science and machine learning. However, the time complexity to compute Shapley values based on the original formula is exponential, and as the number of features increases, this becomes infeasible. Castro et al. [1] developed a sampling algorithm, to estimate Shapley values. In this work, we propose a new sampling method based on a multilinear extension technique as applied in game theory. The aim is to provide a more efficient (sampling) method for estimating Shapley values. Our method is applicable to any machine learning model, in particular for either multiclass classifications or regression problems. We apply the method to estimate Shapley values for multilayer perceptrons (MLPs) and through experimentation on two datasets, we demonstrate that our method provides more accurate estimations of the Shapley values by reducing the variance of the sampling statistics.},
-  eventtitle = {2020 25th {{International Conference}} on {{Pattern Recognition}} ({{ICPR}})},
+  archiveprefix = {arxiv},
   langid = {english},
   keywords = {notion}
 }
 
 @inproceedings{schioppa_scaling_2021,
   title = {Scaling {{Up Influence Functions}}},
+  booktitle = {{{AAAI-22}}},
   author = {Schioppa, Andrea and Zablotskaia, Polina and Vilar, David and Sokolov, Artem},
-  date = {2021-12-06},
+  year = {2021},
+  month = dec,
   eprint = {2112.03052},
-  eprinttype = {arxiv},
-  eprintclass = {cs},
-  publisher = {{arXiv}},
+  primaryclass = {cs},
+  publisher = {arXiv},
   doi = {10.48550/arXiv.2112.03052},
-  url = {http://arxiv.org/abs/2112.03052},
   urldate = {2023-03-10},
   abstract = {We address efficient calculation of influence functions for tracking predictions back to the training data. We propose and analyze a new approach to speeding up the inverse Hessian calculation based on Arnoldi iteration. With this improvement, we achieve, to the best of our knowledge, the first successful implementation of influence functions that scales to full-size (language and vision) Transformer models with several hundreds of millions of parameters. We evaluate our approach on image classification and sequence-to-sequence tasks with tens to a hundred of millions of training examples. Our code will be available at https://github.com/google-research/jax-influence.},
-  eventtitle = {{{AAAI-22}}},
+  archiveprefix = {arxiv},
   keywords = {notion}
 }
 
@@ -252,126 +314,75 @@ @inproceedings{schoch_csshapley_2022
   shorttitle = {{{CS-Shapley}}},
   booktitle = {Proc. of the Thirty-Sixth {{Conference}} on {{Neural Information Processing Systems}} ({{NeurIPS}})},
   author = {Schoch, Stephanie and Xu, Haifeng and Ji, Yangfeng},
-  date = {2022-10-31},
-  location = {{New Orleans, Louisiana, USA}},
-  url = {https://openreview.net/forum?id=KTOcrOR5mQ9},
+  year = {2022},
+  month = oct,
+  address = {New Orleans, Louisiana, USA},
   urldate = {2022-11-23},
-  abstract = {Data valuation, or the valuation of individual datum contributions, has seen growing interest in machine learning due to its demonstrable efficacy for tasks such as noisy label detection. In particular, due to the desirable axiomatic properties, several Shapley value approximations have been proposed. In these methods, the value function is usually defined as the predictive accuracy over the entire development set. However, this limits the ability to differentiate between training instances that are helpful or harmful to their own classes. Intuitively, instances that harm their own classes may be noisy or mislabeled and should receive a lower valuation than helpful instances. In this work, we propose CS-Shapley, a Shapley value with a new value function that discriminates between training instances’ in-class and out-of-class contributions. Our theoretical analysis shows the proposed value function is (essentially) the unique function that satisfies two desirable properties for evaluating data values in classification. Further, our experiments on two benchmark evaluation tasks (data removal and noisy label detection) and four classifiers demonstrate the effectiveness of CS-Shapley over existing methods. Lastly, we evaluate the “transferability” of data values estimated from one classifier to others, and our results suggest Shapley-based data valuation is transferable for application across different models.},
-  eventtitle = {Advances in {{Neural Information Processing Systems}}  ({{NeurIPS}} 2022)},
+  abstract = {Data valuation, or the valuation of individual datum contributions, has seen growing interest in machine learning due to its demonstrable efficacy for tasks such as noisy label detection. In particular, due to the desirable axiomatic properties, several Shapley value approximations have been proposed. In these methods, the value function is usually defined as the predictive accuracy over the entire development set. However, this limits the ability to differentiate between training instances that are helpful or harmful to their own classes. Intuitively, instances that harm their own classes may be noisy or mislabeled and should receive a lower valuation than helpful instances. In this work, we propose CS-Shapley, a Shapley value with a new value function that discriminates between training instances' in-class and out-of-class contributions. Our theoretical analysis shows the proposed value function is (essentially) the unique function that satisfies two desirable properties for evaluating data values in classification. Further, our experiments on two benchmark evaluation tasks (data removal and noisy label detection) and four classifiers demonstrate the effectiveness of CS-Shapley over existing methods. Lastly, we evaluate the ``transferability'' of data values estimated from one classifier to others, and our results suggest Shapley-based data valuation is transferable for application across different models.},
   langid = {english},
   keywords = {notion}
 }
 
-@online{wang_data_2022,
-  title = {Data {{Banzhaf}}: {{A Robust Data Valuation Framework}} for {{Machine Learning}}},
-  shorttitle = {Data {{Banzhaf}}},
-  author = {Wang, Jiachen T. and Jia, Ruoxi},
-  date = {2022-10-22},
-  eprint = {2205.15466},
-  eprinttype = {arxiv},
-  eprintclass = {cs, stat},
-  doi = {10.48550/arXiv.2205.15466},
-  url = {http://arxiv.org/abs/2205.15466},
-  urldate = {2022-10-28},
-  abstract = {This paper studies the robustness of data valuation to noisy model performance scores. Particularly, we find that the inherent randomness of the widely used stochastic gradient descent can cause existing data value notions (e.g., the Shapley value and the Leave-one-out error) to produce inconsistent data value rankings across different runs. To address this challenge, we first pose a formal framework within which one can measure the robustness of a data value notion. We show that the Banzhaf value, a value notion originated from cooperative game theory literature, achieves the maximal robustness among all semivalues -- a class of value notions that satisfy crucial properties entailed by ML applications. We propose an algorithm to efficiently estimate the Banzhaf value based on the Maximum Sample Reuse (MSR) principle. We derive the lower bound sample complexity for Banzhaf value estimation, and we show that our MSR algorithm's sample complexity is close to the lower bound. Our evaluation demonstrates that the Banzhaf value outperforms the existing semivalue-based data value notions on several downstream ML tasks such as learning with weighted samples and noisy label detection. Overall, our study suggests that when the underlying ML algorithm is stochastic, the Banzhaf value is a promising alternative to the semivalue-based data value schemes given its computational advantage and ability to robustly differentiate data quality.},
-  pubstate = {preprint},
-  keywords = {notion}
-}
-
 @inproceedings{wang_improving_2022,
   title = {Improving {{Cooperative Game Theory-based Data Valuation}} via {{Data Utility Learning}}},
+  booktitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022). {{Workshop}} on {{Socially Responsible Machine Learning}}},
   author = {Wang, Tianhao and Yang, Yu and Jia, Ruoxi},
-  date = {2022-04-07},
+  year = {2022},
+  month = apr,
   eprint = {2107.06336v2},
-  eprinttype = {arxiv},
-  publisher = {{arXiv}},
+  publisher = {arXiv},
   doi = {10.48550/arXiv.2107.06336},
-  url = {http://arxiv.org/abs/2107.06336v2},
   urldate = {2022-05-19},
   abstract = {The Shapley value (SV) and Least core (LC) are classic methods in cooperative game theory for cost/profit sharing problems. Both methods have recently been proposed as a principled solution for data valuation tasks, i.e., quantifying the contribution of individual datum in machine learning. However, both SV and LC suffer computational challenges due to the need for retraining models on combinatorially many data subsets. In this work, we propose to boost the efficiency in computing Shapley value or Least core by learning to estimate the performance of a learning algorithm on unseen data combinations. Theoretically, we derive bounds relating the error in the predicted learning performance to the approximation error in SV and LC. Empirically, we show that the proposed method can significantly improve the accuracy of SV and LC estimation.},
-  eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022). {{Workshop}} on {{Socially Responsible Machine Learning}}},
+  archiveprefix = {arxiv},
   langid = {english},
   keywords = {notion}
 }
 
+@misc{watson_accelerated_2023,
+  title = {Accelerated {{Shapley Value Approximation}} for {{Data Evaluation}}},
+  author = {Watson, Lauren and Kujawa, Zeno and Andreeva, Rayna and Yang, Hao-Tsung and Elahi, Tariq and Sarkar, Rik},
+  year = {2023},
+  month = nov,
+  number = {arXiv:2311.05346},
+  eprint = {2311.05346},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2311.05346},
+  urldate = {2023-12-07},
+  abstract = {Data valuation has found various applications in machine learning, such as data filtering, efficient learning and incentives for data sharing. The most popular current approach to data valuation is the Shapley value. While popular for its various applications, Shapley value is computationally expensive even to approximate, as it requires repeated iterations of training models on different subsets of data. In this paper we show that the Shapley value of data points can be approximated more efficiently by leveraging the structural properties of machine learning problems. We derive convergence guarantees on the accuracy of the approximate Shapley value for different learning settings including Stochastic Gradient Descent with convex and non-convex loss functions. Our analysis suggests that in fact models trained on small subsets are more important in the context of data valuation. Based on this idea, we describe \${\textbackslash}delta\$-Shapley -- a strategy of only using small subsets for the approximation. Experiments show that this approach preserves approximate value and rank of data, while achieving speedup of up to 9.9x. In pre-trained networks the approach is found to bring more efficiency in terms of accurate evaluation using small subsets.},
+  archiveprefix = {arxiv}
+}
+
 @inproceedings{wu_davinz_2022,
   title = {{{DAVINZ}}: {{Data Valuation}} Using {{Deep Neural Networks}} at {{Initialization}}},
   shorttitle = {{{DAVINZ}}},
   booktitle = {Proceedings of the 39th {{International Conference}} on {{Machine Learning}}},
   author = {Wu, Zhaoxuan and Shu, Yao and Low, Bryan Kian Hsiang},
-  date = {2022-06-28},
+  year = {2022},
+  month = jun,
   pages = {24150--24176},
-  publisher = {{PMLR}},
-  url = {https://proceedings.mlr.press/v162/wu22j.html},
+  publisher = {PMLR},
   urldate = {2022-10-29},
   abstract = {Recent years have witnessed a surge of interest in developing trustworthy methods to evaluate the value of data in many real-world applications (e.g., collaborative machine learning, data marketplaces). Existing data valuation methods typically valuate data using the generalization performance of converged machine learning models after their long-term model training, hence making data valuation on large complex deep neural networks (DNNs) unaffordable. To this end, we theoretically derive a domain-aware generalization bound to estimate the generalization performance of DNNs without model training. We then exploit this theoretically derived generalization bound to develop a novel training-free data valuation method named data valuation at initialization (DAVINZ) on DNNs, which consistently achieves remarkable effectiveness and efficiency in practice. Moreover, our training-free DAVINZ, surprisingly, can even theoretically and empirically enjoy the desirable properties that training-based data valuation methods usually attain, thus making it more trustworthy in practice.},
-  eventtitle = {International {{Conference}} on {{Machine Learning}}},
   langid = {english},
   keywords = {notion}
 }
 
 @inproceedings{yan_if_2021,
-  title = {If {{You Like Shapley Then You}}’ll {{Love}} the {{Core}}},
+  title = {If {{You Like Shapley Then You}}'ll {{Love}} the {{Core}}},
   booktitle = {Proceedings of the 35th {{AAAI Conference}} on {{Artificial Intelligence}}, 2021},
   author = {Yan, Tom and Procaccia, Ariel D.},
-  date = {2021-05-18},
+  year = {2021},
+  month = may,
   volume = {6},
   pages = {5751--5759},
-  publisher = {{Association for the Advancement of Artificial Intelligence}},
-  location = {{Virtual conference}},
+  publisher = {Association for the Advancement of Artificial Intelligence},
+  address = {Virtual conference},
   doi = {10.1609/aaai.v35i6.16721},
-  url = {https://ojs.aaai.org/index.php/AAAI/article/view/16721},
   urldate = {2021-04-23},
-  abstract = {The prevalent approach to problems of credit assignment in machine learning — such as feature and data valuation— is to model the problem at hand as a cooperative game and apply the Shapley value. But cooperative game theory offers a rich menu of alternative solution concepts, which famously includes the core and its variants. Our goal is to challenge the machine learning community’s current consensus around the Shapley value, and make a case for the core as a viable alternative. To that end, we prove that arbitrarily good approximations to the least core — a core relaxation that is always feasible — can be computed efficiently (but prove an impossibility for a more refined solution concept, the nucleolus). We also perform experiments that corroborate these theoretical results and shed light on settings where the least core may be preferable to the Shapley value.},
-  eventtitle = {{{AAAI Conference}} on {{Artificial Intelligence}}},
+  abstract = {The prevalent approach to problems of credit assignment in machine learning --- such as feature and data valuation--- is to model the problem at hand as a cooperative game and apply the Shapley value. But cooperative game theory offers a rich menu of alternative solution concepts, which famously includes the core and its variants. Our goal is to challenge the machine learning community's current consensus around the Shapley value, and make a case for the core as a viable alternative. To that end, we prove that arbitrarily good approximations to the least core --- a core relaxation that is always feasible --- can be computed efficiently (but prove an impossibility for a more refined solution concept, the nucleolus). We also perform experiments that corroborate these theoretical results and shed light on settings where the least core may be preferable to the Shapley value.},
+  copyright = {Copyright (c) 2021, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.},
   langid = {english},
   keywords = {notion}
 }
-
-@InProceedings{kwon_data_2023,
-  title = 	 {Data-{OOB}: Out-of-bag Estimate as a Simple and Efficient Data Value},
-  author =       {Kwon, Yongchan and Zou, James},
-  booktitle = 	 {Proceedings of the 40th International Conference on Machine Learning},
-  pages = 	 {18135--18152},
-  year = 	 {2023},
-  editor = 	 {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},
-  volume = 	 {202},
-  series = 	 {Proceedings of Machine Learning Research},
-  month = 	 {23--29 Jul},
-  publisher =    {PMLR},
-  pdf = 	 {https://proceedings.mlr.press/v202/kwon23e/kwon23e.pdf},
-  url = 	 {https://proceedings.mlr.press/v202/kwon23e.html},
-  abstract = 	 {Data valuation is a powerful framework for providing statistical insights into which data are beneficial or detrimental to model training. Many Shapley-based data valuation methods have shown promising results in various downstream tasks, however, they are well known to be computationally challenging as it requires training a large number of models. As a result, it has been recognized as infeasible to apply to large datasets. To address this issue, we propose Data-OOB, a new data valuation method for a bagging model that utilizes the out-of-bag estimate. The proposed method is computationally efficient and can scale to millions of data by reusing trained weak learners. Specifically, Data-OOB takes less than $2.25$ hours on a single CPU processor when there are $10^6$ samples to evaluate and the input dimension is $100$. Furthermore, Data-OOB has solid theoretical interpretations in that it identifies the same important data point as the infinitesimal jackknife influence function when two different points are compared. We conduct comprehensive experiments using 12 classification datasets, each with thousands of sample sizes. We demonstrate that the proposed method significantly outperforms existing state-of-the-art data valuation methods in identifying mislabeled data and finding a set of helpful (or harmful) data points, highlighting the potential for applying data values in real-world applications.}
-}
-
-@article{george2018fast,
-  title={Fast approximate natural gradient descent in a kronecker factored eigenbasis},
-  author={George, Thomas and Laurent, C{\'e}sar and Bouthillier, Xavier and Ballas, Nicolas and Vincent, Pascal},
-  journal={Advances in Neural Information Processing Systems},
-  volume={31},
-  year={2018}
-}
-
-@inproceedings{martens2015optimizing,
-  title={Optimizing neural networks with kronecker-factored approximate curvature},
-  author={Martens, James and Grosse, Roger},
-  booktitle={International conference on machine learning},
-  pages={2408--2417},
-  year={2015},
-  organization={PMLR}
-}
-
-@misc{frangella_randomized_2021,
-  title = {Randomized {{Nystr}}{\textbackslash}"om {{Preconditioning}}},
-  author = {Frangella, Zachary and Tropp, Joel A. and Udell, Madeleine},
-  year = {2021},
-  month = dec,
-  number = {arXiv:2110.02820},
-  eprint = {2110.02820},
-  primaryclass = {cs, math},
-  publisher = {{arXiv}},
-  doi = {10.48550/arXiv.2110.02820},
-  urldate = {2023-06-04},
-  abstract = {This paper introduces the Nystr{\textbackslash}"om PCG algorithm for solving a symmetric positive-definite linear system. The algorithm applies the randomized Nystr{\textbackslash}"om method to form a low-rank approximation of the matrix, which leads to an efficient preconditioner that can be deployed with the conjugate gradient algorithm. Theoretical analysis shows that preconditioned system has constant condition number as soon as the rank of the approximation is comparable with the number of effective degrees of freedom in the matrix. The paper also develops adaptive methods that provably achieve similar performance without knowledge of the effective dimension. Numerical tests show that Nystr{\textbackslash}"om PCG can rapidly solve large linear systems that arise in data analysis problems, and it surpasses several competing methods from the literature.},
-  archiveprefix = {arxiv},
-}
\ No newline at end of file
diff --git a/docs/influence/influence_function_model.md b/docs/influence/influence_function_model.md
index b4c95e0ba..951ea8420 100644
--- a/docs/influence/influence_function_model.md
+++ b/docs/influence/influence_function_model.md
@@ -99,7 +99,7 @@ if_model = ArnoldiInfluence(
 ### Eigenvalue Corrected K-FAC
 
 K-FAC, short for Kronecker-Factored Approximate Curvature, is a method that approximates the Fisher information matrix [FIM](https://en.wikipedia.org/wiki/Fisher_information) of a model. It is possible to show that for classification models with appropriate loss functions the FIM is equal to the Hessian of the model’s loss over the dataset. In this restricted but nonetheless important context K-FAC offers an efficient way to approximate the Hessian and hence the influence scores. 
-For more info and details refer to the original paper [@martens2015optimizing].
+For more info and details refer to the original paper [@martens_optimizing_2015].
 
 The K-FAC method is implemented in the class [EkfacInfluence](pydvl/influence/torch/influence_function_model.py). The following code snippet shows how to use the K-FAC method to calculate the influence function of a model. Note that, in contrast to the other methods for influence function calculation, K-FAC does not require the loss function as an input. This is because the current implementation is only applicable to classification models with a cross entropy loss function. 
 
@@ -112,7 +112,7 @@ if_model = EkfacInfluence(
 ```
 Upon initialization, the K-FAC method will parse the model and extract which layers require grad and which do not. Then it will only calculate the influence scores for the layers that require grad. The current implementation of the K-FAC method is only available for linear layers, and therefore if the model contains non-linear layers that require gradient the K-FAC method will raise a NotImplementedLayerRepresentationException.
 
-A further improvement of the K-FAC method is the Eigenvalue Corrected K-FAC (EKFAC) method [@george2018fast], which allows to further re-fit the eigenvalues of the Hessian, thus providing a more accurate approximation. On top of the K-FAC method, the EKFAC method is implemented by setting `update_diagonal=True` when initialising [EkfacInfluence](pydvl/influence/torch/influence_function_model.py). The following code snippet shows how to use the EKFAC method to calculate the influence function of a model. 
+A further improvement of the K-FAC method is the Eigenvalue Corrected K-FAC (EKFAC) method [@george_fast_2018], which allows to further re-fit the eigenvalues of the Hessian, thus providing a more accurate approximation. On top of the K-FAC method, the EKFAC method is implemented by setting `update_diagonal=True` when initialising [EkfacInfluence](pydvl/influence/torch/influence_function_model.py). The following code snippet shows how to use the EKFAC method to calculate the influence function of a model. 
 
 ```python
 from pydvl.influence.torch import EkfacInfluence

From 114f87a2352404280900208970125fca2bddf43a Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <k.schroeder@applied-institute.de>
Date: Tue, 12 Mar 2024 10:01:38 +0100
Subject: [PATCH 2/3] Switch to BetterBibLatex format to include url field

---
 docs/assets/pydvl.bib | 215 +++++++++++++++++++++++-------------------
 1 file changed, 117 insertions(+), 98 deletions(-)

diff --git a/docs/assets/pydvl.bib b/docs/assets/pydvl.bib
index c0b6df378..b96ee442a 100644
--- a/docs/assets/pydvl.bib
+++ b/docs/assets/pydvl.bib
@@ -1,26 +1,28 @@
 @article{agarwal_secondorder_2017,
   title = {Second-{{Order Stochastic Optimization}} for {{Machine Learning}} in {{Linear Time}}},
   author = {Agarwal, Naman and Bullins, Brian and Hazan, Elad},
-  year = {2017},
-  journal = {Journal of Machine Learning Research},
+  date = {2017},
+  journaltitle = {Journal of Machine Learning Research},
+  shortjournal = {JMLR},
   volume = {18},
   eprint = {1602.03943},
+  eprinttype = {arxiv},
   pages = {1--40},
+  url = {https://www.jmlr.org/papers/v18/16-491.html},
   abstract = {First-order stochastic methods are the state-of-the-art in large-scale machine learning optimization owing to efficient per-iteration complexity. Second-order methods, while able to provide faster convergence, have been much less explored due to the high cost of computing the second-order information. In this paper we develop second-order stochastic methods for optimization problems in machine learning that match the per-iteration cost of gradient based methods, and in certain settings improve upon the overall running time over popular first-order methods. Furthermore, our algorithm has the desirable property of being implementable in time linear in the sparsity of the input data.},
-  archiveprefix = {arxiv},
   langid = {english}
 }
 
 @article{benmerzoug_re_2023,
   title = {[{{Re}}] {{If}} You like {{Shapley}}, Then You'll Love the Core},
   author = {Benmerzoug, Anes and Delgado, Miguel de Benito},
-  year = {2023},
-  month = jul,
-  journal = {ReScience C},
+  date = {2023-07-31},
+  journaltitle = {ReScience C},
   volume = {9},
   number = {2},
   pages = {\#32},
   doi = {10.5281/zenodo.8173733},
+  url = {https://zenodo.org/record/8173733},
   urldate = {2023-08-27},
   abstract = {We investigate the results of [1] in the field of data valuation. We repeat their experiments and conclude that the (Monte Carlo) Least Core is sensitive to important characteristics of the ML problem of interest, making it difficult to apply.},
   keywords = {notion}
@@ -28,48 +30,49 @@ @article{benmerzoug_re_2023
 
 @article{castro_polynomial_2009,
   title = {Polynomial Calculation of the {{Shapley}} Value Based on Sampling},
-  author = {Castro, Javier and G{\'o}mez, Daniel and Tejada, Juan},
-  year = {2009},
-  month = may,
-  journal = {Computers \& Operations Research},
+  author = {Castro, Javier and Gómez, Daniel and Tejada, Juan},
+  date = {2009-05-01},
+  journaltitle = {Computers \& Operations Research},
+  shortjournal = {Computers \& Operations Research},
   series = {Selected Papers Presented at the {{Tenth International Symposium}} on {{Locational Decisions}} ({{ISOLDE X}})},
   volume = {36},
   number = {5},
   pages = {1726--1730},
   issn = {0305-0548},
   doi = {10.1016/j.cor.2008.04.004},
+  url = {http://www.sciencedirect.com/science/article/pii/S0305054808000804},
   urldate = {2020-11-21},
   abstract = {In this paper we develop a polynomial method based on sampling theory that can be used to estimate the Shapley value (or any semivalue) for cooperative games. Besides analyzing the complexity problem, we examine some desirable statistical properties of the proposed approach and provide some computational results.},
   langid = {english},
   keywords = {notion}
 }
 
-@misc{frangella_randomized_2021,
-  title = {Randomized {{Nystr{\"o}m Preconditioning}}},
+@online{frangella_randomized_2021,
+  title = {Randomized {{Nyström Preconditioning}}},
   author = {Frangella, Zachary and Tropp, Joel A. and Udell, Madeleine},
-  year = {2021},
-  month = dec,
-  number = {arXiv:2110.02820},
+  date = {2021-12-17},
   eprint = {2110.02820},
-  primaryclass = {cs, math},
-  publisher = {arXiv},
+  eprinttype = {arxiv},
+  eprintclass = {cs, math},
   doi = {10.48550/arXiv.2110.02820},
+  url = {http://arxiv.org/abs/2110.02820},
   urldate = {2023-06-04},
-  abstract = {This paper introduces the Nystr{\textbackslash}"om PCG algorithm for solving a symmetric positive-definite linear system. The algorithm applies the randomized Nystr{\textbackslash}"om method to form a low-rank approximation of the matrix, which leads to an efficient preconditioner that can be deployed with the conjugate gradient algorithm. Theoretical analysis shows that preconditioned system has constant condition number as soon as the rank of the approximation is comparable with the number of effective degrees of freedom in the matrix. The paper also develops adaptive methods that provably achieve similar performance without knowledge of the effective dimension. Numerical tests show that Nystr{\textbackslash}"om PCG can rapidly solve large linear systems that arise in data analysis problems, and it surpasses several competing methods from the literature.},
-  archiveprefix = {arxiv}
+  abstract = {This paper introduces the Nystr\textbackslash "om PCG algorithm for solving a symmetric positive-definite linear system. The algorithm applies the randomized Nystr\textbackslash "om method to form a low-rank approximation of the matrix, which leads to an efficient preconditioner that can be deployed with the conjugate gradient algorithm. Theoretical analysis shows that preconditioned system has constant condition number as soon as the rank of the approximation is comparable with the number of effective degrees of freedom in the matrix. The paper also develops adaptive methods that provably achieve similar performance without knowledge of the effective dimension. Numerical tests show that Nystr\textbackslash "om PCG can rapidly solve large linear systems that arise in data analysis problems, and it surpasses several competing methods from the literature.},
+  pubstate = {preprint}
 }
 
 @inproceedings{george_fast_2018,
   title = {Fast {{Approximate Natural Gradient Descent}} in a {{Kronecker Factored Eigenbasis}}},
   booktitle = {Advances in {{Neural Information Processing Systems}}},
-  author = {George, Thomas and Laurent, C{\'e}sar and Bouthillier, Xavier and Ballas, Nicolas and Vincent, Pascal},
-  year = {2018},
+  author = {George, Thomas and Laurent, César and Bouthillier, Xavier and Ballas, Nicolas and Vincent, Pascal},
+  date = {2018},
   volume = {31},
   eprint = {1806.03884},
+  eprinttype = {arxiv},
   publisher = {Curran Associates, Inc.},
+  url = {https://proceedings.neurips.cc/paper/2018/hash/48000647b315f6f00f913caa757a70b3-Abstract.html},
   urldate = {2024-01-12},
-  abstract = {Optimization algorithms that leverage gradient covariance information, such as variants of natural gradient descent (Amari, 1998), offer the prospect of yielding more effective descent directions. For models with many parameters, the covari- ance matrix they are based on becomes gigantic, making them inapplicable in their original form. This has motivated research into both simple diagonal approxima- tions and more sophisticated factored approximations such as KFAC (Heskes, 2000; Martens \& Grosse, 2015; Grosse \& Martens, 2016). In the present work we draw inspiration from both to propose a novel approximation that is provably better than KFAC and amendable to cheap partial updates. It consists in tracking a diagonal variance, not in parameter coordinates, but in a Kronecker-factored eigenbasis, in which the diagonal approximation is likely to be more effective. Experiments show improvements over KFAC in optimization speed for several deep network architectures.},
-  archiveprefix = {arxiv}
+  abstract = {Optimization algorithms that leverage gradient covariance information, such as variants of natural gradient descent (Amari, 1998), offer the prospect of yielding more effective descent directions. For models with many parameters, the covari- ance matrix they are based on becomes gigantic, making them inapplicable in their original form. This has motivated research into both simple diagonal approxima- tions and more sophisticated factored approximations such as KFAC (Heskes, 2000; Martens \& Grosse, 2015; Grosse \& Martens, 2016). In the present work we draw inspiration from both to propose a novel approximation that is provably better than KFAC and amendable to cheap partial updates. It consists in tracking a diagonal variance, not in parameter coordinates, but in a Kronecker-factored eigenbasis, in which the diagonal approximation is likely to be more effective. Experiments show improvements over KFAC in optimization speed for several deep network architectures.}
 }
 
 @inproceedings{ghorbani_data_2019,
@@ -77,15 +80,16 @@ @inproceedings{ghorbani_data_2019
   shorttitle = {Data {{Shapley}}},
   booktitle = {Proceedings of the 36th {{International Conference}} on {{Machine Learning}}, {{PMLR}}},
   author = {Ghorbani, Amirata and Zou, James},
-  year = {2019},
-  month = may,
+  date = {2019-05-24},
   eprint = {1904.02868},
+  eprinttype = {arxiv},
   pages = {2242--2251},
   publisher = {PMLR},
   issn = {2640-3498},
+  url = {http://proceedings.mlr.press/v97/ghorbani19c.html},
   urldate = {2020-11-01},
   abstract = {As data becomes the fuel driving technological and economic growth, a fundamental challenge is how to quantify the value of data in algorithmic predictions and decisions. For example, in healthcare and consumer markets, it has been suggested that individuals should be compensated for the data that they generate, but it is not clear what is an equitable valuation for individual data. In this work, we develop a principled framework to address data valuation in the context of supervised machine learning. Given a learning algorithm trained on n data points to produce a predictor, we propose data Shapley as a metric to quantify the value of each training datum to the predictor performance. Data Shapley uniquely satisfies several natural properties of equitable data valuation. We develop Monte Carlo and gradient-based methods to efficiently estimate data Shapley values in practical settings where complex learning algorithms, including neural networks, are trained on large datasets. In addition to being equitable, extensive experiments across biomedical, image and synthetic data demonstrate that data Shapley has several other benefits: 1) it is more powerful than the popular leave-one-out or leverage score in providing insight on what data is more valuable for a given learning task; 2) low Shapley value data effectively capture outliers and corruptions; 3) high Shapley value data inform what type of new data to acquire to improve the predictor.},
-  archiveprefix = {arxiv},
+  eventtitle = {International {{Conference}} on {{Machine Learning}} ({{ICML}} 2019)},
   langid = {english},
   keywords = {notion}
 }
@@ -93,8 +97,9 @@ @inproceedings{ghorbani_data_2019
 @article{hampel_influence_1974,
   title = {The {{Influence Curve}} and {{Its Role}} in {{Robust Estimation}}},
   author = {Hampel, Frank R.},
-  year = {1974},
-  journal = {Journal of the American Statistical Association},
+  date = {1974},
+  journaltitle = {Journal of the American Statistical Association},
+  shortjournal = {J. Am. Stat. Assoc.},
   volume = {69},
   number = {346},
   eprint = {2285666},
@@ -103,35 +108,38 @@ @article{hampel_influence_1974
   publisher = {[American Statistical Association, Taylor \& Francis, Ltd.]},
   issn = {0162-1459},
   doi = {10.2307/2285666},
+  url = {https://www.jstor.org/stable/2285666},
   urldate = {2022-05-09},
   abstract = {This paper treats essentially the first derivative of an estimator viewed as functional and the ways in which it can be used to study local robustness properties. A theory of robust estimation "near" strict parametric models is briefly sketched and applied to some classical situations. Relations between von Mises functionals, the jackknife and U-statistics are indicated. A number of classical and new estimators are discussed, including trimmed and Winsorized means, Huber-estimators, and more generally maximum likelihood and M-estimators. Finally, a table with some numerical robustness properties is given.}
 }
 
 @inproceedings{hataya_nystrom_2023,
-  title = {Nystr{\"o}m {{Method}} for {{Accurate}} and {{Scalable Implicit Differentiation}}},
+  title = {Nyström {{Method}} for {{Accurate}} and {{Scalable Implicit Differentiation}}},
   booktitle = {Proceedings of {{The}} 26th {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
   author = {Hataya, Ryuichiro and Yamada, Makoto},
-  year = {2023},
-  month = apr,
+  date = {2023-04-11},
   pages = {4643--4654},
   publisher = {PMLR},
   issn = {2640-3498},
+  url = {https://proceedings.mlr.press/v206/hataya23a.html},
   urldate = {2024-02-26},
-  abstract = {The essential difficulty of gradient-based bilevel optimization using implicit differentiation is to estimate the inverse Hessian vector product with respect to neural network parameters. This paper proposes to tackle this problem by the Nystr{\"o}m method and the Woodbury matrix identity, exploiting the low-rankness of the Hessian. Compared to existing methods using iterative approximation, such as conjugate gradient and the Neumann series approximation, the proposed method avoids numerical instability and can be efficiently computed in matrix operations without iterations. As a result, the proposed method works stably in various tasks and is faster than iterative approximations. Throughout experiments including large-scale hyperparameter optimization and meta learning, we demonstrate that the Nystr{\"o}m method consistently achieves comparable or even superior performance to other approaches. The source code is available from https://github.com/moskomule/hypergrad.},
+  abstract = {The essential difficulty of gradient-based bilevel optimization using implicit differentiation is to estimate the inverse Hessian vector product with respect to neural network parameters. This paper proposes to tackle this problem by the Nyström method and the Woodbury matrix identity, exploiting the low-rankness of the Hessian. Compared to existing methods using iterative approximation, such as conjugate gradient and the Neumann series approximation, the proposed method avoids numerical instability and can be efficiently computed in matrix operations without iterations. As a result, the proposed method works stably in various tasks and is faster than iterative approximations. Throughout experiments including large-scale hyperparameter optimization and meta learning, we demonstrate that the Nyström method consistently achieves comparable or even superior performance to other approaches. The source code is available from https://github.com/moskomule/hypergrad.},
+  eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
   langid = {english}
 }
 
 @article{ji_breakdownfree_2017,
   title = {A Breakdown-Free Block Conjugate Gradient Method},
   author = {Ji, Hao and Li, Yaohang},
-  year = {2017},
-  month = jun,
-  journal = {BIT Numerical Mathematics},
+  date = {2017-06},
+  journaltitle = {BIT Numerical Mathematics},
+  shortjournal = {Bit Numer Math},
   volume = {57},
   number = {2},
   pages = {379--403},
   issn = {0006-3835, 1572-9125},
   doi = {10.1007/s10543-016-0631-z},
+  url = {http://link.springer.com/10.1007/s10543-016-0631-z},
   urldate = {2024-02-28},
   abstract = {In this paper, we analyze all possible situations of rank deficiency that cause breakdown in block conjugate gradient (BCG) solvers. A simple solution, breakdownfree block conjugate gradient (BFBCG), is designed to address the rank deficiency problem. The rationale of the BFBCG algorithm is to derive new forms of parameter matrices based on the potentially reduced search subspace to handle rank deficiency. Orthogonality properties and convergence of BFBCG in case of rank deficiency are justified accordingly with mathematical rigor. BFBCG yields faster convergence than restarting BCG when breakdown occurs. Numerical examples suffering from rank deficiency are provided to demonstrate the robustness of BFBCG.},
   langid = {english}
@@ -140,14 +148,15 @@ @article{ji_breakdownfree_2017
 @inproceedings{jia_efficient_2019,
   title = {Towards {{Efficient Data Valuation Based}} on the {{Shapley Value}}},
   booktitle = {Proceedings of the 22nd {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
-  author = {Jia, Ruoxi and Dao, David and Wang, Boxin and Hubis, Frances Ann and Hynes, Nick and G{\"u}rel, Nezihe Merve and Li, Bo and Zhang, Ce and Song, Dawn and Spanos, Costas J.},
-  year = {2019},
-  month = apr,
+  author = {Jia, Ruoxi and Dao, David and Wang, Boxin and Hubis, Frances Ann and Hynes, Nick and Gürel, Nezihe Merve and Li, Bo and Zhang, Ce and Song, Dawn and Spanos, Costas J.},
+  date = {2019-04-11},
   pages = {1167--1176},
   publisher = {PMLR},
   issn = {2640-3498},
+  url = {http://proceedings.mlr.press/v89/jia19a.html},
   urldate = {2021-02-12},
-  abstract = {``How much is my data worth?'' is an increasingly common question posed by organizations and individuals alike. An answer to this question could allow, for instance, fairly distributing profits...},
+  abstract = {“How much is my data worth?” is an increasingly common question posed by organizations and individuals alike. An answer to this question could allow, for instance, fairly distributing profits...},
+  eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}})},
   langid = {english},
   keywords = {notion}
 }
@@ -156,16 +165,17 @@ @article{jia_efficient_2019a
   title = {Efficient Task-Specific Data Valuation for Nearest Neighbor Algorithms},
   shorttitle = {{{VLDB}} 2019},
   author = {Jia, Ruoxi and Dao, David and Wang, Boxin and Hubis, Frances Ann and Gurel, Nezihe Merve and Li, Bo and Zhang, Ce and Spanos, Costas and Song, Dawn},
-  year = {2019},
-  month = jul,
-  journal = {Proceedings of the VLDB Endowment},
+  date = {2019-07-01},
+  journaltitle = {Proceedings of the VLDB Endowment},
+  shortjournal = {Proc. VLDB Endow.},
   volume = {12},
   number = {11},
   pages = {1610--1623},
   issn = {2150-8097},
   doi = {10.14778/3342263.3342637},
+  url = {https://doi.org/10.14778/3342263.3342637},
   urldate = {2021-02-12},
-  abstract = {Given a data set D containing millions of data points and a data consumer who is willing to pay {\textbackslash}\$X to train a machine learning (ML) model over D, how should we distribute this {\textbackslash}\$X to each data point to reflect its "value"? In this paper, we define the "relative value of data" via the Shapley value, as it uniquely possesses properties with appealing real-world interpretations, such as fairness, rationality and decentralizability. For general, bounded utility functions, the Shapley value is known to be challenging to compute: to get Shapley values for all N data points, it requires O(2N) model evaluations for exact computation and O(N log N) for ({$\epsilon$}, {$\delta$})-approximation. In this paper, we focus on one popular family of ML models relying on K-nearest neighbors (KNN). The most surprising result is that for unweighted KNN classifiers and regressors, the Shapley value of all N data points can be computed, exactly, in O(N log N) time - an exponential improvement on computational complexity! Moreover, for ({$\epsilon$}, {$\delta$})-approximation, we are able to develop an algorithm based on Locality Sensitive Hashing (LSH) with only sublinear complexity O(Nh({$\epsilon$}, K) log N) when {$\epsilon$} is not too small and K is not too large. We empirically evaluate our algorithms on up to 10 million data points and even our exact algorithm is up to three orders of magnitude faster than the baseline approximation algorithm. The LSH-based approximation algorithm can accelerate the value calculation process even further. We then extend our algorithm to other scenarios such as (1) weighed KNN classifiers, (2) different data points are clustered by different data curators, and (3) there are data analysts providing computation who also requires proper valuation. Some of these extensions, although also being improved exponentially, are less practical for exact computation (e.g., O(NK) complexity for weigthed KNN). We thus propose an Monte Carlo approximation algorithm, which is O(N(log N)2/(log K)2) times more efficient than the baseline approximation algorithm.},
+  abstract = {Given a data set D containing millions of data points and a data consumer who is willing to pay \textbackslash\$X to train a machine learning (ML) model over D, how should we distribute this \textbackslash\$X to each data point to reflect its "value"? In this paper, we define the "relative value of data" via the Shapley value, as it uniquely possesses properties with appealing real-world interpretations, such as fairness, rationality and decentralizability. For general, bounded utility functions, the Shapley value is known to be challenging to compute: to get Shapley values for all N data points, it requires O(2N) model evaluations for exact computation and O(N log N) for (ϵ, δ)-approximation. In this paper, we focus on one popular family of ML models relying on K-nearest neighbors (KNN). The most surprising result is that for unweighted KNN classifiers and regressors, the Shapley value of all N data points can be computed, exactly, in O(N log N) time - an exponential improvement on computational complexity! Moreover, for (ϵ, δ)-approximation, we are able to develop an algorithm based on Locality Sensitive Hashing (LSH) with only sublinear complexity O(Nh(ϵ, K) log N) when ϵ is not too small and K is not too large. We empirically evaluate our algorithms on up to 10 million data points and even our exact algorithm is up to three orders of magnitude faster than the baseline approximation algorithm. The LSH-based approximation algorithm can accelerate the value calculation process even further. We then extend our algorithm to other scenarios such as (1) weighed KNN classifiers, (2) different data points are clustered by different data curators, and (3) there are data analysts providing computation who also requires proper valuation. Some of these extensions, although also being improved exponentially, are less practical for exact computation (e.g., O(NK) complexity for weigthed KNN). We thus propose an Monte Carlo approximation algorithm, which is O(N(log N)2/(log K)2) times more efficient than the baseline approximation algorithm.},
   langid = {english},
   keywords = {notion}
 }
@@ -173,12 +183,12 @@ @article{jia_efficient_2019a
 @inproceedings{just_lava_2023,
   title = {{{LAVA}}: {{Data Valuation}} without {{Pre-Specified Learning Algorithms}}},
   shorttitle = {{{LAVA}}},
-  booktitle = {The {{Eleventh International Conference}} on {{Learning Representations}} ({{ICLR}} 2023)},
   author = {Just, Hoang Anh and Kang, Feiyang and Wang, Tianhao and Zeng, Yi and Ko, Myeongseob and Jin, Ming and Jia, Ruoxi},
-  year = {2023},
-  month = feb,
+  date = {2023-02-01},
+  url = {https://openreview.net/forum?id=JJuP86nBl4q},
   urldate = {2023-04-25},
-  abstract = {Traditionally, data valuation is posed as a problem of equitably splitting the validation performance of a learning algorithm among the training data. As a result, the calculated data values depend on many design choices of the underlying learning algorithm. However, this dependence is undesirable for many use cases of data valuation, such as setting priorities over different data sources in a data acquisition process and informing pricing mechanisms in a data marketplace. In these scenarios, data needs to be valued before the actual analysis and the choice of the learning algorithm is still undetermined then. Another side-effect of the dependence is that to assess the value of individual points, one needs to re-run the learning algorithm with and without a point, which incurs a large computation burden. This work leapfrogs over the current limits of data valuation methods by introducing a new framework that can value training data in a way that is oblivious to the downstream learning algorithm. Our main results are as follows. \${\textbackslash}textbf\{(1)\}\$ We develop a proxy for the validation performance associated with a training set based on a non-conventional \${\textbackslash}textit\{class-wise\}\$ \${\textbackslash}textit\{Wasserstein distance\}\$ between the training and the validation set. We show that the distance characterizes the upper bound of the validation performance for any given model under certain Lipschitz conditions. \${\textbackslash}textbf\{(2)\}\$ We develop a novel method to value individual data based on the sensitivity analysis of the \${\textbackslash}textit\{class-wise\}\$ Wasserstein distance. Importantly, these values can be directly obtained \${\textbackslash}textit\{for free\}\$ from the output of off-the-shelf optimization solvers once the Wasserstein distance is computed. \${\textbackslash}textbf\{(3) \}\$We evaluate our new data valuation framework over various use cases related to detecting low-quality data and show that, surprisingly, the learning-agnostic feature of our framework enables a \${\textbackslash}textit\{significant improvement\}\$ over the state-of-the-art performance while being \${\textbackslash}textit\{orders of magnitude faster.\}\$},
+  abstract = {Traditionally, data valuation is posed as a problem of equitably splitting the validation performance of a learning algorithm among the training data. As a result, the calculated data values depend on many design choices of the underlying learning algorithm. However, this dependence is undesirable for many use cases of data valuation, such as setting priorities over different data sources in a data acquisition process and informing pricing mechanisms in a data marketplace. In these scenarios, data needs to be valued before the actual analysis and the choice of the learning algorithm is still undetermined then. Another side-effect of the dependence is that to assess the value of individual points, one needs to re-run the learning algorithm with and without a point, which incurs a large computation burden. This work leapfrogs over the current limits of data valuation methods by introducing a new framework that can value training data in a way that is oblivious to the downstream learning algorithm. Our main results are as follows. \$\textbackslash textbf\{(1)\}\$ We develop a proxy for the validation performance associated with a training set based on a non-conventional \$\textbackslash textit\{class-wise\}\$ \$\textbackslash textit\{Wasserstein distance\}\$ between the training and the validation set. We show that the distance characterizes the upper bound of the validation performance for any given model under certain Lipschitz conditions. \$\textbackslash textbf\{(2)\}\$ We develop a novel method to value individual data based on the sensitivity analysis of the \$\textbackslash textit\{class-wise\}\$ Wasserstein distance. Importantly, these values can be directly obtained \$\textbackslash textit\{for free\}\$ from the output of off-the-shelf optimization solvers once the Wasserstein distance is computed. \$\textbackslash textbf\{(3) \}\$We evaluate our new data valuation framework over various use cases related to detecting low-quality data and show that, surprisingly, the learning-agnostic feature of our framework enables a \$\textbackslash textit\{significant improvement\}\$ over the state-of-the-art performance while being \$\textbackslash textit\{orders of magnitude faster.\}\$},
+  eventtitle = {The {{Eleventh International Conference}} on {{Learning Representations}} ({{ICLR}} 2023)},
   langid = {english},
   keywords = {notion}
 }
@@ -187,14 +197,15 @@ @inproceedings{koh_understanding_2017
   title = {Understanding {{Black-box Predictions}} via {{Influence Functions}}},
   booktitle = {Proceedings of the 34th {{International Conference}} on {{Machine Learning}}},
   author = {Koh, Pang Wei and Liang, Percy},
-  year = {2017},
-  month = jul,
+  date = {2017-07-17},
   eprint = {1703.04730},
+  eprinttype = {arxiv},
   pages = {1885--1894},
   publisher = {PMLR},
+  url = {https://proceedings.mlr.press/v70/koh17a.html},
   urldate = {2022-05-09},
-  abstract = {How can we explain the predictions of a black-box model? In this paper, we use influence functions --- a classic technique from robust statistics --- to trace a model's prediction through the learning algorithm and back to its training data, thereby identifying training points most responsible for a given prediction. To scale up influence functions to modern machine learning settings, we develop a simple, efficient implementation that requires only oracle access to gradients and Hessian-vector products. We show that even on non-convex and non-differentiable models where the theory breaks down, approximations to influence functions can still provide valuable information. On linear models and convolutional neural networks, we demonstrate that influence functions are useful for multiple purposes: understanding model behavior, debugging models, detecting dataset errors, and even creating visually-indistinguishable training-set attacks.},
-  archiveprefix = {arxiv},
+  abstract = {How can we explain the predictions of a black-box model? In this paper, we use influence functions — a classic technique from robust statistics — to trace a model’s prediction through the learning algorithm and back to its training data, thereby identifying training points most responsible for a given prediction. To scale up influence functions to modern machine learning settings, we develop a simple, efficient implementation that requires only oracle access to gradients and Hessian-vector products. We show that even on non-convex and non-differentiable models where the theory breaks down, approximations to influence functions can still provide valuable information. On linear models and convolutional neural networks, we demonstrate that influence functions are useful for multiple purposes: understanding model behavior, debugging models, detecting dataset errors, and even creating visually-indistinguishable training-set attacks.},
+  eventtitle = {International {{Conference}} on {{Machine Learning}}},
   langid = {english},
   keywords = {notion}
 }
@@ -204,15 +215,16 @@ @inproceedings{kwon_beta_2022
   shorttitle = {Beta {{Shapley}}},
   booktitle = {Proceedings of the 25th {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}}) 2022,},
   author = {Kwon, Yongchan and Zou, James},
-  year = {2022},
-  month = jan,
+  date = {2022-01-18},
   volume = {151},
   eprint = {2110.14049},
+  eprinttype = {arxiv},
   publisher = {PMLR},
-  address = {Valencia, Spain},
+  location = {Valencia, Spain},
+  url = {http://arxiv.org/abs/2110.14049},
   urldate = {2022-04-06},
   abstract = {Data Shapley has recently been proposed as a principled framework to quantify the contribution of individual datum in machine learning. It can effectively identify helpful or harmful data points for a learning algorithm. In this paper, we propose Beta Shapley, which is a substantial generalization of Data Shapley. Beta Shapley arises naturally by relaxing the efficiency axiom of the Shapley value, which is not critical for machine learning settings. Beta Shapley unifies several popular data valuation methods and includes data Shapley as a special case. Moreover, we prove that Beta Shapley has several desirable statistical properties and propose efficient algorithms to estimate it. We demonstrate that Beta Shapley outperforms state-of-the-art data valuation methods on several downstream ML tasks such as: 1) detecting mislabeled training data; 2) learning with subsamples; and 3) identifying points whose addition or removal have the largest positive or negative impact on the model.},
-  archiveprefix = {arxiv},
+  eventtitle = {{{AISTATS}} 2022},
   langid = {english},
   keywords = {notion}
 }
@@ -222,13 +234,14 @@ @inproceedings{kwon_dataoob_2023
   shorttitle = {Data-{{OOB}}},
   booktitle = {Proceedings of the 40th {{International Conference}} on {{Machine Learning}}},
   author = {Kwon, Yongchan and Zou, James},
-  year = {2023},
-  month = jul,
+  date = {2023-07-03},
   pages = {18135--18152},
   publisher = {PMLR},
   issn = {2640-3498},
+  url = {https://proceedings.mlr.press/v202/kwon23e.html},
   urldate = {2023-09-06},
   abstract = {Data valuation is a powerful framework for providing statistical insights into which data are beneficial or detrimental to model training. Many Shapley-based data valuation methods have shown promising results in various downstream tasks, however, they are well known to be computationally challenging as it requires training a large number of models. As a result, it has been recognized as infeasible to apply to large datasets. To address this issue, we propose Data-OOB, a new data valuation method for a bagging model that utilizes the out-of-bag estimate. The proposed method is computationally efficient and can scale to millions of data by reusing trained weak learners. Specifically, Data-OOB takes less than 2.25 hours on a single CPU processor when there are \$10\^{}6\$ samples to evaluate and the input dimension is 100. Furthermore, Data-OOB has solid theoretical interpretations in that it identifies the same important data point as the infinitesimal jackknife influence function when two different points are compared. We conduct comprehensive experiments using 12 classification datasets, each with thousands of sample sizes. We demonstrate that the proposed method significantly outperforms existing state-of-the-art data valuation methods in identifying mislabeled data and finding a set of helpful (or harmful) data points, highlighting the potential for applying data values in real-world applications.},
+  eventtitle = {International {{Conference}} on {{Machine Learning}}},
   langid = {english},
   keywords = {notion}
 }
@@ -237,15 +250,16 @@ @inproceedings{kwon_efficient_2021
   title = {Efficient {{Computation}} and {{Analysis}} of {{Distributional Shapley Values}}},
   booktitle = {Proceedings of the 24th {{International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
   author = {Kwon, Yongchan and Rivas, Manuel A. and Zou, James},
-  year = {2021},
-  month = mar,
+  date = {2021-03-18},
   eprint = {2007.01357},
+  eprinttype = {arxiv},
   pages = {793--801},
   publisher = {PMLR},
   issn = {2640-3498},
+  url = {http://proceedings.mlr.press/v130/kwon21a.html},
   urldate = {2021-04-23},
   abstract = {Distributional data Shapley value (DShapley) has recently been proposed as a principled framework to quantify the contribution of individual datum in machine learning. DShapley develops the founda...},
-  archiveprefix = {arxiv},
+  eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
   langid = {english}
 }
 
@@ -253,59 +267,63 @@ @inproceedings{martens_optimizing_2015
   title = {Optimizing {{Neural Networks}} with {{Kronecker-factored Approximate Curvature}}},
   booktitle = {Proceedings of the 32nd {{International Conference}} on {{Machine Learning}}},
   author = {Martens, James and Grosse, Roger},
-  year = {2015},
-  month = jun,
+  date = {2015-06-01},
   pages = {2408--2417},
   publisher = {PMLR},
   issn = {1938-7228},
+  url = {https://proceedings.mlr.press/v37/martens15.html},
   urldate = {2022-11-26},
-  abstract = {We propose an efficient method for approximating natural gradient descent in neural networks which we call Kronecker-factored Approximate Curvature (K-FAC). K-FAC is based on an efficiently invertible approximation of a neural network's Fisher information matrix which is neither diagonal nor low-rank, and in some cases is completely non-sparse. It is derived by approximating various large blocks of the Fisher (corresponding to entire layers) as being the Kronecker product of two much smaller matrices. While only several times more expensive to compute than the plain stochastic gradient, the updates produced by K-FAC make much more progress optimizing the objective, which results in an algorithm that can be much faster than stochastic gradient descent with momentum in practice. And unlike some previously proposed approximate natural-gradient/Newton methods which use high-quality non-diagonal curvature matrices (such as Hessian-free optimization), K-FAC works very well in highly stochastic optimization regimes. This is because the cost of storing and inverting K-FAC's approximation to the curvature matrix does not depend on the amount of data used to estimate it, which is a feature typically associated only with diagonal or low-rank approximations to the curvature matrix.},
+  abstract = {We propose an efficient method for approximating natural gradient descent in neural networks which we call Kronecker-factored Approximate Curvature (K-FAC). K-FAC is based on an efficiently invertible approximation of a neural network’s Fisher information matrix which is neither diagonal nor low-rank, and in some cases is completely non-sparse. It is derived by approximating various large blocks of the Fisher (corresponding to entire layers) as being the Kronecker product of two much smaller matrices. While only several times more expensive to compute than the plain stochastic gradient, the updates produced by K-FAC make much more progress optimizing the objective, which results in an algorithm that can be much faster than stochastic gradient descent with momentum in practice. And unlike some previously proposed approximate natural-gradient/Newton methods which use high-quality non-diagonal curvature matrices (such as Hessian-free optimization), K-FAC works very well in highly stochastic optimization regimes. This is because the cost of storing and inverting K-FAC’s approximation to the curvature matrix does not depend on the amount of data used to estimate it, which is a feature typically associated only with diagonal or low-rank approximations to the curvature matrix.},
+  eventtitle = {International {{Conference}} on {{Machine Learning}}},
   langid = {english}
 }
 
 @article{mitchell_sampling_2022,
   title = {Sampling {{Permutations}} for {{Shapley Value Estimation}}},
   author = {Mitchell, Rory and Cooper, Joshua and Frank, Eibe and Holmes, Geoffrey},
-  year = {2022},
-  journal = {Journal of Machine Learning Research},
+  date = {2022},
+  journaltitle = {Journal of Machine Learning Research},
+  shortjournal = {J. Mach. Learn. Res.},
   volume = {23},
   number = {43},
   pages = {1--46},
   issn = {1533-7928},
+  url = {http://jmlr.org/papers/v23/21-0439.html},
   urldate = {2022-10-23},
-  abstract = {Game-theoretic attribution techniques based on Shapley values are used to interpret black-box machine learning models, but their exact calculation is generally NP-hard, requiring approximation methods for non-trivial models. As the computation of Shapley values can be expressed as a summation over a set of permutations, a common approach is to sample a subset of these permutations for approximation. Unfortunately, standard Monte Carlo sampling methods can exhibit slow convergence, and more sophisticated quasi-Monte Carlo methods have not yet been applied to the space of permutations. To address this, we investigate new approaches based on two classes of approximation methods and compare them empirically. First, we demonstrate quadrature techniques in a RKHS containing functions of permutations, using the Mallows kernel in combination with kernel herding and sequential Bayesian quadrature. The RKHS perspective also leads to quasi-Monte Carlo type error bounds, with a tractable discrepancy measure defined on permutations. Second, we exploit connections between the hypersphere  S d-2 Sd-2  and permutations to create practical algorithms for generating permutation samples with good properties. Experiments show the above techniques provide significant improvements for Shapley value estimates over existing methods, converging to a smaller RMSE in the same number of model evaluations.}
+  abstract = {Game-theoretic attribution techniques based on Shapley values are used to interpret black-box machine learning models, but their exact calculation is generally NP-hard, requiring approximation methods for non-trivial models. As the computation of Shapley values can be expressed as a summation over a set of permutations, a common approach is to sample a subset of these permutations for approximation. Unfortunately, standard Monte Carlo sampling methods can exhibit slow convergence, and more sophisticated quasi-Monte Carlo methods have not yet been applied to the space of permutations. To address this, we investigate new approaches based on two classes of approximation methods and compare them empirically. First, we demonstrate quadrature techniques in a RKHS containing functions of permutations, using the Mallows kernel in combination with kernel herding and sequential Bayesian quadrature. The RKHS perspective also leads to quasi-Monte Carlo type error bounds, with a tractable discrepancy measure defined on permutations. Second, we exploit connections between the hypersphere  S d−2 Sd−2  and permutations to create practical algorithms for generating permutation samples with good properties. Experiments show the above techniques provide significant improvements for Shapley value estimates over existing methods, converging to a smaller RMSE in the same number of model evaluations.}
 }
 
 @inproceedings{okhrati_multilinear_2021,
   title = {A {{Multilinear Sampling Algorithm}} to {{Estimate Shapley Values}}},
   booktitle = {2020 25th {{International Conference}} on {{Pattern Recognition}} ({{ICPR}})},
   author = {Okhrati, Ramin and Lipani, Aldo},
-  year = {2021},
-  month = jan,
+  date = {2021-01},
   eprint = {2010.12082},
+  eprinttype = {arxiv},
   pages = {7992--7999},
   publisher = {IEEE},
   issn = {1051-4651},
   doi = {10.1109/ICPR48806.2021.9412511},
+  url = {https://ieeexplore.ieee.org/abstract/document/9412511},
   abstract = {Shapley values are great analytical tools in game theory to measure the importance of a player in a game. Due to their axiomatic and desirable properties such as efficiency, they have become popular for feature importance analysis in data science and machine learning. However, the time complexity to compute Shapley values based on the original formula is exponential, and as the number of features increases, this becomes infeasible. Castro et al. [1] developed a sampling algorithm, to estimate Shapley values. In this work, we propose a new sampling method based on a multilinear extension technique as applied in game theory. The aim is to provide a more efficient (sampling) method for estimating Shapley values. Our method is applicable to any machine learning model, in particular for either multiclass classifications or regression problems. We apply the method to estimate Shapley values for multilayer perceptrons (MLPs) and through experimentation on two datasets, we demonstrate that our method provides more accurate estimations of the Shapley values by reducing the variance of the sampling statistics.},
-  archiveprefix = {arxiv},
+  eventtitle = {2020 25th {{International Conference}} on {{Pattern Recognition}} ({{ICPR}})},
   langid = {english},
   keywords = {notion}
 }
 
 @inproceedings{schioppa_scaling_2021,
   title = {Scaling {{Up Influence Functions}}},
-  booktitle = {{{AAAI-22}}},
   author = {Schioppa, Andrea and Zablotskaia, Polina and Vilar, David and Sokolov, Artem},
-  year = {2021},
-  month = dec,
+  date = {2021-12-06},
   eprint = {2112.03052},
-  primaryclass = {cs},
+  eprinttype = {arxiv},
+  eprintclass = {cs},
   publisher = {arXiv},
   doi = {10.48550/arXiv.2112.03052},
+  url = {http://arxiv.org/abs/2112.03052},
   urldate = {2023-03-10},
   abstract = {We address efficient calculation of influence functions for tracking predictions back to the training data. We propose and analyze a new approach to speeding up the inverse Hessian calculation based on Arnoldi iteration. With this improvement, we achieve, to the best of our knowledge, the first successful implementation of influence functions that scales to full-size (language and vision) Transformer models with several hundreds of millions of parameters. We evaluate our approach on image classification and sequence-to-sequence tasks with tens to a hundred of millions of training examples. Our code will be available at https://github.com/google-research/jax-influence.},
-  archiveprefix = {arxiv},
+  eventtitle = {{{AAAI-22}}},
   keywords = {notion}
 }
 
@@ -314,44 +332,44 @@ @inproceedings{schoch_csshapley_2022
   shorttitle = {{{CS-Shapley}}},
   booktitle = {Proc. of the Thirty-Sixth {{Conference}} on {{Neural Information Processing Systems}} ({{NeurIPS}})},
   author = {Schoch, Stephanie and Xu, Haifeng and Ji, Yangfeng},
-  year = {2022},
-  month = oct,
-  address = {New Orleans, Louisiana, USA},
+  date = {2022-10-31},
+  location = {New Orleans, Louisiana, USA},
+  url = {https://openreview.net/forum?id=KTOcrOR5mQ9},
   urldate = {2022-11-23},
-  abstract = {Data valuation, or the valuation of individual datum contributions, has seen growing interest in machine learning due to its demonstrable efficacy for tasks such as noisy label detection. In particular, due to the desirable axiomatic properties, several Shapley value approximations have been proposed. In these methods, the value function is usually defined as the predictive accuracy over the entire development set. However, this limits the ability to differentiate between training instances that are helpful or harmful to their own classes. Intuitively, instances that harm their own classes may be noisy or mislabeled and should receive a lower valuation than helpful instances. In this work, we propose CS-Shapley, a Shapley value with a new value function that discriminates between training instances' in-class and out-of-class contributions. Our theoretical analysis shows the proposed value function is (essentially) the unique function that satisfies two desirable properties for evaluating data values in classification. Further, our experiments on two benchmark evaluation tasks (data removal and noisy label detection) and four classifiers demonstrate the effectiveness of CS-Shapley over existing methods. Lastly, we evaluate the ``transferability'' of data values estimated from one classifier to others, and our results suggest Shapley-based data valuation is transferable for application across different models.},
+  abstract = {Data valuation, or the valuation of individual datum contributions, has seen growing interest in machine learning due to its demonstrable efficacy for tasks such as noisy label detection. In particular, due to the desirable axiomatic properties, several Shapley value approximations have been proposed. In these methods, the value function is usually defined as the predictive accuracy over the entire development set. However, this limits the ability to differentiate between training instances that are helpful or harmful to their own classes. Intuitively, instances that harm their own classes may be noisy or mislabeled and should receive a lower valuation than helpful instances. In this work, we propose CS-Shapley, a Shapley value with a new value function that discriminates between training instances’ in-class and out-of-class contributions. Our theoretical analysis shows the proposed value function is (essentially) the unique function that satisfies two desirable properties for evaluating data values in classification. Further, our experiments on two benchmark evaluation tasks (data removal and noisy label detection) and four classifiers demonstrate the effectiveness of CS-Shapley over existing methods. Lastly, we evaluate the “transferability” of data values estimated from one classifier to others, and our results suggest Shapley-based data valuation is transferable for application across different models.},
+  eventtitle = {Advances in {{Neural Information Processing Systems}}  ({{NeurIPS}} 2022)},
   langid = {english},
   keywords = {notion}
 }
 
 @inproceedings{wang_improving_2022,
   title = {Improving {{Cooperative Game Theory-based Data Valuation}} via {{Data Utility Learning}}},
-  booktitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022). {{Workshop}} on {{Socially Responsible Machine Learning}}},
   author = {Wang, Tianhao and Yang, Yu and Jia, Ruoxi},
-  year = {2022},
-  month = apr,
+  date = {2022-04-07},
   eprint = {2107.06336v2},
+  eprinttype = {arxiv},
   publisher = {arXiv},
   doi = {10.48550/arXiv.2107.06336},
+  url = {http://arxiv.org/abs/2107.06336v2},
   urldate = {2022-05-19},
   abstract = {The Shapley value (SV) and Least core (LC) are classic methods in cooperative game theory for cost/profit sharing problems. Both methods have recently been proposed as a principled solution for data valuation tasks, i.e., quantifying the contribution of individual datum in machine learning. However, both SV and LC suffer computational challenges due to the need for retraining models on combinatorially many data subsets. In this work, we propose to boost the efficiency in computing Shapley value or Least core by learning to estimate the performance of a learning algorithm on unseen data combinations. Theoretically, we derive bounds relating the error in the predicted learning performance to the approximation error in SV and LC. Empirically, we show that the proposed method can significantly improve the accuracy of SV and LC estimation.},
-  archiveprefix = {arxiv},
+  eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022). {{Workshop}} on {{Socially Responsible Machine Learning}}},
   langid = {english},
   keywords = {notion}
 }
 
-@misc{watson_accelerated_2023,
+@online{watson_accelerated_2023,
   title = {Accelerated {{Shapley Value Approximation}} for {{Data Evaluation}}},
   author = {Watson, Lauren and Kujawa, Zeno and Andreeva, Rayna and Yang, Hao-Tsung and Elahi, Tariq and Sarkar, Rik},
-  year = {2023},
-  month = nov,
-  number = {arXiv:2311.05346},
+  date = {2023-11-09},
   eprint = {2311.05346},
-  primaryclass = {cs},
-  publisher = {arXiv},
+  eprinttype = {arxiv},
+  eprintclass = {cs},
   doi = {10.48550/arXiv.2311.05346},
+  url = {http://arxiv.org/abs/2311.05346},
   urldate = {2023-12-07},
-  abstract = {Data valuation has found various applications in machine learning, such as data filtering, efficient learning and incentives for data sharing. The most popular current approach to data valuation is the Shapley value. While popular for its various applications, Shapley value is computationally expensive even to approximate, as it requires repeated iterations of training models on different subsets of data. In this paper we show that the Shapley value of data points can be approximated more efficiently by leveraging the structural properties of machine learning problems. We derive convergence guarantees on the accuracy of the approximate Shapley value for different learning settings including Stochastic Gradient Descent with convex and non-convex loss functions. Our analysis suggests that in fact models trained on small subsets are more important in the context of data valuation. Based on this idea, we describe \${\textbackslash}delta\$-Shapley -- a strategy of only using small subsets for the approximation. Experiments show that this approach preserves approximate value and rank of data, while achieving speedup of up to 9.9x. In pre-trained networks the approach is found to bring more efficiency in terms of accurate evaluation using small subsets.},
-  archiveprefix = {arxiv}
+  abstract = {Data valuation has found various applications in machine learning, such as data filtering, efficient learning and incentives for data sharing. The most popular current approach to data valuation is the Shapley value. While popular for its various applications, Shapley value is computationally expensive even to approximate, as it requires repeated iterations of training models on different subsets of data. In this paper we show that the Shapley value of data points can be approximated more efficiently by leveraging the structural properties of machine learning problems. We derive convergence guarantees on the accuracy of the approximate Shapley value for different learning settings including Stochastic Gradient Descent with convex and non-convex loss functions. Our analysis suggests that in fact models trained on small subsets are more important in the context of data valuation. Based on this idea, we describe \$\textbackslash delta\$-Shapley -- a strategy of only using small subsets for the approximation. Experiments show that this approach preserves approximate value and rank of data, while achieving speedup of up to 9.9x. In pre-trained networks the approach is found to bring more efficiency in terms of accurate evaluation using small subsets.},
+  pubstate = {preprint}
 }
 
 @inproceedings{wu_davinz_2022,
@@ -359,30 +377,31 @@ @inproceedings{wu_davinz_2022
   shorttitle = {{{DAVINZ}}},
   booktitle = {Proceedings of the 39th {{International Conference}} on {{Machine Learning}}},
   author = {Wu, Zhaoxuan and Shu, Yao and Low, Bryan Kian Hsiang},
-  year = {2022},
-  month = jun,
+  date = {2022-06-28},
   pages = {24150--24176},
   publisher = {PMLR},
+  url = {https://proceedings.mlr.press/v162/wu22j.html},
   urldate = {2022-10-29},
   abstract = {Recent years have witnessed a surge of interest in developing trustworthy methods to evaluate the value of data in many real-world applications (e.g., collaborative machine learning, data marketplaces). Existing data valuation methods typically valuate data using the generalization performance of converged machine learning models after their long-term model training, hence making data valuation on large complex deep neural networks (DNNs) unaffordable. To this end, we theoretically derive a domain-aware generalization bound to estimate the generalization performance of DNNs without model training. We then exploit this theoretically derived generalization bound to develop a novel training-free data valuation method named data valuation at initialization (DAVINZ) on DNNs, which consistently achieves remarkable effectiveness and efficiency in practice. Moreover, our training-free DAVINZ, surprisingly, can even theoretically and empirically enjoy the desirable properties that training-based data valuation methods usually attain, thus making it more trustworthy in practice.},
+  eventtitle = {International {{Conference}} on {{Machine Learning}}},
   langid = {english},
   keywords = {notion}
 }
 
 @inproceedings{yan_if_2021,
-  title = {If {{You Like Shapley Then You}}'ll {{Love}} the {{Core}}},
+  title = {If {{You Like Shapley Then You}}’ll {{Love}} the {{Core}}},
   booktitle = {Proceedings of the 35th {{AAAI Conference}} on {{Artificial Intelligence}}, 2021},
   author = {Yan, Tom and Procaccia, Ariel D.},
-  year = {2021},
-  month = may,
+  date = {2021-05-18},
   volume = {6},
   pages = {5751--5759},
   publisher = {Association for the Advancement of Artificial Intelligence},
-  address = {Virtual conference},
+  location = {Virtual conference},
   doi = {10.1609/aaai.v35i6.16721},
+  url = {https://ojs.aaai.org/index.php/AAAI/article/view/16721},
   urldate = {2021-04-23},
-  abstract = {The prevalent approach to problems of credit assignment in machine learning --- such as feature and data valuation--- is to model the problem at hand as a cooperative game and apply the Shapley value. But cooperative game theory offers a rich menu of alternative solution concepts, which famously includes the core and its variants. Our goal is to challenge the machine learning community's current consensus around the Shapley value, and make a case for the core as a viable alternative. To that end, we prove that arbitrarily good approximations to the least core --- a core relaxation that is always feasible --- can be computed efficiently (but prove an impossibility for a more refined solution concept, the nucleolus). We also perform experiments that corroborate these theoretical results and shed light on settings where the least core may be preferable to the Shapley value.},
-  copyright = {Copyright (c) 2021, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.},
+  abstract = {The prevalent approach to problems of credit assignment in machine learning — such as feature and data valuation— is to model the problem at hand as a cooperative game and apply the Shapley value. But cooperative game theory offers a rich menu of alternative solution concepts, which famously includes the core and its variants. Our goal is to challenge the machine learning community’s current consensus around the Shapley value, and make a case for the core as a viable alternative. To that end, we prove that arbitrarily good approximations to the least core — a core relaxation that is always feasible — can be computed efficiently (but prove an impossibility for a more refined solution concept, the nucleolus). We also perform experiments that corroborate these theoretical results and shed light on settings where the least core may be preferable to the Shapley value.},
+  eventtitle = {{{AAAI Conference}} on {{Artificial Intelligence}}},
   langid = {english},
   keywords = {notion}
 }

From de89aa7157c5113cc5d5f830437d8a6cbaf6ae79 Mon Sep 17 00:00:00 2001
From: Miguel de Benito Delgado <m.debenito.d@gmail.com>
Date: Tue, 12 Mar 2024 11:55:11 +0100
Subject: [PATCH 3/3] Use https links only in bib

---
 docs/assets/pydvl.bib | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/assets/pydvl.bib b/docs/assets/pydvl.bib
index b96ee442a..dacf52e53 100644
--- a/docs/assets/pydvl.bib
+++ b/docs/assets/pydvl.bib
@@ -40,7 +40,7 @@ @article{castro_polynomial_2009
   pages = {1726--1730},
   issn = {0305-0548},
   doi = {10.1016/j.cor.2008.04.004},
-  url = {http://www.sciencedirect.com/science/article/pii/S0305054808000804},
+  url = {https://www.sciencedirect.com/science/article/pii/S0305054808000804},
   urldate = {2020-11-21},
   abstract = {In this paper we develop a polynomial method based on sampling theory that can be used to estimate the Shapley value (or any semivalue) for cooperative games. Besides analyzing the complexity problem, we examine some desirable statistical properties of the proposed approach and provide some computational results.},
   langid = {english},
@@ -55,7 +55,7 @@ @online{frangella_randomized_2021
   eprinttype = {arxiv},
   eprintclass = {cs, math},
   doi = {10.48550/arXiv.2110.02820},
-  url = {http://arxiv.org/abs/2110.02820},
+  url = {https://arxiv.org/abs/2110.02820},
   urldate = {2023-06-04},
   abstract = {This paper introduces the Nystr\textbackslash "om PCG algorithm for solving a symmetric positive-definite linear system. The algorithm applies the randomized Nystr\textbackslash "om method to form a low-rank approximation of the matrix, which leads to an efficient preconditioner that can be deployed with the conjugate gradient algorithm. Theoretical analysis shows that preconditioned system has constant condition number as soon as the rank of the approximation is comparable with the number of effective degrees of freedom in the matrix. The paper also develops adaptive methods that provably achieve similar performance without knowledge of the effective dimension. Numerical tests show that Nystr\textbackslash "om PCG can rapidly solve large linear systems that arise in data analysis problems, and it surpasses several competing methods from the literature.},
   pubstate = {preprint}
@@ -86,7 +86,7 @@ @inproceedings{ghorbani_data_2019
   pages = {2242--2251},
   publisher = {PMLR},
   issn = {2640-3498},
-  url = {http://proceedings.mlr.press/v97/ghorbani19c.html},
+  url = {https://proceedings.mlr.press/v97/ghorbani19c.html},
   urldate = {2020-11-01},
   abstract = {As data becomes the fuel driving technological and economic growth, a fundamental challenge is how to quantify the value of data in algorithmic predictions and decisions. For example, in healthcare and consumer markets, it has been suggested that individuals should be compensated for the data that they generate, but it is not clear what is an equitable valuation for individual data. In this work, we develop a principled framework to address data valuation in the context of supervised machine learning. Given a learning algorithm trained on n data points to produce a predictor, we propose data Shapley as a metric to quantify the value of each training datum to the predictor performance. Data Shapley uniquely satisfies several natural properties of equitable data valuation. We develop Monte Carlo and gradient-based methods to efficiently estimate data Shapley values in practical settings where complex learning algorithms, including neural networks, are trained on large datasets. In addition to being equitable, extensive experiments across biomedical, image and synthetic data demonstrate that data Shapley has several other benefits: 1) it is more powerful than the popular leave-one-out or leverage score in providing insight on what data is more valuable for a given learning task; 2) low Shapley value data effectively capture outliers and corruptions; 3) high Shapley value data inform what type of new data to acquire to improve the predictor.},
   eventtitle = {International {{Conference}} on {{Machine Learning}} ({{ICML}} 2019)},
@@ -139,7 +139,7 @@ @article{ji_breakdownfree_2017
   pages = {379--403},
   issn = {0006-3835, 1572-9125},
   doi = {10.1007/s10543-016-0631-z},
-  url = {http://link.springer.com/10.1007/s10543-016-0631-z},
+  url = {https://link.springer.com/10.1007/s10543-016-0631-z},
   urldate = {2024-02-28},
   abstract = {In this paper, we analyze all possible situations of rank deficiency that cause breakdown in block conjugate gradient (BCG) solvers. A simple solution, breakdownfree block conjugate gradient (BFBCG), is designed to address the rank deficiency problem. The rationale of the BFBCG algorithm is to derive new forms of parameter matrices based on the potentially reduced search subspace to handle rank deficiency. Orthogonality properties and convergence of BFBCG in case of rank deficiency are justified accordingly with mathematical rigor. BFBCG yields faster convergence than restarting BCG when breakdown occurs. Numerical examples suffering from rank deficiency are provided to demonstrate the robustness of BFBCG.},
   langid = {english}
@@ -153,7 +153,7 @@ @inproceedings{jia_efficient_2019
   pages = {1167--1176},
   publisher = {PMLR},
   issn = {2640-3498},
-  url = {http://proceedings.mlr.press/v89/jia19a.html},
+  url = {https://proceedings.mlr.press/v89/jia19a.html},
   urldate = {2021-02-12},
   abstract = {“How much is my data worth?” is an increasingly common question posed by organizations and individuals alike. An answer to this question could allow, for instance, fairly distributing profits...},
   eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}} ({{AISTATS}})},
@@ -221,7 +221,7 @@ @inproceedings{kwon_beta_2022
   eprinttype = {arxiv},
   publisher = {PMLR},
   location = {Valencia, Spain},
-  url = {http://arxiv.org/abs/2110.14049},
+  url = {https://arxiv.org/abs/2110.14049},
   urldate = {2022-04-06},
   abstract = {Data Shapley has recently been proposed as a principled framework to quantify the contribution of individual datum in machine learning. It can effectively identify helpful or harmful data points for a learning algorithm. In this paper, we propose Beta Shapley, which is a substantial generalization of Data Shapley. Beta Shapley arises naturally by relaxing the efficiency axiom of the Shapley value, which is not critical for machine learning settings. Beta Shapley unifies several popular data valuation methods and includes data Shapley as a special case. Moreover, we prove that Beta Shapley has several desirable statistical properties and propose efficient algorithms to estimate it. We demonstrate that Beta Shapley outperforms state-of-the-art data valuation methods on several downstream ML tasks such as: 1) detecting mislabeled training data; 2) learning with subsamples; and 3) identifying points whose addition or removal have the largest positive or negative impact on the model.},
   eventtitle = {{{AISTATS}} 2022},
@@ -256,7 +256,7 @@ @inproceedings{kwon_efficient_2021
   pages = {793--801},
   publisher = {PMLR},
   issn = {2640-3498},
-  url = {http://proceedings.mlr.press/v130/kwon21a.html},
+  url = {https://proceedings.mlr.press/v130/kwon21a.html},
   urldate = {2021-04-23},
   abstract = {Distributional data Shapley value (DShapley) has recently been proposed as a principled framework to quantify the contribution of individual datum in machine learning. DShapley develops the founda...},
   eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
@@ -288,7 +288,7 @@ @article{mitchell_sampling_2022
   number = {43},
   pages = {1--46},
   issn = {1533-7928},
-  url = {http://jmlr.org/papers/v23/21-0439.html},
+  url = {https://jmlr.org/papers/v23/21-0439.html},
   urldate = {2022-10-23},
   abstract = {Game-theoretic attribution techniques based on Shapley values are used to interpret black-box machine learning models, but their exact calculation is generally NP-hard, requiring approximation methods for non-trivial models. As the computation of Shapley values can be expressed as a summation over a set of permutations, a common approach is to sample a subset of these permutations for approximation. Unfortunately, standard Monte Carlo sampling methods can exhibit slow convergence, and more sophisticated quasi-Monte Carlo methods have not yet been applied to the space of permutations. To address this, we investigate new approaches based on two classes of approximation methods and compare them empirically. First, we demonstrate quadrature techniques in a RKHS containing functions of permutations, using the Mallows kernel in combination with kernel herding and sequential Bayesian quadrature. The RKHS perspective also leads to quasi-Monte Carlo type error bounds, with a tractable discrepancy measure defined on permutations. Second, we exploit connections between the hypersphere  S d−2 Sd−2  and permutations to create practical algorithms for generating permutation samples with good properties. Experiments show the above techniques provide significant improvements for Shapley value estimates over existing methods, converging to a smaller RMSE in the same number of model evaluations.}
 }
@@ -320,7 +320,7 @@ @inproceedings{schioppa_scaling_2021
   eprintclass = {cs},
   publisher = {arXiv},
   doi = {10.48550/arXiv.2112.03052},
-  url = {http://arxiv.org/abs/2112.03052},
+  url = {https://arxiv.org/abs/2112.03052},
   urldate = {2023-03-10},
   abstract = {We address efficient calculation of influence functions for tracking predictions back to the training data. We propose and analyze a new approach to speeding up the inverse Hessian calculation based on Arnoldi iteration. With this improvement, we achieve, to the best of our knowledge, the first successful implementation of influence functions that scales to full-size (language and vision) Transformer models with several hundreds of millions of parameters. We evaluate our approach on image classification and sequence-to-sequence tasks with tens to a hundred of millions of training examples. Our code will be available at https://github.com/google-research/jax-influence.},
   eventtitle = {{{AAAI-22}}},
@@ -350,7 +350,7 @@ @inproceedings{wang_improving_2022
   eprinttype = {arxiv},
   publisher = {arXiv},
   doi = {10.48550/arXiv.2107.06336},
-  url = {http://arxiv.org/abs/2107.06336v2},
+  url = {https://arxiv.org/abs/2107.06336v2},
   urldate = {2022-05-19},
   abstract = {The Shapley value (SV) and Least core (LC) are classic methods in cooperative game theory for cost/profit sharing problems. Both methods have recently been proposed as a principled solution for data valuation tasks, i.e., quantifying the contribution of individual datum in machine learning. However, both SV and LC suffer computational challenges due to the need for retraining models on combinatorially many data subsets. In this work, we propose to boost the efficiency in computing Shapley value or Least core by learning to estimate the performance of a learning algorithm on unseen data combinations. Theoretically, we derive bounds relating the error in the predicted learning performance to the approximation error in SV and LC. Empirically, we show that the proposed method can significantly improve the accuracy of SV and LC estimation.},
   eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022). {{Workshop}} on {{Socially Responsible Machine Learning}}},
@@ -366,7 +366,7 @@ @online{watson_accelerated_2023
   eprinttype = {arxiv},
   eprintclass = {cs},
   doi = {10.48550/arXiv.2311.05346},
-  url = {http://arxiv.org/abs/2311.05346},
+  url = {https://arxiv.org/abs/2311.05346},
   urldate = {2023-12-07},
   abstract = {Data valuation has found various applications in machine learning, such as data filtering, efficient learning and incentives for data sharing. The most popular current approach to data valuation is the Shapley value. While popular for its various applications, Shapley value is computationally expensive even to approximate, as it requires repeated iterations of training models on different subsets of data. In this paper we show that the Shapley value of data points can be approximated more efficiently by leveraging the structural properties of machine learning problems. We derive convergence guarantees on the accuracy of the approximate Shapley value for different learning settings including Stochastic Gradient Descent with convex and non-convex loss functions. Our analysis suggests that in fact models trained on small subsets are more important in the context of data valuation. Based on this idea, we describe \$\textbackslash delta\$-Shapley -- a strategy of only using small subsets for the approximation. Experiments show that this approach preserves approximate value and rank of data, while achieving speedup of up to 9.9x. In pre-trained networks the approach is found to bring more efficiency in terms of accurate evaluation using small subsets.},
   pubstate = {preprint}