From c1549a7dc3fee894d8a646d3f1aec373c9bd2e70 Mon Sep 17 00:00:00 2001
From: jasonjabbour <jjj4se@virginia.edu>
Date: Mon, 11 Nov 2024 14:44:48 -0500
Subject: [PATCH] benchmark engineering

---
 contents/core/benchmarking/benchmarking.bib |  8 ++++++++
 contents/core/benchmarking/benchmarking.qmd | 10 +++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/contents/core/benchmarking/benchmarking.bib b/contents/core/benchmarking/benchmarking.bib
index 863e8d38..5923a4cb 100644
--- a/contents/core/benchmarking/benchmarking.bib
+++ b/contents/core/benchmarking/benchmarking.bib
@@ -50,6 +50,14 @@ @article{banbury2020benchmarking
  year = {2020},
 }
 
+@article{banbury2021mlperf,
+  title={Mlperf tiny benchmark},
+  author={Banbury, Colby and Reddi, Vijay Janapa and Torelli, Peter and Holleman, Jeremy and Jeffries, Nat and Kiraly, Csaba and Montino, Pietro and Kanter, David and Ahmed, Sebastian and Pau, Danilo and others},
+  journal={arXiv preprint arXiv:2106.07597},
+  year={2021}, 
+  url = {https://arxiv.org/pdf/2106.07597},
+}
+
 @article{beyer2020we,
  author = {Beyer, Lucas and H\'enaff, Olivier J and Kolesnikov, Alexander and Zhai, Xiaohua and Oord, A\"aron van den},
  journal = {ArXiv preprint},
diff --git a/contents/core/benchmarking/benchmarking.qmd b/contents/core/benchmarking/benchmarking.qmd
index f6650036..e6327c75 100644
--- a/contents/core/benchmarking/benchmarking.qmd
+++ b/contents/core/benchmarking/benchmarking.qmd
@@ -517,7 +517,7 @@ Hardware lottery occurs when a machine learning model unintentionally performs e
 
 In contrast to the accidental hardware lottery, benchmark engineering involves deliberately optimizing or designing a machine learning model to perform exceptionally well on specific hardware, often to win benchmarks or competitions. This intentional optimization might include tweaking the model's architecture, algorithms, or parameters to exploit the hardware's features and capabilities fully.
 
-#### Problem
+##### Problem
 
 Benchmark engineering refers to tweaking or modifying an AI system to optimize performance on specific benchmark tests, often at the expense of generalizability or real-world performance. This can include adjusting hyperparameters, training data, or other aspects of the system specifically to achieve high scores on benchmark metrics without necessarily improving the overall functionality or utility of the system.
 
@@ -527,7 +527,7 @@ It can lead to several risks and challenges. One of the primary risks is that th
 
 The AI community must prioritize transparency and accountability to mitigate the risks associated with benchmark engineering. This can include disclosing any optimizations or adjustments made specifically for benchmark tests and providing more comprehensive evaluations of AI systems that include real-world performance metrics and benchmark scores. Researchers and developers must prioritize holistic improvements to AI systems that improve their generalizability and functionality across various applications rather than focusing solely on benchmark-specific optimizations.
 
-#### Issues
+##### Issues
 
 One of the primary problems with benchmark engineering is that it can compromise the real-world performance of AI systems. When developers focus on optimizing their systems to achieve high scores on specific benchmark tests, they may neglect other important system performance aspects crucial in real-world applications. For example, an AI system designed for image recognition might be engineered to perform exceptionally well on a benchmark test that includes a specific set of images but needs help to recognize images slightly different from those in the test set accurately.
 
@@ -535,15 +535,15 @@ Another area for improvement with benchmark engineering is that it can result in
 
 It can also lead to misleading results. When AI systems are engineered to perform well on benchmark tests, the results may not accurately reflect the system's true capabilities. This can be problematic for users or investors who rely on benchmark scores to make informed decisions about which AI systems to use or invest in. For example, an AI system engineered to achieve high scores on a benchmark test for speech recognition might need to be more capable of accurately recognizing speech in real-world situations, leading users or investors to make decisions based on inaccurate information.
 
-#### Mitigation
+##### Mitigation
 
 There are several ways to mitigate benchmark engineering. Transparency in the benchmarking process is crucial to maintaining benchmark accuracy and reliability. This involves clearly disclosing the methodologies, data sets, and evaluation criteria used in benchmark tests, as well as any optimizations or adjustments made to the AI system for the purpose of the benchmark.
 
 One way to achieve transparency is through the use of open-source benchmarks. Open-source benchmarks are made publicly available, allowing researchers, developers, and other stakeholders to review, critique, and contribute to them, thereby ensuring their accuracy and reliability. This collaborative approach also facilitates sharing best practices and developing more robust and comprehensive benchmarks.
 
-One example is the MLPerf Tiny. It's an open-source framework designed to make it easy to compare different solutions in the world of TinyML. Its modular design allows components to be swapped out for comparison or improvement. The reference implementations, shown in green and orange in @fig-ml-perf, act as the baseline for results. TinyML often needs optimization across the entire system, and users can contribute by focusing on specific parts, like quantization. The modular benchmark design allows users to showcase their contributions and competitive advantage by modifying a reference implementation. In short, MLPerf Tiny offers a flexible and modular way to assess and improve TinyML applications, making it easier to compare and improve different aspects of the technology.
+The modular design of MLPerf Tiny connects to the problem of benchmark engineering by providing a structured yet flexible approach that encourages a balanced evaluation of TinyML. In benchmark engineering, systems may be overly optimized for specific benchmarks, leading to inflated performance scores that don’t necessarily translate to real-world effectiveness. MLPerf Tiny’s modular design aims to address this issue by allowing contributors to swap out and test specific components within a standardized framework, such as hardware, quantization techniques, or inference models. The reference implementations, highlighted in green and orange in @fig-ml-perf, provide a baseline for results, enabling flexible yet controlled testing by specifying which components can be modified. This structure supports transparency and flexibility, enabling a focus on genuine improvements rather than benchmark-specific optimizations.
 
-![MLPerf Tiny modular design. Source: @mattson2020mlperf.](images/png/mlperf_tiny.png){#fig-ml-perf}
+![Modular design of the MLPerf Tiny benchmark, showing the reference implementation with modifiable components. This modular approach enables flexible, targeted testing while maintaining a standardized baseline. Source: @banbury2021mlperf.](images/png/mlperf_tiny.png){#fig-ml-perf}
 
 Another method for achieving transparency is through peer review of benchmarks. This involves having independent experts review and validate the benchmark's methodology, data sets, and results to ensure their credibility and reliability. Peer review can provide a valuable means of verifying the accuracy of benchmark tests and help build confidence in the results.