diff --git a/contents/core/benchmarking/benchmarking.bib b/contents/core/benchmarking/benchmarking.bib
index 863e8d38..c785dd69 100644
--- a/contents/core/benchmarking/benchmarking.bib
+++ b/contents/core/benchmarking/benchmarking.bib
@@ -1,406 +1,454 @@
 %comment{This file was created with betterbib v5.0.11.}
 
-
 @article{bianco2018benchmark,
- author = {Bianco, Simone and Cadene, Remi and Celona, Luigi and Napoletano, Paolo},
- title = {Benchmark analysis of representative deep neural network architectures},
- journal = {IEEE access},
- volume = {6},
- pages = {64270--64277},
- year = {2018},
- publisher = {IEEE},
+  doi = {10.1109/access.2018.2877890},
+  pages = {64270--64277},
+  source = {Crossref},
+  volume = {6},
+  author = {Bianco, Simone and Cadene, Remi and Celona, Luigi and Napoletano, Paolo},
+  year = {2018},
+  url = {https://doi.org/10.1109/access.2018.2877890},
+  issn = {2169-3536},
+  journal = {IEEE Access},
+  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
+  title = {Benchmark Analysis of Representative Deep Neural Network Architectures},
 }
 
 @inproceedings{adolf2016fathom,
- author = {Adolf, Robert and Rama, Saketh and Reagen, Brandon and Wei, Gu-yeon and Brooks, David},
- booktitle = {2016 IEEE International Symposium on Workload Characterization (IISWC)},
- doi = {10.1109/iiswc.2016.7581275},
- organization = {IEEE},
- pages = {1--10},
- publisher = {IEEE},
- source = {Crossref},
- title = {Fathom: {Reference} workloads for modern deep learning methods},
- url = {https://doi.org/10.1109/iiswc.2016.7581275},
- year = {2016},
- month = sep,
+  doi = {10.1109/iiswc.2016.7581275},
+  pages = {1--10},
+  source = {Crossref},
+  author = {Adolf, Robert and Rama, Saketh and Reagen, Brandon and Wei, Gu-yeon and Brooks, David},
+  year = {2016},
+  month = sep,
+  url = {https://doi.org/10.1109/iiswc.2016.7581275},
+  booktitle = {2016 IEEE International Symposium on Workload Characterization (IISWC)},
+  publisher = {IEEE},
+  title = {Fathom: reference workloads for modern deep learning methods},
+  organization = {IEEE},
 }
 
 @inproceedings{antol2015vqa,
- author = {Antol, Stanislaw and Agrawal, Aishwarya and Lu, Jiasen and Mitchell, Margaret and Batra, Dhruv and Zitnick, C. Lawrence and Parikh, Devi},
- bibsource = {dblp computer science bibliography, https://dblp.org},
- biburl = {https://dblp.org/rec/conf/iccv/AntolALMBZP15.bib},
- booktitle = {2015 IEEE International Conference on Computer Vision (ICCV)},
- doi = {10.1109/iccv.2015.279},
- pages = {2425--2433},
- publisher = {IEEE},
- timestamp = {Wed, 24 May 2017 01:00:00 +0200},
- title = {{VQA:} {Visual} Question Answering},
- url = {https://doi.org/10.1109/iccv.2015.279},
- year = {2015},
- source = {Crossref},
- month = dec,
+  doi = {10.1109/iccv.2015.279},
+  pages = {2425--2433},
+  source = {Crossref},
+  author = {Antol, Stanislaw and Agrawal, Aishwarya and Lu, Jiasen and Mitchell, Margaret and Batra, Dhruv and Zitnick, C. Lawrence and Parikh, Devi},
+  year = {2015},
+  month = dec,
+  url = {https://doi.org/10.1109/iccv.2015.279},
+  booktitle = {2015 IEEE International Conference on Computer Vision (ICCV)},
+  publisher = {IEEE},
+  title = {VQA: Visual Question Answering},
+  bibsource = {dblp computer science bibliography, https://dblp.org},
+  biburl = {https://dblp.org/rec/conf/iccv/AntolALMBZP15.bib},
+  timestamp = {Wed, 24 May 2017 01:00:00 +0200},
 }
 
 @article{banbury2020benchmarking,
- author = {Banbury, Colby R and Reddi, Vijay Janapa and Lam, Max and Fu, William and Fazel, Amin and Holleman, Jeremy and Huang, Xinyuan and Hurtado, Robert and Kanter, David and Lokhmotov, Anton and others},
- journal = {ArXiv preprint},
- title = {Benchmarking tinyml systems: {Challenges} and direction},
- url = {https://arxiv.org/abs/2003.04821},
- volume = {abs/2003.04821},
- year = {2020},
+  url = {http://arxiv.org/abs/2003.04821v4},
+  year = {2020},
+  month = mar,
+  title = {Benchmarking TinyML Systems: Challenges and Direction},
+  author = {Banbury, Colby R. and Reddi, Vijay Janapa and Lam, Max and Fu, William and Fazel, Amin and Holleman, Jeremy and Huang, Xinyuan and Hurtado, Robert and Kanter, David and Lokhmotov, Anton and Patterson, David and Pau, Danilo and Seo, Jae-sun and Sieracki, Jeff and Thakker, Urmish and Verhelst, Marian and Yadav, Poonam},
+  primaryclass = {cs.PF},
+  archiveprefix = {arXiv},
+  journal = {ArXiv preprint},
+  volume = {abs/2003.04821},
+}
+
+@article{banbury2021mlperf,
+  url = {http://arxiv.org/abs/2106.07597v4},
+  year = {2021},
+  month = jun,
+  title = {MLPerf Tiny Benchmark},
+  author = {Banbury, Colby and Reddi, Vijay Janapa and Torelli, Peter and Holleman, Jeremy and Jeffries, Nat and Kiraly, Csaba and Montino, Pietro and Kanter, David and Ahmed, Sebastian and Pau, Danilo and Thakker, Urmish and Torrini, Antonio and Warden, Peter and Cordaro, Jay and Guglielmo, Giuseppe Di and Duarte, Javier and Gibellini, Stephen and Parekh, Videet and Tran, Honson and Tran, Nhan and Wenxu, Niu and Xuesong, Xu},
+  primaryclass = {cs.LG},
+  archiveprefix = {arXiv},
+  journal = {arXiv preprint arXiv:2106.07597},
 }
 
 @article{beyer2020we,
- author = {Beyer, Lucas and H\'enaff, Olivier J and Kolesnikov, Alexander and Zhai, Xiaohua and Oord, A\"aron van den},
- journal = {ArXiv preprint},
- title = {Are we done with imagenet?},
- url = {https://arxiv.org/abs/2006.07159},
- volume = {abs/2006.07159},
- year = {2020},
+  url = {http://arxiv.org/abs/2006.07159v1},
+  year = {2020},
+  month = jun,
+  title = {Are we done with ImageNet?},
+  author = {Beyer, Lucas and H\'enaff, Olivier J. and Kolesnikov, Alexander and Zhai, Xiaohua and van den Oord, A\"aron},
+  primaryclass = {cs.CV},
+  archiveprefix = {arXiv},
+  journal = {ArXiv preprint},
+  volume = {abs/2006.07159},
 }
 
 @inproceedings{brown2020language,
- author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
- editor = {Larochelle, Hugo and Ranzato, Marc'Aurelio and Hadsell, Raia and Balcan, Maria-Florina and Lin, Hsuan-Tien},
- bibsource = {dblp computer science bibliography, https://dblp.org},
- biburl = {https://dblp.org/rec/conf/nips/BrownMRSKDNSSAA20.bib},
- booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual},
- timestamp = {Tue, 19 Jan 2021 00:00:00 +0100},
- title = {Language Models are Few-Shot Learners},
- url = {https://proceedings.neurips.cc/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html},
- year = {2020},
+  author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
+  editor = {Larochelle, Hugo and Ranzato, Marc'Aurelio and Hadsell, Raia and Balcan, Maria-Florina and Lin, Hsuan-Tien},
+  bibsource = {dblp computer science bibliography, https://dblp.org},
+  biburl = {https://dblp.org/rec/conf/nips/BrownMRSKDNSSAA20.bib},
+  booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual},
+  timestamp = {Tue, 19 Jan 2021 00:00:00 +0100},
+  title = {Language Models are Few-Shot Learners},
+  url = {https://proceedings.neurips.cc/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html},
+  year = {2020},
 }
 
 @article{10.1145/3467017,
-author = {Hooker, Sara},
-title = {The hardware lottery},
-year = {2021},
-issue_date = {December 2021},
-publisher = {Association for Computing Machinery},
-address = {New York, NY, USA},
-volume = {64},
-number = {12},
-issn = {0001-0782},
-url = {https://doi.org/10.1145/3467017},
-doi = {10.1145/3467017},
-abstract = {After decades of incentivizing the isolation of hardware, software, and algorithm development, the catalysts for closer collaboration are changing the paradigm.},
-journal = {Commun. ACM},
-month = nov,
-pages = {58-65},
-numpages = {8}
+  number = {12},
+  doi = {10.1145/3467017},
+  pages = {58--65},
+  source = {Crossref},
+  volume = {64},
+  author = {Hooker, Sara},
+  year = {2021},
+  month = nov,
+  url = {https://doi.org/10.1145/3467017},
+  issn = {0001-0782,1557-7317},
+  journal = {Communications of the ACM},
+  publisher = {Association for Computing Machinery (ACM)},
+  title = {The hardware lottery},
+  issue_date = {December 2021},
+  address = {New York, NY, USA},
+  abstract = {After decades of incentivizing the isolation of hardware, software, and algorithm development, the catalysts for closer collaboration are changing the paradigm.},
+  numpages = {8},
 }
 
 @inproceedings{chu2021discovering,
- author = {Chu, Grace and Arikan, Okan and Bender, Gabriel and Wang, Weijun and Brighton, Achille and Kindermans, Pieter-Jan and Liu, Hanxiao and Akin, Berkin and Gupta, Suyog and Howard, Andrew},
- bibsource = {dblp computer science bibliography, https://dblp.org},
- biburl = {https://dblp.org/rec/conf/cvpr/ChuABWBKLAG021.bib},
- booktitle = {2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)},
- doi = {10.1109/cvprw53098.2021.00337},
- pages = {3022--3031},
- publisher = {IEEE},
- timestamp = {Mon, 18 Jul 2022 01:00:00 +0200},
- title = {Discovering Multi-Hardware Mobile Models via Architecture Search},
- url = {https://doi.org/10.1109/cvprw53098.2021.00337},
- year = {2021},
- source = {Crossref},
- month = jun,
+  doi = {10.1109/cvprw53098.2021.00337},
+  pages = {3016--3025},
+  source = {Crossref},
+  author = {Chu, Grace and Arikan, Okan and Bender, Gabriel and Wang, Weijun and Brighton, Achille and Kindermans, Pieter-Jan and Liu, Hanxiao and Akin, Berkin and Gupta, Suyog and Howard, Andrew},
+  year = {2021},
+  month = jun,
+  url = {https://doi.org/10.1109/cvprw53098.2021.00337},
+  booktitle = {2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)},
+  publisher = {IEEE},
+  title = {Discovering Multi-Hardware Mobile Models via Architecture Search},
+  bibsource = {dblp computer science bibliography, https://dblp.org},
+  biburl = {https://dblp.org/rec/conf/cvpr/ChuABWBKLAG021.bib},
+  timestamp = {Mon, 18 Jul 2022 01:00:00 +0200},
 }
 
 @article{coleman2017dawnbench,
- author = {Coleman, Cody and Kang, Daniel and Narayanan, Deepak and Nardi, Luigi and Zhao, Tian and Zhang, Jian and Bailis, Peter and Olukotun, Kunle and R\'e, Chris and Zaharia, Matei},
- doi = {10.1145/3352020.3352024},
- issn = {0163-5980},
- journal = {ACM SIGOPS Operating Systems Review},
- number = {1},
- pages = {14--25},
- publisher = {Association for Computing Machinery (ACM)},
- source = {Crossref},
- title = {Analysis of {DAWNBench,} a Time-to-Accuracy Machine Learning Performance Benchmark},
- url = {https://doi.org/10.1145/3352020.3352024},
- volume = {53},
- year = {2019},
- month = jul,
+  number = {1},
+  doi = {10.1145/3352020.3352024},
+  pages = {14--25},
+  source = {Crossref},
+  volume = {53},
+  author = {Coleman, Cody and Kang, Daniel and Narayanan, Deepak and Nardi, Luigi and Zhao, Tian and Zhang, Jian and Bailis, Peter and Olukotun, Kunle and R\'e, Chris and Zaharia, Matei},
+  year = {2019},
+  month = jul,
+  url = {https://doi.org/10.1145/3352020.3352024},
+  issn = {0163-5980},
+  journal = {ACM SIGOPS Operating Systems Review},
+  publisher = {Association for Computing Machinery (ACM)},
+  title = {Analysis of DAWNBench, a Time-to-Accuracy Machine Learning Performance Benchmark},
 }
 
-@inproceedings{coleman2022similarity,
- author = {Coleman, Cody and Chou, Edward and Katz-Samuels, Julian and Culatana, Sean and Bailis, Peter and Berg, Alexander C. and Nowak, Robert D. and Sumbaly, Roshan and Zaharia, Matei and Yalniz, I. Zeki},
- bibsource = {dblp computer science bibliography, https://dblp.org},
- biburl = {https://dblp.org/rec/conf/aaai/ColemanCKCBBNSZ22.bib},
- booktitle = {Thirty-Sixth AAAI Conference on Artificial Intelligence, AAAI 2022, Thirty-Fourth Conference on Innovative Applications of Artificial Intelligence, IAAI 2022, The Twelveth Symposium on Educational Advances in Artificial Intelligence, EAAI 2022 Virtual Event, February 22 - March 1, 2022},
- pages = {6402--6410},
- publisher = {AAAI Press},
- timestamp = {Mon, 11 Jul 2022 01:00:00 +0200},
- title = {Similarity Search for Efficient Active Learning and Search of Rare Concepts},
- url = {https://ojs.aaai.org/index.php/AAAI/article/view/20591},
- year = {2022},
+@article{coleman2022similarity,
+  number = {6},
+  doi = {10.1609/aaai.v36i6.20591},
+  pages = {6402--6410},
+  source = {Crossref},
+  volume = {36},
+  author = {Coleman, Cody and Chou, Edward and Katz-Samuels, Julian and Culatana, Sean and Bailis, Peter and Berg, Alexander C. and Nowak, Robert and Sumbaly, Roshan and Zaharia, Matei and Yalniz, I. Zeki},
+  year = {2022},
+  month = jun,
+  url = {https://doi.org/10.1609/aaai.v36i6.20591},
+  issn = {2374-3468,2159-5399},
+  journal = {Proceedings of the AAAI Conference on Artificial Intelligence},
+  publisher = {Association for the Advancement of Artificial Intelligence (AAAI)},
+  title = {Similarity Search for Efficient Active Learning and Search of Rare Concepts},
+  bibsource = {dblp computer science bibliography, https://dblp.org},
+  biburl = {https://dblp.org/rec/conf/aaai/ColemanCKCBBNSZ22.bib},
+  booktitle = {Thirty-Sixth AAAI Conference on Artificial Intelligence, AAAI 2022, Thirty-Fourth Conference on Innovative Applications of Artificial Intelligence, IAAI 2022, The Twelveth Symposium on Educational Advances in Artificial Intelligence, EAAI 2022 Virtual Event, February 22 - March 1, 2022},
+  timestamp = {Mon, 11 Jul 2022 01:00:00 +0200},
 }
 
 @article{david2021tensorflow,
- author = {David, Robert and Duke, Jared and Jain, Advait and Janapa Reddi, Vijay and Jeffries, Nat and Li, Jian and Kreeger, Nick and Nappier, Ian and Natraj, Meghna and Wang, Tiezhen and others},
- journal = {Proceedings of Machine Learning and Systems},
- pages = {800--811},
- title = {Tensorflow lite micro: {Embedded} machine learning for tinyml systems},
- volume = {3},
- year = {2021},
+  author = {David, Robert and Duke, Jared and Jain, Advait and Janapa Reddi, Vijay and Jeffries, Nat and Li, Jian and Kreeger, Nick and Nappier, Ian and Natraj, Meghna and Wang, Tiezhen and others},
+  journal = {Proceedings of Machine Learning and Systems},
+  pages = {800--811},
+  title = {Tensorflow lite micro: Embedded machine learning for tinyml systems},
+  volume = {3},
+  year = {2021},
 }
 
 @article{davies2018loihi,
- author = {Davies, Mike and Srinivasa, Narayan and Lin, Tsung-Han and Chinya, Gautham and Cao, Yongqiang and Choday, Sri Harsha and Dimou, Georgios and Joshi, Prasad and Imam, Nabil and Jain, Shweta and Liao, Yuyun and Lin, Chit-Kwan and Lines, Andrew and Liu, Ruokun and Mathaikutty, Deepak and McCoy, Steven and Paul, Arnab and Tse, Jonathan and Venkataramanan, Guruguhanathan and Weng, Yi-Hsin and Wild, Andreas and Yang, Yoonseok and Wang, Hong},
- doi = {10.1109/mm.2018.112130359},
- issn = {0272-1732, 1937-4143},
- journal = {IEEE Micro},
- number = {1},
- pages = {82--99},
- publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
- source = {Crossref},
- title = {Loihi: {A} Neuromorphic Manycore Processor with On-Chip Learning},
- url = {https://doi.org/10.1109/mm.2018.112130359},
- volume = {38},
- year = {2018},
- month = jan,
+  number = {1},
+  doi = {10.1109/mm.2018.112130359},
+  pages = {82--99},
+  source = {Crossref},
+  volume = {38},
+  author = {Davies, Mike and Srinivasa, Narayan and Lin, Tsung-Han and Chinya, Gautham and Cao, Yongqiang and Choday, Sri Harsha and Dimou, Georgios and Joshi, Prasad and Imam, Nabil and Jain, Shweta and Liao, Yuyun and Lin, Chit-Kwan and Lines, Andrew and Liu, Ruokun and Mathaikutty, Deepak and McCoy, Steven and Paul, Arnab and Tse, Jonathan and Venkataramanan, Guruguhanathan and Weng, Yi-Hsin and Wild, Andreas and Yang, Yoonseok and Wang, Hong},
+  year = {2018},
+  month = jan,
+  url = {https://doi.org/10.1109/mm.2018.112130359},
+  issn = {0272-1732,1937-4143},
+  journal = {IEEE Micro},
+  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
+  title = {Loihi: A Neuromorphic Manycore Processor with On-Chip Learning},
 }
 
 @inproceedings{devlin2018bert,
- author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
- address = {Minneapolis, Minnesota},
- booktitle = {Proceedings of the 2019 Conference of the North},
- doi = {10.18653/v1/n19-1423},
- pages = {4171--4186},
- publisher = {Association for Computational Linguistics},
- title = {{BERT:} {Pre-training} of Deep Bidirectional Transformers for Language Understanding},
- url = {https://doi.org/10.18653/v1/n19-1423},
- year = {2019},
- source = {Crossref},
+  doi = {10.18653/v1/n19-1423},
+  source = {Crossref},
+  author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+  year = {2019},
+  url = {https://doi.org/10.18653/v1/n19-1423},
+  booktitle = {Proceedings of the 2019 Conference of the North},
+  publisher = {Association for Computational Linguistics},
+  title = {None},
+  address = {Minneapolis, Minnesota},
+  pages = {4171--4186},
 }
 
 @article{gaviria2022dollar,
- author = {Mattson, Peter and Reddi, Vijay Janapa and Cheng, Christine and Coleman, Cody and Diamos, Greg and Kanter, David and Micikevicius, Paulius and Patterson, David and Schmuelling, Guenther and Tang, Hanlin and Wei, Gu-Yeon and Wu, Carole-Jean},
- doi = {10.1109/mm.2020.2974843},
- issn = {0272-1732, 1937-4143},
- journal = {IEEE Micro},
- number = {2},
- pages = {8--16},
- publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
- source = {Crossref},
- title = {{MLPerf:} {An} Industry Standard Benchmark Suite for Machine Learning Performance},
- url = {https://doi.org/10.1109/mm.2020.2974843},
- volume = {40},
- year = {2020},
- month = mar,
+  number = {2},
+  doi = {10.1109/mm.2020.2974843},
+  pages = {8--16},
+  source = {Crossref},
+  volume = {40},
+  author = {Mattson, Peter and Reddi, Vijay Janapa and Cheng, Christine and Coleman, Cody and Diamos, Greg and Kanter, David and Micikevicius, Paulius and Patterson, David and Schmuelling, Guenther and Tang, Hanlin and Wei, Gu-Yeon and Wu, Carole-Jean},
+  year = {2020},
+  month = mar,
+  url = {https://doi.org/10.1109/mm.2020.2974843},
+  issn = {0272-1732,1937-4143},
+  journal = {IEEE Micro},
+  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
+  title = {MLPerf: An Industry Standard Benchmark Suite for Machine Learning Performance},
 }
 
 @inproceedings{hendrycks2021natural,
- author = {Hendrycks, Dan and Zhao, Kevin and Basart, Steven and Steinhardt, Jacob and Song, Dawn},
- bibsource = {dblp computer science bibliography, https://dblp.org},
- biburl = {https://dblp.org/rec/conf/cvpr/HendrycksZBSS21.bib},
- booktitle = {2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
- doi = {10.1109/cvpr46437.2021.01501},
- pages = {15262--15271},
- publisher = {IEEE},
- timestamp = {Mon, 18 Jul 2022 01:00:00 +0200},
- title = {Natural Adversarial Examples},
- url = {https://doi.org/10.1109/cvpr46437.2021.01501},
- year = {2021},
- source = {Crossref},
- month = jun,
+  doi = {10.1109/cvpr46437.2021.01501},
+  pages = {15257--15266},
+  source = {Crossref},
+  author = {Hendrycks, Dan and Zhao, Kevin and Basart, Steven and Steinhardt, Jacob and Song, Dawn},
+  year = {2021},
+  month = jun,
+  url = {https://doi.org/10.1109/cvpr46437.2021.01501},
+  booktitle = {2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  publisher = {IEEE},
+  title = {Natural Adversarial Examples},
+  bibsource = {dblp computer science bibliography, https://dblp.org},
+  biburl = {https://dblp.org/rec/conf/cvpr/HendrycksZBSS21.bib},
+  timestamp = {Mon, 18 Jul 2022 01:00:00 +0200},
 }
 
 @inproceedings{ignatov2018ai,
- author = {Ignatov, Andrey and Timofte, Radu and Kulik, Andrei and Yang, Seungsoo and Wang, Ke and Baum, Felix and Wu, Max and Xu, Lirong and Van Gool, Luc},
- booktitle = {2019 IEEE/CVF International Conference on Computer Vision Workshop (ICCVW)},
- doi = {10.1109/iccvw.2019.00447},
- pages = {0--0},
- publisher = {IEEE},
- source = {Crossref},
- title = {{AI} Benchmark: {All} About Deep Learning on Smartphones in 2019},
- url = {https://doi.org/10.1109/iccvw.2019.00447},
- year = {2019},
- month = oct,
+  doi = {10.1109/iccvw.2019.00447},
+  pages = {3617--3635},
+  source = {Crossref},
+  author = {Ignatov, Andrey and Timofte, Radu and Kulik, Andrei and Yang, Seungsoo and Wang, Ke and Baum, Felix and Wu, Max and Xu, Lirong and Van Gool, Luc},
+  year = {2019},
+  month = oct,
+  url = {https://doi.org/10.1109/iccvw.2019.00447},
+  booktitle = {2019 IEEE/CVF International Conference on Computer Vision Workshop (ICCVW)},
+  publisher = {IEEE},
+  title = {AI Benchmark: All About Deep Learning on Smartphones in 2019},
 }
 
 @inproceedings{kiela2021dynabench,
- author = {Kiela, Douwe and Bartolo, Max and Nie, Yixin and Kaushik, Divyansh and Geiger, Atticus and Wu, Zhengxuan and Vidgen, Bertie and Prasad, Grusha and Singh, Amanpreet and Ringshia, Pratik and Ma, Zhiyi and Thrush, Tristan and Riedel, Sebastian and Waseem, Zeerak and Stenetorp, Pontus and Jia, Robin and Bansal, Mohit and Potts, Christopher and Williams, Adina},
- address = {Online},
- booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
- doi = {10.18653/v1/2021.naacl-main.324},
- pages = {4110--4124},
- publisher = {Association for Computational Linguistics},
- title = {Dynabench: {Rethinking} Benchmarking in {NLP}},
- url = {https://doi.org/10.18653/v1/2021.naacl-main.324},
- year = {2021},
- source = {Crossref},
+  doi = {10.18653/v1/2021.naacl-main.324},
+  source = {Crossref},
+  author = {Kiela, Douwe and Bartolo, Max and Nie, Yixin and Kaushik, Divyansh and Geiger, Atticus and Wu, Zhengxuan and Vidgen, Bertie and Prasad, Grusha and Singh, Amanpreet and Ringshia, Pratik and Ma, Zhiyi and Thrush, Tristan and Riedel, Sebastian and Waseem, Zeerak and Stenetorp, Pontus and Jia, Robin and Bansal, Mohit and Potts, Christopher and Williams, Adina},
+  year = {2021},
+  url = {https://doi.org/10.18653/v1/2021.naacl-main.324},
+  booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
+  publisher = {Association for Computational Linguistics},
+  title = {Dynabench: Rethinking Benchmarking in NLP},
+  address = {Online},
+  pages = {4110--4124},
 }
 
 @inproceedings{koh2021wilds,
- author = {Koh, Pang Wei and Sagawa, Shiori and Marklund, Henrik and Xie, Sang Michael and Zhang, Marvin and Balsubramani, Akshay and Hu, Weihua and Yasunaga, Michihiro and Phillips, Richard Lanas and Gao, Irena and Lee, Tony and David, Etienne and Stavness, Ian and Guo, Wei and Earnshaw, Berton and Haque, Imran S. and Beery, Sara M. and Leskovec, Jure and Kundaje, Anshul and Pierson, Emma and Levine, Sergey and Finn, Chelsea and Liang, Percy},
- editor = {Meila, Marina and Zhang, Tong},
- bibsource = {dblp computer science bibliography, https://dblp.org},
- biburl = {https://dblp.org/rec/conf/icml/KohSMXZBHYPGLDS21.bib},
- booktitle = {Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event},
- pages = {5637--5664},
- publisher = {PMLR},
- series = {Proceedings of Machine Learning Research},
- timestamp = {Tue, 13 Dec 2022 00:00:00 +0100},
- title = {{WILDS:} {A} Benchmark of in-the-Wild Distribution Shifts},
- url = {http://proceedings.mlr.press/v139/koh21a.html},
- volume = {139},
- year = {2021},
+  author = {Koh, Pang Wei and Sagawa, Shiori and Marklund, Henrik and Xie, Sang Michael and Zhang, Marvin and Balsubramani, Akshay and Hu, Weihua and Yasunaga, Michihiro and Phillips, Richard Lanas and Gao, Irena and Lee, Tony and David, Etienne and Stavness, Ian and 0002, Wei Guo and Earnshaw, Berton and Haque, Imran S. and Beery, Sara M. and Leskovec, Jure and Kundaje, Anshul and Pierson, Emma and Levine, Sergey and Finn, Chelsea and Liang, Percy},
+  title = {WILDS: A Benchmark of in-the-Wild Distribution Shifts.},
+  journal = {ICML},
+  pages = {5637--5664},
+  year = {2021},
+  url = {http://proceedings.mlr.press/v139/koh21a.html},
+  source = {DBLP},
+  editor = {Meila, Marina and Zhang, Tong},
+  bibsource = {dblp computer science bibliography, https://dblp.org},
+  biburl = {https://dblp.org/rec/conf/icml/KohSMXZBHYPGLDS21.bib},
+  booktitle = {Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event},
+  publisher = {PMLR},
+  series = {Proceedings of Machine Learning Research},
+  timestamp = {Tue, 13 Dec 2022 00:00:00 +0100},
+  volume = {139},
 }
 
-@inproceedings{lin2014microsoft,
- author = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll\'ar, Piotr and Zitnick, C Lawrence},
- booktitle = {Computer Vision{\textendash}ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13},
- organization = {Springer},
- pages = {740--755},
- title = {Microsoft coco: {Common} objects in context},
- year = {2014},
+@incollection{lin2014microsoft,
+  doi = {10.1007/978-3-319-10602-1\_48},
+  pages = {740--755},
+  source = {Crossref},
+  author = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll\'ar, Piotr and Zitnick, C. Lawrence},
+  year = {2014},
+  isbn = {9783319106014,9783319106021},
+  url = {https://doi.org/10.1007/978-3-319-10602-1\_48},
+  issn = {0302-9743,1611-3349},
+  booktitle = {Computer Vision -- ECCV 2014},
+  publisher = {Springer International Publishing},
+  title = {Microsoft COCO: Common Objects in Context},
+  organization = {Springer},
 }
 
 @inproceedings{lundberg2017unified,
- author = {Lundberg, Scott M. and Lee, Su-In},
- editor = {Guyon, Isabelle and von Luxburg, Ulrike and Bengio, Samy and Wallach, Hanna M. and Fergus, Rob and Vishwanathan, S. V. N. and Garnett, Roman},
- bibsource = {dblp computer science bibliography, https://dblp.org},
- biburl = {https://dblp.org/rec/conf/nips/LundbergL17.bib},
- booktitle = {Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA},
- pages = {4765--4774},
- timestamp = {Thu, 21 Jan 2021 00:00:00 +0100},
- title = {A Unified Approach to Interpreting Model Predictions},
- url = {https://proceedings.neurips.cc/paper/2017/hash/8a20a8621978632d76c43dfd28b67767-Abstract.html},
- year = {2017},
+  author = {Lundberg, Scott M. and Lee, Su-In},
+  editor = {Guyon, Isabelle and von Luxburg, Ulrike and Bengio, Samy and Wallach, Hanna M. and Fergus, Rob and Vishwanathan, S. V. N. and Garnett, Roman},
+  bibsource = {dblp computer science bibliography, https://dblp.org},
+  biburl = {https://dblp.org/rec/conf/nips/LundbergL17.bib},
+  booktitle = {Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA},
+  pages = {4765--4774},
+  timestamp = {Thu, 21 Jan 2021 00:00:00 +0100},
+  title = {A Unified Approach to Interpreting Model Predictions},
+  url = {https://proceedings.neurips.cc/paper/2017/hash/8a20a8621978632d76c43dfd28b67767-Abstract.html},
+  year = {2017},
 }
 
 @article{maass1997networks,
- author = {Maass, Wolfgang},
- doi = {10.1016/s0893-6080(97)00011-7},
- issn = {0893-6080},
- journal = {Neural Networks},
- number = {9},
- pages = {1659--1671},
- publisher = {Elsevier BV},
- source = {Crossref},
- title = {Networks of spiking neurons: {The} third generation of neural network models},
- url = {https://doi.org/10.1016/s0893-6080(97)00011-7},
- volume = {10},
- year = {1997},
- month = dec,
+  number = {9},
+  doi = {10.1016/s0893-6080(97)00011-7},
+  pages = {1659--1671},
+  source = {Crossref},
+  volume = {10},
+  author = {Maass, Wolfgang},
+  year = {1997},
+  month = dec,
+  url = {https://doi.org/10.1016/s0893-6080(97)00011-7},
+  issn = {0893-6080},
+  journal = {Neural Networks},
+  publisher = {Elsevier BV},
+  title = {Networks of spiking neurons: The third generation of neural network models},
 }
 
 @article{mattson2020mlperf,
- author = {Mattson, Peter and Reddi, Vijay Janapa and Cheng, Christine and Coleman, Cody and Diamos, Greg and Kanter, David and Micikevicius, Paulius and Patterson, David and Schmuelling, Guenther and Tang, Hanlin and Wei, Gu-Yeon and Wu, Carole-Jean},
- doi = {10.1109/mm.2020.2974843},
- issn = {0272-1732, 1937-4143},
- journal = {IEEE Micro},
- number = {2},
- pages = {8--16},
- publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
- source = {Crossref},
- title = {{MLPerf:} {An} Industry Standard Benchmark Suite for Machine Learning Performance},
- url = {https://doi.org/10.1109/mm.2020.2974843},
- volume = {40},
- year = {2020},
- month = mar,
+  number = {2},
+  doi = {10.1109/mm.2020.2974843},
+  pages = {8--16},
+  source = {Crossref},
+  volume = {40},
+  author = {Mattson, Peter and Reddi, Vijay Janapa and Cheng, Christine and Coleman, Cody and Diamos, Greg and Kanter, David and Micikevicius, Paulius and Patterson, David and Schmuelling, Guenther and Tang, Hanlin and Wei, Gu-Yeon and Wu, Carole-Jean},
+  year = {2020},
+  month = mar,
+  url = {https://doi.org/10.1109/mm.2020.2974843},
+  issn = {0272-1732,1937-4143},
+  journal = {IEEE Micro},
+  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
+  title = {MLPerf: An Industry Standard Benchmark Suite for Machine Learning Performance},
 }
 
 @article{modha2023neural,
- author = {Modha, Dharmendra S. and Akopyan, Filipp and Andreopoulos, Alexander and Appuswamy, Rathinakumar and Arthur, John V. and Cassidy, Andrew S. and Datta, Pallab and DeBole, Michael V. and Esser, Steven K. and Otero, Carlos Ortega and Sawada, Jun and Taba, Brian and Amir, Arnon and Bablani, Deepika and Carlson, Peter J. and Flickner, Myron D. and Gandhasri, Rajamohan and Garreau, Guillaume J. and Ito, Megumi and Klamo, Jennifer L. and Kusnitz, Jeffrey A. and McClatchey, Nathaniel J. and McKinstry, Jeffrey L. and Nakamura, Yutaka and Nayak, Tapan K. and Risk, William P. and Schleupen, Kai and Shaw, Ben and Sivagnaname, Jay and Smith, Daniel F. and Terrizzano, Ignacio and Ueda, Takanori},
- doi = {10.1126/science.adh1174},
- issn = {0036-8075, 1095-9203},
- journal = {Science},
- number = {6668},
- pages = {329--335},
- publisher = {American Association for the Advancement of Science (AAAS)},
- source = {Crossref},
- title = {Neural inference at the frontier of energy, space, and time},
- url = {https://doi.org/10.1126/science.adh1174},
- volume = {382},
- year = {2023},
- month = oct,
+  number = {6668},
+  doi = {10.1126/science.adh1174},
+  pages = {329--335},
+  source = {Crossref},
+  volume = {382},
+  author = {Modha, Dharmendra S. and Akopyan, Filipp and Andreopoulos, Alexander and Appuswamy, Rathinakumar and Arthur, John V. and Cassidy, Andrew S. and Datta, Pallab and DeBole, Michael V. and Esser, Steven K. and Otero, Carlos Ortega and Sawada, Jun and Taba, Brian and Amir, Arnon and Bablani, Deepika and Carlson, Peter J. and Flickner, Myron D. and Gandhasri, Rajamohan and Garreau, Guillaume J. and Ito, Megumi and Klamo, Jennifer L. and Kusnitz, Jeffrey A. and McClatchey, Nathaniel J. and McKinstry, Jeffrey L. and Nakamura, Yutaka and Nayak, Tapan K. and Risk, William P. and Schleupen, Kai and Shaw, Ben and Sivagnaname, Jay and Smith, Daniel F. and Terrizzano, Ignacio and Ueda, Takanori},
+  year = {2023},
+  month = oct,
+  url = {https://doi.org/10.1126/science.adh1174},
+  issn = {0036-8075,1095-9203},
+  journal = {Science},
+  publisher = {American Association for the Advancement of Science (AAAS)},
+  title = {Neural inference at the frontier of energy, space, and time},
 }
 
 @inproceedings{reddi2020mlperf,
- author = {Reddi, Vijay Janapa and Cheng, Christine and Kanter, David and Mattson, Peter and Schmuelling, Guenther and Wu, Carole-Jean and Anderson, Brian and Breughe, Maximilien and Charlebois, Mark and Chou, William and Chukka, Ramesh and Coleman, Cody and Davis, Sam and Deng, Pan and Diamos, Greg and Duke, Jared and Fick, Dave and Gardner, J. Scott and Hubara, Itay and Idgunji, Sachin and Jablin, Thomas B. and Jiao, Jeff and John, Tom St. and Kanwar, Pankaj and Lee, David and Liao, Jeffery and Lokhmotov, Anton and Massa, Francisco and Meng, Peng and Micikevicius, Paulius and Osborne, Colin and Pekhimenko, Gennady and Rajan, Arun Tejusve Raghunath and Sequeira, Dilip and Sirasao, Ashish and Sun, Fei and Tang, Hanlin and Thomson, Michael and Wei, Frank and Wu, Ephrem and Xu, Lingjie and Yamada, Koichi and Yu, Bing and Yuan, George and Zhong, Aaron and Zhang, Peizhao and Zhou, Yuchen},
- booktitle = {2020 ACM/IEEE 47th Annual International Symposium on Computer Architecture (ISCA)},
- doi = {10.1109/isca45697.2020.00045},
- organization = {IEEE},
- pages = {446--459},
- publisher = {IEEE},
- source = {Crossref},
- title = {{MLPerf} Inference Benchmark},
- url = {https://doi.org/10.1109/isca45697.2020.00045},
- year = {2020},
- month = may,
+  doi = {10.1109/isca45697.2020.00045},
+  pages = {446--459},
+  source = {Crossref},
+  author = {Reddi, Vijay Janapa and Cheng, Christine and Kanter, David and Mattson, Peter and Schmuelling, Guenther and Wu, Carole-Jean and Anderson, Brian and Breughe, Maximilien and Charlebois, Mark and Chou, William and Chukka, Ramesh and Coleman, Cody and Davis, Sam and Deng, Pan and Diamos, Greg and Duke, Jared and Fick, Dave and Gardner, J. Scott and Hubara, Itay and Idgunji, Sachin and Jablin, Thomas B. and Jiao, Jeff and John, Tom St. and Kanwar, Pankaj and Lee, David and Liao, Jeffery and Lokhmotov, Anton and Massa, Francisco and Meng, Peng and Micikevicius, Paulius and Osborne, Colin and Pekhimenko, Gennady and Rajan, Arun Tejusve Raghunath and Sequeira, Dilip and Sirasao, Ashish and Sun, Fei and Tang, Hanlin and Thomson, Michael and Wei, Frank and Wu, Ephrem and Xu, Lingjie and Yamada, Koichi and Yu, Bing and Yuan, George and Zhong, Aaron and Zhang, Peizhao and Zhou, Yuchen},
+  year = {2020},
+  month = may,
+  url = {https://doi.org/10.1109/isca45697.2020.00045},
+  booktitle = {2020 ACM/IEEE 47th Annual International Symposium on Computer Architecture (ISCA)},
+  publisher = {IEEE},
+  title = {MLPerf Inference Benchmark},
+  organization = {IEEE},
 }
 
 @inproceedings{ribeiro2016should,
- author = {Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos},
- booktitle = {Proceedings of the 22nd ACM SIGKDD international conference on knowledge discovery and data mining},
- pages = {1135--1144},
- title = {{\textquotedblright} Why should i trust you?{\textquotedblright} Explaining the predictions of any classifier},
- year = {2016},
+  author = {Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos},
+  booktitle = {Proceedings of the 22nd ACM SIGKDD international conference on knowledge discovery and data mining},
+  pages = {1135--1144},
+  title = {'' Why should i trust you?'' Explaining the predictions of any classifier},
+  year = {2016},
 }
 
 @article{schuman2022opportunities,
- author = {Schuman, Catherine D. and Kulkarni, Shruti R. and Parsa, Maryam and Mitchell, J. Parker and Date, Prasanna and Kay, Bill},
- doi = {10.1038/s43588-021-00184-y},
- issn = {2662-8457},
- journal = {Nature Computational Science},
- number = {1},
- pages = {10--19},
- publisher = {Springer Science and Business Media LLC},
- source = {Crossref},
- title = {Opportunities for neuromorphic computing algorithms and applications},
- url = {https://doi.org/10.1038/s43588-021-00184-y},
- volume = {2},
- year = {2022},
- month = jan,
+  number = {1},
+  doi = {10.1038/s43588-021-00184-y},
+  pages = {10--19},
+  source = {Crossref},
+  volume = {2},
+  author = {Schuman, Catherine D. and Kulkarni, Shruti R. and Parsa, Maryam and Mitchell, J. Parker and Date, Prasanna and Kay, Bill},
+  year = {2022},
+  month = jan,
+  url = {https://doi.org/10.1038/s43588-021-00184-y},
+  issn = {2662-8457},
+  journal = {Nature Computational Science},
+  publisher = {Springer Science and Business Media LLC},
+  title = {Opportunities for neuromorphic computing algorithms and applications},
 }
 
 @article{warden2018speech,
- author = {Warden, Pete},
- journal = {ArXiv preprint},
- title = {Speech commands: {A} dataset for limited-vocabulary speech recognition},
- url = {https://arxiv.org/abs/1804.03209},
- volume = {abs/1804.03209},
- year = {2018},
+  url = {http://arxiv.org/abs/1804.03209v1},
+  year = {2018},
+  month = apr,
+  title = {Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition},
+  author = {Warden, Pete},
+  primaryclass = {cs.CL},
+  archiveprefix = {arXiv},
+  journal = {ArXiv preprint},
+  volume = {abs/1804.03209},
 }
 
 @inproceedings{xie2020adversarial,
- author = {Xie, Cihang and Tan, Mingxing and Gong, Boqing and Wang, Jiang and Yuille, Alan L. and Le, Quoc V.},
- bibsource = {dblp computer science bibliography, https://dblp.org},
- biburl = {https://dblp.org/rec/conf/cvpr/XieTGWYL20.bib},
- booktitle = {2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
- doi = {10.1109/cvpr42600.2020.00090},
- pages = {816--825},
- publisher = {IEEE},
- timestamp = {Tue, 13 Oct 2020 01:00:00 +0200},
- title = {Adversarial Examples Improve Image Recognition},
- url = {https://doi.org/10.1109/cvpr42600.2020.00090},
- year = {2020},
- source = {Crossref},
- month = jun,
+  doi = {10.1109/cvpr42600.2020.00090},
+  source = {Crossref},
+  author = {Xie, Cihang and Tan, Mingxing and Gong, Boqing and Wang, Jiang and Yuille, Alan L. and Le, Quoc V.},
+  year = {2020},
+  month = jun,
+  url = {https://doi.org/10.1109/cvpr42600.2020.00090},
+  booktitle = {2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  publisher = {IEEE},
+  title = {Adversarial Examples Improve Image Recognition},
+  bibsource = {dblp computer science bibliography, https://dblp.org},
+  biburl = {https://dblp.org/rec/conf/cvpr/XieTGWYL20.bib},
+  pages = {816--825},
+  timestamp = {Tue, 13 Oct 2020 01:00:00 +0200},
 }
 
 @article{xu2023demystifying,
- author = {Xu, Hu and Xie, Saining and Tan, Xiaoqing Ellen and Huang, Po-Yao and Howes, Russell and Sharma, Vasu and Li, Shang-Wen and Ghosh, Gargi and Zettlemoyer, Luke and Feichtenhofer, Christoph},
- journal = {ArXiv preprint},
- title = {Demystifying {CLIP} Data},
- url = {https://arxiv.org/abs/2309.16671},
- volume = {abs/2309.16671},
- year = {2023},
+  url = {http://arxiv.org/abs/2309.16671v4},
+  year = {2023},
+  month = sep,
+  title = {Demystifying CLIP Data},
+  author = {Xu, Hu and Xie, Saining and Tan, Xiaoqing Ellen and Huang, Po-Yao and Howes, Russell and Sharma, Vasu and Li, Shang-Wen and Ghosh, Gargi and Zettlemoyer, Luke and Feichtenhofer, Christoph},
+  primaryclass = {cs.CV},
+  archiveprefix = {arXiv},
+  journal = {ArXiv preprint},
+  volume = {abs/2309.16671},
 }
 
-@misc{yik2023neurobench,
- author = {Yik, Jason and Ahmed, Soikat Hasan and Ahmed, Zergham and Anderson, Brian and Andreou, Andreas G. and Bartolozzi, Chiara and Basu, Arindam and den Blanken, Douwe and Bogdan, Petrut and Bohte, Sander and Bouhadjar, Younes and Buckley, Sonia and Cauwenberghs, Gert and Corradi, Federico and de Croon, Guido and Danielescu, Andreea and Daram, Anurag and Davies, Mike and Demirag, Yigit and Eshraghian, Jason and Forest, Jeremy and Furber, Steve and Furlong, Michael and Gilra, Aditya and Indiveri, Giacomo and Joshi, Siddharth and Karia, Vedant and Khacef, Lyes and Knight, James C. and Kriener, Laura and Kubendran, Rajkumar and Kudithipudi, Dhireesha and Lenz, Gregor and Manohar, Rajit and Mayr, Christian and Michmizos, Konstantinos and Muir, Dylan and Neftci, Emre and Nowotny, Thomas and Ottati, Fabrizio and Ozcelikkale, Ayca and Pacik-Nelson, Noah and Panda, Priyadarshini and Pao-Sheng, Sun and Payvand, Melika and Pehle, Christian and Petrovici, Mihai A. and Posch, Christoph and Renner, Alpha and Sandamirskaya, Yulia and Schaefer, Clemens JS and van Schaik, Andr\'e and Schemmel, Johannes and Schuman, Catherine and Seo, Jae-sun and Sheik, Sadique and Shrestha, Sumit Bam and Sifalakis, Manolis and Sironi, Amos and Stewart, Kenneth and Stewart, Terrence C. and Stratmann, Philipp and Tang, Guangzhi and Timcheck, Jonathan and Verhelst, Marian and Vineyard, Craig M. and Vogginger, Bernhard and Yousefzadeh, Amirreza and Zhou, Biyan and Zohora, Fatima Tuz and Frenkel, Charlotte and Reddi, Vijay Janapa},
- archiveprefix = {arXiv},
- eprint = {2304.04640},
- primaryclass = {cs.AI},
- title = {{NeuroBench:} {Advancing} Neuromorphic Computing through Collaborative, Fair and Representative Benchmarking},
- year = {2023},
+@article{yik2023neurobench,
+  url = {http://arxiv.org/abs/2304.04640v3},
+  year = {2023},
+  month = apr,
+  title = {NeuroBench: A Framework for Benchmarking Neuromorphic Computing Algorithms and Systems},
+  author = {Yik, Jason and den Berghe, Korneel Van and den Blanken, Douwe and Bouhadjar, Younes and Fabre, Maxime and Hueber, Paul and Kleyko, Denis and Pacik-Nelson, Noah and Sun, Pao-Sheng Vincent and Tang, Guangzhi and Wang, Shenqi and Zhou, Biyan and Ahmed, Soikat Hasan and Joseph, George Vathakkattil and Leto, Benedetto and Micheli, Aurora and Mishra, Anurag Kumar and Lenz, Gregor and Sun, Tao and Ahmed, Zergham and Akl, Mahmoud and Anderson, Brian and Andreou, Andreas G. and Bartolozzi, Chiara and Basu, Arindam and Bogdan, Petrut and Bohte, Sander and Buckley, Sonia and Cauwenberghs, Gert and Chicca, Elisabetta and Corradi, Federico and de Croon, Guido and Danielescu, Andreea and Daram, Anurag and Davies, Mike and Demirag, Yigit and Eshraghian, Jason and Fischer, Tobias and Forest, Jeremy and Fra, Vittorio and Furber, Steve and Furlong, P. Michael and Gilpin, William and Gilra, Aditya and Gonzalez, Hector A. and Indiveri, Giacomo and Joshi, Siddharth and Karia, Vedant and Khacef, Lyes and Knight, James C. and Kriener, Laura and Kubendran, Rajkumar and Kudithipudi, Dhireesha and Liu, Yao-Hong and Liu, Shih-Chii and Ma, Haoyuan and Manohar, Rajit and Margarit-Taul\'e, Josep Maria and Mayr, Christian and Michmizos, Konstantinos and Muir, Dylan and Neftci, Emre and Nowotny, Thomas and Ottati, Fabrizio and Ozcelikkale, Ayca and Panda, Priyadarshini and Park, Jongkil and Payvand, Melika and Pehle, Christian and Petrovici, Mihai A. and Pierro, Alessandro and Posch, Christoph and Renner, Alpha and Sandamirskaya, Yulia and Schaefer, Clemens JS and van Schaik, Andr\'e and Schemmel, Johannes and Schmidgall, Samuel and Schuman, Catherine and Seo, Jae-sun and Sheik, Sadique and Shrestha, Sumit Bam and Sifalakis, Manolis and Sironi, Amos and Stewart, Matthew and Stewart, Kenneth and Stewart, Terrence C. and Stratmann, Philipp and Timcheck, Jonathan and T\"omen, Nergis and Urgese, Gianvito and Verhelst, Marian and Vineyard, Craig M. and Vogginger, Bernhard and Yousefzadeh, Amirreza and Zohora, Fatima Tuz and Frenkel, Charlotte and Reddi, Vijay Janapa},
+  primaryclass = {cs.AI},
+  archiveprefix = {arXiv},
+  eprint = {2304.04640},
 }
 
 @article{tschand2024mlperf,
-  title={MLPerf Power: Benchmarking the Energy Efficiency of Machine Learning Systems from $\{$$\backslash$mu$\}$ Watts to MWatts for Sustainable AI},
-  author={Tschand, Arya and Rajan, Arun Tejusve Raghunath and Idgunji, Sachin and Ghosh, Anirban and Holleman, Jeremy and Kiraly, Csaba and Ambalkar, Pawan and Borkar, Ritika and Chukka, Ramesh and Cockrell, Trevor and others},
-  journal={arXiv preprint arXiv:2410.12032},
-  year={2024}
+  url = {http://arxiv.org/abs/2410.12032v1},
+  year = {2024},
+  month = oct,
+  title = {MLPerf Power: Benchmarking the Energy Efficiency of Machine Learning Systems from Microwatts to Megawatts for Sustainable AI},
+  author = {Tschand, Arya and Rajan, Arun Tejusve Raghunath and Idgunji, Sachin and Ghosh, Anirban and Holleman, Jeremy and Kiraly, Csaba and Ambalkar, Pawan and Borkar, Ritika and Chukka, Ramesh and Cockrell, Trevor and Curtis, Oliver and Fursin, Grigori and Hodak, Miro and Kassa, Hiwot and Lokhmotov, Anton and Miskovic, Dejan and Pan, Yuechao and Manmathan, Manu Prasad and Raymond, Liz and John, Tom St. and Suresh, Arjun and Taubitz, Rowan and Zhan, Sean and Wasson, Scott and Kanter, David and Reddi, Vijay Janapa},
+  primaryclass = {cs.AR},
+  archiveprefix = {arXiv},
+  journal = {arXiv preprint arXiv:2410.12032},
 }
\ No newline at end of file
diff --git a/contents/core/benchmarking/benchmarking.qmd b/contents/core/benchmarking/benchmarking.qmd
index e0fe1ddb..9a889ef0 100644
--- a/contents/core/benchmarking/benchmarking.qmd
+++ b/contents/core/benchmarking/benchmarking.qmd
@@ -108,7 +108,7 @@ A key prerogative for any benchmark to be impactful is that it must reflect the
 
 Furthermore, benchmarks published with broad co-authorship from respected institutions carry authority and validity that convinces the community to adopt them as trusted standards. Benchmarks perceived as biased by particular corporate or institutional interests breed skepticism. Ongoing community engagement through workshops and challenges is also key after the initial release, and that is what, for instance, led to the success of ImageNet. As research progresses, collective participation enables continual refinement and expansion of benchmarks over time.
 
-Finally, community-developed benchmarks released with open access accelerate adoption and consistent implementation. We shared open-source code, documentation, models, and infrastructure to lower barriers for groups to benchmark solutions on an equal footing using standardized implementations. This consistency is critical for fair comparisons. Without coordination, labs and companies may implement benchmarks differently, reducing result reproducibility.
+Finally, releasing community-developed benchmarks with open access promotes their adoption and consistent use. By providing open-source code, documentation, models, and infrastructure, we reduce barriers to entry, enabling groups to benchmark solutions on an equal footing with standardized implementations. This consistency is essential for fair comparisons. Without coordination, labs and companies might implement benchmarks differently, which can undermine reproducibility and comparability of results.
 
 Community consensus brings benchmarks lasting relevance, while fragmentation confuses. Through collaborative development and transparent operation, benchmarks can become authoritative standards for tracking progress. Several of the benchmarks that we discuss in this chapter were developed and built by the community, for the community, and that is what ultimately led to their success.
 
@@ -126,7 +126,7 @@ The architecture, size, and complexity of AI models vary widely. Different model
 
 ### Data Benchmarks
 
-AI, particularly machine learning, is inherently data-driven. The quality, size, and diversity of data influence AI models' training efficacy and generalization capability. Data benchmarks focus on the datasets used in AI training and evaluation. They provide standardized datasets the community can use to train and test models, ensuring a level playing field for comparisons. Moreover, these benchmarks highlight data quality, diversity, and representation challenges, pushing the community to address biases and gaps in AI training data. By understanding data benchmarks, researchers can also gauge how models might perform in real-world scenarios, ensuring robustness and reliability.
+In machine learning, data is foundational because the quality, scale, and diversity of datasets directly impact model efficacy and generalization. Data benchmarks focus on the datasets used in training and evaluation. They provide standardized datasets the community can use to train and test models, ensuring a level playing field for comparisons. Moreover, these benchmarks highlight data quality, diversity, and representation challenges, pushing the community to address biases and gaps in training data. By understanding data benchmarks, researchers can also gauge how models might perform in real-world scenarios, ensuring robustness and reliability.
 
 In the remainder of the sections, we will discuss each of these benchmark types. The focus will be an in-depth exploration of system benchmarks, as these are critical to understanding and advancing machine learning system performance. We will briefly cover model and data benchmarks for a comprehensive perspective, but the emphasis and majority of the content will be devoted to system benchmarks.
 
@@ -143,7 +143,7 @@ Machine learning system benchmarking provides a structured and systematic approa
 
 #### Micro Benchmarks
 
-Micro-benchmarks in AI are specialized, evaluating distinct components or specific operations within a broader machine learning process. These benchmarks zero in on individual tasks, offering insights into the computational demands of a particular neural network layer, the efficiency of a unique optimization technique, or the throughput of a specific activation function. For instance, practitioners might use micro-benchmarks to measure the computational time required by a convolutional layer in a deep learning model or to evaluate the speed of data preprocessing that feeds data into the model. Such granular assessments are instrumental in fine-tuning and optimizing discrete aspects of AI models, ensuring that each component operates at its peak potential.
+Micro-benchmarks are specialized, evaluating distinct components or specific operations within a broader machine learning process. These benchmarks focus on individual tasks, offering insights into the computational demands of a particular neural network layer, the efficiency of a unique optimization technique, or the throughput of a specific activation function. For instance, practitioners might use micro-benchmarks to measure the computational time required by a convolutional layer in a deep learning model or to evaluate the speed of data preprocessing that feeds data into the model. Such granular assessments are instrumental in fine-tuning and optimizing discrete aspects of models, ensuring that each component operates at its peak potential.
 
 These types of microbenchmarks include zooming into very specific operations or components of the AI pipeline, such as the following:
 
@@ -153,7 +153,7 @@ These types of microbenchmarks include zooming into very specific operations or
 
 * **Layer Benchmarks:** Evaluations of the computational efficiency of distinct neural network layers, such as LSTM or Transformer blocks, when operating on standardized input sizes.
 
-Example: [DeepBench](https://github.com/baidu-research/DeepBench), introduced by Baidu, is a good example of something that assesses the above. DeepBench assesses the performance of basic operations in deep learning models, providing insights into how different hardware platforms handle neural network training and inference.
+Example: [DeepBench](https://github.com/baidu-research/DeepBench), introduced by Baidu, is a good benchmark that evaluates fundamental deep learning operations, such as those mentioned above. DeepBench assesses the performance of basic operations in deep learning models, providing insights into how different hardware platforms handle neural network training and inference.
 
 :::{#exr-cuda .callout-caution collapse="true"}
 
@@ -167,7 +167,7 @@ Ever wonder how your image filters get so fast? Special libraries like cuDNN sup
 
 #### Macro Benchmarks
 
-Macro benchmarks provide a holistic view, assessing the end-to-end performance of entire machine learning models or comprehensive AI systems. Rather than focusing on individual operations, macro-benchmarks evaluate the collective efficacy of models under real-world scenarios or tasks. For example, a macro-benchmark might assess the complete performance of a deep learning model undertaking image classification on a dataset like [ImageNet](https://www.image-net.org/). This includes gauging accuracy, computational speed, and resource consumption. Similarly, one might measure the cumulative time and resources needed to train a natural language processing model on extensive text corpora or evaluate the performance of an entire recommendation system, from data ingestion to final user-specific outputs.
+Macro benchmarks provide a holistic view, assessing the end-to-end performance of entire machine learning models or comprehensive ML systems. Rather than focusing on individual operations, macro-benchmarks evaluate the collective efficacy of models under real-world scenarios or tasks. For example, a macro-benchmark might assess the complete performance of a deep learning model undertaking image classification on a dataset like [ImageNet](https://www.image-net.org/). This includes gauging accuracy, computational speed, and resource consumption. Similarly, one might measure the cumulative time and resources needed to train a natural language processing model on extensive text corpora or evaluate the performance of an entire recommendation system, from data ingestion to final user-specific outputs.
 
 Examples: These benchmarks evaluate the AI model:
 
@@ -179,7 +179,7 @@ Examples: These benchmarks evaluate the AI model:
 
 #### End-to-end Benchmarks
 
-End-to-end benchmarks provide an all-inclusive evaluation that extends beyond the boundaries of the AI model itself. Instead of focusing solely on a machine learning model's computational efficiency or accuracy, these benchmarks encompass the entire pipeline of an AI system. This includes initial data preprocessing, the core model's performance, post-processing of the model's outputs, and other integral components like storage and network interactions.
+End-to-end benchmarks provide an all-inclusive evaluation that extends beyond the boundaries of the ML model itself. Instead of focusing solely on a machine learning model's computational efficiency or accuracy, these benchmarks encompass the entire pipeline of an AI system. This includes initial data preprocessing, the core model's performance, post-processing of the model's outputs, and other integral components like storage and network interactions.
 
 Data preprocessing is the first stage in many AI systems, transforming raw data into a format suitable for model training or inference. These preprocessing steps' efficiency, scalability, and accuracy are vital for the overall system's performance. End-to-end benchmarks assess this phase, ensuring that data cleaning, normalization, augmentation, or any other transformation process doesn't become a bottleneck.
 
@@ -254,28 +254,19 @@ Beyond raw scores or metrics, benchmarks often provide guidelines or context to
 
 Example: A benchmark might highlight that while Model A scored higher than Model B in accuracy, it offers better real-time performance, making it more suitable for time-sensitive applications.
 
-### Training vs. Inference
-
-The development life cycle of a machine learning model involves two critical phases - training and inference. [Training](../training/training.qmd), as you may recall, is the process of learning patterns from data to create the model. Inference refers to the model making predictions on new unlabeled data. Both phases play indispensable yet distinct roles. Consequently, each phase warrants rigorous benchmarking to evaluate performance metrics like speed, accuracy, and computational efficiency.
-
-Benchmarking the training phase provides insights into how different model architectures, hyperparameter values, and optimization algorithms impact the time and resources needed to train the model. For instance, benchmarking shows how neural network depth affects training time on a given dataset. Benchmarking also reveals how hardware accelerators like GPUs and TPUs can speed up training.
-
-On the other hand, benchmarking inference evaluates model performance in real-world conditions after deployment. Key metrics include latency, throughput, memory footprint, and power consumption. This type of benchmarking determines if a model meets the requirements of its target application regarding response time and device constraints. However, we will discuss these broadly to ensure a general understanding.
-
-
 ### Training Benchmarks
 
-Training represents the phase where the system processes and ingests raw data to adjust and refine its parameters. Therefore, it is an algorithmic activity and involves system-level considerations, including data pipelines, storage, computing resources, and orchestration mechanisms. The goal is to ensure that the ML system can efficiently learn from data, optimizing both the model's performance and the system's resource utilization.
+The development life cycle of a machine learning model involves two critical phases - training and inference. Training represents the phase where the system processes and ingests raw data to adjust and refine its parameters. Benchmarking the training phase reveals how choices in data pipelines, storage solutions, model architectures, computing resources, hyperparameter settings, and optimization algorithms affect the efficiency and resource demands of model training. The goal is to ensure that the ML system can efficiently learn from data, optimizing both the model's performance and the system's resource utilization.
 
 #### Purpose
 
-From an ML systems perspective, training benchmarks evaluate how well the system scales with increasing data volumes and computational demands. It's about understanding the interplay between hardware, software, and the data pipeline in the training process.
+From a systems perspective, training machine learning models is resource-intensive, especially when working with large models. These models often contain billions or even trillions of trainable parameters and require enormous amounts of data, often on the scale of many terabytes. For example, [OpenAI's GPT-3](https://arxiv.org/abs/2005.14165) [@brown2020language] has 175 billion parameters, was trained on 45 TB of compressed plaintext data, and required 3,640 petaflop-days of compute for pretraining. ML training benchmarks evaluate the systems and resources required to manage the computational load of training such models.
 
-Consider a distributed ML system designed to train on vast datasets, like those used in large-scale e-commerce product recommendations. A training benchmark would assess how efficiently the system scales across multiple nodes, manage data sharding and handle failures or node drop-offs during training.
+Efficient data storage and delivery during training also play a major role in the training process. For instance, in a machine learning model that predicts bounding boxes around objects in an image, thousands of images may be required. However, loading an entire image dataset into memory is typically infeasible, so practitioners rely on data loaders (as disucssed in @sec-frameworks-data-loaders) from ML frameworks. Successful model training depends on timely and efficient data delivery, making it essential to benchmark tools like data loaders, data pipelines, preprocessing speed, and storage retrieval times to understand their impact on training performance.
 
-Training benchmarks evaluate CPU, GPU, memory, and network utilization during the training phase, guiding system optimizations. When training a model in a cloud-based ML system, it's crucial to understand how resources are being utilized. Are GPUs being fully leveraged? Is there unnecessary memory overhead? Benchmarks can highlight bottlenecks or inefficiencies in resource utilization, leading to cost savings and performance improvements.
+Hardware selection is another key factor in training machine learning systems, as it can significantly impact training time. Training benchmarks evaluate CPU, GPU, memory, and network utilization during the training phase to guide system optimizations. Understanding how resources are used is essential: Are GPUs being fully leveraged? Is there unnecessary memory overhead? Benchmarks can uncover bottlenecks or inefficiencies in resource utilization, leading to cost savings and performance improvements.
 
-Training an ML model is contingent on timely and efficient data delivery. Benchmarks in this context would also assess the efficiency of data pipelines, data preprocessing speed, and storage retrieval times. For real-time analytics systems, like those used in fraud detection, the speed at which training data is ingested, preprocessed, and fed into the model can be critical. Benchmarks would evaluate the latency of data pipelines, the efficiency of storage systems (like SSDs vs. HDDs), and the speed of data augmentation or transformation tasks.
+In many cases, using a single hardware accelerator, such as a single GPU, is insufficient to meet the computational demands of large-scale model training. Machine learning models are often trained in data centers with multiple GPUs or TPUs, where distributed computing enables parallel processing across nodes. Training benchmarks assess how efficiently the system scales across multiple nodes, manages data sharding, and handles challenges like node failures or drop-offs during training.
 
 #### Metrics
 
@@ -285,13 +276,13 @@ The following metrics are often considered important:
 
 1. **Training Time:** The time it takes to train a model from scratch until it reaches a satisfactory performance level. It directly measures the computational resources required to train a model. For example, [Google's BERT](https://arxiv.org/abs/1810.04805) [@devlin2018bert] is a natural language processing model that requires several days to train on a massive corpus of text data using multiple GPUs. The long training time is a significant resource consumption and cost challenge. In some cases, benchmarks can instead measure the training throughput (training samples per unit of time). Throughput can be calculated much faster and easier than training time but may obscure the metrics we really care about (e.g. time to train).
 
-2. **Scalability:** How well the training process can handle increases in data size or model complexity. Scalability can be assessed by measuring training time, memory usage, and other resource consumption as data size or model complexity increases. [OpenAI's GPT-3](https://arxiv.org/abs/2005.14165) [@brown2020language] model has 175 billion parameters, making it one of the largest language models in existence. Training GPT-3 required extensive engineering efforts to scale the training process to handle the massive model size. This involved using specialized hardware, distributed training, and other techniques to ensure the model could be trained efficiently.
+2. **Scalability:** How well the training process can handle increases in data size or model complexity. Scalability can be assessed by measuring training time, memory usage, and other resource consumption as data size or model complexity increases. For instance, training OpenAI's GPT-3 required extensive engineering efforts to scale the training process across many GPU nodes to handle the massive model size. This involved using specialized hardware, distributed training, and other techniques to ensure the model could be trained efficiently.
 
 3. **Resource Utilization:** The extent to which the training process utilizes available computational resources such as CPU, GPU, memory, and disk I/O. High resource utilization can indicate an efficient training process, while low utilization can suggest bottlenecks or inefficiencies. For instance, training a convolutional neural network (CNN) for image classification requires significant GPU resources. Utilizing multi-GPU setups and optimizing the training code for GPU acceleration can greatly improve resource utilization and training efficiency.
 
 4. **Memory Consumption:** The amount of memory the training process uses. Memory consumption can be a limiting factor for training large models or datasets. For example, Google researchers faced significant memory consumption challenges when training BERT. The model has hundreds of millions of parameters, requiring large amounts of memory. The researchers had to develop techniques to reduce memory consumption, such as gradient checkpointing and model parallelism.
 
-5. **Energy Consumption:** The energy consumed during training. As machine learning models become more complex, energy consumption has become an important consideration. Training large machine learning models can consume significant energy, leading to a large carbon footprint. For instance, the training of OpenAI's GPT-3 was estimated to have a carbon footprint equivalent to traveling by car for 700,000 kilometers.
+5. **Energy Consumption:** The energy consumed during training. As machine learning models become more complex, energy consumption has become an important consideration. Training large machine learning models can consume significant energy, leading to a large carbon footprint. For instance, the training of OpenAI's GPT-3 was estimated to have a carbon footprint equivalent to traveling by car for 700,000 kilometers (~435,000 miles).
 
 6. **Throughput:** The number of training samples processed per unit time. Higher throughput generally indicates a more efficient training process. The throughput is an important metric to consider when training a recommendation system for an e-commerce platform. A high throughput ensures that the model can process large volumes of user interaction data promptly, which is crucial for maintaining the relevance and accuracy of the recommendations. But it's also important to understand how to balance throughput with latency bounds. Therefore, a latency-bounded throughput constraint is often imposed on service-level agreements for data center application deployments.
 
@@ -305,27 +296,11 @@ The following metrics are often considered important:
 
 By benchmarking for these types of metrics, we can obtain a comprehensive view of the training process's performance and efficiency from a systems perspective. This can help identify areas for improvement and ensure that resources are used effectively.
 
-#### Tasks
-
-Selecting a handful of representative tasks for benchmarking machine learning systems is challenging because machine learning is applied to various domains with unique characteristics and requirements. Here are some of the challenges faced in selecting representative tasks:
-
-1. **Diversity of Applications:** Machine learning is used in numerous fields such as healthcare, finance, natural language processing, computer vision, and many more. Each field has specific tasks that may not be representative of other fields. For example, image classification tasks in computer vision may not be relevant to financial fraud detection.
-2. **Variability in Data Types and Quality:** Different tasks require different data types, such as text, images, videos, or numerical data. Data quality and availability can vary greatly between tasks, making it difficult to select tasks that are representative of the general challenges faced in machine learning.
-3. **Task Complexity and Difficulty:** The complexity of tasks varies greatly. Some are relatively straightforward, while others are highly complex and require sophisticated models and techniques. Selecting representative tasks that cover the complexities encountered in machine learning is challenging.
-4. **Ethical and Privacy Concerns:** Some tasks may involve sensitive or private data, such as medical records or personal information. These tasks may have ethical and privacy concerns that need to be addressed, making them less suitable as representative tasks for benchmarking.
-5. **Scalability and Resource Requirements:** Different tasks may have different scalability and resource requirements. Some tasks may require extensive computational resources, while others can be performed with minimal resources. Selecting tasks that represent the general resource requirements in machine learning is difficult.
-6. **Evaluation Metrics:** The metrics used to evaluate the performance of machine learning models vary between tasks. Some tasks may have well-established evaluation metrics, while others lack clear or standardized metrics. This can make it challenging to compare performance across different tasks.
-7. **Generalizability of Results:** The results obtained from benchmarking on a specific task may not be generalizable to other tasks. This means that a machine learning system's performance on a selected task may not be indicative of its performance on other tasks.
-
-It is important to carefully consider these factors when designing benchmarks to ensure they are meaningful and relevant to the diverse range of tasks encountered in machine learning.
-
 #### Benchmarks
 
 Here are some original works that laid the fundamental groundwork for developing systematic benchmarks for training machine learning systems.
 
-*[MLPerf Training Benchmark](https://github.com/mlcommons/training)*
-
-MLPerf is a suite of benchmarks designed to measure the performance of machine learning hardware, software, and services. The MLPerf Training benchmark [@mattson2020mlperf] focuses on the time it takes to train models to a target quality metric. It includes diverse workloads, such as image classification, object detection, translation, and reinforcement learning. @fig-perf-trend highlights the performance improvements in progressive versions of MLPerf Training benchmarks, which have all outpaced Moore's Law. Using standardized benchmarking trends enables us to rigorously showcase the rapid evolution of ML computing.
+**[MLPerf Training Benchmark](https://github.com/mlcommons/training)**: MLPerf is a suite of benchmarks designed to measure the performance of machine learning hardware, software, and services. The MLPerf Training benchmark [@mattson2020mlperf] focuses on the time it takes to train models to a target quality metric. It includes diverse workloads, such as image classification, object detection, translation, and reinforcement learning. @fig-perf-trend highlights the performance improvements in progressive versions of MLPerf Training benchmarks, which have all outpaced Moore's Law. Using standardized benchmarking trends enables us to rigorously showcase the rapid evolution of ML computing.
 
 ![MLPerf Training performance trends. Source: @mattson2020mlperf.](images/png/mlperf_perf_trend.png){#fig-perf-trend}
 
@@ -335,9 +310,7 @@ Metrics:
 * Throughput (examples per second)
 * Resource utilization (CPU, GPU, memory, disk I/O)
 
-*[DAWNBench](https://dawn.cs.stanford.edu/benchmark/)*
-
-DAWNBench [@coleman2017dawnbench] is a benchmark suite focusing on end-to-end deep learning training time and inference performance. It includes common tasks such as image classification and question answering.
+**[DAWNBench](https://dawn.cs.stanford.edu/benchmark/)**: DAWNBench [@coleman2017dawnbench] is a benchmark suite focusing on end-to-end deep learning training time and inference performance. It includes common tasks such as image classification and question answering.
 
 Metrics:
 
@@ -345,9 +318,7 @@ Metrics:
 * Inference latency
 * Cost (in terms of cloud computing and storage resources)
 
-*[Fathom](https://github.com/rdadolf/fathom)*
-
-Fathom [@adolf2016fathom] is a benchmark from Harvard University that evaluates the performance of deep learning models using a diverse set of workloads. These include common tasks such as image classification, speech recognition, and language modeling.
+**[Fathom](https://github.com/rdadolf/fathom)**: Fathom [@adolf2016fathom] is a benchmark from Harvard University that evaluates the performance of deep learning models using a diverse set of workloads. These include common tasks such as image classification, speech recognition, and language modeling.
 
 Metrics:
 
@@ -357,17 +328,18 @@ Metrics:
 
 #### Example Use Case
  
-Consider a scenario where we want to benchmark the training of an image classification model on a specific hardware platform.
+Imagine you have been tasked with benchmarking the training performance of an image classification model on a specific hardware platform. Let’s break down how you might approach this:
+
+1. **Define the Task**: First, choose a model and dataset. In this case, you’ll be training a CNN to classify images in the [CIFAR-10](https://www.cs.toronto.edu/kriz/cifar.html) dataset, a widely used benchmark in computer vision.
 
-1. **Task:** The task is to train a convolutional neural network (CNN) for image classification on the CIFAR-10 dataset.
-2. **Benchmark:** We can use the MLPerf Training benchmark for this task. It includes an image classification workload that is relevant to our task.
-3. **Metrics:** We will measure the following metrics:
+2. **Select the Benchmark**: Choosing a widely accepted benchmark helps ensure your setup is comparable with other real-world evaluations. You could choose to use the MLPerf Training benchmark because it provides a structured image classification workload, making it a relevant and standardized option for assessing training performance on CIFAR-10. Using MLPerf enables you to evaluate your system against industry-standard metrics, helping to ensure that results are meaningful and comparable to those achieved on other hardware platforms.
 
-* Training time to reach a target accuracy of 90%.
-* Throughput in terms of images processed per second.
-* GPU and CPU utilization during training.
+3. **Identify Key Metrics**: Now, decide on the metrics that will help you evaluate the system’s training performance. For this example, you might track:
+   - **Training Time**: How long does it take to reach 90% accuracy?
+   - **Throughput**: How many images are processed per second?
+   - **Resource Utilization**: What’s the GPU and CPU usage throughout training?
 
-By measuring these metrics, we can assess the performance and efficiency of the training process on the selected hardware platform. This information can then be used to identify potential bottlenecks or areas for improvement.
+By analyzing these metrics, you’ll gain insights into the model's training performance on your chosen hardware platform. Consider whether training time meets your expectations, if there are any bottlenecks, such as underutilized GPUs or slow data loading. This process helps identify areas for potential optimization, like improving data handling or adjusting resource allocation, and can guide future benchmarking decisions.
 
 ### Inference Benchmarks
 
@@ -397,26 +369,6 @@ Finally, it is vital to ensure that the model's predictions are not only accurat
 
 6. **Memory Usage:** Memory usage quantifies the volume of RAM needed by a machine learning model to carry out inference tasks. A relevant example to illustrate this would be a face recognition system based on a CNN; if such a system requires 150 MB of RAM to process and recognize faces within an image, its memory usage is 150 MB.
 
-#### Tasks
-
-The challenges in picking representative tasks for benchmarking inference machine learning systems are, by and large, somewhat similar to the taxonomy we have provided for training. Nevertheless, to be pedantic, let's discuss those in the context of inference machine learning systems.
-
-1. **Diversity of Applications:** Inference machine learning is employed across numerous domains such as healthcare, finance, entertainment, security, and more. Each domain has unique tasks, and what's representative in one domain might not be in another. For example, an inference task for predicting stock prices in the financial domain might differ from image recognition tasks in the medical domain.
-
-2. **Variability in Data Types:** Different inference tasks require different types of data—text, images, videos, numerical data, etc. Ensuring that benchmarks address the wide variety of data types used in real-world applications is challenging. For example, voice recognition systems process audio data, which is vastly different from the visual data processed by facial recognition systems.
-
-3. **Task Complexity:** The complexity of inference tasks can differ immensely, from basic classification tasks to intricate tasks requiring state-of-the-art models. For example, differentiating between two categories (binary classification) is typically simpler than detecting hundreds of object types in a crowded scene.
-
-4. **Real-time Requirements:** Some applications demand immediate or real-time responses, while others may allow for some delay. In autonomous driving, real-time object detection and decision-making are paramount, whereas a recommendation engine for a shopping website might tolerate slight delays.
-
-5. **Scalability Concerns:** Given the varied scale of applications, from edge devices to cloud-based servers, tasks must represent the diverse computational environments where inference occurs. For example, an inference task running on a smartphone's limited resources differs from a powerful cloud server.
-
-6. **Evaluation Metrics Diversity:** The metrics used to evaluate performance can differ significantly depending on the task. Finding a common ground or universally accepted metric for diverse tasks is challenging. For example, precision and recall might be vital for a medical diagnosis task, whereas throughput (inferences per second) might be more crucial for video processing tasks.
-
-7. **Ethical and Privacy Concerns:** Concerns related to ethics and privacy exist, especially in sensitive areas like facial recognition or personal data processing. These concerns can impact the selection and nature of tasks used for benchmarking. For example, using real-world facial data for benchmarking can raise privacy issues, whereas synthetic data might not replicate real-world challenges.
-
-8. **Hardware Diversity:** With a wide range of devices from GPUs, CPUs, and TPUs to custom ASICs used for inference, ensuring that tasks are representative across varied hardware is challenging. For example, a task optimized for inference on a GPU might perform sub-optimally on an edge device.
-
 #### Benchmarks
 
 Here are some original works that laid the fundamental groundwork for developing systematic benchmarks for inference machine learning systems.
@@ -456,20 +408,19 @@ Metrics:
 
 #### Example Use Case
 
-Consider a scenario where we want to evaluate the inference performance of an object detection model on a specific edge device.
-
-Task: The task is to perform real-time object detection on video streams, detecting and identifying objects such as vehicles, pedestrians, and traffic signs.
+Suppose you were tasked with evaluating the inference performance of an object detection model on a specific edge device. Here’s how you might approach structuring this benchmark:
 
-Benchmark: We can use the AI Benchmark for this task as it evaluates inference performance on edge devices, which suits our scenario.
+1. **Define the Task**: In this case, the task is real-time object detection on video streams, identifying objects such as vehicles, pedestrians, and traffic signs.
 
-Metrics: We will measure the following metrics:
+2. **Select the Benchmark**: To align with your goal of evaluating inference on an edge device, the AI Benchmark is a suitable choice. It provides a standardized framework specifically for assessing inference performance on edge hardware, making it relevant to this scenario.
 
-* Inference time to process each video frame
-* Latency to generate the bounding boxes for detected objects
-* Energy consumption during the inference process
-* Throughput in terms of video frames processed per second
+3. **Identify Key Metrics**: Now, determine the metrics that will help evaluate the model’s inference performance. For this example, you might track:
+   - **Inference Time**: How long does it take to process each video frame?
+   - **Latency**: What is the delay in generating bounding boxes for detected objects?
+   - **Energy Consumption**: How much power is used during inference?
+   - **Throughput**: How many video frames are processed per second?
 
-By measuring these metrics, we can assess the performance of the object detection model on the edge device and identify any potential bottlenecks or areas for optimization to improve real-time processing capabilities.
+By measuring these metrics, you’ll gain insights into how well the object detection model performs on the edge device. This can help identify any bottlenecks, such as slow frame processing or high energy consumption, and highlight areas for potential optimization to improve real-time performance.
 
 :::{#exr-perf .callout-caution collapse="true"}
 
@@ -481,6 +432,19 @@ Get ready to put your AI models to the ultimate test! MLPerf is like the Olympic
 
 :::
 
+
+### Benchmark Task Selection
+
+Selecting representative tasks for benchmarking machine learning systems is complex due to the varied applications, data types, and requirements across different domains. Machine learning is applied in fields such as healthcare, finance, natural language processing, and computer vision, each with unique tasks that may not be relevant or comparable to others. Key challenges in task selection include:
+
+1. **Diversity of Applications and Data Types:** Tasks across domains involve different data types (e.g., text, images, video) and qualities, making it difficult to find benchmarks that universally represent ML challenges.
+2. **Task Complexity and Resource Needs:** Tasks vary in complexity and resource demands, with some requiring substantial computational power and sophisticated models, while others can be addressed with simpler resources and methods.
+3. **Privacy Concerns:** Tasks involving sensitive data, such as medical records or personal information, introduce ethical and privacy issues, making them unsuitable for general benchmarks.
+4. **Evaluation Metrics:** Performance metrics vary widely across tasks, and results from one task often do not generalize to others, complicating comparisons and limiting insights from one benchmarked task to another.
+
+Addressing these challenges is essential to designing meaningful benchmarks that are relevant across the diverse tasks encountered in machine learning, ensuring benchmarks provide useful, generalizable insights for both training and inference. 
+
+
 ### Measuring Energy Efficiency
 
 As machine learning capabilities expand, both in training and inference, concerns about increased power consumption and its ecological footprint have intensified. Addressing the sustainability of ML systems, a topic explored in more depth in the [Sustainable AI](../sustainable_ai/sustainable_ai.qmd) chapter, has thus become a key priority. This focus on sustainability has led to the development of standardized benchmarks designed to accurately measure energy efficiency. However, standardizing these methodologies poses challenges due to the need to accommodate vastly different scales—from the microwatt consumption of TinyML devices to the megawatt demands of data center training systems. Moreover, ensuring that benchmarking is fair and reproducible requires accommodating the diverse range of hardware configurations and architectures in use today.
@@ -553,7 +517,7 @@ Hardware lottery occurs when a machine learning model unintentionally performs e
 
 In contrast to the accidental hardware lottery, benchmark engineering involves deliberately optimizing or designing a machine learning model to perform exceptionally well on specific hardware, often to win benchmarks or competitions. This intentional optimization might include tweaking the model's architecture, algorithms, or parameters to exploit the hardware's features and capabilities fully.
 
-#### Problem
+##### Problem
 
 Benchmark engineering refers to tweaking or modifying an AI system to optimize performance on specific benchmark tests, often at the expense of generalizability or real-world performance. This can include adjusting hyperparameters, training data, or other aspects of the system specifically to achieve high scores on benchmark metrics without necessarily improving the overall functionality or utility of the system.
 
@@ -563,7 +527,7 @@ It can lead to several risks and challenges. One of the primary risks is that th
 
 The AI community must prioritize transparency and accountability to mitigate the risks associated with benchmark engineering. This can include disclosing any optimizations or adjustments made specifically for benchmark tests and providing more comprehensive evaluations of AI systems that include real-world performance metrics and benchmark scores. Researchers and developers must prioritize holistic improvements to AI systems that improve their generalizability and functionality across various applications rather than focusing solely on benchmark-specific optimizations.
 
-#### Issues
+##### Issues
 
 One of the primary problems with benchmark engineering is that it can compromise the real-world performance of AI systems. When developers focus on optimizing their systems to achieve high scores on specific benchmark tests, they may neglect other important system performance aspects crucial in real-world applications. For example, an AI system designed for image recognition might be engineered to perform exceptionally well on a benchmark test that includes a specific set of images but needs help to recognize images slightly different from those in the test set accurately.
 
@@ -571,15 +535,15 @@ Another area for improvement with benchmark engineering is that it can result in
 
 It can also lead to misleading results. When AI systems are engineered to perform well on benchmark tests, the results may not accurately reflect the system's true capabilities. This can be problematic for users or investors who rely on benchmark scores to make informed decisions about which AI systems to use or invest in. For example, an AI system engineered to achieve high scores on a benchmark test for speech recognition might need to be more capable of accurately recognizing speech in real-world situations, leading users or investors to make decisions based on inaccurate information.
 
-#### Mitigation
+##### Mitigation
 
 There are several ways to mitigate benchmark engineering. Transparency in the benchmarking process is crucial to maintaining benchmark accuracy and reliability. This involves clearly disclosing the methodologies, data sets, and evaluation criteria used in benchmark tests, as well as any optimizations or adjustments made to the AI system for the purpose of the benchmark.
 
 One way to achieve transparency is through the use of open-source benchmarks. Open-source benchmarks are made publicly available, allowing researchers, developers, and other stakeholders to review, critique, and contribute to them, thereby ensuring their accuracy and reliability. This collaborative approach also facilitates sharing best practices and developing more robust and comprehensive benchmarks.
 
-One example is the MLPerf Tiny. It's an open-source framework designed to make it easy to compare different solutions in the world of TinyML. Its modular design allows components to be swapped out for comparison or improvement. The reference implementations, shown in green and orange in @fig-ml-perf, act as the baseline for results. TinyML often needs optimization across the entire system, and users can contribute by focusing on specific parts, like quantization. The modular benchmark design allows users to showcase their contributions and competitive advantage by modifying a reference implementation. In short, MLPerf Tiny offers a flexible and modular way to assess and improve TinyML applications, making it easier to compare and improve different aspects of the technology.
+The modular design of MLPerf Tiny connects to the problem of benchmark engineering by providing a structured yet flexible approach that encourages a balanced evaluation of TinyML. In benchmark engineering, systems may be overly optimized for specific benchmarks, leading to inflated performance scores that don’t necessarily translate to real-world effectiveness. MLPerf Tiny’s modular design aims to address this issue by allowing contributors to swap out and test specific components within a standardized framework, such as hardware, quantization techniques, or inference models. The reference implementations, highlighted in green and orange in @fig-ml-perf, provide a baseline for results, enabling flexible yet controlled testing by specifying which components can be modified. This structure supports transparency and flexibility, enabling a focus on genuine improvements rather than benchmark-specific optimizations.
 
-![MLPerf Tiny modular design. Source: @mattson2020mlperf.](images/png/mlperf_tiny.png){#fig-ml-perf}
+![Modular design of the MLPerf Tiny benchmark, showing the reference implementation with modifiable components. This modular approach enables flexible, targeted testing while maintaining a standardized baseline. Source: @banbury2021mlperf.](images/png/mlperf_tiny.png){#fig-ml-perf}
 
 Another method for achieving transparency is through peer review of benchmarks. This involves having independent experts review and validate the benchmark's methodology, data sets, and results to ensure their credibility and reliability. Peer review can provide a valuable means of verifying the accuracy of benchmark tests and help build confidence in the results.
 
@@ -639,59 +603,45 @@ Machine learning model evaluation has evolved from a narrow focus on accuracy to
 
 #### Accuracy
 
-Accuracy is one of the most intuitive and commonly used metrics for evaluating machine learning models. At its core, accuracy measures the proportion of correct predictions made by the model out of all predictions. For example, imagine we have developed a machine learning model to classify images as either containing a cat or not. If we test this model on a dataset of 100 images, and it correctly identifies 90 of them, we would calculate its accuracy as 90%.
-
-In the initial stages of machine learning, accuracy was often the primary, if not the only, metric considered when evaluating model performance. This is understandable, given its straightforward nature and ease of interpretation. However, as the field has progressed, the limitations of relying solely on accuracy have become more apparent.
-
-Consider the example of a medical diagnosis model with an accuracy of 95%. While at first glance this may seem impressive, we must look deeper to assess the model's performance fully. Suppose the model fails to accurately diagnose severe conditions that, while rare, can have severe consequences; its high accuracy may not be as meaningful. A pertinent example of this is [Google's retinopathy machine learning model](https://about.google/intl/ALL_us/stories/seeingpotential/), which was designed to diagnose diabetic retinopathy and diabetic macular edema from retinal photographs.
-
-The Google model demonstrated impressive accuracy levels in lab settings. Still, when deployed in real-world clinical environments in Thailand, [it faced significant challenges](https://www.technologyreview.com/2020/04/27/1000658/google-medical-ai-accurate-lab-real-life-clinic-covid-diabetes-retina-disease/). In the real-world setting, the model encountered diverse patient populations, varying image quality, and a range of different medical conditions that it had not been exposed to during its training. Consequently, its performance could have been better, and it struggled to maintain the same accuracy levels observed in lab settings. This example serves as a clear reminder that while high accuracy is an important and desirable attribute for a medical diagnosis model, it must be evaluated in conjunction with other factors, such as the model's ability to generalize to different populations and handle diverse and unpredictable real-world conditions, to understand its value and potential impact on patient care truly.
+Accuracy is one of the most intuitive and commonly used metrics for evaluating machine learning models. In the early stages of machine learning, accuracy was often the primary, if not the only, metric considered when evaluating model performance. However, as the field has evolved, it’s become clear that relying solely on accuracy can be misleading, especially in applications where certain types of errors carry significant consequences. 
 
-Similarly, if the model performs well on average but exhibits significant disparities in performance across different demographic groups, this, too, would be cause for concern.
+Consider the example of a medical diagnosis model with an accuracy of 95%. While at first glance this may seem impressive, we must look deeper to assess the model's performance fully. Suppose the model fails to accurately diagnose severe conditions that, while rare, can have severe consequences; its high accuracy may not be as meaningful. A well-known example of this limitation is [Google’s diabetic retinopathy model](https://about.google/intl/ALL_us/stories/seeingpotential/). While it achieved high accuracy in lab settings, it encountered challenges when deployed in real-world clinics in Thailand, where variations in patient populations, image quality, and environmental factors reduced its effectiveness. This example illustrates that even models with high accuracy need to be tested for their ability to generalize across diverse, unpredictable conditions to ensure reliability and impact in real-world settings.
 
-The evolution of machine learning has thus seen a shift towards a more holistic approach to model evaluation, taking into account not just accuracy, but also other crucial factors such as fairness, transparency, and real-world applicability. A prime example is the [Gender Shades project](http://gendershades.org/) at MIT Media Lab, led by Joy Buolamwini, highlighting significant racial and gender biases in commercial facial recognition systems. The project evaluated the performance of three facial recognition technologies developed by IBM, Microsoft, and Face++. It found that they all exhibited biases, performing better on lighter-skinned and male faces compared to darker-skinned and female faces.
+Similarly, if the model performs well on average but exhibits significant disparities in performance across different demographic groups, this, too, would be cause for concern. The evolution of machine learning has thus seen a shift towards a more holistic approach to model evaluation, taking into account not just accuracy, but also other crucial factors such as fairness, transparency, and real-world applicability. A prime example is the [Gender Shades project](http://gendershades.org/) at MIT Media Lab, led by Joy Buolamwini, highlighting biases by performing better on lighter-skinned and male faces compared to darker-skinned and female faces.
 
-While accuracy remains a fundamental and valuable metric for evaluating machine learning models, a more comprehensive approach is required to fully assess a model's performance. This means considering additional metrics that account for fairness, transparency, and real-world applicability, as well as conducting rigorous testing across diverse datasets to uncover and mitigate any potential biases. The move towards a more holistic approach to model evaluation reflects the maturation of the field and its increasing recognition of the real-world implications and ethical considerations associated with deploying machine learning models.
+While accuracy remains essential for evaluating machine learning models, a comprehensive approach is needed to fully assess performance. This includes additional metrics for fairness, transparency, and real-world applicability, along with rigorous testing across diverse datasets to identify and address biases. This holistic evaluation approach reflects the field’s growing awareness of real-world implications in deploying models.
 
 #### Fairness
 
-Fairness in machine learning models is a multifaceted and critical aspect that requires careful attention, particularly in high-stakes applications that significantly affect people's lives, such as in loan approval processes, hiring, and criminal justice. It refers to the equitable treatment of all individuals, irrespective of their demographic or social attributes such as race, gender, age, or socioeconomic status.
+Fairness in machine learning involves ensuring that models perform consistently across diverse groups, especially in high-impact applications like loan approvals, hiring, and criminal justice. Relying solely on accuracy can be misleading if the model exhibits biased outcomes across demographic groups. For example, a loan approval model with high accuracy may still consistently deny loans to certain groups, raising questions about its fairness.
 
-Simply relying on accuracy can be insufficient and potentially misleading when evaluating models. For instance, consider a loan approval model with a 95% accuracy rate. While this figure may appear impressive at first glance, it does not reveal how the model performs across different demographic groups. If this model consistently discriminates against a particular group, its accuracy is less commendable, and its fairness is questioned.
+Bias in models can arise directly, when sensitive attributes like race or gender influence decisions, or indirectly, when neutral features correlate with these attributes, affecting outcomes. Simply relying on accuracy can be insufficient when evaluating models. For instance, consider a loan approval model with a 95% accuracy rate. While this figure may appear impressive at first glance, it does not reveal how the model performs across different demographic groups. For instance, a well-known example is the COMPAS tool used in the US criminal justice system, which showed racial biases in predicting recidivism despite not explicitly using race as a variable.
 
-Discrimination can manifest in various forms, such as direct discrimination, where a model explicitly uses sensitive attributes like race or gender in its decision-making process, or indirect discrimination, where seemingly neutral variables correlate with sensitive attributes, indirectly influencing the model's outcomes. An infamous example of the latter is the COMPAS tool used in the US criminal justice system, which exhibited racial biases in predicting recidivism rates despite not explicitly using race as a variable.
+Addressing fairness requires analyzing a model’s performance across groups, identifying biases, and applying corrective measures like re-balancing datasets or using fairness-aware algorithms. Researchers and practitioners continuously develop metrics and methodologies tailored to specific use cases to evaluate fairness in real-world scenarios. For example, disparate impact analysis, demographic parity, and equal opportunity are some of the metrics employed to assess fairness. Additionally, transparency and interpretability of models are fundamental to achieving fairness. Tools like [AI Fairness 360](https://ai-fairness-360.org/) and [Fairness Indicators](https://www.tensorflow.org/tfx/guide/fairness_indicators) help explain how a model makes decisions, allowing developers to detect and correct fairness issues in machine learning models.
 
-Addressing fairness involves careful examination of the model's performance across diverse groups, identifying potential biases, and rectifying disparities through corrective measures such as re-balancing datasets, adjusting model parameters, and implementing fairness-aware algorithms. Researchers and practitioners continuously develop metrics and methodologies tailored to specific use cases to evaluate fairness in real-world scenarios. For example, disparate impact analysis, demographic parity, and equal opportunity are some of the metrics employed to assess fairness.
-
-Additionally, transparency and interpretability of models are fundamental to achieving fairness. Understanding how a model makes decisions can reveal potential biases and enable stakeholders to hold developers accountable. Open-source tools like [AI Fairness 360](https://ai-fairness-360.org/) by IBM and [Fairness Indicators](https://www.tensorflow.org/tfx/guide/fairness_indicators) by TensorFlow are being developed to facilitate fairness assessments and mitigation of biases in machine learning models.
-
-Ensuring fairness in machine learning models, particularly in applications that significantly impact people's lives, requires rigorous evaluation of the model's performance across diverse groups, careful identification and mitigation of biases, and implementation of transparency and interpretability measures. By comprehensively addressing fairness, we can work towards developing machine learning models that are equitable, just, and beneficial for society.
+While accuracy is a valuable metric, it doesn’t always provide the full picture; assessing fairness ensures models are effective across real-world scenarios. Ensuring fairness in machine learning models, particularly in applications that significantly impact people's lives, requires rigorous evaluation of the model's performance across diverse groups, careful identification and mitigation of biases, and implementation of transparency and interpretability measures. 
 
 #### Complexity
 
 ##### Parameters
 
-In the initial stages of machine learning, model benchmarking often relied on parameter counts as a proxy for model complexity. The rationale was that more parameters typically lead to a more complex model, which should, in turn, deliver better performance. However, this approach has proven inadequate as it needs to account for the computational cost associated with processing many parameters.
+In the initial stages of machine learning, model benchmarking often relied on parameter counts as a proxy for model complexity. The rationale was that more parameters typically lead to a more complex model, which should, in turn, deliver better performance. However, this approach overlooks the practical costs associated with processing large models. As parameter counts increase, so do the computational resources required, making such models impractical for deployment in real-world scenarios, particularly on devices with limited processing power.
 
-For example, GPT-3, developed by OpenAI, is a language model that boasts an astounding 175 billion parameters. While it achieves state-of-the-art performance on various natural language processing tasks, its size and the computational resources required to run it make it impractical for deployment in many real-world scenarios, especially those with limited computational capabilities.
+Relying on parameter counts as a proxy for model complexity also fails to consider the model's efficiency. A well-optimized model with fewer parameters can often achieve comparable or even superior performance to a larger model. For instance, MobileNets, developed by Google, is a family of models designed specifically for mobile and edge devices. They used depth-wise separable convolutions to reduce parameter counts and computational demands while still maintaining strong performance.
 
-Relying on parameter counts as a proxy for model complexity also fails to consider the model's efficiency. If optimized for efficiency, a model with fewer parameters might be just as effective, if not more so, than a model with a higher parameter count. For instance, MobileNets, developed by Google, is a family of models designed specifically for mobile and edge devices. They use depth-wise separable convolutions to reduce the number of parameters and computational costs while still achieving competitive performance.
-
-In light of these limitations, the field has moved towards a more holistic approach to model benchmarking that considers parameter counts and other crucial factors such as floating-point operations per second (FLOPs), memory consumption, and latency. FLOPs, in particular, have emerged as an important metric as they provide a more accurate representation of the computational load a model imposes. This shift towards a more comprehensive approach to model benchmarking reflects a recognition of the need to balance performance with practicality, ensuring that models are effective, efficient, and deployable in real-world scenarios.
+In light of these limitations, the field has moved towards a more holistic approach to model benchmarking that considers parameter counts and other crucial factors such as floating-point operations per second (FLOPs), memory consumption, and latency. This comprehensive approach balances performance with deployability, ensuring that models are not only accurate but also efficient and suitable for real-world applications.
 
 ##### FLOPS
 
-The size of a machine learning model is an essential aspect that directly impacts its usability in practical scenarios, especially when computational resources are limited. Traditionally, the number of parameters in a model was often used as a proxy for its size, with the underlying assumption being that more parameters would translate to better performance. However, this simplistic view does not consider the computational cost of processing these parameters. This is where the concept of floating-point operations per second (FLOPs) comes into play, providing a more accurate representation of the computational load a model imposes.
+FLOPs, or floating-point operations per second, have become a critical metric for representing a model’s computational load. Traditionally, parameter count was used as a proxy for model complexity, based on the assumption that more parameters would yield better performance. However, this approach overlooks the computational cost of processing these parameters, which can impact a model’s usability in real-world scenarios with limited resources.
 
-FLOPs measure the number of floating-point operations a model performs to generate a prediction. A model with many FLOPs requires substantial computational resources to process the vast number of operations, which may render it impractical for certain applications. Conversely, a model with a lower FLOP count is more lightweight and can be easily deployed in scenarios where computational resources are limited. @fig-flops, from [@bianco2018benchmark], shows the relationship between Top-1 Accuracy on ImageNet (_y_-axis), the model's G-FLOPs (_x_-axis), and the model's parameter count (circle-size).
+FLOPs measure the number of floating-point operations a model performs to generate a prediction. A model with many FLOPs requires substantial computational resources to process the vast number of operations, which may render it impractical for certain applications. Conversely, a model with a lower FLOP count is more lightweight and can be easily deployed in scenarios where computational resources are limited. @fig-flops, from [@bianco2018benchmark], illustrates the trade-off between ImageNet accuracy, FLOPs, and parameter count, showing that some architectures achieve higher efficiency than others.
 
 ![A graph that depicts the top-1 imagenet accuracy vs. the FLOP count of a model along with the model's parameter count. The figure shows a overall tradeoff between model complexity and accuracy, although some model architectures are more efficiency than others. Source: @bianco2018benchmark.](images/png/model_FLOPS_VS_TOP_1.png){#fig-flops}
 
-Let's consider an example. BERT---Bidirectional Encoder Representations from Transformers [@devlin2018bert]---is a popular natural language processing model, has over 340 million parameters, making it a large model with high accuracy and impressive performance across various tasks. However, the sheer size of BERT, coupled with its high FLOP count, makes it a computationally intensive model that may not be suitable for real-time applications or deployment on edge devices with limited computational capabilities.
-
-In light of this, there has been a growing interest in developing smaller models that can achieve similar performance levels as their larger counterparts while being more efficient in computational load. DistilBERT, for instance, is a smaller version of BERT that retains 97% of its performance while being 40% smaller in terms of parameter count. The size reduction also translates to a lower FLOP count, making DistilBERT a more practical choice for resource-constrained scenarios.
+Let's consider an example. BERT---Bidirectional Encoder Representations from Transformers [@devlin2018bert]---is a popular natural language processing model, has over 340 million parameters, making it a large model with high accuracy and impressive performance across various tasks. However, the sheer size of BERT, coupled with its high FLOP count, makes it a computationally intensive model that may not be suitable for real-time applications or deployment on edge devices with limited computational capabilities. In light of this, there has been a growing interest in developing smaller models that can achieve similar performance levels as their larger counterparts while being more efficient in computational load. DistilBERT, for instance, is a smaller version of BERT that retains 97% of its performance while being 40% smaller in terms of parameter count. The size reduction also translates to a lower FLOP count, making DistilBERT a more practical choice for resource-constrained scenarios.
 
-In summary, while parameter count provides a useful indication of model size, it is not a comprehensive metric as it needs to consider the computational cost associated with processing these parameters. FLOPs, on the other hand, offer a more accurate representation of a model's computational load and are thus an essential consideration when deploying machine learning models in real-world scenarios, particularly when computational resources are limited. The evolution from relying solely on parameter count to considering FLOPs signifies a maturation in the field, reflecting a greater awareness of the practical constraints and challenges of deploying machine learning models in diverse settings.
+While parameter count indicates model size, it does not fully capture the computational cost. FLOPs provide a more accurate measure of computational load, highlighting the practical trade-offs in model deployment. This shift from parameter count to FLOPs reflects the field’s growing awareness of deployment challenges in diverse settings.
 
 ##### Efficiency
 
@@ -818,7 +768,7 @@ Benchmarking the triad of system, model, and data in an integrated fashion will
 
 @fig-benchmarking-trifecta illustrates the many potential ways to interplay data benchmarking, model benchmarking, and system infrastructure benchmarking together. Exploring these intricate interactions is likely to uncover new optimization opportunities and enhancement capabilities. The data, model, and system benchmark triad offers a rich space for co-design and co-optimization.
 
-![Benchmarking trifecta.](images/png/trifecta.png){#fig-benchmarking-trifecta}
+![Benchmarking trifecta.](images/png/benchmarking_trifecta.png){#fig-benchmarking-trifecta}
 
 While this integrated perspective represents an emerging trend, the field has much more to discover about the synergies and trade-offs between these components. As we iteratively benchmark combinations of data, models, and systems, new insights that remain hidden when these elements are studied in isolation will emerge. This multifaceted benchmarking approach charting the intersections of data, algorithms, and hardware promises to be a fruitful avenue for major progress in AI, even though it is still in its early stages.
 
diff --git a/contents/core/benchmarking/images/png/benchmarking_trifecta.png b/contents/core/benchmarking/images/png/benchmarking_trifecta.png
new file mode 100644
index 00000000..3b2b9e56
Binary files /dev/null and b/contents/core/benchmarking/images/png/benchmarking_trifecta.png differ
diff --git a/contents/core/frameworks/frameworks.qmd b/contents/core/frameworks/frameworks.qmd
index d35bf4d8..908d1b3f 100644
--- a/contents/core/frameworks/frameworks.qmd
+++ b/contents/core/frameworks/frameworks.qmd
@@ -397,7 +397,7 @@ Recently, the distinction has blurred as frameworks adopt both modes. TensorFlow
 
 Computational graphs can only be as good as the data they learn from and work on. Therefore, feeding training data efficiently is crucial for optimizing deep neural network performance, though it is often overlooked as one of the core functionalities. Many modern AI frameworks provide specialized pipelines to ingest, process, and augment datasets for model training.
 
-#### Data Loaders
+#### Data Loaders {#sec-frameworks-data-loaders}
 
 
 At the core of these pipelines are data loaders, which handle reading training examples from sources like files, databases, and object storage. Data loaders facilitate efficient data loading and preprocessing, crucial for deep learning models. For instance, TensorFlow's [tf.data](https://www.tensorflow.org/guide/data) dataloading pipeline is designed to manage this process. Depending on the application, deep learning models require diverse data formats such as CSV files or image folders. Some popular formats include: