From 11d54234b6a7b0931f9a9ca01933f57d088bbf15 Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Tue, 24 Oct 2023 17:22:04 -0400 Subject: [PATCH] Fixed missing references, links, bibtex etc. --- frameworks.qmd | 329 +++++++++++++++++++++++-------------------------- references.bib | 167 +++++++++++++++++++++++++ 2 files changed, 324 insertions(+), 172 deletions(-) diff --git a/frameworks.qmd b/frameworks.qmd index 1c64f52bd..2de452761 100644 --- a/frameworks.qmd +++ b/frameworks.qmd @@ -1,25 +1,28 @@ # AI Frameworks -Learning Objectives +::: {.callout-tip collapse="true"} +## Learning Objectives -- The evolution, core components, and advanced features of ML frameworks +* The evolution, core components, and advanced features of ML frameworks -- How frameworks specialize for cloud, edge, and tinyML environments +* How frameworks specialize for cloud, edge, and tinyML environments -- Challenges of embedded ML and how frameworks optimize models +* Challenges of embedded ML and how frameworks optimize models -- Criteria for selecting the right framework based on models, hardware, software factors +* Criteria for selecting the right framework based on models, hardware, software factors -- How to match framework capabilities to the constraints and requirements of a project +* How to match framework capabilities to the constraints and requirements of a project -- Ongoing innovations in frameworks for next-generation machine learning +* Ongoing innovations in frameworks for next-generation machine learning + +::: ## Introduction Machine learning frameworks provide the tools and infrastructure to efficiently build, train, and deploy machine learning models. In this chapter, we will explore the evolution and key capabilities of major -frameworks like [[TensorFlow (TF)]{.underline}](https://www.tensorflow.org/), [[PyTorch]{.underline}](https://pytorch.org/), and specialized frameworks for +frameworks like [TensorFlow (TF)](https://www.tensorflow.org/), [PyTorch](https://pytorch.org/), and specialized frameworks for embedded devices. We will dive into the components like computational graphs, optimization algorithms, hardware acceleration, and more that enable developers to quickly construct performant models. Understanding @@ -42,15 +45,15 @@ specialized hardware like NVIDIA GPUs to further accelerate training via optimizations like parallelization and efficient matrix operations. In addition, frameworks simplify deploying finished models into -production through tools like [[TensorFlow Serving]{.underline}](https://www.tensorflow.org/tfx/guide/serving) for scalable model -serving and [[TensorFlow Lite]{.underline}](https://www.tensorflow.org/lite) for optimization on mobile and edge devices. +production through tools like [TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving) for scalable model +serving and [TensorFlow Lite](https://www.tensorflow.org/lite) for optimization on mobile and edge devices. Other valuable capabilities include visualization, model optimization techniques like quantization and pruning, and monitoring metrics during training. -Leading open source frameworks like TensorFlow, PyTorch, and [[MXNet]{.underline}](https://mxnet.apache.org/versions/1.9.1/) power +Leading open source frameworks like TensorFlow, PyTorch, and [MXNet](https://mxnet.apache.org/versions/1.9.1/) power much of AI research and development today. Commercial offerings like -[[Amazon SageMaker]{.underline}](https://aws.amazon.com/pm/sagemaker/?trk=b6c2fafb-22b1-4a97-a2f7-7e4ab2c7aa28&sc_channel=ps&ef_id=CjwKCAjws9ipBhB1EiwAccEi1JpbBz6j4t7sRUoAiKFDc0mi59faZYge5MuFecAU6zGDQYTFz9NnaBoCV-wQAvD_BwE:G:s&s_kwcid=AL!4422!3!651751060692!e!!g!!amazon%20sagemaker!19852662230!145019225977) and [[Microsoft Azure Machine Learning]{.underline}](https://azure.microsoft.com/en-us/free/machine-learning/search/?ef_id=_k_CjwKCAjws9ipBhB1EiwAccEi1JVOThls797Sj3Li96_GYjoJQDx_EWaXNsDaEWeFbIaRkESUCkq64xoCSmwQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAjws9ipBhB1EiwAccEi1JVOThls797Sj3Li96_GYjoJQDx_EWaXNsDaEWeFbIaRkESUCkq64xoCSmwQAvD_BwE_k_&gad=1&gclid=CjwKCAjws9ipBhB1EiwAccEi1JVOThls797Sj3Li96_GYjoJQDx_EWaXNsDaEWeFbIaRkESUCkq64xoCSmwQAvD_BwE) integrate these +[Amazon SageMaker](https://aws.amazon.com/pm/sagemaker/?trk=b6c2fafb-22b1-4a97-a2f7-7e4ab2c7aa28&sc_channel=ps&ef_id=CjwKCAjws9ipBhB1EiwAccEi1JpbBz6j4t7sRUoAiKFDc0mi59faZYge5MuFecAU6zGDQYTFz9NnaBoCV-wQAvD_BwE:G:s&s_kwcid=AL!4422!3!651751060692!e!!g!!amazon%20sagemaker!19852662230!145019225977) and [Microsoft Azure Machine Learning](https://azure.microsoft.com/en-us/free/machine-learning/search/?ef_id=_k_CjwKCAjws9ipBhB1EiwAccEi1JVOThls797Sj3Li96_GYjoJQDx_EWaXNsDaEWeFbIaRkESUCkq64xoCSmwQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAjws9ipBhB1EiwAccEi1JVOThls797Sj3Li96_GYjoJQDx_EWaXNsDaEWeFbIaRkESUCkq64xoCSmwQAvD_BwE_k_&gad=1&gclid=CjwKCAjws9ipBhB1EiwAccEi1JVOThls797Sj3Li96_GYjoJQDx_EWaXNsDaEWeFbIaRkESUCkq64xoCSmwQAvD_BwE) integrate these open source frameworks with proprietary capabilities and enterprise tools. @@ -77,19 +80,18 @@ meet the expanding needs of practitioners and rapid advances in deep learning techniques. Early neural network research was constrained by insufficient data and compute power. Building and training machine learning models required extensive low-level coding and infrastructure. -But the release of large datasets like [[ImageNet]{.underline}](https://www.image-net.org/) in 2009 and advancements +But the release of large datasets like [ImageNet](https://www.image-net.org/) [@deng2009imagenet] and advancements in parallel GPU computing unlocked the potential for far deeper neural networks. -The first ML frameworks, [[Theano]{.underline}](https://pypi.org/project/Theano/#:~:text=Theano%20is%20a%20Python%20library,a%20similar%20interface%20to%20NumPy's.) (2007) and [[Caffe]{.underline}](https://caffe.berkeleyvision.org/) (2014), were developed +The first ML frameworks, [Theano](https://pypi.org/project/Theano/#:~:text=Theano%20is%20a%20Python%20library,a%20similar%20interface%20to%20NumPy's.) by @al2016theano and [Caffe](https://caffe.berkeleyvision.org/) by @jia2014caffe, were developed by academic institutions (Montreal Institute for Learning Algorithms, Berkeley Vision and Learning Center). Amid a growing interest in deep -learning due to state-of-the-art performance of AlexNet (2012) on the +learning due to state-of-the-art performance of AlexNet @krizhevsky2012imagenet on the ImageNet dataset, private companies and individuals began developing ML -frameworks, resulting in frameworks such as [[Keras]{.underline}](https://keras.io/) by Google researcher -François Chollet (2015), [[Chainer]{.underline}](https://chainer.org/) by Preferred Networks (2015), -TensorFlow by Google (2015), [[CNTK]{.underline}](https://learn.microsoft.com/en-us/cognitive-toolkit/) by Microsoft (2016), and PyTorch by -Facebook (2016). +frameworks, resulting in frameworks such as [Keras](https://keras.io/) by @chollet2018keras, [Chainer](https://chainer.org/) by @tokui2015chainer, +TensorFlow from Google [@abadi2016tensorflow], [CNTK](https://learn.microsoft.com/en-us/cognitive-toolkit/) by Microsoft [@seide2016cntk], and PyTorch by +Facebook [@paszke2019pytorch]. Many of these ML frameworks can be divided into categories, namely high-level vs. low-level frameworks and static vs. dynamic computational @@ -118,9 +120,9 @@ on in the AI Training section. The development of these frameworks facilitated an explosion in model size and complexity over time---from early multilayer perceptrons and convolutional networks to modern transformers with billions or trillions -of parameters. In 2017, ResNet models achieved record ImageNet accuracy +of parameters. In 2016, ResNet models by @he2016deep achieved record ImageNet accuracy with over 150 layers and 25 million parameters. Then in 2020, the GPT-3 -language model pushed parameters to an astonishing 175 billion using +language model from OpenAI [@brown2020language] pushed parameters to an astonishing 175 billion using model parallelism in frameworks to train across thousands of GPUs and TPUs. @@ -184,31 +186,31 @@ package. ### TF Ecosystem -1. [[TensorFlow Core]{.underline}](https://www.tensorflow.org/tutorials): primary package that most developers engage with. It provides a comprehensive, flexible platform for defining, training, and deploying machine learning models. It includes tf.keras as its high-level API. +1. [TensorFlow Core](https://www.tensorflow.org/tutorials): primary package that most developers engage with. It provides a comprehensive, flexible platform for defining, training, and deploying machine learning models. It includes tf.keras as its high-level API. -2. [[TensorFlow Lite]{.underline}](https://www.tensorflow.org/lite): designed for deploying lightweight models on mobile, embedded, and edge devices. It offers tools to convert TensorFlow models to a more compact format suitable for limited-resource devices and provides optimized pre-trained models for mobile. +2. [TensorFlow Lite](https://www.tensorflow.org/lite): designed for deploying lightweight models on mobile, embedded, and edge devices. It offers tools to convert TensorFlow models to a more compact format suitable for limited-resource devices and provides optimized pre-trained models for mobile. -3. [[TensorFlow.js]{.underline}](https://www.tensorflow.org/js): JavaScript library that allows training and deployment of machine learning models directly in the browser or on Node.js. It also provides tools for porting pre-trained TensorFlow models to the browser-friendly format. +3. [TensorFlow.js](https://www.tensorflow.org/js): JavaScript library that allows training and deployment of machine learning models directly in the browser or on Node.js. It also provides tools for porting pre-trained TensorFlow models to the browser-friendly format. -4. [[TensorFlow on Edge Devices (Coral)]{.underline}](https://developers.googleblog.com/2019/03/introducing-coral-our-platform-for.html): platform of hardware components and software tools from Google that allows the execution of TensorFlow models on edge devices, leveraging Edge TPUs for acceleration. +4. [TensorFlow on Edge Devices (Coral)](https://developers.googleblog.com/2019/03/introducing-coral-our-platform-for.html): platform of hardware components and software tools from Google that allows the execution of TensorFlow models on edge devices, leveraging Edge TPUs for acceleration. -5. [[TensorFlow Federated (TFF)]{.underline}](https://www.tensorflow.org/federated): framework for machine learning and other computations on decentralized data. TFF facilitates federated learning, allowing model training across many devices without centralizing the data. +5. [TensorFlow Federated (TFF)](https://www.tensorflow.org/federated): framework for machine learning and other computations on decentralized data. TFF facilitates federated learning, allowing model training across many devices without centralizing the data. -6. [[TensorFlow Graphics]{.underline}](https://www.tensorflow.org/graphics): library for using TensorFlow to carry out graphics-related tasks, including 3D shapes and point clouds processing, using deep learning. +6. [TensorFlow Graphics](https://www.tensorflow.org/graphics): library for using TensorFlow to carry out graphics-related tasks, including 3D shapes and point clouds processing, using deep learning. -7. [[TensorFlow Hub]{.underline}](https://www.tensorflow.org/hub): repository of reusable machine learning model components to allow developers to reuse pre-trained model components, facilitating transfer learning and model composition +7. [TensorFlow Hub](https://www.tensorflow.org/hub): repository of reusable machine learning model components to allow developers to reuse pre-trained model components, facilitating transfer learning and model composition -8. [[TensorFlow Serving]{.underline}](https://www.tensorflow.org/tfx/guide/serving): framework designed for serving and deploying machine learning models for inference in production environments. It provides tools for versioning and dynamically updating deployed models without service interruption. +8. [TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving): framework designed for serving and deploying machine learning models for inference in production environments. It provides tools for versioning and dynamically updating deployed models without service interruption. -9. [[TensorFlow Extended (TFX)]{.underline}](https://www.tensorflow.org/tfx): end-to-end platform designed to deploy and manage machine learning pipelines in production settings. TFX encompasses components for data validation, preprocessing, model training, validation, and serving. +9. [TensorFlow Extended (TFX)](https://www.tensorflow.org/tfx): end-to-end platform designed to deploy and manage machine learning pipelines in production settings. TFX encompasses components for data validation, preprocessing, model training, validation, and serving. -TensorFlow was developed to address the limitations of DistBelief[^2]---the +TensorFlow was developed to address the limitations of DistBelief [@abadi2016tensorflow]---the framework in use at Google from 2011 to 2015---by providing flexibility along three axes: 1) defining new layers, 2) refining training algorithms, and 3) defining new training algorithms. To understand what limitations in DistBelief led to the development of TensorFlow, we will first give a brief overview of the Parameter Server Architecture that -DistBelief employed.[^3] +DistBelief employed [@dean2012large]. The Parameter Server (PS) architecture is a popular design for distributing the training of machine learning models, especially deep @@ -225,7 +227,7 @@ maintain and manage this state across the training process. **Computation**: The worker processes, which could be run in parallel, were stateless and purely computational, processing data and computing -gradients without maintaining any state or long-term memory.[^4] +gradients without maintaining any state or long-term memory [@li2014communication]. DistBelief and its architecture defined above were crucial in enabling distributed deep learning at Google but also introduced limitations that @@ -244,7 +246,7 @@ ones, complicating the management and synchronization tasks of the parameter servers. This made it harder to implement models outside the neural framework or models that required dynamic computation graphs. -TensorFlow was designed to be a more general computation framework[^2] where +TensorFlow was designed to be a more general computation framework where the computation is expressed as a data flow graph. This allows for a wider variety of machine learning models and algorithms outside of just neural networks, and provides flexibility in refining models. @@ -419,9 +421,9 @@ within it representing a certain color value in the certain location of the image. Extending even further, if we wanted to store a series of images, we can simply extend the dimensions such that the new dimension (to create a 4-dimensional tensor) represents the different images that -we have. This is exactly what the famous MNIST dataset does, +we have. This is exactly what the famous [MNIST](https://www.tensorflow.org/datasets/catalog/mnist) dataset does, loading a single 4-dimensional tensor when one calls to load the -dataset, allowing a compact representation of all the data in one place. [^5] +dataset, allowing a compact representation of all the data in one place. ### Computational graphs @@ -480,7 +482,7 @@ objects. The layer abstraction makes building and training neural networks much more intuitive. This sort of abstraction enables developers to build models by stacking these layers together, without having to implement the layer logic themselves. For example, calling -tf.keras.layers.Conv2D in TensorFlow creates a convolutional layer. The +`tf.keras.layers.Conv2D` in TensorFlow creates a convolutional layer. The framework handles computing the convolutions, managing parameters, etc. This simplifies model development, allowing developers to focus on architecture rather than low-level implementations. Layer abstractions @@ -536,8 +538,7 @@ one of the core functionalities that is offered by a good ML framework: - Automatic differentiation for training -- Language agnosticism - graph can be translated to run on GPUs, TPUs, - > etc +- Language agnosticism - graph can be translated to run on GPUs, TPUs, etc. - Portability - graph can be serialized, saved, and restored later @@ -570,9 +571,10 @@ graph require re-declaring the full model. For example: -``x = tf.placeholder(tf.float32)`` - -``y = tf.matmul(x, weights) + biases`` +```{{python}} +x = tf.placeholder(tf.float32) +y = tf.matmul(x, weights) + biases +``` The model is defined separately from execution, like building a blueprint. For TensorFlow 1.x, this is done using tf.Graph(). All ops @@ -590,9 +592,10 @@ PyTorch uses dynamic graphs, building the graph on-the-fly as execution happens. For example, consider the following code snippet, where the graph is built as the execution is taking place: -``x = torch.randn(4,784)`` - -``y = torch.matmul(x, weights) + biases`` +```{{python}} +x = torch.randn(4,784) +y = torch.matmul(x, weights) + biases +``` In the above example, there are no separate compile/build/run phases. Ops define and execute immediately. With dynamic graphs, definition is @@ -636,8 +639,8 @@ formats are CSV: A versatile, simple format often used for tabular data. TFRecord: TensorFlow\'s proprietary format, optimized for performance. Parquet: Columnar storage, offering efficient data compression and retrieval. JPEG/PNG: Commonly used for image data. WAV/MP3: Prevalent -formats for audio data. For instance, tf.data is TensorFlows's -dataloading pipeline: https://www.tensorflow.org/guide/data +formats for audio data. For instance, `tf.data` is TensorFlows's +dataloading pipeline: . Data loaders batch examples to leverage vectorization support in hardware. Batching refers to grouping multiple data points for @@ -783,7 +786,7 @@ processing methods, and utilities such as checkpointing and early stopping. These resources manage the complex aspects of performance, enabling practitioners to zero in on model development and training. As a result, developers experience both speed and ease when utilizing the -capabilities of neural networks. [^6] +capabilities of neural networks. ### Validation and Analysis @@ -827,8 +830,8 @@ Visualization tools provide insight into models: - Precision-recall curves - Assess classification tradeoffs. -Tools like [[TensorBoard]{.underline}](https://www.tensorflow.org/tensorboard/scalars_and_keras) -for TensorFlow and [[TensorWatch]{.underline}](https://github.com/microsoft/tensorwatch)for PyTorch enable +Tools like [TensorBoard](https://www.tensorflow.org/tensorboard/scalars_and_keras) +for TensorFlow and [TensorWatch](https://github.com/microsoft/tensorwatch) for PyTorch enable real-time metrics and visualization during training. ### Differentiable programming @@ -859,7 +862,7 @@ differentiation, computational complexity of computing the gradient is proportional to computing the function itself. Intricacies of automatic differentiation are not dealt with by end users now, but resources to learn more can be found widely, such as from -[[here]{.underline}](https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/slides/lec10.pdf). +[here](https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/slides/lec10.pdf). Automatic differentiation and differentiable programming today is ubiquitous and is done efficiently and automatically by modern machine learning frameworks. @@ -871,12 +874,12 @@ models has essentially made hardware acceleration support a necessity for machine learning platforms. Deep layers of neural networks require many matrix multiplications, which attracts hardware that can compute matrix operations fast and in parallel. In this landscape, two types of -hardware architectures, the [[GPU and -TPU]{.underline}](https://cloud.google.com/tpu/docs/intro-to-tpu), have +hardware architectures, the [GPU and +TPU](https://cloud.google.com/tpu/docs/intro-to-tpu), have emerged as leading choices for training machine learning models. The use of hardware accelerators began with -[[AlexNet]{.underline}](https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf), +[AlexNet](https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf), which paved the way for future works to utilize GPUs as hardware accelerators for training computer vision models. GPUs, or Graphics Processing Units, excel in handling a large number of computations at once, making them @@ -887,15 +890,15 @@ machine learning. While they are very useful for machine learning tasks and have been implemented in many hardware platforms, GPU's are still general purpose in that they can be used for other applications. -On the other hand, [[Tensor Processing -Units]{.underline}](https://cloud.google.com/tpu/docs/intro-to-tpu) +On the other hand, [Tensor Processing +Units](https://cloud.google.com/tpu/docs/intro-to-tpu) (TPU) are hardware units designed specifically for neural networks. They focus on the multiply and accumulate (MAC) operation, and their hardware essentially consists of a large hardware matrix that contains elements -efficiently computing the MAC operation. This, called the [[systolic +efficiently computing the MAC operation. This concept called the [systolic array -architecture]{.underline}](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1653825), -was pioneered in 1979 by HT Kung and Charles E. Leiserson, but has +architecture](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1653825), +was pioneered by @kung1979systolic, but has proven to be a useful structure to efficiently compute matrix products and other operations within neural networks (such as convolutions). @@ -907,9 +910,9 @@ custom operations from the machine learning frameworks, and the network design must closely align to the hardware capabilities. Today, NVIDIA GPUs dominate training, aided by software libraries like -[[CUDA]{.underline}](https://developer.nvidia.com/cuda-toolkit), -[[cuDNN]{.underline}](https://developer.nvidia.com/cudnn), and -[[TensorRT.]{.underline}](https://developer.nvidia.com/tensorrt#:~:text=NVIDIA%20TensorRT%2DLLM%20is%20an,knowledge%20of%20C%2B%2B%20or%20CUDA.) +[CUDA](https://developer.nvidia.com/cuda-toolkit), +[cuDNN](https://developer.nvidia.com/cudnn), and +[TensorRT.](https://developer.nvidia.com/tensorrt#:~:text=NVIDIA%20TensorRT%2DLLM%20is%20an,knowledge%20of%20C%2B%2B%20or%20CUDA.) Frameworks also tend to include optimizations to maximize performance on these hardware types, like pruning unimportant connections and fusing layers. Combining these techniques with hardware acceleration provides @@ -918,7 +921,7 @@ towards optimized ASICs and SoCs. Google\'s TPUs accelerate models in data centers. Apple, Qualcomm, and others now produce AI-focused mobile chips. The NVIDIA Jetson family targets autonomous robots. -## Advanced Features +## Advanced Features {#sec-ai_frameworks-advanced} ### Distributed training @@ -941,13 +944,13 @@ two mentioned here are active research areas. ML frameworks that support distributed learning include TensorFlow (through its -[[tf.distribute]{.underline}](https://www.tensorflow.org/api_docs/python/tf/distribute) +[tf.distribute](https://www.tensorflow.org/api_docs/python/tf/distribute) module), PyTorch (through its -[[torch.nn.DataParallel]{.underline}](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) +[torch.nn.DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) and -[[torch.nn.DistributedDataParallel]{.underline}](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) +[torch.nn.DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) modules), and MXNet (through its -[[gluon]{.underline}](https://mxnet.apache.org/versions/1.9.1/api/python/docs/api/gluon/index.html) +[gluon](https://mxnet.apache.org/versions/1.9.1/api/python/docs/api/gluon/index.html) API). ### Model Conversion @@ -970,9 +973,8 @@ converted to TensorFlow Lite format. TensorFlow Lite uses a compact flatbuffer representation and optimizations for fast inference on mobile hardware. -Model optimizations like quantization (see Optimizations chapter) can -further optimize models for target architectures like mobile. This -reduces precision of weights and activations to uint8 or int8 for a +Model optimizations like quantization (see [Optimizations](./optimizations.qmd) chapter) can further optimize models for target architectures like mobile. This +reduces precision of weights and activations to `uint8` or `int8` for a smaller footprint and faster execution with supported hardware accelerators. For post-training quantization, TensorFlow\'s converter handles analysis and conversion automatically. @@ -987,7 +989,7 @@ learning. TensorFlow\'s conversion utilities handle these scenarios to streamline end-to-end workflows. More information about model conversion in TensorFlow is linked -[[here]{.underline}](https://www.tensorflow.org/lite/models/convert). +[here](https://www.tensorflow.org/lite/models/convert). ### AutoML, No-Code/Low-Code ML @@ -1028,11 +1030,13 @@ ImageNet datasets such as MobileNet and ResNet can help classify other image datasets. To do so, one may freeze the pretrained model, utilizing it as a feature extractor to train a much smaller model that is built on top of the feature extraction. One can also fine tune the entire model -to fit the new task. Transfer learning has a series of challenges, in +to fit the new task. + +Transfer learning has a series of challenges, in that the modified model may not be able to conduct its original tasks -after transfer learning. Papers such as [["Learning without -Forgetting"]{.underline}](https://browse.arxiv.org/pdf/1606.09282.pdf) -paper aims to address these challenges and have been implemented in +after transfer learning. Papers such as ["Learning without +Forgetting"](https://browse.arxiv.org/pdf/1606.09282.pdf) by @li2017learning +aims to address these challenges and have been implemented in modern machine learning platforms. #### Federated Learning @@ -1048,7 +1052,7 @@ is often not feasible, and very costly. Furthermore, there are privacy challenges associated with moving personal data, such as Photos central servers. -[[Federated learning]{.underline}](https://arxiv.org/abs/1602.05629) is +Federated learning by @mcmahan2023communicationefficient is a form of distributed computing that resolves these issues by distributing the models into personal devices for them to be trained on device. At the beginning, a base global model is trained on a central @@ -1061,19 +1065,14 @@ the set of images that are on personal devices), without the need to transfer a large amount of potentially sensitive data. However, federated learning also comes with a series of challenges. - In many real-world situations, data collected from devices may not come with suitable labels. This issue is compounded by the fact that users, who are often the primary source of data, can be unreliable. This unreliability means that even when data is labeled, there's no guarantee of its accuracy or relevance. Furthermore, each user's data is unique, resulting in a significant variance in the data generated by different users. This non-IID nature of data, coupled with the unbalanced data production where some users generate more data than others, can adversely impact the performance of the global model. Researchers have worked to compensate for this, such as by adding a proximal term to achieve a balance between the local and global -model, and adding a frozen [[global hypersphere -classifier]{.underline}](https://arxiv.org/abs/2207.09413). +model, and adding a frozen [global hypersphere +classifier](https://arxiv.org/abs/2207.09413). There are additional challenges associated with federated learning. The number of mobile device owners can far exceed the average number of training samples on each device, leading to substantial communication overhead. This issue is particularly pronounced in the context of mobile networks, which are often used for such communication and can be unstable. This instability can result in delayed or failed transmission of model updates, thereby affecting the overall training process. -The heterogeneity of device resources is another hurdle. Devices participating in Federated Learning can have varying computational powers and memory capacities. This diversity makes it challenging to design algorithms that are efficient across all devices. Privacy and security issues are not a guarantee for federated learning. Techniques such as inversion gradient attacks can be used to extract information about the training data from the model parameters. Despite these challenges, the large amount of potential benefits continue to make it a popular research area. Open source programs such as [[Flower]{.underline}](https://flower.dev/) have been developed to make it simpler to implement federated learning with a variety of machine learning frameworks. - - - - +The heterogeneity of device resources is another hurdle. Devices participating in Federated Learning can have varying computational powers and memory capacities. This diversity makes it challenging to design algorithms that are efficient across all devices. Privacy and security issues are not a guarantee for federated learning. Techniques such as inversion gradient attacks can be used to extract information about the training data from the model parameters. Despite these challenges, the large amount of potential benefits continue to make it a popular research area. Open source programs such as [Flower](https://flower.dev/) have been developed to make it simpler to implement federated learning with a variety of machine learning frameworks. ## Framework Specialization @@ -1153,7 +1152,7 @@ and tinyML environments: - TinyML fits models into extremely low memory and compute > environments like microcontrollers -## Embedded AI Frameworks +## Embedded AI Frameworks {#sec-ai_frameworks_embedded} ### Resource Constraints @@ -1163,19 +1162,19 @@ computing platforms. For example, microcontroller units (MCUs) commonly used in IoT devices often have: - **RAM** in the range of tens of kilobytes to a few megabytes. The - > popular ESP8266 MCU has around 80KB RAM available to developers. - > This contrasts with 8GB or more on typical laptops and desktops - > today. + popular [ESP8266 MCU](https://www.espressif.com/en/products/socs/esp8266) has around 80KB RAM available to developers. + This contrasts with 8GB or more on typical laptops and desktops + today. - **Flash storage** ranging from hundreds of kilobytes to a few - > megabytes. The Arduino Uno microcontroller provides just 32KB of - > storage for code. Standard computers today have disk storage in - > the order of terabytes. + megabytes. The Arduino Uno microcontroller provides just 32KB of + storage for code. Standard computers today have disk storage in + the order of terabytes. - **Processing power** from just a few MHz to approximately 200MHz. - > The ESP8266 operates at 80MHz. This is several orders of magnitude - > slower than multi-GHz multi-core CPUs in servers and high-end - > laptops. + The ESP8266 operates at 80MHz. This is several orders of magnitude + slower than multi-GHz multi-core CPUs in servers and high-end + laptops. These tight constraints make training machine learning models directly on microcontrollers infeasible in most cases. The limited RAM precludes @@ -1185,32 +1184,31 @@ trained on resource-rich systems and deployed on microcontrollers for optimized inference. But even inference poses challenges: 1. **Model Size:** AI models are too large to fit on embedded and IoT - > devices. This necessitates the need for model compression - > techniques, such as quantization, pruning, and knowledge - > distillation. Additionally, as we will see in the Embedded AI - > Frameworks section, many of the frameworks used by developers for - > AI development have large amounts of overhead, and built in - > libraries that embedded systems can't support. + devices. This necessitates the need for model compression + techniques, such as quantization, pruning, and knowledge + distillation. Additionally, as we will see, many of the frameworks used by developers for + AI development have large amounts of overhead, and built in + libraries that embedded systems can't support. 2. **Complexity of Tasks:** With only tens of KBs to a few MBs of RAM, - > IoT devices and embedded systems are constrained in the complexity - > of tasks they can handle. Tasks that require large datasets or - > sophisticated algorithms-- for example LLMs-- which would run - > smoothly on traditional computing platforms, might be infeasible - > on embedded systems without compression or other optimization - > techniques due to memory limitations. + IoT devices and embedded systems are constrained in the complexity + of tasks they can handle. Tasks that require large datasets or + sophisticated algorithms-- for example LLMs-- which would run + smoothly on traditional computing platforms, might be infeasible + on embedded systems without compression or other optimization + techniques due to memory limitations. 3. **Data Storage and Processing:** Embedded systems often process data - > in real-time and might not store large amounts of data locally. - > Conversely, traditional computing systems can hold and process - > large datasets in memory, enabling faster data operations and - > analysis as well as real-time updates. + in real-time and might not store large amounts of data locally. + Conversely, traditional computing systems can hold and process + large datasets in memory, enabling faster data operations and + analysis as well as real-time updates. 4. **Security and Privacy:** Limited memory also restricts the - > complexity of security algorithms and protocols, data encryption, - > reverse engineering protections, and more that can be implemented - > on the device. This can potentially make some IoT devices more - > vulnerable to attacks. + complexity of security algorithms and protocols, data encryption, + reverse engineering protections, and more that can be implemented + on the device. This can potentially make some IoT devices more + vulnerable to attacks. Consequently, specialized software optimizations and ML frameworks tailored for microcontrollers are necessary to work within these tight @@ -1220,14 +1218,14 @@ memory (see Optimizations section). Learnings from neural architecture search help guide model designs. Hardware improvements like dedicated ML accelerators on microcontrollers -also help alleviate constraints. For instance, Qualcomm\'s Hexagon DSP +also help alleviate constraints. For instance, [Qualcomm's Hexagon DSP](https://developer.qualcomm.com/software/hexagon-dsp-sdk/dsp-processor) provides acceleration for TensorFlow Lite models on Snapdragon mobile -chips. Google\'s Edge TPU packs ML performance into a tiny ASIC for edge -devices. ARM Ethos-U55 offers efficient inference on Cortex-M class +chips. [Google's Edge TPU](https://cloud.google.com/edge-tpu) packs ML performance into a tiny ASIC for edge +devices. [ARM Ethos-U55](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55) offers efficient inference on Cortex-M class microcontrollers. These customized ML chips unlock advanced capabilities for resource-constrained applications. -Generally, due to the limited processing power, it\'s almost always +Generally, due to the limited processing power, it's almost always infeasible to train AI models on IoT or embedded systems. Instead, models are trained on powerful traditional computers (often with GPUs) and then deployed on the embedded device for inference. TinyML @@ -1237,9 +1235,8 @@ real-time inference on these constrained devices. ### Frameworks & Libraries Embedded AI frameworks are software tools and libraries designed to -enable artificial intelligence (AI) and machine learning (ML) -capabilities on embedded systems. These frameworks are essential for -bringing AI to IoT (Internet of Things) devices, robotics, and other +enable AI and ML capabilities on embedded systems. These frameworks are essential for +bringing AI to IoT devices, robotics, and other edge computing platforms and they are designed to work where computational resources, memory, and power consumption are limited. @@ -1256,28 +1253,28 @@ server clusters with abundant resources do not directly translate to embedded systems. This section uncovers some of the challenges and opportunities for embedded systems and ML frameworks. -#### Fragmented Ecosystem +**Fragmented Ecosystem** The lack of a unified ML framework led to a highly fragmented ecosystem. -Engineers at companies like STMicroelectronics, NXP Semiconductors, and -Renesas had to develop custom solutions tailored to their specific +Engineers at companies like [STMicroelectronics](https://www.st.com/), [NXP Semiconductors](https://www.nxp.com/), and +[Renesas](https://www.renesas.com/) had to develop custom solutions tailored to their specific microcontroller and DSP architectures. These ad-hoc frameworks required extensive manual optimization for each low-level hardware platform. This made porting models extremely difficult, requiring redevelopment for new Arm, RISC-V or proprietary architectures. -#### Disparate Hardware Needs +**Disparate Hardware Needs ** Without a shared framework, there was no standard way to assess -hardware\'s capabilities. Vendors like Intel, Qualcomm and NVIDIA +hardware's capabilities. Vendors like Intel, Qualcomm and NVIDIA created integrated solutions blending model, software and hardware improvements. This made it hard to discern the sources of performance -gains - whether new chip designs like Intel\'s low-power x86 cores or +gains - whether new chip designs like Intel's low-power x86 cores or software optimizations were responsible. A standard framework was needed -so vendors could evaluate their hardware\'s capabilities in a fair, +so vendors could evaluate their hardware's capabilities in a fair, reproducible way. -#### Lack of Portability +**Lack of Portability** Adapting models trained in common frameworks like TensorFlow or PyTorch to run efficiently on microcontrollers was very challenging without @@ -1286,7 +1283,7 @@ models to run on specialized DSPs from companies like CEVA or low-power Arm M-series cores. There were no turnkey tools enabling portable deployment across different architectures. -#### Incomplete Infrastructure +**Incomplete Infrastructure ** The infrastructure to support key model development workflows was lacking. There was minimal support for compression techniques to fit @@ -1297,16 +1294,16 @@ functionality like on-device debugging, metrics, and performance profiling was absent. These gaps increased the cost and difficulty of embedded ML development. -#### No Standard Benchmark +**No Standard Benchmark** Without unified benchmarks, there was no standard way to assess and compare the capabilities of different hardware platforms from vendors like NVIDIA, Arm and Ambiq Micro. Existing evaluations relied on proprietary benchmarks tailored to showcased strengths of particular chips. This made it impossible to objectively measure hardware -improvements in a fair, neutral manner. +improvements in a fair, neutral manner. This topic is discussed in more detail in the [Benchmarking AI](./benchmarking.qmd) chapter. -#### Minimal Real-World Testing +**Minimal Real-World Testing** Much of the benchmarks relied on synthetic data. Rigorously testing models on real-world embedded applications was difficult without @@ -1321,7 +1318,7 @@ improved portability, performance profiling, and benchmarking support. But ongoing innovation is still needed to enable seamless, cost-effective deployment of AI to edge devices. -### Summary +**Summary** The absence of standardized frameworks, benchmarks, and infrastructure for embedded ML has traditionally hampered adoption. However, recent @@ -1330,7 +1327,7 @@ Lite Micro and benchmark suites like MLPerf Tiny that aim to accelerate the proliferation of TinyML solutions. But overcoming the fragmentation and difficulty of embedded deployment remains an ongoing process. -#### Examples +## Examples Machine learning deployment on microcontrollers and other embedded devices often requires specially optimized software libraries and @@ -1364,12 +1361,12 @@ systems and microcontrollers. In the following sections, we will dive into understanding each of these in greater detail. -##### TFLM (Interpreter) +### Interpreter -TensorFlow Lite Micro (TFLM) is a machine learning inference framework +[TensorFlow Lite Micro (TFLM)](https://www.tensorflow.org/lite/microcontrollers) is a machine learning inference framework designed for embedded devices with limited resources. It uses an interpreter to load and execute machine learning models, which provides -flexibility and ease of updating models in the field. +flexibility and ease of updating models in the field [@david2021tensorflow]. Traditional interpreters often have significant branching overhead, which can reduce performance. However, machine learning model @@ -1393,16 +1390,16 @@ of an interpreter-based model execution framework. An interpreter-based approach offers several benefits over code generation for machine learning inference on embedded devices: -- Flexibility: Models can be updated in the field without recompiling +- **Flexibility:** Models can be updated in the field without recompiling the entire application. -- Portability: The interpreter can be used to execute models on +- **Portability:** The interpreter can be used to execute models on different target platforms without porting the code. -- Memory efficiency: The interpreter can share code across multiple +- **Memory efficiency:** The interpreter can share code across multiple models, reducing memory usage. -- Ease of development: Interpreters are easier to develop and maintain +- **Ease of development:** Interpreters are easier to develop and maintain than code generators. TensorFlow Lite Micro is a powerful and flexible framework for machine @@ -1410,12 +1407,12 @@ learning inference on embedded devices. Its interpreter-based approach offers several benefits over code generation, including flexibility, portability, memory efficiency, and ease of development. -##### TinyEngine (Compiler-based) +### Compiler-based -TinyEngine is an ML inference framework designed specifically for +[TinyEngine](https://github.com/mit-han-lab/tinyengine) by is an ML inference framework designed specifically for resource-constrained microcontrollers. It employs several optimizations to enable high-accuracy neural network execution within the tight -constraints of memory, compute, and storage on microcontrollers. +constraints of memory, compute, and storage on microcontrollers [@lin2020mcunet]. While inference frameworks like TFLite Micro use interpreters to execute the neural network graph dynamically at runtime, this adds significant @@ -1460,13 +1457,13 @@ model-based scheduling, specialized kernels, and co-design with NAS, TinyEngine enables high-accuracy deep learning inference within the tight resource constraints of microcontrollers. -##### CMSIS-NN (Library) +### Library -CMSIS-NN, standing for Cortex Microcontroller Software Interface +[CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html), standing for Cortex Microcontroller Software Interface Standard for Neural Networks, is a software library devised by ARM. It offers a standardized interface for deploying neural network inference on microcontrollers and embedded systems, with a particular focus on -optimization for ARM Cortex-M processors. +optimization for ARM Cortex-M processors [@lai2018cmsis]. **Neural Network Kernels:** CMSIS-NN is equipped with highly efficient kernels that handle fundamental neural network operations such as @@ -1649,29 +1646,28 @@ computational graphs, (2) tensor programs, (3) libraries and runtimes, and (4) hardware primitives. -![](images_ml_frameworks/image8.png){width="2.557292213473316in" -height="2.9092125984251966in"} +![](images_ml_frameworks/image8.png){fig-align="center" width=70%} This has led to vertical (i.e. between abstraction levels) and horizontal (i.e. library-driven vs. compilation-driven approaches to tensor computation) boundaries, which hinder innovation for ML. Future work in ML frameworks can look toward breaking these boundaries. In -December 2021, Apache TVM Unity was proposed, which aimed to facilitate +December 2021, [Apache TVM](https://tvm.apache.org/2021/12/15/tvm-unity) Unity was proposed, which aimed to facilitate interactions between the different abstraction levels (as well as the people behind them, such as ML scientists, ML engineers, and hardware -engineers) and co-optimize decisions in all four abstraction levels.[^1] +engineers) and co-optimize decisions in all four abstraction levels. ### High-Performance Compilers & Libraries As ML frameworks further develop, high-performance compilers and libraries will continue to emerge. Some current examples include -[[TensorFlow -XLA]{.underline}](https://www.tensorflow.org/xla/architecture) and +[TensorFlow +XLA](https://www.tensorflow.org/xla/architecture) and Nvidia's -[[CUTLASS]{.underline}](https://developer.nvidia.com/blog/cutlass-linear-algebra-cuda/), +[CUTLASS](https://developer.nvidia.com/blog/cutlass-linear-algebra-cuda/), which accelerate linear algebra operations in computational graphs, and Nvidia's -[[TensorRT]{.underline}](https://developer.nvidia.com/tensorrt), which +[TensorRT](https://developer.nvidia.com/tensorrt), which accelerates and optimizes inference. ### ML for ML Frameworks @@ -1680,13 +1676,13 @@ We can also use ML to improve ML frameworks in the future. Some current uses of ML for ML frameworks include: - hyperparameter optimization using techniques such as Bayesian - > optimization, random search, and grid search + optimization, random search, and grid search - neural architecture search (NAS) to automatically search for optimal - > network architectures + network architectures -- AutoML, which as described in the Advanced Features section, - > automates the ML pipeline. +- AutoML, which as described in the [Advanced Features][@sec-ai_frameworks-advanced] section, + automates the ML pipeline. ## Conclusion @@ -1724,15 +1720,4 @@ This requires balancing tradeoffs between performance needs, hardware constraints, model complexity, and other factors. Thoroughly assessing intended models, use cases, and evaluating options against key metrics will guide developers towards picking the ideal framework for their -embedded ML application. - -[^1]: Sampson et al. 2021. "Apache TVM Unity: a vision for the ML software & hardware ecosystem in 2022." [[https://tvm.apache.org/2021/12/15/tvm-unity]{.underline}](https://tvm.apache.org/2021/12/15/tvm-unity). -[^2]: Abadi et al. 2015. "TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems." [[https://arxiv.org/pdf/1603.04467.pdf]{.underline}](https://arxiv.org/pdf/1603.04467.pdf). -[^3]: Dean et al. 2012. "Large Scale Distributed Deep Networks." *Proceedings of the 25th International Conference on Neural Information Processing Systems* 1: 1223–1231. [[https://storage.googleapis.com/pub-tools-public-publication-data/pdf/40565.pdf]{.underline}](https://storage.googleapis.com/pub-tools-public-publication-data/pdf/40565.pdf). -[^4]: Li et al. 2014. "Communication Efficient Distributed Machine Learning with the Parameter Server." *Proceedings of the 27th International Conference on Neural Information Processing Systems* 1: 19–27. [[https://proceedings.neurips.cc/paper_files/paper/2014/file/1ff1de774005f8da13f42943881c655f-Paper.pdf -]{.underline}](https://proceedings.neurips.cc/paper_files/paper/2014/file/1ff1de774005f8da13f42943881c655f-Paper.pdf -). -[^5]: [[TensorFlow: Large-scale machine learning on heterogeneous systems, -2015.]{.underline}](https://www.tensorflow.org/datasets/catalog/mnist) - -[^6]: [[Patrick McClanahan, Introduction to Operating Systems, 2023]{.underline}](https://eng.libretexts.org/Courses/Delta_College/Introduction_to_Operating_Systems/03%3A_The_Operating_System/3.06%3A_Types_of_Operating_Systems) +embedded ML application. \ No newline at end of file diff --git a/references.bib b/references.bib index ad5daaf2e..815ae31a7 100644 --- a/references.bib +++ b/references.bib @@ -7,6 +7,153 @@ @misc{Thefutur92:online note = {(Accessed on 09/16/2023)} } +@inproceedings{deng2009imagenet, + title={Imagenet: A large-scale hierarchical image database}, + author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, + booktitle={2009 IEEE conference on computer vision and pattern recognition}, + pages={248--255}, + year={2009}, + organization={Ieee} +} + +@article{david2021tensorflow, + title={Tensorflow lite micro: Embedded machine learning for tinyml systems}, + author={David, Robert and Duke, Jared and Jain, Advait and Janapa Reddi, Vijay and Jeffries, Nat and Li, Jian and Kreeger, Nick and Nappier, Ian and Natraj, Meghna and Wang, Tiezhen and others}, + journal={Proceedings of Machine Learning and Systems}, + volume={3}, + pages={800--811}, + year={2021} +} + + +@article{al2016theano, + title={Theano: A Python framework for fast computation of mathematical expressions}, + author={Al-Rfou, Rami and Alain, Guillaume and Almahairi, Amjad and Angermueller, Christof and Bahdanau, Dzmitry and Ballas, Nicolas and Bastien, Fr{\'e}d{\'e}ric and Bayer, Justin and Belikov, Anatoly and Belopolsky, Alexander and others}, + journal={arXiv e-prints}, + pages={arXiv--1605}, + year={2016} +} + + + +@inproceedings{jia2014caffe, + title={Caffe: Convolutional architecture for fast feature embedding}, + author={Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor}, + booktitle={Proceedings of the 22nd ACM international conference on Multimedia}, + pages={675--678}, + year={2014} +} + +@article{brown2020language, + title={Language models are few-shot learners}, + author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others}, + journal={Advances in neural information processing systems}, + volume={33}, + pages={1877--1901}, + year={2020} +} + +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} + + +@article{krizhevsky2012imagenet, + title={Imagenet classification with deep convolutional neural networks}, + author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, + journal={Advances in neural information processing systems}, + volume={25}, + year={2012} +} + +@article{paszke2019pytorch, + title={Pytorch: An imperative style, high-performance deep learning library}, + author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others}, + journal={Advances in neural information processing systems}, + volume={32}, + year={2019} +} + +@inproceedings{seide2016cntk, + title={CNTK: Microsoft's open-source deep-learning toolkit}, + author={Seide, Frank and Agarwal, Amit}, + booktitle={Proceedings of the 22nd ACM SIGKDD international conference on knowledge discovery and data mining}, + pages={2135--2135}, + year={2016} +} + +@inproceedings{kung1979systolic, + title={Systolic arrays (for VLSI)}, + author={Kung, Hsiang Tsung and Leiserson, Charles E}, + booktitle={Sparse Matrix Proceedings 1978}, + volume={1}, + pages={256--282}, + year={1979}, + organization={Society for industrial and applied mathematics Philadelphia, PA, USA} +} + + +@article{li2014communication, + title={Communication efficient distributed machine learning with the parameter server}, + author={Li, Mu and Andersen, David G and Smola, Alexander J and Yu, Kai}, + journal={Advances in Neural Information Processing Systems}, + volume={27}, + year={2014} +} + +@inproceedings{abadi2016tensorflow, + title={$\{$TensorFlow$\}$: a system for $\{$Large-Scale$\}$ machine learning}, + author={Abadi, Mart{\'\i}n and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and others}, + booktitle={12th USENIX symposium on operating systems design and implementation (OSDI 16)}, + pages={265--283}, + year={2016} +} + +@article{dean2012large, + title={Large scale distributed deep networks}, + author={Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen, Kai and Devin, Matthieu and Mao, Mark and Ranzato, Marc'aurelio and Senior, Andrew and Tucker, Paul and Yang, Ke and others}, + journal={Advances in neural information processing systems}, + volume={25}, + year={2012} +} + +@inproceedings{tokui2015chainer, + title={Chainer: a next-generation open source framework for deep learning}, + author={Tokui, Seiya and Oono, Kenta and Hido, Shohei and Clayton, Justin}, + booktitle={Proceedings of workshop on machine learning systems (LearningSys) in the twenty-ninth annual conference on neural information processing systems (NIPS)}, + volume={5}, + pages={1--6}, + year={2015} +} + +@article{chollet2018keras, + title={Keras: The python deep learning library}, + author={Chollet, Fran{\c{c}}ois and others}, + journal={Astrophysics source code library}, + pages={ascl--1806}, + year={2018} +} + +@article{lai2018cmsis, + title={Cmsis-nn: Efficient neural network kernels for arm cortex-m cpus}, + author={Lai, Liangzhen and Suda, Naveen and Chandra, Vikas}, + journal={arXiv preprint arXiv:1801.06601}, + year={2018} +} + +@article{lin2020mcunet, + title={Mcunet: Tiny deep learning on iot devices}, + author={Lin, Ji and Chen, Wei-Ming and Lin, Yujun and Gan, Chuang and Han, Song and others}, + journal={Advances in Neural Information Processing Systems}, + volume={33}, + pages={11711--11722}, + year={2020} +} + @article{ramcharan2017deep, title={Deep learning for image-based cassava disease detection}, author={Ramcharan, Amanda and Baranowski, Kelsee and McCloskey, Peter and Ahmed, Babuali and Legg, James and Hughes, David P}, @@ -170,6 +317,26 @@ @inproceedings{jouppi2017datacenter year={2017} } +@misc{mcmahan2023communicationefficient, + title={Communication-Efficient Learning of Deep Networks from Decentralized Data}, + author={H. Brendan McMahan and Eider Moore and Daniel Ramage and Seth Hampson and Blaise Agüera y Arcas}, + year={2023}, + eprint={1602.05629}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} + +@article{li2017learning, + title={Learning without forgetting}, + author={Li, Zhizhong and Hoiem, Derek}, + journal={IEEE transactions on pattern analysis and machine intelligence}, + volume={40}, + number={12}, + pages={2935--2947}, + year={2017}, + publisher={IEEE} +} + @article{krizhevsky2012imagenet, title={Imagenet classification with deep convolutional neural networks}, author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},