notes.tex

\documentclass[12pt, a4paper, oneside]{article}
\usepackage{amsmath, amsthm, amssymb, bm, graphicx, hyperref, mathrsfs, listings, xcolor, indentfirst}

\title{{\Huge{\textbf{Notes on MIT Introduction to Deep Learning}}}}
\author{Tianzong Cheng}
\date{\today}
\linespread{1.5}

\definecolor{code_background_color}{rgb}{0.95,0.95,0.92}
\lstset{language=Python, basicstyle=\ttfamily\small, backgroundcolor=\color{code_background_color}}

\begin{document}

\maketitle
\thispagestyle{empty}

\newpage
\thispagestyle{empty}
\begin{center}
    \Huge\textbf{Preface}
\end{center}
Many thanks to Professor Ban for introducing this lecture to me.
\begin{flushright}
    \begin{tabular}{c}
        Tianzong Cheng \\
        \today
    \end{tabular}
\end{flushright}

\newpage
\pagenumbering{Roman}
\setcounter{page}{1}
\tableofcontents

\newpage
\setcounter{page}{1}
\pagenumbering{arabic}

\section{Intro to Deep Learning}

\subsection{Perceptron}

\begin{equation*}
    \hat{y}=g(w_{0}+\sum_{i=1}^{m}x_{i}w_{i})
\end{equation*}

$x_{i}$ is the input, $w_{i}$ is the weight of each input, $w_{0}$ is the bias term, and $g$ is a non-linear activation function.

Or we can rewrite the equation in linear algebra language.

\begin{equation*}
    \hat{y} = g(w_{0}+X^{T}W)
\end{equation*}

where: $X=\begin{bmatrix}
        x_{1}  \\
        \vdots \\
        x_{m}
    \end{bmatrix}$ and $W=\begin{bmatrix}
        w_{1}  \\
        \vdots \\
        w_{m}
    \end{bmatrix}$

An example of the activation function: sigmoid function. It can be interpreted as a continuous version of the threshold function.

\begin{equation*}
    g(z)=\sigma(z)=\frac{1}{1+e^{-z}}
\end{equation*}

\subsection{Building Neural Networks with Perceptrons}

Perceptron: dot product, bias, non-linearity.

Because all inputs are densely connected to all outputs, these layers are called \textbf{Dense} layers.

The first layer, which is a hidden layer can be expressed as follows:

\begin{equation*}
    z_{i}=w_{0,i}^{(1)}+\sum_{j=1}^{m}x_{j}w_{j,i}^{(1)}
\end{equation*}

Then, the second layer, which calculates the final outputs, can be expressed as follows:

\begin{equation*}
    \hat{y_{i}}=g(w_{0,i}^{(2)}+\sum_{j=1}^{m}g(z_{j})w_{j,i}^{(2)})
\end{equation*}

By stacking these layers on top of each other, we create a sequential model.

\subsection{Loss Functions and Gradient Descent}

The \textbf{empirical loss} measures the total loss over our entire dataset. Empirical loss is also known as objective function, cost function and empirical risk.

\begin{equation*}
    J(\bm{W})=\frac{1}{n}\sum_{i=1}^{n}\mathcal{L}(f(x^{(i)};\bm{W}),y^{(i)})
\end{equation*}

In this equation, $J$ is a function of the weights of the neuron network. \textbf{Cross entropy loss} can be used with models that output a probability between $0$ and $1$. \textbf{Mean squared error loss} can be used with regression models that output continuous real numbers.

Next, we want to find the network weights that \textbf{achieve the lowest loss}. 

First, we randomly pick an initial $(w_{0},w_{1})$. Then, by taking a small step in the opposite direction of gradient at this point, we can get closer to where the loss is the lowest. Repeat this step until convergence.

\subsection{Backpropagation}

Backpropagation is the process of computing gradients with respect to the weights and biases. The process is given the name because gradients are calculated layer by layer, starting from the output layer.

\begin{equation*}
    \frac{\partial J(\bm{W})}{\partial w_{1}}=\frac{\partial J(\bm{W})}{\partial \hat{y}}\frac{\partial\hat{y}}{\partial z_{1}}\frac{\partial z_{1}}{\partial w_{1}}
\end{equation*}

Repeat this for \textbf{every weight in the network} using gradients from later layers. We can see backpropagation is simply an instantiation of the chain rule.

\subsection{Optimization}

\begin{equation*}
    W\leftarrow W-\eta\frac{\partial J(\bm{W})}{\partial \bm{W}}
\end{equation*}

The equation shows how the weights are optimized through gradient descent. Note that $eta$ indicates the learning rate. A small learning rate converges slowly and gets stuck in false local minima while large learning rates overshoot, become unstable and diverge. So we need some kind of algorithms that adapts to the landscape.

\subsection{Batched Gradient Descent}

Compute the gradient of a batch of (usually around a hundred) data points, rather than a single data point or all the data points. This is a fast and accurate way of estimating the gradient.

\subsection{Regularization}

The problem of overfitting describes that the model is too complex and does not generalize well.

Regularization is a technique that constrains our optimization problem to discourage complex models. It helps to improve generalization of our model on unseen data.

The idea of \textbf{dropout} is randomly selecting some (typically half) activations to $0$.

The idea of \textbf{early stopping} is to stop training before we have a chance to overfit. To be more specific, as the number of iterations increase, the loss of training dataset gets smaller. However, the loss of testing dataset gets smaller first and starts to increase again at certain point. The training process should stop before this happens.

\section{Deep Sequence Modeling}

\subsection{Neurons with Recurrence}

\begin{equation*}
    \hat{y}_{t}=f(x_{t},h_{t-1})
\end{equation*}

The output is a function of the input and \textbf{a past memory}, represented by the symbol $h$. This is the intuitive foundation behind \textbf{Recurrent Neural Networks} (RNNs).

\begin{equation*}
    h_{t}=f_{W}(x_{t},h_{t-1})
\end{equation*}

The cell state is a function of the input and the old state. Note that the same function and set of parameters are used at every time step.

\subsection{Encoding Language for a Neural Network}

\begin{itemize}
    \item Indexing: each word is given a number index
    \item Embedding: Index to fixed-sized vector
    \begin{itemize}
        \item One-hot embedding: $[0,\cdots,0,1,0,\cdots,0]$
        \item Learned embedding: Words that have close meanings are put near to each other
    \end{itemize}
\end{itemize}

\subsection{Dealing with Gradient Issues}

Computing the gradient with respect to $h_{0}$ involves many factors of $W_{hh}$ and repeated gradient computation. Consider the $W_{hh}$ matrix:

\begin{itemize}
    \item Many values $>1$: exploding gradients. Solution: Gradient clipping
    \item Many values $<1$: vanishing gradients.
\end{itemize}

Solving the vanishing gradient problem:

\begin{itemize}
    \item Activation Functions: Using ReLU prevents $f'$ from shrinking the gradients when $x>0$ because the derivative of ReLU function is $1$.
    \item Parameter Initialization: Initialize weights to identity matrix and initialize biases to zero.
    \item Gated Cells, Long Short Term Memory (LSTMs): Use gates to selectively add or remove information within each recurrent unit. This is \textbf{the most robust method} to solve the vanishing gradient problem.
\end{itemize}

\subsection{Long Short Term Memory (LSTMs)}

\begin{itemize}
    \item Maintain a cell state
    \item Use gates to control the flow of information
    \begin{itemize}
        \item \textbf{Forget} gate gets rid of irrelevant information
        \item \textbf{Store} relevant information from current input
        \item Selectively \textbf{update} cell state
        \item \textbf{Output} gate returns a filtered version of the cell state
    \end{itemize}
    \item Backpropagation through time with partially uninterrupted gradient flow
\end{itemize}

There are three gates, which are all functions of the input and the last output, controlling the information flow.

\begin{equation*}
    \text{gate}=\text{sigmoid}(W[x_{t},h_{t-1}]+b)
\end{equation*}

\begin{itemize}
    \item The old cell state goes through \textbf{the forget gate} to get rid of the irrelevant information.
    \item The input goes through \textbf{the input gate}, getting rid of the irrelevant information, and is added upon the filtered old cell state to obtain the current cell state.
    \item The current cell state goes through \textbf{the output gate} to get the output at current time.
\end{itemize}

\subsection{Limitations of RNNs}

Limitations of RNNs:
\begin{itemize}
    \item Encoding bottleneck
    \item Slow, no parallelization
    \item Not long memory
\end{itemize}

Idea 1: Feed everything into dense network:
\begin{itemize}
    \item Not scalable
    \item No order
    \item No long memory
\end{itemize}

Idea 2: Identify and attend to what's important

\subsection{Attention}

\begin{itemize}
    \item Encode position information
    \item Extract \textbf{query}, \textbf{key}, \textbf{value} for search: we use neural network layers to extract \textbf{query}, \textbf{key}, \textbf{value}.
    \item Compute attention weighting: $\text{softmax}(\frac{Q\cdot K^{T}}{\text{scaling}})$, the softmax function constrains the values to be between $0$ and $1$
    \item Extract features with high attention
\end{itemize}

\begin{equation*}
    A(Q,K,V)=\text{softmax}(\frac{Q\cdot K^{T}}{\text{scaling}})\cdot V
\end{equation*}

\section{Deep Computer Vision}

\subsection{Learning Visual Features}

If we use a fully connected neural network to deal with images, there are two main problems. Firstly, since the image is flattened down into one dimension, all the spatial information is lost. Secondly, there are too many parameters.

The idea of convolution:
\begin{itemize}
    \item Connect patch in input layer to a single neuron in subsequent layer.
    \item Use a \textbf{sliding window} to define connections.
\end{itemize}

The convolution operation is element-wise multiplication between the input image and the filter and the result is a feature map.

\subsection{Convolutional Neural Networks}

\begin{itemize}
    \item Convolution: Apply filters to generate feature maps.
    \item Non-linearity: Often ReLU.
    \item Pooling: Down-sampling operation on each feature map.
\end{itemize}

The idea of output volume introduces the dimension of depth, representing the number of filters. Thus, the network can learn a number of different features.

\subsubsection{Non-linearity}

ReLU function replaces all negative values with zero. The computational expense is really low while introducing non-linearity.

\subsubsection{Pooling}

Pooling is a way of down-sampling the feature map so that the CNN can deal with bigger images. Max pooling with $2\times 2$ filters and stride $2$ is taking the maximum value in each $2\times 2$ block thus reducing the scale by $2$ in each dimension. Note that the space information is preserved.

\subsection{Applications}

\subsubsection{Basic Architecture}

Use CNNs to learn features in the input image first and then feed the result into downstream networks.

\subsubsection{Classification}

After the features are learned, the arrays are flattened and fed into a fully connected network. Remember to use the softmax function to normalize the result between $0$ and $1$, representing the probability.

\subsubsection{Object Detection}

Faster R-CNN learns region proposals. A region proposal network learns candidate regions and feed them into downstream networks.

\subsubsection{Semantic Segmentation}

Semantic segmentation is a fully convolutional network that classify every pixel of the input image. The network is designed with both down-sampling and up-sampling.

\section{Deep Reinforcement Learning}

There are three classes of learning problems:
\begin{itemize}
    \item Supervised Learning: learn a function to map $x\rightarrow y$
    \item Unsupervised Learning: learn the underlying structure
    \item Reinforcement Learning: maximize future rewards over many steps
\end{itemize}

\subsection{Key Concepts}

\begin{itemize}
    \item Agent: take actions
    \item Environment: the world in which the agent exists and operates
    \item Action space: the set of possible actions an agent can make in the Environment
    \item Observations: of the environment after taking actions
    \item State: a situation which the agent perceives
    \item Reward: feedback that measures the success or failure of the agent's action
    \item Total reward: $R_{t}=\sum_{i=t}^{\infty}r_{i}$
    \item Discounted total reward: $R_{t}=\sum_{i=t}^{\infty}\gamma^{i}r_{i}$, where $\gamma$ is the discount factor, $0<\gamma <1$
    \item \textbf{Q-function}: The Q-function captures the expected total future reward an agent in state, $s$, can receive by executing a certain action, $a$. $Q(s_{t},a_{t})=\mathbb{E}[R_{t}|s_{t},a_{t}]$
    \item \textbf{Policy}: The policy function infers the best action to take at its state. The policy should choose an action that maximizes future reward. $\pi^{*}(s)=\text{argmax}Q(s,a)$
\end{itemize}

\subsection{Deep Reinforcement Learning Algorithms}

\begin{itemize}
    \item Value learning: Find $Q(s,a)$, $a=\text{argmax}Q(s,a)$
    \item Policy learning: Find $\pi (s)$
\end{itemize}

\subsection{Deep Q Networks (DQN)}

The Q-Loss function is:

\begin{equation*}
    \mathcal{L}=\mathbb{E}[||(r+\gamma\text{max}Q(s',a'))-Q(s,a)||^{2}]
\end{equation*}

Downsides of Q-learning

\begin{itemize}
    \item Complexity:
    \begin{itemize}
        \item Can model scenarios where the action space is discrete and small
        \item Cannot handle continuous action spaces
    \end{itemize}
    \item Flexibility:
    \begin{itemize}
        \item Policy is deterministically computed from the Q function by maximizing the reward, thus it cannot learn stochastic policies.
    \end{itemize}
\end{itemize}

\subsection{Policy Learning}

Function $P$ represents the probability of selecting the corresponding action will lead to the highest reward. Note that $\sum_{a_{i}\in A}^{}P(a_{i}|s)=1$, the sum of all values of is $1$.

The $P$ function can be a continuous function, so it can deal with continuous action space, such as how fast should the agent move.

Policy Gradients Training Algorithm
\begin{lstlisting}
    1. Initialize the agent
    2. Run a policy until termination
    3. Record all states, actions, rewards
    4. Decrease probability of actions that resulted
       in low rewards
    5. Increase probability of actions that resulted
       in high rewards
\end{lstlisting}

The loss function is defined as the negative multiplication of log-likelihood of action and the reward.

\begin{equation*}
    \text{loss}=-\log P(a_{t}|s_{t}R_{t})
\end{equation*}

\end{document}