2016 Spring.tex

\documentclass{article} 	% use "amsart" instead of "article" for AMSLaTeX format
\usepackage{geometry}                		% See geometry.pdf to learn the layout options. There are lots.
\geometry{letterpaper}                   		% ... or a4paper or a5paper or ... 
%\geometry{landscape}                		% Activate for rotated page geometry
%\usepackage[parfill]{parskip}    		% Activate to begin paragraphs with an empty line rather than an indent
\usepackage{graphicx}				% Use pdf, png, jpg, or eps§ with pdflatex; use eps in DVI mode
								% TeX will automatically convert eps --> pdf in pdflatex		
\usepackage{amssymb}
\usepackage[UTF8, heading = false, scheme = plain]{ctex}
\usepackage[colorlinks = false]{hyperref}
\usepackage{amsmath}
\usepackage{bbm}

\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}

\title{2016 Spring Notes}
\author{Yue Yu}
\begin{document}
\maketitle
\tableofcontents
\newpage

\section{各种相关背景知识}
	\subsection{概率论}
		\subsubsection{条件概率期望:}
			\begin{eqnarray}
			E(X) = E(E(X|Y))
			\end{eqnarray}
		\subsubsection{Correlation coefficient(相关系数)}
			\begin{eqnarray}
				\rho_{\scriptscriptstyle X,\scriptscriptstyle Y} &=& \frac{Cov(X,Y)}{\sigma_{\scriptscriptstyle X}\sigma_{\scriptscriptstyle Y}}\\
				&=& \frac{E(X-E(X))(Y-E(Y))}{\sigma_{\scriptscriptstyle X}\sigma_{\scriptscriptstyle Y}}\\
				&=& \frac{E(XY)-E(X)E(Y)}{\sigma_{\scriptscriptstyle X}\sigma_{\scriptscriptstyle Y}}
			\end{eqnarray}
		\subsubsection{协方差矩阵}
                        \begin{eqnarray}
                        	       \mathrm{X} &=& [X_1,\ldots,X_n]^T\\
                                \Sigma &=& \mathrm{E}\Big[(\mathrm{X}-\mathrm{E}(\mathrm{X}))(\mathrm{X}-\mathrm{E}(\mathrm{X}))^T\Big]\\
                                \Sigma_{i,j} &=& Cov(X_i, X_j)\\
                                &=& \mathrm{E}\Big[(X_i - \mu_i)(X_j - \mu_j)\Big]
                        \end{eqnarray}

                \subsubsection{Multivariate normal distribution}
                        \begin{itemize}
                                \item
                                多维高斯联合分布:
                                $$f_X(x_1,\ldots,x_n) = \frac{1}{\sqrt{(2\pi)^k|\Sigma|}}\mathrm{exp}(-\frac{1}{2}(x-\mu)^T\Sigma^{-1}(x-\mu))$$
                                
                                \item
                                当为二维高斯分布时（$\rho$为相关系数）\\
                                $$f(x,y) = \frac{1}{2\pi\sigma_x\sigma_y\sqrt{1-\rho^2}}\mathrm{exp}\Big( -\frac{1}{2(1-\rho)^2} 
                                \Big[\frac{(x-\mu_x)}{\sigma_x^2}+\frac{(y-\mu_y)}{\sigma_y^2} - \frac{2\rho(x-\mu_x)(y-\mu_y)}{\sigma_x\sigma_y}\Big]\Big)$$
                                
                                条件概率服从:
                                $$P(X_1\big|X_2 =x_2)\sim N(\mu_1 +\frac{\sigma_1}{\sigma_2}\rho(x_2-\mu_2),(1-\rho^2)\sigma_1^2)$$
                                \item
                                Product of multivariate gaussian
                                \begin{eqnarray*}
                                    N_X(\mu_a,\Sigma_a)\cdot N_X(\mu_b,\Sigma_b) &=& z_cN_X(\mu_c,\Sigma_c)\\
                                    \Sigma_c &=& (\Sigma_a^{-1}+\Sigma_b^{-1})^{-1}\\
                                    \mu_c &=& \Sigma_c(\Sigma_a^{-1}\mu_a +\Sigma_b^{-1}\mu_b)\\
                                    z_c &=& |2\pi(\Sigma_a+\Sigma_b)|^{-\frac{1}{2}}\,\mathrm{exp} \left(-\frac{1}{2}(\mu_a-\mu_b)^T(\Sigma_a+\Sigma_b)^{-1}(\mu_a-\mu_b)\right)
                                \end{eqnarray*}
	                        \end{itemize}
	        
	        \subsubsection{Laplace Distribution:}
	        		\begin{itemize}
			\item
			概率密度函数\\
			$$f(x|\mu,b)= \displaystyle\frac{1}{2b} \mathrm{exp}\big(-\frac{|x-\mu|}{b}\big)$$
			\item
			期望\\
			$$\mu$$
			\item
			方差\\
			$$2b^2$$
			\end{itemize}
	\subsection{线性代数}
		\begin{itemize}
		\item
		Trace\\
		$$\mathrm{Tr}(\mathbf{ABC}) = \mathrm{Tr}(\mathbf{BCA}) =\mathrm{Tr}(\mathbf{CAB})$$ 
		$$||\mathbf{X}||^2_{\mathrm{F}} = \mathrm{Tr}(\mathbf{X^TX})$$
		\end{itemize}
	\subsection{矩阵求导法则}
		\begin{itemize}
		\item
		雅各比矩阵\\
		假设某函数从$\mathbbm{R}^n$映射到$\mathbbm{R}^m$
		\begin{eqnarray}
		J_{F}(x_1,\ldots,x_n) = \frac{\partial(y_1,\ldots,y_m)}{\partial(x_1,\ldots,x_n)} =
		\left(
			\begin{array}{ccc}
			\frac{\partial y_1}{\partial x_1}&\ldots&\frac{\partial y_1}{\partial x_n}\\
			\vdots&\ddots&\vdots\\
			\frac{\partial y_m}{\partial x_1}&\ldots&\frac{\partial y_m}{\partial x_n}\\
			\end{array}
		\right)
		\end{eqnarray}
		
		\item
		Chain Rules:\\
		$\mathrm{Suppose}\ \mathbf{y} = f(\mathbf{u}), \mathbf{u} = g(\mathbf{x})$\\
		\begin{eqnarray}
		\frac{\partial(y_1,\ldots,y_m)}{\partial(x_1,\ldots,x_n)} = 
		\frac{\partial(y_1,\ldots,y_m)}{\partial(u_1,\ldots,u_k)} \frac{\partial(u_1,\ldots,u_k)}{\partial(x_1,\ldots,x_n)}
		\end{eqnarray}
		\item
		对向量(Vector Function)求导(定义列向量对标量求导得到的是行向量)：
		\begin{eqnarray}
		\mathbf{\frac{\partial u^TAv}{\partial x}} &=& \mathbf{\frac{\partial u}{\partial x}Av + \frac{\partial v}{\partial x}A^Tu}\\
		\mathbf{\frac{\partial(u(x)+v(x))}{\partial x}} &=& \mathbf{\frac{\partial u(x)}{\partial x}+ \frac{\partial v(x)}{\partial x} }\\
		\mathbf{\frac{\partial Ax}{\partial x}} &=& \mathbf{A^T}\\
		\mathbf{\frac{\partial a}{\partial x}} &=& \mathbf{0} \\
		\mathbf{\frac{\partial x^T Ab}{\partial x}} &=& \mathbf{Ab}\\
		\mathbf{\frac{\partial x^TAx}{\partial x}} &=& \mathbf{(A+A^T)x}\\
		&=& \mathbf{2Ax} \textrm{\qquad 如果A为对称阵}\\
		\end{eqnarray}
		\item
		矩阵行列式对矩阵求导：
		\begin{eqnarray}
		\mathbf{\frac{\partial |X|}{\partial X}} &=& \mathbf{|X|(X^{-1})^T} \\
		\mathbf{\frac{\partial \ln |X|}{\partial X}} &=& \mathbf{(X^{-1})^T}
		%\mathbf{\frac{\partial}{\partial x}} &=& \mathbf{\frac{\partial}{\partial x}} 
		\end{eqnarray}
		\item
		矩阵求导\\
		\begin{eqnarray}
		\mathbf{\frac{\partial a^TXb}{\partial X}} &=& \mathbf{ab^T} \\
		\mathbf{\frac{\partial a^TX^Tb}{\partial X}} &=& \mathbf{ba^T} \\
		\mathbf{\partial X^T} &=& \mathbf{(\partial X)^T}\\
		\partial(\mathrm{Tr}(\mathbf{X} )) &=& \mathrm{Tr}(\partial \mathbf{X})\\
		\frac{\partial}{\partial\mathbf{X}}\mathrm{Tr}(\mathbf{X^TBX} ) &=& \mathbf{B^TX+BX }\\
		\frac{\partial}{\partial\mathbf{X}}||\mathbf{X}||^2_{\mathrm{F}} &=& 2\mathbf{X}\\
		\mathbf{\frac{\partial b^TX^TDXc}{\partial X}} &=& \mathbf{D^TXbc^T + DXcb^T}\\
		\mathbf{\frac{\partial (Xb+c)^TD(Xb+c)}{\partial X}} &=& \mathbf{(D+D^T)(Xb+c)b^T}
		\end{eqnarray}
		\end{itemize}
	
\newpage


\section{Information Theory}
	\subsection{Differential Entropy}
		\subsubsection{常见 Differential Entropy}
			\begin{itemize}
			       \item
            		       \emph{Uniform distribution(from 0 to a)}
                                \[
                                h(X) = \log(a)
                                \]
                                
                                \item
                                \emph{Normal Distribution}
                                \[
                                 X\sim N(0,\sigma^2)
                                 \]
                                 \[
                                 h(X) = \frac{1}{2} \log2\pi e \sigma^2 
                                \]
                                
                                \item
                                \emph{Multivariate Normal Distribution}
                                $$N_n\sim(\mu, K)$$
                                $\mu$ is mean and $K$ is covariance matrix
                                \[
                                h(X_1,\ldots,X_n) = \frac{1}{2}\log(2\pi e )^n |K|
                                \]
			\end{itemize}
			
		\subsubsection{Properties}
                        \begin{itemize}
                                \item $h(X+c) = h(X)$\\
                                \item$h(aX) = h(X) + \log|a|$\\
                                \item$h(AX) = h(X) + \log\big|\mathrm{det}(A)\big|$
                        \end{itemize}

\newpage
\section{Machine Learning}
	\subsection{MAXIMUM LIKELIHOOD ESTIMATION (MLE)}
		\subsubsection{GAUSSIAN MLE}
			$$\hat{\mu}_{\scriptscriptstyle ML} = \frac{1}{n}\sum_{i=1}^nx_i$$
			$$\hat{\Sigma}_{\scriptscriptstyle ML} = \frac{1}{n}\sum_{i=1}^n(x_i - \hat{\mu}_{\scriptscriptstyle ML})(x_i - \hat{\mu}_{\scriptscriptstyle ML})^T$$
			$$\mathbb{E}(\hat{\mu}_{\scriptscriptstyle ML}) = \mu$$
			$$\mathbb{E}(\hat{\Sigma}_{\scriptscriptstyle ML}) = \frac{m-1}{m}\Sigma$$
			So the mean is unbiased and covariance is biased.
	\subsection{Linear Regression}
		\subsubsection{Least Squares}
			Usually, for linear regression (and classification) we include an intercept term $w_0$ that doesn’t interact with any element in the vector $x$. It will be 
			convenient to attach a 1 to the first dimension of each vector $x_i$.
			\[
			x_i = \left [
				\begin{array}{c}
				1\\
				x_{i1}\\
				\vdots\\
				x_{id}\\
				\end{array}
				\right ]
			, \qquad
			\mathbf{X} = \left [
					    \begin{array}{cccc}
					    1 &x_{11}& \ldots & x_{1d}\\
					    1 &x_{21}& \ldots & x_{2d}\\
					    \vdots &\vdots&  & \vdots\\
					    1 &x_{n1}& \ldots & x_{nd}\\
					    \end{array}
					    \right]
			\]
			这种情况下，得到的解为:
			\[
			w_{\scriptscriptstyle \mathrm{ML}} = (X^TX)^{-1}X^Ty
			\]
			预测新的点为:
			$$y_{\scriptscriptstyle\mathrm{new}} = x_{\scriptscriptstyle\mathrm{new}}^Tw_{\scriptscriptstyle \mathrm{ML}} $$
			
	\subsection{Classification}
		\subsubsection{Bayes Classifier}
			The Bayes classifier has the smallest prediction error of all classifiers.\\
			假设$(X,Y)$独立同分布，那么optimal classifier is:
			\begin{eqnarray}
			f(x) = \argmax_{y\in \mathcal{Y}} P(Y=y|X=x)
			\end{eqnarray}
 			Using Bayes rule and ignoring $P(X = x)$ we equivalently have: 
			\begin{eqnarray}
			f(x) = \argmax_{y\in \mathcal{Y}} P(Y=y)\times P(X=x|Y=y)
			\end{eqnarray}
			其中$P(Y=y)$叫做 class prior，$P(X=x|Y=y)$叫做 data likelihood given class.
		\subsubsection{THE PERCEPTRON ALGORITHM}
			\begin{itemize}
			\item
			Suppose there is a linear classifier with zero training error:
			$$y_i = \mathrm{sign}(x_i^Tw)$$
			Then the data is linearly ‘separable’\\
			\item
			By using the linear classifier $y = \mathrm{sign}(x^Tw)$ the Perceptron seeks to minimize
			$$\mathcal{L} = -\sum_{i=1}^n(y_{i}\cdot x_i^Tw)\mathbbm{1} \{y_i\not = \mathrm{sign}(x_i^Tw)\}$$
			Because $y\in \{-1,+1\}$,
			\[
			y_i\cdot x_i^Tw \quad \mathrm{is}\quad
			\left \{ 
			\begin{array}{c}
			> 0\quad y_i = x_i^Tw\\
			< 0\quad y_i \not= x_i^Tw\\
			\end{array}
			\right .
			\]
			By minimizing $\mathcal{L}$ we’re trying to always predict the correct label.
			\item
			$\nabla_w\mathcal{L} = 0 $ 没有办法直接解出来，所以使用 gradient descent 的方法迭代求解
			($\mathcal{M}_t$ 表示在第$t$步被分错类的数据下标的集合)：
			\begin{eqnarray}
			\nabla_w\mathcal{L} = \sum_{i \in \mathcal{M}_t} -y_ix_i\\
			w'\gets w -\eta \nabla_w\mathcal{L} \\
			\mathcal{L}(w')<\mathcal{L}(w)
			\end{eqnarray}
			\item
			Perceptron 算法的一些问题：\\
			When the data is separable, there are an infinite number of hyperplanes.\\
			This algorithm. doesn’t take “quality” into consideration. It converges to the first one it finds.\\
			When the data isn’t separable, the algorithm doesn’t converge. The hyperplane of $w$ is always moving around.\\
			\end{itemize}
		\subsubsection{LOGISTIC REGRESSION}
			\begin{itemize}
			\item
			The distance of $x$ to a hyperplane $x^Tw + w_0$ is:
			$$\Big|\frac{x^Tw}{||w||_2} + \frac{w_0}{||w||_2}\Big|$$
			\item
			鉴于hyperplane与log odds有很多相似的性质，我们可以直接用hyperplane来表示log odds(注意，
			与前面Bayes得到的公式不同的是，前面是通过先验概率和class prior算出来的$w$,$w_0$，这里我们对于这些先验概率没有约束Discriminative
			 classifier)：
			$$\ln \frac{P(y = +1|x)}{P(y = -1|x)} = x^Tw+w_0$$
			$$P(y=+1|x) = \frac{\mathrm{exp}(x^Tw+w_0)}{1+\mathrm{exp}(x^Tw+w_0)}=\sigma(x^Tw+w_0)$$
			其中$\sigma$叫做 \emph{sigmoid function}，in this case $x^Tw+w_0$ is the \emph{link function} for the log odds.
			\item
			与linear regression 类似，我们可以通过给$x$加入一维全为1的数据，将$w_0$并入到$w$；
			$$
			w\gets\left[ 
			\begin{array}{c}
			w_0\\w
			\end{array}\right],\quad
			x\gets\left[
			\begin{array}{c}
			1\\x
			\end{array}
			\right]
			$$
			$$x^Tw + w_0\gets x^Tw$$
			\item
			Data likelihood\\
			$y_1,\ldots,y_n$的联合分布就是把每个的概率乘起来(默认$P(Y = y_i|X=x_i)$独立同分布)，对联合分布取对数的到目标函数$\mathcal{L}$，
			算法的目标是找到参数
			$w$使得目标函数值最大（实际上就是 Maximum likelihood）\\
			$$\mathcal{L} = \sum_{i=1}^n\ln \sigma_i(y_i\cdot w)$$
			$$w_{\scriptscriptstyle{ML}} = \argmax_{w}\mathcal{L}$$
			\item
			与Perceptron算法类似，使用梯度进行极值的计算(其中$\sigma_i(y_i\cdot w)$表示$P(Y = y_i|X=x_i)$)\\
			$$w^{(t+1)}=w^t + \eta \nabla_w\mathcal{L},\quad \quad \nabla_w\mathcal{L} = \sum_{i=1}^n\{1-\sigma_i(y_i\cdot w)\}y_ix_i$$
			有时为了避免‘‘over fitting''使用下述公式:
			$$\mathcal{L} = \sum_{i=1}^n\ln \sigma_i(y_i\cdot w) - \lambda w^Tw$$
			\item
			上述算法叫做\emph{steepest ascent}，还有一种Newton算法利用的是$\mathcal{L}$的二阶导数，参见lecture\_9第14页\\
			\item
			Laplace approximation for logistic regression 参见lecture\_9第24页\\
			\end{itemize}
	\subsection{Kernels}
		\subsubsection{Defination}
			A kernel $K(\cdot,\cdot):\mathbb{R}^d\times \mathbb{R}^d \to \mathbb{R}$\ is a symmetric function defined as follows:\\
			For any set of data $x_1,\ldots, x_n \in \mathbb{R}^d$, the $n \times n$ matrix $K$ with $K_{i,j} = K(x_i,x_j)$ and there is a mapping 
			$\phi:\mathbb{R}^d \to \mathbb{R}^D$ such that $K(x_i,x_j) = \phi(x_i)^T\phi(x_j)$
		\subsubsection{Gaussian Kernel}
			Also called \emph{radial basis function} (RBF),
			$$K(x,x') = a\,\mathrm{exp}(-\frac{1}{b}||x-x'||^2)$$
		\subsubsection{产生新Kernel}
			假设$K_1,K_2$为两个kernel，可以利用现有的kernel组成新的kernel $K$：
			\begin{eqnarray}
			K(x,x') &=& K_1(x,x') + K_2(x,x')\\
			K(x,x') &=& K_1(x,x')K_2(x,x')\\
			K(x,x') &=& \mathrm{exp}{K_1(x,x')} 
			\end{eqnarray}
		\subsubsection{Kernelized Perceptron}
			\begin{itemize}
			\item
			For Perceptron classfier,
			$$w = \sum_{i \in \mathcal{M}}y_ix_i$$
			where $\mathcal{M}$ is \emph{sequentially constructed} set of misclassified examples.
			$$y_0 =\mathrm{sign}(x_0^Tw) = \mathrm{sign}(\sum_{i \in \mathcal{M}}y_ix_0^T x_i)$$
			\item
			Using feature expansions, we can rewrite the function into the form below:
			$$y_0 =\mathrm{sign}(\phi(x_0)^Tw) = \mathrm{sign}\big (\sum_{i \in \mathcal{M}}y_i\phi(x_0)^T \phi(x_i)\big)$$
			\item
			Use the kernel replace the $\phi(x_0)^T \phi(x_i)$
			$$y_0 = \mathrm{sign}\big (\sum_{i \in \mathcal{M}}y_i K(x_0,x_i)\big)$$
			\item
			Learning the kernelized Perceptron\\
			和普通的Perceptron基本一样，只不过每次更新$\mathcal{M}$是用kernelized的公式错判的点，具体可以看lecture 10第15页
			\end{itemize}
		\subsubsection{Kernel k-NN}
			lecture 10第16页
			$$ y_0 = \frac{1}{Z}\mathrm{sign}\big(\sum^n_{i=1}y_ie^{-\frac{1}{b}||x_0-x_i||^2}\big)$$
			其中$Z = \sum_{i=1}^n e^{-\frac{1}{b}||x_0 - x_i||^2}$
		\subsubsection{Gaussian Processes}
			具体参考 lecture 10 第19页以后
			\begin{itemize}
			\item
			Defination\\
			$f(x)$ is a Gaussian process and $y(x)$ is  the noise-added process
			where $y = (y_1,\ldots,y_n)^T$ and $K$ is $n\times n$ matrix with $K_{ij} = K(x_i,x_j)$:
			$$y|f \sim N(f,\sigma^2I),\ f\sim N(0,K) \Leftrightarrow y\sim N(0,\sigma^2I+K)$$
			\textbf{可以这样理解：    }\emph{$f (x)$ is the GP and $y(x)$ equals $f (x)$ plus i.i.d. noise}
			\item
			Predictions with Gaussian Processes\\
			Given measured data $\mathcal{D}_n = \{(x_1,y_1),\ldots,(x_n,y_n)\}$, the distribution of $y(x)$ 
			can be calculated at any \emph{new x} to make predictions.\\
			$K(x,\mathcal{D}_n) = [K(x,x_1),\ldots,K(x,x_n)]$ and $K_n$ is the $n\times n$ kernel matrix 
			restricted points in $\mathcal{D}_n$
				\begin{eqnarray}
				y(x)|\mathcal{D}_n  &\sim& N(\mu(x),\Sigma(x))\\
				\mu(x) &=& K(x,\mathcal{D}_n)K_n^{-1}y\\
				\Sigma(x) &=& \sigma^2 + K(x,x) - K(x,\mathcal{D}_n) K_n^{-1} K(x,\mathcal{D}_n)^T
				\end{eqnarray}
			For the posterior of $f(x)$ instead of $y(x)$, just remove $\sigma^2$
			\end{itemize}
	
			
\end{document}