-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2016 Spring.tex
376 lines (349 loc) · 16.5 KB
/
2016 Spring.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
\documentclass{article} % use "amsart" instead of "article" for AMSLaTeX format
\usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots.
\geometry{letterpaper} % ... or a4paper or a5paper or ...
%\geometry{landscape} % Activate for rotated page geometry
%\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent
\usepackage{graphicx} % Use pdf, png, jpg, or eps§ with pdflatex; use eps in DVI mode
% TeX will automatically convert eps --> pdf in pdflatex
\usepackage{amssymb}
\usepackage[UTF8, heading = false, scheme = plain]{ctex}
\usepackage[colorlinks = false]{hyperref}
\usepackage{amsmath}
\usepackage{bbm}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\title{2016 Spring Notes}
\author{Yue Yu}
\begin{document}
\maketitle
\tableofcontents
\newpage
\section{各种相关背景知识}
\subsection{概率论}
\subsubsection{条件概率期望:}
\begin{eqnarray}
E(X) = E(E(X|Y))
\end{eqnarray}
\subsubsection{Correlation coefficient(相关系数)}
\begin{eqnarray}
\rho_{\scriptscriptstyle X,\scriptscriptstyle Y} &=& \frac{Cov(X,Y)}{\sigma_{\scriptscriptstyle X}\sigma_{\scriptscriptstyle Y}}\\
&=& \frac{E(X-E(X))(Y-E(Y))}{\sigma_{\scriptscriptstyle X}\sigma_{\scriptscriptstyle Y}}\\
&=& \frac{E(XY)-E(X)E(Y)}{\sigma_{\scriptscriptstyle X}\sigma_{\scriptscriptstyle Y}}
\end{eqnarray}
\subsubsection{协方差矩阵}
\begin{eqnarray}
\mathrm{X} &=& [X_1,\ldots,X_n]^T\\
\Sigma &=& \mathrm{E}\Big[(\mathrm{X}-\mathrm{E}(\mathrm{X}))(\mathrm{X}-\mathrm{E}(\mathrm{X}))^T\Big]\\
\Sigma_{i,j} &=& Cov(X_i, X_j)\\
&=& \mathrm{E}\Big[(X_i - \mu_i)(X_j - \mu_j)\Big]
\end{eqnarray}
\subsubsection{Multivariate normal distribution}
\begin{itemize}
\item
多维高斯联合分布:
$$f_X(x_1,\ldots,x_n) = \frac{1}{\sqrt{(2\pi)^k|\Sigma|}}\mathrm{exp}(-\frac{1}{2}(x-\mu)^T\Sigma^{-1}(x-\mu))$$
\item
当为二维高斯分布时($\rho$为相关系数)\\
$$f(x,y) = \frac{1}{2\pi\sigma_x\sigma_y\sqrt{1-\rho^2}}\mathrm{exp}\Big( -\frac{1}{2(1-\rho)^2}
\Big[\frac{(x-\mu_x)}{\sigma_x^2}+\frac{(y-\mu_y)}{\sigma_y^2} - \frac{2\rho(x-\mu_x)(y-\mu_y)}{\sigma_x\sigma_y}\Big]\Big)$$
条件概率服从:
$$P(X_1\big|X_2 =x_2)\sim N(\mu_1 +\frac{\sigma_1}{\sigma_2}\rho(x_2-\mu_2),(1-\rho^2)\sigma_1^2)$$
\item
Product of multivariate gaussian
\begin{eqnarray*}
N_X(\mu_a,\Sigma_a)\cdot N_X(\mu_b,\Sigma_b) &=& z_cN_X(\mu_c,\Sigma_c)\\
\Sigma_c &=& (\Sigma_a^{-1}+\Sigma_b^{-1})^{-1}\\
\mu_c &=& \Sigma_c(\Sigma_a^{-1}\mu_a +\Sigma_b^{-1}\mu_b)\\
z_c &=& |2\pi(\Sigma_a+\Sigma_b)|^{-\frac{1}{2}}\,\mathrm{exp} \left(-\frac{1}{2}(\mu_a-\mu_b)^T(\Sigma_a+\Sigma_b)^{-1}(\mu_a-\mu_b)\right)
\end{eqnarray*}
\end{itemize}
\subsubsection{Laplace Distribution:}
\begin{itemize}
\item
概率密度函数\\
$$f(x|\mu,b)= \displaystyle\frac{1}{2b} \mathrm{exp}\big(-\frac{|x-\mu|}{b}\big)$$
\item
期望\\
$$\mu$$
\item
方差\\
$$2b^2$$
\end{itemize}
\subsection{线性代数}
\begin{itemize}
\item
Trace\\
$$\mathrm{Tr}(\mathbf{ABC}) = \mathrm{Tr}(\mathbf{BCA}) =\mathrm{Tr}(\mathbf{CAB})$$
$$||\mathbf{X}||^2_{\mathrm{F}} = \mathrm{Tr}(\mathbf{X^TX})$$
\end{itemize}
\subsection{矩阵求导法则}
\begin{itemize}
\item
雅各比矩阵\\
假设某函数从$\mathbbm{R}^n$映射到$\mathbbm{R}^m$
\begin{eqnarray}
J_{F}(x_1,\ldots,x_n) = \frac{\partial(y_1,\ldots,y_m)}{\partial(x_1,\ldots,x_n)} =
\left(
\begin{array}{ccc}
\frac{\partial y_1}{\partial x_1}&\ldots&\frac{\partial y_1}{\partial x_n}\\
\vdots&\ddots&\vdots\\
\frac{\partial y_m}{\partial x_1}&\ldots&\frac{\partial y_m}{\partial x_n}\\
\end{array}
\right)
\end{eqnarray}
\item
Chain Rules:\\
$\mathrm{Suppose}\ \mathbf{y} = f(\mathbf{u}), \mathbf{u} = g(\mathbf{x})$\\
\begin{eqnarray}
\frac{\partial(y_1,\ldots,y_m)}{\partial(x_1,\ldots,x_n)} =
\frac{\partial(y_1,\ldots,y_m)}{\partial(u_1,\ldots,u_k)} \frac{\partial(u_1,\ldots,u_k)}{\partial(x_1,\ldots,x_n)}
\end{eqnarray}
\item
对向量(Vector Function)求导(定义列向量对标量求导得到的是行向量):
\begin{eqnarray}
\mathbf{\frac{\partial u^TAv}{\partial x}} &=& \mathbf{\frac{\partial u}{\partial x}Av + \frac{\partial v}{\partial x}A^Tu}\\
\mathbf{\frac{\partial(u(x)+v(x))}{\partial x}} &=& \mathbf{\frac{\partial u(x)}{\partial x}+ \frac{\partial v(x)}{\partial x} }\\
\mathbf{\frac{\partial Ax}{\partial x}} &=& \mathbf{A^T}\\
\mathbf{\frac{\partial a}{\partial x}} &=& \mathbf{0} \\
\mathbf{\frac{\partial x^T Ab}{\partial x}} &=& \mathbf{Ab}\\
\mathbf{\frac{\partial x^TAx}{\partial x}} &=& \mathbf{(A+A^T)x}\\
&=& \mathbf{2Ax} \textrm{\qquad 如果A为对称阵}\\
\end{eqnarray}
\item
矩阵行列式对矩阵求导:
\begin{eqnarray}
\mathbf{\frac{\partial |X|}{\partial X}} &=& \mathbf{|X|(X^{-1})^T} \\
\mathbf{\frac{\partial \ln |X|}{\partial X}} &=& \mathbf{(X^{-1})^T}
%\mathbf{\frac{\partial}{\partial x}} &=& \mathbf{\frac{\partial}{\partial x}}
\end{eqnarray}
\item
矩阵求导\\
\begin{eqnarray}
\mathbf{\frac{\partial a^TXb}{\partial X}} &=& \mathbf{ab^T} \\
\mathbf{\frac{\partial a^TX^Tb}{\partial X}} &=& \mathbf{ba^T} \\
\mathbf{\partial X^T} &=& \mathbf{(\partial X)^T}\\
\partial(\mathrm{Tr}(\mathbf{X} )) &=& \mathrm{Tr}(\partial \mathbf{X})\\
\frac{\partial}{\partial\mathbf{X}}\mathrm{Tr}(\mathbf{X^TBX} ) &=& \mathbf{B^TX+BX }\\
\frac{\partial}{\partial\mathbf{X}}||\mathbf{X}||^2_{\mathrm{F}} &=& 2\mathbf{X}\\
\mathbf{\frac{\partial b^TX^TDXc}{\partial X}} &=& \mathbf{D^TXbc^T + DXcb^T}\\
\mathbf{\frac{\partial (Xb+c)^TD(Xb+c)}{\partial X}} &=& \mathbf{(D+D^T)(Xb+c)b^T}
\end{eqnarray}
\end{itemize}
\newpage
\section{Information Theory}
\subsection{Differential Entropy}
\subsubsection{常见 Differential Entropy}
\begin{itemize}
\item
\emph{Uniform distribution(from 0 to a)}
\[
h(X) = \log(a)
\]
\item
\emph{Normal Distribution}
\[
X\sim N(0,\sigma^2)
\]
\[
h(X) = \frac{1}{2} \log2\pi e \sigma^2
\]
\item
\emph{Multivariate Normal Distribution}
$$N_n\sim(\mu, K)$$
$\mu$ is mean and $K$ is covariance matrix
\[
h(X_1,\ldots,X_n) = \frac{1}{2}\log(2\pi e )^n |K|
\]
\end{itemize}
\subsubsection{Properties}
\begin{itemize}
\item $h(X+c) = h(X)$\\
\item$h(aX) = h(X) + \log|a|$\\
\item$h(AX) = h(X) + \log\big|\mathrm{det}(A)\big|$
\end{itemize}
\newpage
\section{Machine Learning}
\subsection{MAXIMUM LIKELIHOOD ESTIMATION (MLE)}
\subsubsection{GAUSSIAN MLE}
$$\hat{\mu}_{\scriptscriptstyle ML} = \frac{1}{n}\sum_{i=1}^nx_i$$
$$\hat{\Sigma}_{\scriptscriptstyle ML} = \frac{1}{n}\sum_{i=1}^n(x_i - \hat{\mu}_{\scriptscriptstyle ML})(x_i - \hat{\mu}_{\scriptscriptstyle ML})^T$$
$$\mathbb{E}(\hat{\mu}_{\scriptscriptstyle ML}) = \mu$$
$$\mathbb{E}(\hat{\Sigma}_{\scriptscriptstyle ML}) = \frac{m-1}{m}\Sigma$$
So the mean is unbiased and covariance is biased.
\subsection{Linear Regression}
\subsubsection{Least Squares}
Usually, for linear regression (and classification) we include an intercept term $w_0$ that doesn’t interact with any element in the vector $x$. It will be
convenient to attach a 1 to the first dimension of each vector $x_i$.
\[
x_i = \left [
\begin{array}{c}
1\\
x_{i1}\\
\vdots\\
x_{id}\\
\end{array}
\right ]
, \qquad
\mathbf{X} = \left [
\begin{array}{cccc}
1 &x_{11}& \ldots & x_{1d}\\
1 &x_{21}& \ldots & x_{2d}\\
\vdots &\vdots& & \vdots\\
1 &x_{n1}& \ldots & x_{nd}\\
\end{array}
\right]
\]
这种情况下,得到的解为:
\[
w_{\scriptscriptstyle \mathrm{ML}} = (X^TX)^{-1}X^Ty
\]
预测新的点为:
$$y_{\scriptscriptstyle\mathrm{new}} = x_{\scriptscriptstyle\mathrm{new}}^Tw_{\scriptscriptstyle \mathrm{ML}} $$
\subsection{Classification}
\subsubsection{Bayes Classifier}
The Bayes classifier has the smallest prediction error of all classifiers.\\
假设$(X,Y)$独立同分布,那么optimal classifier is:
\begin{eqnarray}
f(x) = \argmax_{y\in \mathcal{Y}} P(Y=y|X=x)
\end{eqnarray}
Using Bayes rule and ignoring $P(X = x)$ we equivalently have:
\begin{eqnarray}
f(x) = \argmax_{y\in \mathcal{Y}} P(Y=y)\times P(X=x|Y=y)
\end{eqnarray}
其中$P(Y=y)$叫做 class prior,$P(X=x|Y=y)$叫做 data likelihood given class.
\subsubsection{THE PERCEPTRON ALGORITHM}
\begin{itemize}
\item
Suppose there is a linear classifier with zero training error:
$$y_i = \mathrm{sign}(x_i^Tw)$$
Then the data is linearly ‘separable’\\
\item
By using the linear classifier $y = \mathrm{sign}(x^Tw)$ the Perceptron seeks to minimize
$$\mathcal{L} = -\sum_{i=1}^n(y_{i}\cdot x_i^Tw)\mathbbm{1} \{y_i\not = \mathrm{sign}(x_i^Tw)\}$$
Because $y\in \{-1,+1\}$,
\[
y_i\cdot x_i^Tw \quad \mathrm{is}\quad
\left \{
\begin{array}{c}
> 0\quad y_i = x_i^Tw\\
< 0\quad y_i \not= x_i^Tw\\
\end{array}
\right .
\]
By minimizing $\mathcal{L}$ we’re trying to always predict the correct label.
\item
$\nabla_w\mathcal{L} = 0 $ 没有办法直接解出来,所以使用 gradient descent 的方法迭代求解
($\mathcal{M}_t$ 表示在第$t$步被分错类的数据下标的集合):
\begin{eqnarray}
\nabla_w\mathcal{L} = \sum_{i \in \mathcal{M}_t} -y_ix_i\\
w'\gets w -\eta \nabla_w\mathcal{L} \\
\mathcal{L}(w')<\mathcal{L}(w)
\end{eqnarray}
\item
Perceptron 算法的一些问题:\\
When the data is separable, there are an infinite number of hyperplanes.\\
This algorithm. doesn’t take “quality” into consideration. It converges to the first one it finds.\\
When the data isn’t separable, the algorithm doesn’t converge. The hyperplane of $w$ is always moving around.\\
\end{itemize}
\subsubsection{LOGISTIC REGRESSION}
\begin{itemize}
\item
The distance of $x$ to a hyperplane $x^Tw + w_0$ is:
$$\Big|\frac{x^Tw}{||w||_2} + \frac{w_0}{||w||_2}\Big|$$
\item
鉴于hyperplane与log odds有很多相似的性质,我们可以直接用hyperplane来表示log odds(注意,
与前面Bayes得到的公式不同的是,前面是通过先验概率和class prior算出来的$w$,$w_0$,这里我们对于这些先验概率没有约束Discriminative
classifier):
$$\ln \frac{P(y = +1|x)}{P(y = -1|x)} = x^Tw+w_0$$
$$P(y=+1|x) = \frac{\mathrm{exp}(x^Tw+w_0)}{1+\mathrm{exp}(x^Tw+w_0)}=\sigma(x^Tw+w_0)$$
其中$\sigma$叫做 \emph{sigmoid function},in this case $x^Tw+w_0$ is the \emph{link function} for the log odds.
\item
与linear regression 类似,我们可以通过给$x$加入一维全为1的数据,将$w_0$并入到$w$;
$$
w\gets\left[
\begin{array}{c}
w_0\\w
\end{array}\right],\quad
x\gets\left[
\begin{array}{c}
1\\x
\end{array}
\right]
$$
$$x^Tw + w_0\gets x^Tw$$
\item
Data likelihood\\
$y_1,\ldots,y_n$的联合分布就是把每个的概率乘起来(默认$P(Y = y_i|X=x_i)$独立同分布),对联合分布取对数的到目标函数$\mathcal{L}$,
算法的目标是找到参数
$w$使得目标函数值最大(实际上就是 Maximum likelihood)\\
$$\mathcal{L} = \sum_{i=1}^n\ln \sigma_i(y_i\cdot w)$$
$$w_{\scriptscriptstyle{ML}} = \argmax_{w}\mathcal{L}$$
\item
与Perceptron算法类似,使用梯度进行极值的计算(其中$\sigma_i(y_i\cdot w)$表示$P(Y = y_i|X=x_i)$)\\
$$w^{(t+1)}=w^t + \eta \nabla_w\mathcal{L},\quad \quad \nabla_w\mathcal{L} = \sum_{i=1}^n\{1-\sigma_i(y_i\cdot w)\}y_ix_i$$
有时为了避免‘‘over fitting''使用下述公式:
$$\mathcal{L} = \sum_{i=1}^n\ln \sigma_i(y_i\cdot w) - \lambda w^Tw$$
\item
上述算法叫做\emph{steepest ascent},还有一种Newton算法利用的是$\mathcal{L}$的二阶导数,参见lecture\_9第14页\\
\item
Laplace approximation for logistic regression 参见lecture\_9第24页\\
\end{itemize}
\subsection{Kernels}
\subsubsection{Defination}
A kernel $K(\cdot,\cdot):\mathbb{R}^d\times \mathbb{R}^d \to \mathbb{R}$\ is a symmetric function defined as follows:\\
For any set of data $x_1,\ldots, x_n \in \mathbb{R}^d$, the $n \times n$ matrix $K$ with $K_{i,j} = K(x_i,x_j)$ and there is a mapping
$\phi:\mathbb{R}^d \to \mathbb{R}^D$ such that $K(x_i,x_j) = \phi(x_i)^T\phi(x_j)$
\subsubsection{Gaussian Kernel}
Also called \emph{radial basis function} (RBF),
$$K(x,x') = a\,\mathrm{exp}(-\frac{1}{b}||x-x'||^2)$$
\subsubsection{产生新Kernel}
假设$K_1,K_2$为两个kernel,可以利用现有的kernel组成新的kernel $K$:
\begin{eqnarray}
K(x,x') &=& K_1(x,x') + K_2(x,x')\\
K(x,x') &=& K_1(x,x')K_2(x,x')\\
K(x,x') &=& \mathrm{exp}{K_1(x,x')}
\end{eqnarray}
\subsubsection{Kernelized Perceptron}
\begin{itemize}
\item
For Perceptron classfier,
$$w = \sum_{i \in \mathcal{M}}y_ix_i$$
where $\mathcal{M}$ is \emph{sequentially constructed} set of misclassified examples.
$$y_0 =\mathrm{sign}(x_0^Tw) = \mathrm{sign}(\sum_{i \in \mathcal{M}}y_ix_0^T x_i)$$
\item
Using feature expansions, we can rewrite the function into the form below:
$$y_0 =\mathrm{sign}(\phi(x_0)^Tw) = \mathrm{sign}\big (\sum_{i \in \mathcal{M}}y_i\phi(x_0)^T \phi(x_i)\big)$$
\item
Use the kernel replace the $\phi(x_0)^T \phi(x_i)$
$$y_0 = \mathrm{sign}\big (\sum_{i \in \mathcal{M}}y_i K(x_0,x_i)\big)$$
\item
Learning the kernelized Perceptron\\
和普通的Perceptron基本一样,只不过每次更新$\mathcal{M}$是用kernelized的公式错判的点,具体可以看lecture 10第15页
\end{itemize}
\subsubsection{Kernel k-NN}
lecture 10第16页
$$ y_0 = \frac{1}{Z}\mathrm{sign}\big(\sum^n_{i=1}y_ie^{-\frac{1}{b}||x_0-x_i||^2}\big)$$
其中$Z = \sum_{i=1}^n e^{-\frac{1}{b}||x_0 - x_i||^2}$
\subsubsection{Gaussian Processes}
具体参考 lecture 10 第19页以后
\begin{itemize}
\item
Defination\\
$f(x)$ is a Gaussian process and $y(x)$ is the noise-added process
where $y = (y_1,\ldots,y_n)^T$ and $K$ is $n\times n$ matrix with $K_{ij} = K(x_i,x_j)$:
$$y|f \sim N(f,\sigma^2I),\ f\sim N(0,K) \Leftrightarrow y\sim N(0,\sigma^2I+K)$$
\textbf{可以这样理解: }\emph{$f (x)$ is the GP and $y(x)$ equals $f (x)$ plus i.i.d. noise}
\item
Predictions with Gaussian Processes\\
Given measured data $\mathcal{D}_n = \{(x_1,y_1),\ldots,(x_n,y_n)\}$, the distribution of $y(x)$
can be calculated at any \emph{new x} to make predictions.\\
$K(x,\mathcal{D}_n) = [K(x,x_1),\ldots,K(x,x_n)]$ and $K_n$ is the $n\times n$ kernel matrix
restricted points in $\mathcal{D}_n$
\begin{eqnarray}
y(x)|\mathcal{D}_n &\sim& N(\mu(x),\Sigma(x))\\
\mu(x) &=& K(x,\mathcal{D}_n)K_n^{-1}y\\
\Sigma(x) &=& \sigma^2 + K(x,x) - K(x,\mathcal{D}_n) K_n^{-1} K(x,\mathcal{D}_n)^T
\end{eqnarray}
For the posterior of $f(x)$ instead of $y(x)$, just remove $\sigma^2$
\end{itemize}
\end{document}