-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinal version.R
226 lines (188 loc) · 12.6 KB
/
final version.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
##Group member:
##1. Aditya Prabaswara Mardjikoen(s2264710)
##2. Xiangtian Duan(s2248742)
##3. Huiying Chen(s2264943)
## Github repositories link:
## https://github.com/aprabaswara/BFGS-Optimizer.git
##overview:
##---------------------------------------------------------------------------------------------------------------------------------------------
##Finite differencing approximation reduces reliability, accuracy, and efficiency a little,
##but for many cases it is good enough. In this code, we will implement the BFGS method by
##minimizing the initial value using this algorithm. One advantage of using this method is
##we mostly dealing with first order derivative instead of second order derivative. Here
##is a short overview how the algorithm works:
##1. Initialize initial value, gradient, and inverse of Hessian matrix, which usually set as
## identity matrix or any matrix that approximate the inverse of Hessian.
##
##2. Check whether the gradient approximate zero, if yes terminate the process and if not continue
## the process until the gradient approximate zero.
##
##3. compute delta by multiplying the negative value of inverse Hessian with its gradient at the
## current initial value.
##
##4. Check whether the delta reduces the objective. If not, decrease the delta value until it reduces the objective.
##
##5. Check whether the delta satisfies the second wolfe condition. If not, increase delta slightly until it satisfies this
## condition.
##
##6. Set the upper limit of iteration times of delta, otherwise we will stuck in this process for too long.
## If we still can not obtain appropriate delta when iteration times exceed the limit, we move to the next step.
##
##7. Update the theta using the new delta by adding it to the previous theta.
##
##8. Compute the new matrix B.
##
##9. Repeat the whole process until the gradient approaches zero.
##
##10. For the whole process, we issue error if objective or derivative is not finite at some
## theta, objective function values not decreased before convergence, and maximum iteration
## is reached before convergence.
##
##11. If no error in point 10, compute the Hessian matrix for optimal values using finite difference on the
## gradient.
##
##12. Compute the optimal values for objective functions, gradient, and the number of iteration
## to reach convergence if no error in point 10.
##----------------------------------------------------------------------------------------------------------
g <- function(theta,f,...){
## function: calculate gradient given the value of theta and function f
##
## input: theta: vector of initial values for the optimization parameter
## f : objective function
## ... : any argument of f after the parameter vector and gradient logical indicator;
## output: grad: gradient of theta
if (is.null(attr(f(theta,...),"gradient"))){## compute gradient by finite difference approximation if the user didn't supply it
f0 <- f(theta,...) ## f at theta
eps <- 1e-7 ## finite difference interval
grad <- c() ##initialize vector to store gradient values
for (i in 1:length(theta)) { ## loop over parameters
th <- theta; th[i] <- th[i] + eps ## increase theta by eps
f1 <- f(th,...) ## compute f at theta+eps
grad[i] <- (f1 - f0) / eps ## approximate first order derivative
}
}
else{ ## if the user supplies gradient
grad <- attr(f(theta,...),'gradient') ## get the gradient from f
}
return(grad)
}
bfgs <- function(theta, f, ..., tol = 1e-5, fscale = 1, maxit = 100){
## function: implementing the BFGS Quasi-Newton minimization method.
## input:
## theta = vector of initial values for the optimization parameter
## f = objective function to minimize
## Its first argument is the vector of optimization parameters
## Its second argument is a logical indicating whether or not gradients of the objective
## ... = any argument of f after the parameter vector and gradient logical indicator
## tol = the convergence tolerance;
## fscale = a rough estimate of the magnitude of f at the optimum (used in convergence testing)
## maxit = the maximum number of BFGS iterations to try before giving up
##
## output:(named list containing following element)
##
## f = scalar value of the objective function at the minimum
## theta = vector of values of the parameters at the minimum
## iter = number of iterations taken to reach the minimum
## g = gradient vector at the minimum
## H = approximate Hessian matrix (obtained by finite difference method) at the minimum.
c2 <- 0.9 ## constant for checking second Wolfe condition
param_num <- length(theta) ## length of vector theta
B <- I <- diag(param_num) ## initialize identity matrix as the initial value of the inverse of Hessian matrix
iter <- 0 ## initialize iteration
f0 <- f(theta,...) ## initialize objective function values
grad <- g(theta,f,...) ## initialize gradient
## Before starting the whole process we have to check weather the initial objective function
## values or gradient is finite. If not, we could not continue the BFGS algorithm.
grad_index <- which(is.finite(grad)==FALSE)## find gradient element that isn't finite
if (is.finite(f0)==FALSE){ ## issue error if objective function values is not finite
stop("Objective function is not finite at given initial theta value!")
}
else if (length(grad_index) != 0){ ## issue error if derivative is not finite
stop("Derivative is not finite at given initial theta value!")
}
while(iter <= maxit & max(abs(grad)) >= (abs(f(theta,...)) + fscale) * tol){## when the iteration is less than
## the limit and it is not convergent
## we implement BFGS algorithm
iter <- iter + 1 ## record the number of iterations
f0 <- f(theta,...) ## objective function value with initial value of parameters
grad <- g(theta,f,...) ## gradient of objective function with initial value of parameters
## In this part, we will check is there any objective function values or
## gradient that is not finite during the BFGS algorithm iteration. If there
## exist objective function values or gradient that is not finite, we could
## not continue the BFGS algorithm during this iteration
grad_index <- which(is.finite(grad) == FALSE) ## find gradient element that isn't finite
if (is.finite(f0) == FALSE){ ## issue error if objective function values is not finite
stop("Objective function is not finite at given initial theta value!")
break
}
else if (length(grad_index) != 0){ ## issue error if derivative is not finite
stop("Derivative is not finite at given initial theta value!")
break
}
## initial value for checking second wolfe condition
delta <- drop(- B %*% g(theta,f,...))
f1 <- f(theta + delta,...)
grad_delta1 <- drop(crossprod(g(theta,f,...),delta))
grad_delta2 <- drop(crossprod(g(theta + delta,f,...),delta))
count <- 0 ## record the number of loops in order to limit the number of cycles
while(!is.finite(f1) | f1 > f0 | grad_delta2 < c2 * grad_delta1) { ## cases where delta adjustment is required
count<- count + 1 ## update trial
if(count > maxit){ ## terminate loop if its times exceed the upper limit
break
}
else if(!is.finite(f1) | f1 > f0){ ## if the objective is infinite and if delta
## can not decrease the objective function value
delta <- delta / 2 ## reduce delta
}
else if(grad_delta2 < c2 * grad_delta1){ ## if delta makes objective function decrease and
## it do not satisfy the second Wolfe condition
delta <- (1 + 0.1) * delta ## increase delta slightly
}
else{ ## terminate process if delta make objective function
## decrease and it satisfies the second Wolfe condition
break
}
f1 <- f(theta + delta,...) ## update f1 using new delta to prepare for the loop
grad_delta1 <- drop(crossprod(g(theta,f,...),delta)) ## the right side of the Wolfe condition 2
grad_delta2 <- drop(crossprod(g(theta + delta,f,...),delta)) ## the left side of the Wolfe condition 2
}
s <- delta ## s equals to the final delta in the iteration
theta_new <- theta + s ## update the new theta with s
## issue error if steps failed to reduce objective function values before convergence
if (f(theta_new,...) > f(theta,...)){
stop('Steps failed to reduced objective values but convergence not occured!')
break
}
## compute the B matrix (approximate inverse Hessian)
y <- g(theta_new,f,...) - g(theta,f,...) ## difference in gradients at old and new parameters
inv_rho <- drop(crossprod(delta,y)) ## rho^(-1) = s^T*y
rho <- 1 / inv_rho ## 1 / rho^(-1)
s_yt <- tcrossprod(s,y) ## s*y^T
y_st <- tcrossprod(y,s) ## y*s^T
B <- B - rho * B %*% y_st - rho * s_yt %*% B + rho^2 * s_yt %*% B %*% y_st + rho * tcrossprod(s) ## update B, unpack the formula to calculate
## in order to have the cost O(p^2) rather than
## O(p^3)
theta <- theta_new ## update parameter value
grad <- g(theta,f,...) ## the current gradient
delta <- drop(- B %*% g(theta,f,...)) ## delta computed with new B matrix
}
##issue error if convergence not reached until maximum iteration
if (max(abs(grad)) >= (abs(f(theta,...)) + fscale) * tol){
stop('Convergence not reached until maximum iteration!')
}
H <- matrix(0, nrow = param_num, ncol = param_num) ## initialize Hessian matrix
g0 <- g(theta,f,...) ## gradient value at the minimum theta value
eps <- 1e-7 ## finite difference interval
for (i in 1:param_num) { ## loop over parameters
th <- theta; th[i] <- th[i] + eps ## increase theta by eps
g1 <- g(th,f,...) ## gradient value at theta + eps
H[i,] <- (g1 - g0) / eps ## approximate second derivatives in Hessian matrix
}
##convert asymmetric Hessian matrix into symmetric Hessian matrix
if (isSymmetric(H) == FALSE){
H <- 0.5 * (t(H) + H)
}
f_optimum <- f(theta,...) ## scalar value of objective function at the minimum
bfgs_summary <- list(f = f_optimum, theta = theta, iter = iter, g = g0, H = H) ## create the named list
return(bfgs_summary) ## return the list
}