-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
74 lines (59 loc) · 3.03 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Versions of Gradient Temporal Difference Learning
Donghwan Lee, Han-Dong Lim, Jihoon Park, and Okyong Choi
"""
import numpy as np
from lin_approx import LinApprox
import matplotlib.pyplot as plt
if __name__ == '__main__':
mdp = LinApprox()
#GTD2 parameters
theta1 = np.random.rand(mdp.feature_vector_size, 1)
lambda1 = np.random.rand(mdp.feature_vector_size, 1)
#GTD3 parameters
theta2 = np.random.rand(mdp.feature_vector_size, 1)
lambda2 = np.random.rand(mdp.feature_vector_size, 1)
#GTD4 parameters
theta3 = np.random.rand(mdp.feature_vector_size, 1)
lambda3 = np.random.rand(mdp.feature_vector_size, 1)
steps = 50000
error_vec1 = np.zeros(steps)
error_vec2 = np.zeros(steps)
error_vec3 = np.zeros(steps)
for step in range(steps):
#Generates a random variable in 1, 2, ..., n given a prob distribution
state = np.random.choice(mdp.state_size, 1, p = mdp.d)
state = state[0]
action = np.random.choice(mdp.action_size, 1, p = mdp.beta[state])
action = action[0]
next_state = np.random.choice(mdp.state_size, 1, p = mdp.P_beta[:,state])
next_state = next_state[0]
# Importance sampling ratio
rho = mdp.target[state][action]/mdp.beta[state][action]
# Diminishing step size
step_size = 10/(step+100)
# GTD (off-policy)
delta = rho*mdp.reward[state] + mdp.gamma*rho*mdp.phi[next_state]@theta1 - mdp.phi[state]@theta1
theta1 = theta1 + step_size * (mdp.phi[state].reshape(-1,1) - mdp.gamma*rho*mdp.phi[next_state].reshape(-1,1)) * mdp.phi[state]@lambda1
lambda1 = lambda1 + step_size * (delta - mdp.phi[state]@lambda1) * mdp.phi[state].reshape(-1,1)
# GTD3
delta = rho*mdp.reward[state] + mdp.gamma*rho*mdp.phi[next_state]@theta2 - mdp.phi[state]@theta2
theta2 = theta2 + step_size * ((mdp.phi[state].reshape(-1,1) - mdp.gamma*rho*mdp.phi[next_state].reshape(-1,1)) * mdp.phi[state]@lambda2 - mdp.phi[state].reshape(-1,1)*mdp.phi[state]@theta2)
lambda2 = lambda2 + step_size * delta * mdp.phi[state].reshape(-1,1)
# GTD4
sigma1 = 100/(steps+1000)
delta = rho*mdp.reward[state] + mdp.gamma*rho*mdp.phi[next_state]@theta3 - mdp.phi[state]@theta3
theta3 = theta3 + step_size * ((mdp.phi[state].reshape(-1,1) - mdp.gamma*rho*mdp.phi[next_state].reshape(-1,1)) * mdp.phi[state]@lambda3 - sigma1*mdp.phi[state].reshape(-1,1)*mdp.phi[state]@theta3)
lambda3 = lambda3 + step_size * (delta - mdp.phi[state]@lambda3) * mdp.phi[state].reshape(-1,1)
error1 = np.linalg.norm(mdp.sol-theta1 ,2)
error2 = np.linalg.norm(mdp.sol-theta2, 2)
error3 = np.linalg.norm(mdp.sol-theta3, 2)
error_vec1[step] = error1
error_vec2[step] = error2
error_vec3[step] = error3
plt.plot(error_vec1, 'b', label = 'GTD2')
plt.plot(error_vec2, 'r', label = 'GTD3')
plt.plot(error_vec3, 'g', label = 'GTD4')
plt.legend()
plt.yscale("log")
plt.savefig('result.png')