-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlesson_2_code.py
125 lines (99 loc) · 3.94 KB
/
lesson_2_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import numpy as np; np.random.seed(6)
import matplotlib.pyplot as plt
class MultiArmedBandit():
"""
A class that implements the N-armed testbed environment
Attributes
----------
levers : int
number of levers, same as the possible actions in the environment
q_star : list
value of each action, i.e., mean reward obtained when the corresponding action is selected
sampling_variance: int
variance of the returned reward selecting an action, the mean is q_star[action]
Methods
-------
action( action )
return the reward obtained selecting the action 'action'; this value is obtained
sampling from a distribution with mean=q_star[action] and varaince=self.sampling_variance
"""
def __init__( self, levers ):
#
# YOUR CODE HERE!
#
self.levers = None
self.q_star = None
self.sampling_variance = None
def action( self, action ):
#
# YOUR CODE HERE!
#
return None
def banditAlgorithm( env, eps=0, maxiters=1000 ):
"""
Implements the Simple Bandit Algorithm
Args:
env: instance of the multi-armed bandit environmet
eps: random value for the eps-greedy policy (probability of random action)
maxiters: number of steps to perform in the environment
Returns:
avg_reward: list of the rewards obtained during the training,, averaged from the first step to the last
Q: the updated value function after the training
"""
levers = env.levers
Q = np.array([0 for _ in range(levers) ], dtype=float)
N = np.array([0 for _ in range(levers) ], dtype=int)
ep_reward = []; avg_reward = []
for _ in range(maxiters):
#
# YOUR CODE HERE!
#
ep_reward.append( 0 )
avg_reward.append( np.mean(ep_reward) )
return avg_reward, Q
def main():
print( "\n************************************************" )
print( "* Welcome to the second lesson of the RL-Lab! *" )
print( "* (Multi-Armed Bandit) *" )
print( "**************************************************" )
# Hyperparameters
n_armed = 10
training_steps = 1000
# Training phase
print( f"\n{n_armed}-armed testbed with {training_steps} training_steps" )
bandit = MultiArmedBandit( levers=n_armed )
eps_00, q_00 = banditAlgorithm( bandit, eps=0.00, maxiters=training_steps )
eps_01, q_01 = banditAlgorithm( bandit, eps=0.01, maxiters=training_steps )
eps_10, q_10 = banditAlgorithm( bandit, eps=0.10, maxiters=training_steps )
# Computing and plotting the reward of the last steps
print( f"\n\tLast epsiodes reward (with eps=0 ):", np.mean(eps_00[-20:]) )
print( f"\tLast epsiodes reward (with eps=0.01):", np.mean(eps_01[-20:]) )
print( f"\tLast epsiodes reward (with eps=0.1 ):", np.mean(eps_10[-20:]) )
# Computing and plotting the optimal action found
print( "\n\tThe real optimal action is: ", bandit.q_star.argmax() )
print( f"\tThe optimal actions found are: {q_00.argmax()} (eps=0), {q_01.argmax()} (eps=0.01), and {q_10.argmax()} (eps=0.1)" )
# Repeat the experiment for 2000 episodes
eps_00_average, eps_01_average, eps_10_average = [], [], []
for _ in range(500):
bandit = MultiArmedBandit( levers=n_armed )
eps_00_average.append( banditAlgorithm( bandit, eps=0.00, maxiters=training_steps )[0] )
eps_01_average.append( banditAlgorithm( bandit, eps=0.01, maxiters=training_steps )[0] )
eps_10_average.append( banditAlgorithm( bandit, eps=0.10, maxiters=training_steps )[0] )
eps_00_average = np.average(np.array(eps_00_average), axis=0)
eps_01_average = np.average(np.array(eps_01_average), axis=0)
eps_10_average = np.average(np.array(eps_10_average), axis=0)
# Plot the results
print( "\n\tPlotting data..." )
t = np.arange(0, training_steps)
_, ax = plt.subplots()
ax.plot(t, eps_00_average, label="eps: 0", linewidth=3)
ax.plot(t, eps_01_average, label="eps: 0.01", linewidth=3)
ax.plot(t, eps_10_average, label="eps: 0.1", linewidth=3)
plt.xlabel( "steps", fontsize=16)
plt.ylabel( "average reward", fontsize=16)
ax.grid()
plt.ylim([0, None])
plt.legend()
plt.show()
if __name__ == "__main__":
main()