forked from vuoristo/dqn-agent
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathDQNAgent.py
195 lines (172 loc) · 7.07 KB
/
DQNAgent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import gym
import numpy as np
import random
from collections import deque
from ExperienceMemory import ExperienceMemory
import time
import cv2
class DQNAgent(object):
def __init__(self, env, model, max_episodes=200000, max_steps=1000000,
exp_buffer_size=40000, epsilon=0.9, linear_epsilon_decay=True,
epsilon_decay_steps=1.e6, exponential_epsilon_decay=0.99,
min_epsilon=0.01, batch_size=512, render=True, warmup_steps=3e4,
update_freq=1, random_starts=1):
"""Deep Q-learning agent for OpenAI gym. Currently supports only
one dimensional input.
arguments:
env -- the OpenAI gym environment
model -- model for Q-function approximation
keyword arguments:
max_episodes -- default 200000
max_steps -- max number of steps per episode. default 1000000
exp_buffer_size -- how many experiences to remember. default 40000
epsilon -- initial probability to take random action. default 0.9
linear_epsilon_decay -- enable linear decay. True: linear,
False: exponential. default True
epsilon_decay_steps -- how many steps for the epsilon to decay to
minimum. default 1000000
exponential_epsilon_decay -- exponential decay factor for epsilon.
default 0.99
min_epsilon -- minimum epsilon value. default 0.01
batch_size -- number of elements in minibatch. default 20
render -- enable environment rendering every timestep. default True
warmup_steps -- how many steps to run before epsilon decay starts
"""
self.n_actions = env.action_space.n
self.max_episodes = max_episodes
self.max_steps = max_steps
self.batch_size = batch_size
self.exp_buffer_size = exp_buffer_size
self.eps = epsilon
self.min_epsilon = min_epsilon
self.linear_epsilon_decay = linear_epsilon_decay
self.epsilon_decay_steps = epsilon_decay_steps
self.exponential_epsilon_decay = exponential_epsilon_decay
self.update_freq = update_freq
self.reward_log = deque(100*[0], 100)
self.window_size = model.window_size
self.experiences = ExperienceMemory(exp_buffer_size)
self.recent_observations = deque(maxlen=self.window_size)
self.model = model
self.env = env
self.warmup_steps = warmup_steps
self.warmup = True
self.random_starts = random_starts
self.render = render
self.start_time = time.time()
self.prev_time = time.time()
def train(self):
""" The training loop of the DQNAgent. Steps the environment,
saves observations and calls model training.
"""
total_steps = 0
for ep in range(self.max_episodes):
step = 0
rewards = 0
first_observation = self.new_random_game()
# recent_observations are episode specific
self.recent_observations = deque(maxlen=self.window_size)
self.append_to_recent_observations(first_observation)
while step < self.max_steps:
# select action according to the recent_observations
action = self.select_action()
second_observation, reward, done, _ = self.env.step(action)
if step == 9999:
reward += 300
# observations are saved with the same index as the
# action, reward and done following them
self.save_experience(action, reward, done)
self.append_to_recent_observations(second_observation)
first_observation = second_observation
rewards += reward
step += 1
total_steps += 1
if self.render:
self.env.render()
if total_steps > self.warmup_steps:
self.warmup = False
if not self.warmup:
if total_steps % self.update_freq == 0:
self.train_model()
if self.eps > self.min_epsilon:
if self.linear_epsilon_decay:
self.eps -= (1. - self.min_epsilon) / self.epsilon_decay_steps
else:
self.eps *= self.exponential_epsilon_decay
if done:
self.report(total_steps, step, rewards, ep, time.time(), float(step)/float(time.time()-self.prev_time))
self.prev_time = time.time()
break
def evaluate(self):
""" Evaluate the agent. Runs the environment exactly like in
training, but no model training is conducted.
"""
self.warmup = False
self.eps = 0
total_steps = 0
for ep in range(self.max_episodes):
step = 0
rewards = 0
first_observation = self.new_random_game()
# recent_observations are episode specific
self.recent_observations = deque(maxlen=self.window_size)
self.append_to_recent_observations(first_observation)
while step < self.max_steps:
# select action according to the recent_observations
action = self.select_action()
second_observation, reward, done, _ = self.env.step(action)
# observations are saved with the same index as the
# action, reward and done following them
self.append_to_recent_observations(second_observation)
first_observation = second_observation
rewards += reward
step += 1
total_steps += 1
if self.render:
self.env.render()
if total_steps > self.warmup_steps:
self.warmup = False
if done:
self.report(total_steps, step, rewards, ep, time.time(), float(step)/float(time.time()-self.prev_time))
self.prev_time = time.time()
break
def new_random_game(self):
ob = self.env.reset()
# no_rnd = np.random.randint(0, self.random_starts)
# for i in range(no_rnd):
# ob, _, _, _ = self.env.step(0)
return ob[0]
def select_action(self):
"""Selects action for given observation."""
if self.warmup or \
np.random.uniform(0,1) < self.eps:
action = np.random.randint(self.n_actions)
else:
obs = list(self.recent_observations)
while len(obs) < self.window_size:
obs = [obs[0]] + obs
q = self.model.get_q_value(obs)
action = np.argmax(q)
return action
def append_to_recent_observations(self, observation):
observation = self.model.reshape_observation(observation)
if self.model.grayscale:
self.recent_observations.append(observation)
else:
b, g, r = cv2.split(observation)
observation_rev = np.concatenate( (b,g,r), axis=0)
self.recent_observations.append(observation_rev)
def save_experience(self, action, reward, done):
self.experiences.save_experience(self.recent_observations[-1],
action, reward, done)
def train_model(self):
mb_ob0, mb_ac, mb_re, mb_ob1, mb_term = self.experiences.sample_minibatch(
self.batch_size, self.window_size)
self.model.train_net(mb_ob0, mb_ac, mb_re, mb_ob1, mb_term)
def report(self, total_steps, steps, rewards, episode, time, fps):
self.reward_log.append(rewards)
m, s = divmod(int(time - self.start_time), 60)
h, m = divmod(m, 60)
print('Episode: {} Total steps: {}, steps: {}, reward: {} mean-100: '
'{} epsilon: {}, fps: {}, time: {}:{}:{}'.format(episode, total_steps, steps, rewards,
np.mean(self.reward_log), self.eps, fps, h, m, s))