-
Notifications
You must be signed in to change notification settings - Fork 0
/
Agent.py
177 lines (137 loc) · 5.7 KB
/
Agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from parameters import constants
from ale_python_interface import ALEInterface
from random import random, randrange
import sys
from improc import BilinearInterpolator2D
from utils import LockManager
from network import AgentComputation
#from PIL import Image
import os
import numpy as np
import pdb
def AgentProcess(rwlock, mainNet, criticNet, T_glob, T_lock, game_path, ident, init_learning_rate, barrier):
#Assign processor cores to Agent
os.system('taskset -p -c ' + str(ident) + ' ' + str(os.getpid()))
#Set up game environment
ale = ALEInterface()
ale.setInt(b'random_seed', randrange(0,256,1))
#ale.setBool(b'display_screen', True)
ale.setBool(b'color_averaging', False)
ale.loadROM(game_path)
actions = ale.getMinimalActionSet()
pdb.set_trace()
#Create Agent network based on shared weights wrapped in mainNet and criticNet
computation = AgentComputation(mainNet, criticNet, 'computation_'+str(ident))
f = open(constants.filebase+str(ident), 'w')
t_lock = LockManager(T_lock.acquire, T_lock.release, constants.lock_T)
writer_lock = LockManager(rwlock.writer_acquire, rwlock.writer_release, constants.lock_write)
reader_lock = LockManager(rwlock.reader_acquire, rwlock.reader_release, constants.lock_read)
t = 0
scores = []
#Determination of Agent final epsilon-greedyness level
rnd = random()
if rnd < 0.4:
epsilon_end = 0.1
elif rnd < 0.7:
epsilon_end = 0.01
else:
epsilon_end = 0.5
interpolator = BilinearInterpolator2D([210,160],[84,84])
current_frame = np.empty([210, 160, 1], dtype=np.uint8)
ale.getScreenGrayscale(current_frame)
#Image.fromarray(current_frame.squeeze(), mode='L').save('curframe.png')
next_state = np.empty([constants.action_repeat, 84, 84, 1], dtype=np.float32)
#Image.fromarray(state[0].squeeze(), mode='L').save('prevgetcurframe.png')
interpolator.interpolate(current_frame, next_state[0])
next_state[1:4] = next_state[0]
#Image.fromarray((state[0].squeeze()*255).astype(np.uint8), mode='L').save('getcurframe.png')
score = 0
eps_decrease = - epsilon_end / constants.final_e_frame
if ident == 0:
computation.initialisedRMSVals = False
with t_lock:
T = T_glob.value
T_glob.value += 1
barrier.wait()
f.write("After the barrier !")
f.flush()
rnd = random()
if rnd < 0.4:
epsilon_end = 0.1
elif rnd < 0.7:
epsilon_end = 0.01
else:
epsilon_end = 0.5
while T < constants.nb_max_frames:
state = next_state
next_state = np.empty_like(state)
# Determination of epsilon for the current frame
# Epsilon linearlily decrease from one to self.epsilon_end
# between frame 0 and frame constants.final_e_frame
epsilon += eps_decrease
epsilon = max(epsilon, epsilon_end)
#Choosing current action based on epsilon greedy behaviour
rnd = random()
if rnd < epsilon:
action = randrange(0, len(actions))
else:
with reader_lock:
action = computation.getBestAction(state.transpose(0,3,1,2))[0]
t += 1
reward = 0
i = 0
#repeating constants.action_repeat times the same action
#and cumulating the rewards
while i < constants.action_repeat and not ale.game_over():
reward += ale.act(actions[action])
ale.getScreenGrayscale(current_frame)
interpolator.interpolate(current_frame, next_state[i])
i += 1
while i < constants.action_repeat:
next_state[i] = next_state[i-1]
i += 1
#for i in range(constants.action_repeat):
# im = Image.fromarray((next_state[i].squeeze()*255).astype(np.uint8), mode='L')
# im.save(constants.filebase+str(ident)+'_image_'+'{:08d}'.format(T)+'_'+str(i)+'.png')
score += reward
discounted_reward = 0
if reward > 0:
discounted_reward = 1
elif reward < 0:
discounted_reward = -1
if not ale.game_over():
#Computing the estimated Q value of the new state
with reader_lock:
discounted_reward += constants.discount_factor * computation.getCriticScore(next_state.transpose(0,3,1,2))[0]
computation.cumulateGradient(
state.transpose(0,3,1,2),
action,
discounted_reward, ident)
if t != 0 and (t % constants.batch_size == 0 or ale.game_over()):
#computing learning rate for current frame
lr = init_learning_rate * (1 - T/constants.nb_max_frames)
with writer_lock:
computation.applyGradient(lr)
t = 0
if T % constants.critic_up_freq == 0:
f.write("Update critic !\n")
f.flush()
with writer_lock:
computation.update_critic()
#Log some statistics about played games
if ale.game_over():
f.write("["+str(ident)+"] Game ended with score of : "+str(score) + "\n")
f.write("["+str(ident)+"] T : "+str(T)+"\n")
ale.reset_game()
interpolator.interpolate(current_frame, next_state[0])
next_state[1:4] = next_state[0]
scores.append(score)
if len(scores) >= constants.lenmoy:
moy = sum(scores) / len(scores)
f.write("Average scores for last 12 games for Agent "+str(ident)+ " : " + str(moy)+"\n")
f.flush()
scores = []
score = 0
with t_lock:
T = T_glob.value
T_glob.value += 1