-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhierarchical_DAgger.py
executable file
·483 lines (409 loc) · 23.6 KB
/
hierarchical_DAgger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
#!/usr/bin/env python
import numpy as np
from Sawyer_DynaGoalEnv_DAgger import demoEnv # REFACTOR TO REMOVE GOALS
import rospy
# Leveraging demonstration is same as TD3
from std_srvs.srv import Empty, EmptyRequest
from running_mean_std import RunningMeanStd
from SetupSummary import SummaryManager_HIRO as SummaryManager
import pickle
import os
from collections import OrderedDict
from state_action_space import *
# intera imports
from intera_interface import CHECK_VERSION
from intera_interface import Limb
from intera_interface import Gripper
from tf_conversions import posemath
from tf.msg import tfMessage
from tf.transformations import quaternion_from_euler
from intera_core_msgs.msg import (
DigitalIOState,
DigitalOutputCommand,
IODeviceStatus
)
from geometry_msgs.msg import (
PoseStamped,
Pose,
Point,
Quaternion,
)
import time as timer
import time
from std_msgs.msg import Header
# save running mean std for demo transition
from SetupSummary import SummaryManager_HIRO as SummaryManager
rms_path = '/home/irobot/catkin_ws/src/ddpg/scripts/ecsac_aux/'
demo_path = '/home/irobot/catkin_ws/src/ddpg/scripts/ecsac_aux/'
MAN_BUF_FNAME = 'demo_manager_buffer.bin'
CON_BUF_FNAME = 'demo_controller_buffer.bin'
class DemoReplayBuffer(object):
"""
A simple FIFO experience replay buffer for ECSAC + HIRO agents.
For HIRO, we need seperate replay buffers for both manager and controller policies.
low-level controller does not require state-action sequence
high-level controller requires extra buffer for state-action
"""
def __init__(self, obs_dim, stt_dim, act_dim, aux_dim, size, manager=False):
# 9 types of data in buffer
self.obs_buf = np.zeros(shape=(size,)+ obs_dim, dtype=np.float32) # o_t, (-1,100,100,3)
self.obs1_buf = np.zeros(shape=(size,)+ obs_dim, dtype=np.float32) # o_t+1, (-1,100,100,3)
self.g_buf = np.zeros(shape=(size , stt_dim), dtype=np.float32) # g_t, (-1,21)
self.g1_buf = np.zeros(shape=(size , stt_dim), dtype=np.float32) # g_t+1, (-1,21)
self.stt_buf = np.zeros(shape=(size , stt_dim), dtype=np.float32) # s_t (-1,21), joint pos, vel, eff
self.stt1_buf = np.zeros(shape=(size , stt_dim), dtype=np.float32) # s_t+1 for (-1,21), consists of pos, vel, eff self.act_buf = np.zeros(shape=(size, act_dim), dtype=np.float32) # A+t 3 dim for action
if not manager:
self.act_buf = np.zeros(shape=(size , act_dim), dtype=np.float32) # a_t (-1,8)
else:
self.act_buf = np.zeros(shape=(size , stt_dim), dtype=np.float32)
self.rews_buf = np.zeros(shape=(size,), dtype=np.float32)
self.done_buf = np.zeros(shape=(size,), dtype=np.float32)
self.aux_buf = np.zeros(shape=(size , aux_dim), dtype=np.float32)
self.aux1_buf = np.zeros( shape=(size , aux_dim), dtype=np.float32)
self.ptr, self.size, self.max_size = 0, 0, size
def store(self, obs, obs1, g, g1, stt, stt1, act, aux, aux1, rew, done, manager=False):
"""store step transition in the buffer
"""
self.obs_buf[self.ptr] = obs
self.obs1_buf[self.ptr] = obs1
self.g_buf[self.ptr] = g
self.g1_buf[self.ptr] = g1
self.stt_buf[self.ptr] = stt
self.stt1_buf[self.ptr] = stt1
self.act_buf[self.ptr] = act
self.rews_buf[self.ptr] = rew
self.done_buf[self.ptr] = done
self.aux_buf[self.ptr] = aux
self.aux1_buf[self.ptr] = aux
if not manager:
self.ptr = (self.ptr+1) % self.max_size
self.size = min(self.size+1, self.max_size)
def get_episodic_subgoal(self, global_step, ep_len):
""" Return the subgoal and fullstate batch of the episode.
<s, s', g, g'>
slicing index : buf[global_step - ep_len:global_step-1] => slice out only the transitions for this episode.
"""
return [self.stt_buf[global_step-ep_len:global_step], self.stt1_buf[global_step-ep_len:global_step],
self.g_buf[global_step-ep_len:global_step], self.g1_buf[global_step-ep_len:global_step]]
def return_buffer(self):
""" Returns the whole numpy arrays containing demo transitions.
"""
return {'data':[self.obs_buf[:self.ptr], self.obs1_buf[:self.ptr], self.g_buf[:self.ptr],
self.g1_buf[:self.ptr], self.stt_buf[:self.ptr], self.stt1_buf[:self.ptr],
self.act_buf[:self.ptr], self.rews_buf[:self.ptr], self.done_buf[:self.ptr],
self.aux_buf[:self.ptr], self.aux1_buf[:self.ptr]],
'size':self.size}
class DemoManagerReplayBuffer(DemoReplayBuffer):
"""
A simple FIFO experience replay buffer for ECSAC + HIRO agents.
For HIRO, we need seperate replay buffers for both manager and controller policies.
low-level controller does not require state-action sequence
high-level controller requires extra buffer for state-action
"""
def __init__(self, obs_dim, stt_dim, act_dim, aux_dim, size, seq_len):
"""full-state/ color_observation sequence lists' shape[1] is +1 longer than that of
action seqence -> they are stored as s_t:t+c/o_t:t+c, while action is stored as a_t:t+c
"""
super(DemoManagerReplayBuffer, self).__init__(obs_dim, stt_dim, act_dim, aux_dim, size, manager=True)
self.stt_seq_buf = np.zeros(shape=(size, seq_len+1, stt_dim), dtype=np.float32) # s_t (-1, 10+1, 21), joint pos, vel, eff
self.obs_seq_buf = np.zeros(shape=(size, seq_len+1,)+ obs_dim, dtype=np.float32) # o_t, (-1, 10+1, 100, 100, 3)
self.act_seq_buf = np.zeros(shape=(size, seq_len, act_dim), dtype=np.float32) # a_t (-1,10, 8)
def store(self, stt_seq, obs_seq, act_seq, *args, **kwargs):
"""store step transition in the buffer
"""
super(DemoManagerReplayBuffer, self).store(manager=True, *args, **kwargs)
self.stt_seq_buf[self.ptr] = np.array(stt_seq)
self.obs_seq_buf[self.ptr] = np.array(obs_seq)
self.act_seq_buf[self.ptr] = np.array(act_seq)
self.ptr = (self.ptr+1) % self.max_size
self.size = min(self.size+1, self.max_size)
def return_buffer(self):
""" Returns the whole numpy arrays containing demo transitions, for manager transitions
"""
return {'data':[self.obs_buf[:self.ptr], self.obs1_buf[:self.ptr], self.g_buf[:self.ptr],
self.g1_buf[:self.ptr], self.stt_buf[:self.ptr], self.stt1_buf[:self.ptr],
self.act_buf[:self.ptr], self.rews_buf[:self.ptr], self.done_buf[:self.ptr],
self.aux_buf[:self.ptr], self.aux1_buf[:self.ptr]],
'seq_data':[self.stt_seq_buf[:self.ptr], self.obs_seq_buf[:self.ptr], self.act_seq_buf[:self.ptr]],
'size':self.size}
def normalize(x, stats):
if stats is None:
return x
return (x - stats.mean) / stats.std
# def normalize_action(action_arr):
# lb_array = ACTION_LOW_BOUND*np.ones(action_arr.shape)
# hb_array = ACTION_HIGH_BOUND*np.ones(action_arr.shape)
# _norm_action = lb_array + (action_arr+1.0*np.ones(action_arr.shape))*0.5*(hb_array - lb_array)
# _norm_action = np.clip(_norm_action, lb_array, hb_array)
# _norm_action = _norm_action.reshape(action_arr.shape)
# return _norm_action
def randomize_world():
""" Domain randomization for the environment's light and the color of robot link.
"""
rospy.wait_for_service('/dynamic_world_service') # randomize the light in the gazebo world
dynamic_world_service_call = rospy.ServiceProxy('/dynamic_world_service', Empty)
change_env_request = EmptyRequest()
dynamic_world_service_call(change_env_request)
# rospy.wait_for_service('/colorize_world_service') # randomize the model colors in the gazebo world
# colorize_world_service_call = rospy.ServiceProxy('/colorize_world_service', Empty)
# colorize_env_request = EmptyRequest()
# colorize_world_service_call(colorize_env_request)
if __name__ == '__main__':
USE_CARTESIAN = True
USE_GRIPPER = True
IS_TRAIN = True # always true for DAgger script
# define observation dimensions
# for low_level controller
obs_dim = (100, 100, 3) # for actor in POMDP
stt_dim = 21# full_state of the robot (joint positions, velocities, and efforts) + ee position
act_dim = 8 # 7 joint vels and gripper position
aux_dim = 3 # target object's position
# define joint vel_limits
action_space = (-1.0, 1.0)
ee_dim = 7 # 4 quaternion
grip_dim = 1 # 1 dimension for the gripper position
# for high_level controller
des_goal_dim = 21 # (joint positions, velocities, and efforts) + ee position
sub_goal_dim = 21 # (joint positions, velocities, and efforts) + ee position
if USE_CARTESIAN: # append 7-dim
stt_dim += ee_dim
des_goal_dim += ee_dim
sub_goal_dim += ee_dim
if USE_GRIPPER: # append 1-dim
stt_dim += grip_dim
des_goal_dim += grip_dim
sub_goal_dim += grip_dim
# init node
# rospy.init_node('hierarchical_DAgger')
# demo quantity related
total_epi = 1
max_ep_len = 500
total_steps = total_epi * max_ep_len
buffer_size = int(1e4) # 50000 steps : is it enough?
manager_propose_freq = 10
isDemo = True
ep_ret, ep_len = 0, 0
t = 0 # counted steps [0:total_steps - 1]
timesteps_since_manager = 0 # to count c-step elapse for training manager
timesteps_since_subgoal = 0 # to count c-step elapse for subgoal proposal
episode_num = 0 # incremental episode counter
done = True
reset = False
manager_temp_transition = list() # temp manager transition
# demoEnv inherits robotEnv
env = demoEnv(max_steps=max_ep_len, control_mode='velocity', isPOMDP=True, isGripper=USE_GRIPPER, isCartesian=USE_CARTESIAN, train_indicator=IS_TRAIN)
controller_buffer = DemoReplayBuffer(obs_dim=obs_dim, stt_dim=stt_dim, act_dim=act_dim, aux_dim=aux_dim, size=buffer_size)
manager_buffer = DemoManagerReplayBuffer(obs_dim=obs_dim, stt_dim=stt_dim, act_dim=act_dim, aux_dim=aux_dim, size=buffer_size, seq_len=manager_propose_freq)
obs_shape_list = [(100,100,3), (7), (7), (7), (1), (7), (3)]
summary_manager = SummaryManager(obs_shape_list=obs_shape_list) # manager for rms
# create instances for the arm and gripper
limb = Limb()
gripper = Gripper()
def update_rms(full_stt=None, c_obs=None, aux=None, act=None):
"""Update the mean/stddev of the running mean-std normalizers.
Normalize full-state, color_obs, and auxiliary observation.
Caution on the shape!
"""
summary_manager.s_t0_rms.update(c_obs) # c_obs
summary_manager.s_t1_rms.update(full_stt[:7]) # joint_pos
summary_manager.s_t2_rms.update(full_stt[7:14]) # joint_vel
summary_manager.s_t3_rms.update(full_stt[14:21]) # joint_eff
summary_manager.s_t4_rms.update(full_stt[21:22]) # gripper_position
summary_manager.s_t5_rms.update(full_stt[22:]) # ee_pose
summary_manager.s_t6_rms.update(aux) # aux
summary_manager.a_t_rms.update(act) # ee_pose
def load_rms():
rospy.logwarn('Loads the mean and stddev for test time')
summary_manager.s_t0_rms.load_mean_std(rms_path+'mean_std0_demo.bin')
summary_manager.s_t1_rms.load_mean_std(rms_path+'mean_std1_demo.bin')
summary_manager.s_t2_rms.load_mean_std(rms_path+'mean_std2_demo.bin')
summary_manager.s_t3_rms.load_mean_std(rms_path+'mean_std3_demo.bin')
summary_manager.s_t4_rms.load_mean_std(rms_path+'mean_std4_demo.bin')
summary_manager.s_t5_rms.load_mean_std(rms_path+'mean_std5_demo.bin')
summary_manager.s_t6_rms.load_mean_std(rms_path+'mean_std6_demo.bin')
summary_manager.a_t_rms.load_mean_std(rms_path+'mean_std7_demo.bin')
def save_rms(step):
rospy.logwarn('Saves the mean and stddev @ step %d', step)
summary_manager.s_t0_rms.save_mean_std(rms_path+'mean_std0_demo.bin')
summary_manager.s_t1_rms.save_mean_std(rms_path+'mean_std1_demo.bin')
summary_manager.s_t2_rms.save_mean_std(rms_path+'mean_std2_demo.bin')
summary_manager.s_t3_rms.save_mean_std(rms_path+'mean_std3_demo.bin')
summary_manager.s_t4_rms.save_mean_std(rms_path+'mean_std4_demo.bin')
summary_manager.s_t5_rms.save_mean_std(rms_path+'mean_std5_demo.bin')
summary_manager.s_t6_rms.save_mean_std(rms_path+'mean_std6_demo.bin')
summary_manager.a_t_rms.save_mean_std(rms_path+'mean_std7_demo.bin')
def get_demo_temp_subgoal():
""" return the temporary subgoal for demo transition.
"""
return np.zeros(sub_goal_dim)
def demo_manager_sg_transition(manager_transition):
""" This function is called every C-step.
replace the temp subgoals in temp manager buffer with expert subgoals for the rollout batches.
The demo agent achieves the subgoal for every step, i.e. h = s + g - s'
g_t == s_t+c (action of the manager)
*manager_transition :
[s_seq, o_seq, a_seq, obs, obs_1, dg, dg_1, stt, stt_1, act, aux, aux_1, rew, done]
replace the temp. subgoal (a_t of the manager) with s_t+c-1
"""
manager_transition[-5] = manager_transition[0][-1]
return manager_transition
def demo_controller_sg_transition(controller_buffer, global_step, ep_len):
""" This function is called at the end of each episode.
*controller_buffer :
controller_buffer.store(c_obs, next_c_obs, subgoal,
next_subgoal, full_stt, next_full_stt, action, aux, next_aux, intrinsic_reward, done)
subgoal for the controller -> for every C-step sequence,
* h = s + g - s'
g_t == s_t+c, g' = h(s,g,s'), g'' = h(s',g',s'') ...
*don't do subgoal_transition and intrinsic reward computation while doing rollouts.
controller transition (s_t||s_t+c & s_t+1||s_t+c+1) -> TD learning
in terms of controller -> g_t:t+c-1
index : 2 & 3
"""
# s_t+c should be the g_t for s_t. (e.g. s_10 == g_0 -> induces new proposal)
sb, s1b, gb, g1b = controller_buffer.get_episodic_subgoal(global_step, ep_len) # returns each batch of the transition e<s, s', g, g'>
# ep_len - ep_len % manager_propose_freq
# Example : if an episode is 647 length, iterate till 640 (idx 639). Then, gb[640:646] should all be the terminal state.
# 1. replace the subgoal proposals in 'gb'
remainder = ep_len % manager_propose_freq
for idx in range(0, ep_len - remainder - manager_propose_freq, manager_propose_freq): # iterate until the full proposal period is met.
gb[idx] = s1b[idx + manager_propose_freq - 1] # s1b[idx + manager_propose_freq - 1] has the s_(idx + manager_propose_freq)
for i in range(1, manager_propose_freq): #[t+1:t+c-1]
gb[idx + i] = env.env_goal_transition(sb[idx + i], s1b[idx + i], gb[idx])
# 2. fill ther remaining transitions with terminal state observations
# here, gb[-1], gb[-2], ... gb[-7] = sT in example.
sT = s1b[-1]
for idx in range(1,remainder + 1):
gb[-idx] = sT
# 3. copy the gb into g1b, with the index offset of 1. Then the last element of g1b is sT.
g1b[:-1] = gb[1:]
g1b[-1] = sT
def get_action():
""" return the action inference from the external controller.
"""
return env._get_demo_action()
# divide the loop into two phase.
# 1. rollout (collecting normal transition data (s, a, r, s', d))
while not rospy.is_shutdown() and t <int(total_steps):
if done or ep_len== max_ep_len: # if an episode is finished (no matter done==True)
if t != 0:
# Process final state/obs, store manager transition i.e. state/obs @ t+c
if len(manager_temp_transition[2]) != 1:
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13
# buffer store arguments : s_seq, o_seq, a_seq, obs, obs_1, dg, dg_1, stt, stt_1, act, aux, aux_1, rew, done
manager_temp_transition[4] = c_obs # save o_t+c
manager_temp_transition[8] = full_stt # save s_t+c
manager_temp_transition[11] = aux # save aux_t+c
manager_temp_transition[-1] = float(True) # done = True for manager, regardless of the episode
# make sure every manager transition have the same length of sequence
# TODO: debug here...
if len(manager_temp_transition[0]) <= manager_propose_freq: # len(state_seq) = propose_1freq +1 since we save s_t:t+c as a seq.
while len(manager_temp_transition[0]) <= manager_propose_freq:
manager_temp_transition[0].append(full_stt) # s_seq
manager_temp_transition[1].append(c_obs) # o_seq
manager_temp_transition[2].append(action) # a_seq0
manager_temp_transition = demo_manager_sg_transition(manager_temp_transition)
demo_controller_sg_transition(controller_buffer, t, ep_len) # verify if the member of the buffer class is successfully modified.
# buffer store arguments : s_seq, o_seq, a_seq, obs, obs_1, dg, dg_1, stt, stt_1, act, aux, aux_1, rew, done
# print (manager_temp_transition[2])
manager_buffer.store(*manager_temp_transition) # off policy correction is done inside the manager_buffer
save_rms(step=t)
# reset the environment since an episode has been finished
obs = env.reset_demo() #'observation', 'desired_goal', g_des -> task specific goal!
done = False
reset = False
ep_len = 0 # length of the episode
ep_ret = 0 # episode return for the manager
ep_low_ret = 0 # return of the intrinsic reward for low level controller
episode_num += 1 # for every env.reset()
# process observations
full_stt = np.concatenate(obs['observation']['full_state'], axis=0) # s_0
c_obs = obs['observation']['color_obs'] #o_0
des_goal = np.concatenate(obs['desired_goal']['full_state'], axis=0) # g_des
aux = obs['auxiliary'] # g_des
# infer subgoal for low-level policy
# args for get_subgoal : obs, next_obs, sub_goal
subgoal = get_demo_temp_subgoal() # action_dim = (1, stt_dim) -> defaults to 25-dim
timesteps_since_subgoal = 0
# apply noise on the subgoal
# create a temporal high-level transition : requires off-policy correction
# buffer store arguments : s_seq, o_seq, a_seq, obs, obs_1, dg, dg_1, stt, stt_1, act, aux, aux_1, rew, done
manager_temp_transition = [[full_stt], [c_obs], [], c_obs, None, des_goal, des_goal, full_stt, None, subgoal, aux, None, 0, False]
action = get_action() # a_t
# TODO: make action on the gripper as categorical policy
# action[-1] = reloc_rescale_gripper(action[-1])
next_obs, manager_reward, done = env.step_demo(action, time_step=ep_len) # reward R_t-> for high-level manager -> for sum(R_t:t+c-1)
randomize_world()
# update episodic logs
# Ignore the "done" signal if it comes from hitting the time
# horizon (that is, when it's an artificial terminal signal
# that isn't based on the agent's state) -> if done = False for the max_timestep
# DO NOT Make done = True when it hits timeout
ep_ret += manager_reward # reward in terms of achieving episodic task
done = False if ep_len== max_ep_len else done
if done:
rospy.logwarn('=============== Now epsiode %d ends with done signal! ====================', episode_num)
next_full_stt = np.concatenate(next_obs['observation']['full_state']) # s_t
next_c_obs = next_obs['observation']['color_obs'] #o_t
next_aux = obs['auxiliary'] # g_des
# append manager transition
manager_temp_transition[-1] = float(True)
manager_temp_transition[-2] += manager_reward # sum(R_t:t+c)
manager_temp_transition[0].append(next_full_stt) # append s_seq
manager_temp_transition[1].append(next_c_obs) # append o_seq
manager_temp_transition[2].append(action) # append a_seq
# compute intrinsic reward
intrinsic_reward = env.compute_intrinsic_reward(full_stt, next_full_stt, subgoal)
# subgoal transition
next_subgoal = env.env_goal_transition(full_stt, next_full_stt, subgoal)
# add transition for low-level policy
# (obs, obs1, sg, sg1, stt, stt1, act, aux, rew, done)
controller_buffer.store(c_obs, next_c_obs, subgoal,
next_subgoal, full_stt, next_full_stt, action, aux, next_aux, intrinsic_reward, done)
# update observations and subgoal
obs = next_obs
subgoal = next_subgoal
aux = next_aux
# update logging steps
ep_len += 1
t +=1
timesteps_since_manager += 1
timesteps_since_subgoal += 1
if timesteps_since_subgoal % manager_propose_freq == 0:
# for every c-step, renew the subgoal estimation from the manager policy.
timesteps_since_subgoal = 0
manager_temp_transition[4] = c_obs # save o_t+c
manager_temp_transition[8] = full_stt # save s_t+c
manager_temp_transition[11] = aux # save aux_t+c
manager_temp_transition[-1] = float(True) # done = True for manager, regardless of the episode
# intentional seq appending is not required here since it always satisfies c step.
# rospy.logwarn('Debug manager transitions. Debug manager transitions.')
# print (manager_temp_transition[2])
# rospy.logwarn('Debug manager transitions. Debug manager transitions.')
manager_buffer.store(*manager_temp_transition)
subgoal = get_demo_temp_subgoal() # action_dim = (1, stt_dim) -> defaults to 25-dim
# Create a high level transition : note that the action of manager policy is subgoal
# buffer store arguments : s_seq, o_seq, a_seq, obs, obs_1, dg, dg_1, stt, stt_1, act, aux, aux_1, rew, done
manager_temp_transition = [[full_stt], [c_obs], [], c_obs, None, des_goal, des_goal, full_stt, None, subgoal, aux, None, 0, False]
# update running mean-std normalizer
update_rms(full_stt=full_stt, c_obs=c_obs, aux=aux, act=action) # do it.
# if all the demo episodes have ended.
os.chdir(demo_path)
if os.path.exists(MAN_BUF_FNAME):
print ('Deletes the manger buffer')
ans = input("Delete the manager buffer? '1' / '0' ")
rospy.logwarn('=======================================================')
if ans == 1:
os.remove(MAN_BUF_FNAME)
if os.path.exists(CON_BUF_FNAME):
print ('Deletes the controller buffer')
ans = input("Delete the controller buffer? '1' / '0' ")
if ans == 1:
os.remove(CON_BUF_FNAME)
rospy.logwarn('=======================================================')
print ('Now saves the manager buffer in pickle format')
with open (MAN_BUF_FNAME, 'wb') as f:
pickle.dump(manager_buffer.return_buffer(), f)
print ('Now saves the controller buffer in pickle format')
with open (CON_BUF_FNAME, 'wb') as f2:
pickle.dump(controller_buffer.return_buffer(), f2)