From 17f37225f68ef2a1e640541bba62c5abf066f6da Mon Sep 17 00:00:00 2001 From: Wojciech Indyk Date: Fri, 28 Aug 2020 19:22:23 +0200 Subject: [PATCH 1/6] Automatic conversion via tf_upgrade_v2. --- btgym/algorithms/aac.py | 108 ++--- btgym/algorithms/math_utils.py | 16 +- btgym/algorithms/nn/ae.py | 24 +- btgym/algorithms/nn/layers.py | 62 +-- btgym/algorithms/nn/losses.py | 92 ++-- btgym/algorithms/nn/networks.py | 36 +- btgym/algorithms/policy/base.py | 40 +- btgym/algorithms/policy/meta.py | 42 +- btgym/algorithms/policy/stacked_lstm.py | 52 +-- btgym/algorithms/utils.py | 12 +- btgym/algorithms/worker.py | 32 +- btgym/monitor/tensorboard2.py | 20 +- btgym/research/b_vae_a3c.py | 64 +-- btgym/research/casual/aac.py | 16 +- btgym/research/casual_conv/layers.py | 20 +- btgym/research/casual_conv/networks.py | 14 +- btgym/research/encoder_test/aac.py | 70 +-- btgym/research/encoder_test/networks.py | 10 +- btgym/research/encoder_test/policy.py | 38 +- btgym/research/gps/aac.py | 2 +- btgym/research/gps/loss.py | 24 +- btgym/research/metalearn_2/_fwrnn_aac.py | 188 -------- btgym/research/metalearn_2/_mldg_batch.py | 530 ---------------------- btgym/research/metalearn_2/loss.py | 28 +- btgym/research/mldg/aac.py | 30 +- btgym/research/mldg/aac_1.py | 18 +- btgym/research/mldg/aac_1d.py | 10 +- btgym/research/mldg/aac_1s.py | 38 +- btgym/research/mldg/policy.py | 2 +- btgym/research/model_based/aac.py | 68 +-- btgym/research/policy_rl2.py | 2 +- setup.py | 2 +- 32 files changed, 496 insertions(+), 1214 deletions(-) diff --git a/btgym/algorithms/aac.py b/btgym/algorithms/aac.py index 1f00b47c..43a0539a 100644 --- a/btgym/algorithms/aac.py +++ b/btgym/algorithms/aac.py @@ -198,7 +198,7 @@ def __init__(self, self.random_seed = random_seed if self.random_seed is not None: np.random.seed(self.random_seed) - tf.set_random_seed(self.random_seed) + tf.compat.v1.set_random_seed(self.random_seed) self.log.debug('rnd_seed:{}, log_u_sample_(0,1]x5: {}'. format(random_seed, log_uniform([1e-10,1], 5))) @@ -405,7 +405,7 @@ def __init__(self, self.log.debug('started building graphs...') if self.use_global_network: # PS: - with tf.device(tf.train.replica_device_setter(1, worker_device=self.worker_device)): + with tf.device(tf.compat.v1.train.replica_device_setter(1, worker_device=self.worker_device)): self.network = pi_global = self._make_policy('global') if self.use_target_policy: self.network_prime = self._make_policy('global_prime') @@ -417,7 +417,7 @@ def __init__(self, # Worker: with tf.device(self.worker_device): - with tf.variable_scope(self.name): + with tf.compat.v1.variable_scope(self.name): self.local_network = pi = self._make_policy('local') if self.use_target_policy: @@ -429,7 +429,7 @@ def __init__(self, self.worker_device_callback_0() # if need more networks etc. # Meant for Batch-norm layers: - pi.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope='.*local.*') + pi.update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS, scope='.*local.*') # Just in case: self.dummy_pi = self._make_dummy_policy() @@ -440,7 +440,7 @@ def __init__(self, self.log.debug('{}: {}'.format(v.name, v.get_shape())) # Learning rate annealing: - self.learn_rate_decayed = tf.train.polynomial_decay( + self.learn_rate_decayed = tf.compat.v1.train.polynomial_decay( self.opt_learn_rate, self.global_step + 1, self.opt_decay_steps, @@ -507,13 +507,13 @@ def _make_base_loss(self, pi, pi_prime, name='base', verbose=True): tensor holding estimated loss graph list of related summaries """ - with tf.name_scope(name): + with tf.compat.v1.name_scope(name): # On-policy AAC loss definition: - pi.on_pi_act_target = tf.placeholder( + pi.on_pi_act_target = tf.compat.v1.placeholder( tf.float32, [None, self.ref_env.action_space.one_hot_depth], name="on_policy_action_pl" ) - pi.on_pi_adv_target = tf.placeholder(tf.float32, [None], name="on_policy_advantage_pl") - pi.on_pi_r_target = tf.placeholder(tf.float32, [None], name="on_policy_return_pl") + pi.on_pi_adv_target = tf.compat.v1.placeholder(tf.float32, [None], name="on_policy_advantage_pl") + pi.on_pi_r_target = tf.compat.v1.placeholder(tf.float32, [None], name="on_policy_return_pl") clip_epsilon = tf.cast(self.clip_epsilon * self.learn_rate_decayed / self.opt_learn_rate, tf.float32) @@ -534,10 +534,10 @@ def _make_base_loss(self, pi, pi_prime, name='base', verbose=True): model_summaries = on_pi_summaries # Off-policy losses: - pi.off_pi_act_target = tf.placeholder( + pi.off_pi_act_target = tf.compat.v1.placeholder( tf.float32, [None, self.ref_env.action_space.one_hot_depth], name="off_policy_action_pl") - pi.off_pi_adv_target = tf.placeholder(tf.float32, [None], name="off_policy_advantage_pl") - pi.off_pi_r_target = tf.placeholder(tf.float32, [None], name="off_policy_return_pl") + pi.off_pi_adv_target = tf.compat.v1.placeholder(tf.float32, [None], name="off_policy_advantage_pl") + pi.off_pi_r_target = tf.compat.v1.placeholder(tf.float32, [None], name="off_policy_return_pl") if self.use_off_policy_aac: # Off-policy AAC loss graph mirrors on-policy: @@ -558,8 +558,8 @@ def _make_base_loss(self, pi, pi_prime, name='base', verbose=True): if self.use_pixel_control: # Pixel control loss: - pi.pc_action = tf.placeholder(tf.float32, [None, self.ref_env.action_space.tensor_shape[0]], name="pc_action") - pi.pc_target = tf.placeholder(tf.float32, [None, None, None], name="pc_target") + pi.pc_action = tf.compat.v1.placeholder(tf.float32, [None, self.ref_env.action_space.tensor_shape[0]], name="pc_action") + pi.pc_target = tf.compat.v1.placeholder(tf.float32, [None, None, None], name="pc_target") pc_loss, pc_summaries = self.pc_loss( actions=pi.pc_action, @@ -574,7 +574,7 @@ def _make_base_loss(self, pi, pi_prime, name='base', verbose=True): if self.use_value_replay: # Value function replay loss: - pi.vr_target = tf.placeholder(tf.float32, [None], name="vr_target") + pi.vr_target = tf.compat.v1.placeholder(tf.float32, [None], name="vr_target") vr_loss, vr_summaries = self.vr_loss( r_target=pi.vr_target, pi_vf=pi.vr_value, @@ -586,7 +586,7 @@ def _make_base_loss(self, pi, pi_prime, name='base', verbose=True): if self.use_reward_prediction: # Reward prediction loss: - pi.rp_target = tf.placeholder(tf.float32, [None, 3], name="rp_target") + pi.rp_target = tf.compat.v1.placeholder(tf.float32, [None, 3], name="rp_target") rp_loss, rp_summaries = self.rp_loss( rp_targets=pi.rp_target, @@ -613,7 +613,7 @@ def _make_train_op(self, pi, pi_prime, pi_global): """ # Each worker gets a different set of adam optimizer parameters: - self.optimizer = tf.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) + self.optimizer = tf.compat.v1.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) # self.optimizer = tf.train.RMSPropOptimizer( # learning_rate=train_learn_rate, @@ -624,10 +624,10 @@ def _make_train_op(self, pi, pi_prime, pi_global): # Clipped gradients: self.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.loss, pi.var_list), + tf.gradients(ys=self.loss, xs=pi.var_list), 40.0 ) - self.grads_global_norm = tf.global_norm(self.grads) + self.grads_global_norm = tf.linalg.global_norm(self.grads) # Copy weights from the parameter server to the local model self.sync = self.sync_pi = tf.group( *[v1.assign(v2) for v1, v2 in zip(pi.var_list, pi_global.var_list)] @@ -649,7 +649,7 @@ def _make_train_op(self, pi, pi_prime, pi_global): stream = pi.on_state_in['external'][list(pi.on_state_in['external'].keys())[0]] else: stream = pi.on_state_in['external'] - self.inc_step = self.global_step.assign_add(tf.shape(stream)[0]) + self.inc_step = self.global_step.assign_add(tf.shape(input=stream)[0]) train_op = self.optimizer.apply_gradients(grads_and_vars) self.log.debug('train_op defined') @@ -666,71 +666,71 @@ def _combine_summaries(self, policy=None, model_summaries=None): if model_summaries is not None: if self.use_global_network: # Model-wide statistics: - with tf.name_scope('model'): + with tf.compat.v1.name_scope('model'): model_summaries += [ - tf.summary.scalar("grad_global_norm", self.grads_global_norm), + tf.compat.v1.summary.scalar("grad_global_norm", self.grads_global_norm), # TODO: add gradient variance summary #tf.summary.scalar("learn_rate", self.train_learn_rate), - tf.summary.scalar("learn_rate", self.learn_rate_decayed), # cause actual rate is a jaggy due to test freezes - tf.summary.scalar("total_loss", self.loss), + tf.compat.v1.summary.scalar("learn_rate", self.learn_rate_decayed), # cause actual rate is a jaggy due to test freezes + tf.compat.v1.summary.scalar("total_loss", self.loss), # tf.summary.scalar('roll_reward', tf.reduce_mean(self.local_network.on_last_reward_in)), # tf.summary.scalar('roll_advantage', tf.reduce_mean(self.local_network.on_pi_adv_target)), ] if policy is not None: - model_summaries += [tf.summary.scalar("var_global_norm", tf.global_norm(policy.var_list))] + model_summaries += [tf.compat.v1.summary.scalar("var_global_norm", tf.linalg.global_norm(policy.var_list))] else: model_summaries = [] # Model stat. summary: - model_summary = tf.summary.merge(model_summaries, name='model_summary') + model_summary = tf.compat.v1.summary.merge(model_summaries, name='model_summary') # Episode-related summaries: ep_summary = dict( # Summary placeholders - render_atari=tf.placeholder(tf.uint8, [None, None, None, 1]), - total_r=tf.placeholder(tf.float32, ), - cpu_time=tf.placeholder(tf.float32, ), - final_value=tf.placeholder(tf.float32, ), - steps=tf.placeholder(tf.int32, ), + render_atari=tf.compat.v1.placeholder(tf.uint8, [None, None, None, 1]), + total_r=tf.compat.v1.placeholder(tf.float32, ), + cpu_time=tf.compat.v1.placeholder(tf.float32, ), + final_value=tf.compat.v1.placeholder(tf.float32, ), + steps=tf.compat.v1.placeholder(tf.int32, ), ) if self.test_mode: # For Atari: - ep_summary['render_op'] = tf.summary.image("model/state", ep_summary['render_atari']) + ep_summary['render_op'] = tf.compat.v1.summary.image("model/state", ep_summary['render_atari']) else: # BTGym rendering: ep_summary.update( { - mode: tf.placeholder(tf.uint8, [None, None, None, None], name=mode + '_pl') + mode: tf.compat.v1.placeholder(tf.uint8, [None, None, None, None], name=mode + '_pl') for mode in self.env_list[0].render_modes + self.aux_render_modes } ) - ep_summary['render_op'] = tf.summary.merge( - [tf.summary.image(mode, ep_summary[mode]) + ep_summary['render_op'] = tf.compat.v1.summary.merge( + [tf.compat.v1.summary.image(mode, ep_summary[mode]) for mode in self.env_list[0].render_modes + self.aux_render_modes] ) # Episode stat. summary: - ep_summary['btgym_stat_op'] = tf.summary.merge( + ep_summary['btgym_stat_op'] = tf.compat.v1.summary.merge( [ - tf.summary.scalar('episode_train/total_reward', ep_summary['total_r']), - tf.summary.scalar('episode_train/cpu_time_sec', ep_summary['cpu_time']), - tf.summary.scalar('episode_train/final_value', ep_summary['final_value']), - tf.summary.scalar('episode_train/env_steps', ep_summary['steps']) + tf.compat.v1.summary.scalar('episode_train/total_reward', ep_summary['total_r']), + tf.compat.v1.summary.scalar('episode_train/cpu_time_sec', ep_summary['cpu_time']), + tf.compat.v1.summary.scalar('episode_train/final_value', ep_summary['final_value']), + tf.compat.v1.summary.scalar('episode_train/env_steps', ep_summary['steps']) ], name='episode_train_btgym' ) # Test episode stat. summary: - ep_summary['test_btgym_stat_op'] = tf.summary.merge( + ep_summary['test_btgym_stat_op'] = tf.compat.v1.summary.merge( [ - tf.summary.scalar('episode_test/total_reward', ep_summary['total_r']), - tf.summary.scalar('episode_test/final_value', ep_summary['final_value']), - tf.summary.scalar('episode_test/env_steps', ep_summary['steps']) + tf.compat.v1.summary.scalar('episode_test/total_reward', ep_summary['total_r']), + tf.compat.v1.summary.scalar('episode_test/final_value', ep_summary['final_value']), + tf.compat.v1.summary.scalar('episode_test/env_steps', ep_summary['steps']) ], name='episode_test_btgym' ) - ep_summary['atari_stat_op'] = tf.summary.merge( + ep_summary['atari_stat_op'] = tf.compat.v1.summary.merge( [ - tf.summary.scalar('episode/total_reward', ep_summary['total_r']), - tf.summary.scalar('episode/steps', ep_summary['steps']) + tf.compat.v1.summary.scalar('episode/total_reward', ep_summary['total_r']), + tf.compat.v1.summary.scalar('episode/steps', ep_summary['steps']) ], name='episode_atari' ) @@ -798,24 +798,24 @@ def _make_step_counters(self): Returns: None, sets attrs. """ - self.global_step = tf.get_variable( + self.global_step = tf.compat.v1.get_variable( "global_step", [], tf.int32, - initializer=tf.constant_initializer( + initializer=tf.compat.v1.constant_initializer( 0, dtype=tf.int32 ), trainable=False ) - tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, self.global_step) + tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.GLOBAL_STEP, self.global_step) self.reset_global_step = self.global_step.assign(0) - self.global_episode = tf.get_variable( + self.global_episode = tf.compat.v1.get_variable( "global_episode", [], tf.int32, - initializer=tf.constant_initializer( + initializer=tf.compat.v1.constant_initializer( 0, dtype=tf.int32 ), @@ -837,7 +837,7 @@ def _make_policy(self, scope): Returns: policy instance """ - with tf.variable_scope(scope): + with tf.compat.v1.variable_scope(scope): # Make policy instance: network = self.policy_class(**self.policy_kwargs) if 'global' not in scope: @@ -1313,7 +1313,7 @@ def process_summary(self, sess, data, model_data=None, step=None, episode=None): # Every worker writes train episode summaries: if model_data is not None: - self.summary_writer.add_summary(tf.Summary.FromString(model_data), step) + self.summary_writer.add_summary(tf.compat.v1.Summary.FromString(model_data), step) self.summary_writer.flush() def process(self, sess, **kwargs): diff --git a/btgym/algorithms/math_utils.py b/btgym/algorithms/math_utils.py index 0bdb0c1b..94829a64 100644 --- a/btgym/algorithms/math_utils.py +++ b/btgym/algorithms/math_utils.py @@ -36,22 +36,22 @@ def log_uniform(lo_hi, size): def cat_entropy(logits): - a0 = logits - tf.reduce_max(logits, 1, keepdims=True) + a0 = logits - tf.reduce_max(input_tensor=logits, axis=1, keepdims=True) ea0 = tf.exp(a0) - z0 = tf.reduce_sum(ea0, 1, keepdims=True) + z0 = tf.reduce_sum(input_tensor=ea0, axis=1, keepdims=True) p0 = ea0 / z0 - return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1) + return tf.reduce_sum(input_tensor=p0 * (tf.math.log(z0) - a0), axis=1) def kl_divergence(logits_1, logits_2): - a0 = logits_1 - tf.reduce_max(logits_1, axis=-1, keepdims=True) - a1 = logits_2 - tf.reduce_max(logits_2, axis=-1, keepdims=True) + a0 = logits_1 - tf.reduce_max(input_tensor=logits_1, axis=-1, keepdims=True) + a1 = logits_2 - tf.reduce_max(input_tensor=logits_2, axis=-1, keepdims=True) ea0 = tf.exp(a0) ea1 = tf.exp(a1) - z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) - z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True) + z0 = tf.reduce_sum(input_tensor=ea0, axis=-1, keepdims=True) + z1 = tf.reduce_sum(input_tensor=ea1, axis=-1, keepdims=True) p0 = ea0 / z0 - return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1) + return tf.reduce_sum(input_tensor=p0 * (a0 - tf.math.log(z0) - a1 + tf.math.log(z1)), axis=-1) # def softmax(x): diff --git a/btgym/algorithms/nn/ae.py b/btgym/algorithms/nn/ae.py index 3cd25409..f88b60b9 100644 --- a/btgym/algorithms/nn/ae.py +++ b/btgym/algorithms/nn/ae.py @@ -31,7 +31,7 @@ def conv2d_encoder(x, level-wise list of encoding layers shapes, first ro last. """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): layer_shapes = [x.get_shape()] layer_outputs = [] for i, layer_spec in enumerate(layer_config, 1): @@ -83,7 +83,7 @@ def conv2d_decoder(z, list of tensors holding decoded features for every layer inner to outer """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): x = z layer_shapes = list(layer_shapes) layer_shapes.reverse() @@ -91,7 +91,7 @@ def conv2d_decoder(z, layer_config.reverse() layer_output = [] for i, (layer_spec, layer_shape) in enumerate(zip(layer_config,layer_shapes[1:]), 1): - x = tf.image.resize_images( + x = tf.image.resize( images=x, size=[int(layer_shape[1]), int(layer_shape[2])], method=resize_method, @@ -156,7 +156,7 @@ def conv2d_autoencoder( None value """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): # Encode: encoder_layers, shapes = conv2d_encoder( x=inputs, @@ -223,7 +223,7 @@ def cw_conv2d_autoencoder( None value """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): ae_bank = [] for i in range(inputs.get_shape().as_list()[-1]): # Making list of list of AE's: @@ -313,7 +313,7 @@ def beta_var_conv2d_autoencoder( tensor holding estimated KL divergence """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): # Encode: encoder_layers, shapes = conv2d_encoder( @@ -358,8 +358,8 @@ def beta_var_conv2d_autoencoder( # Oversized noise generator: #eps = tf.random_normal(shape=[max_batch_size, half_size_z], mean=0., stddev=1.) - eps = tf.random_normal(shape=[max_batch_size, size_z], mean=0., stddev=1.) - eps = eps[:tf.shape(z)[0],:] + eps = tf.random.normal(shape=[max_batch_size, size_z], mean=0., stddev=1.) + eps = eps[:tf.shape(input=z)[0],:] # Get sample z ~ Q(z|X): z_sampled = mu + tf.exp(log_sigma / 2) * eps @@ -405,18 +405,18 @@ def __init__(self, conv_input, layer_output): conv_input: convolution stack input tensor layer_output: tensor holding output of layer of interest from stack """ - self.idx = tf.placeholder(tf.int32, name='kernel_index') + self.idx = tf.compat.v1.placeholder(tf.int32, name='kernel_index') self.conv_input = conv_input self.layer_output = layer_output # Build a loss function that maximizes the activation # of the n-th filter of the layer considered: - self.vis_loss = tf.reduce_mean(self.layer_output[:, :, :, self.idx]) + self.vis_loss = tf.reduce_mean(input_tensor=self.layer_output[:, :, :, self.idx]) # Gradient of the input picture wrt this loss: - self.vis_grads = tf.gradients(self.vis_loss, self.conv_input)[0] + self.vis_grads = tf.gradients(ys=self.vis_loss, xs=self.conv_input)[0] # Normalization trick: - self.vis_grads /= (tf.sqrt(tf.reduce_mean(tf.square(self.vis_grads))) + 1e-5) + self.vis_grads /= (tf.sqrt(tf.reduce_mean(input_tensor=tf.square(self.vis_grads))) + 1e-5) def _iterate(self, sess, signal, kernel_index): """ diff --git a/btgym/algorithms/nn/layers.py b/btgym/algorithms/nn/layers.py index 6dcf01cf..19dd966b 100644 --- a/btgym/algorithms/nn/layers.py +++ b/btgym/algorithms/nn/layers.py @@ -31,7 +31,7 @@ def categorical_sample(logits, depth): tensor of shape [batch_dim, logits_depth] """ # print('categorical_sample_logits: ', logits) - value = tf.squeeze(tf.multinomial(logits, 1), [1]) + value = tf.squeeze(tf.random.categorical(logits=logits, num_samples=1), [1]) one_hot = tf.one_hot(value, depth, name='sample_one_hot') return one_hot @@ -40,9 +40,9 @@ def linear(x, size, name, initializer=None, bias_init=0, reuse=False): """ Linear network layer. """ - with tf.variable_scope(name, reuse=reuse): - w = tf.get_variable("/w", [x.get_shape()[1], size], initializer=initializer) - b = tf.get_variable("/b", [size], initializer=tf.constant_initializer(bias_init)) + with tf.compat.v1.variable_scope(name, reuse=reuse): + w = tf.compat.v1.get_variable("/w", [x.get_shape()[1], size], initializer=initializer) + b = tf.compat.v1.get_variable("/b", [size], initializer=tf.compat.v1.constant_initializer(bias_init)) return tf.matmul(x, w) + b @@ -56,29 +56,29 @@ def noisy_linear(x, size, name, bias=True, activation_fn=tf.identity, reuse=Fals https://arxiv.org/abs/1706.01905 """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): # the function used in eq.7,8 def f(x): return tf.multiply(tf.sign(x), tf.pow(tf.abs(x), 0.5)) # Initializer of \mu and \sigma - mu_init = tf.random_uniform_initializer(minval=-1*1/np.power(x.get_shape().as_list()[1], 0.5), + mu_init = tf.compat.v1.random_uniform_initializer(minval=-1*1/np.power(x.get_shape().as_list()[1], 0.5), maxval=1*1/np.power(x.get_shape().as_list()[1], 0.5)) - sigma_init = tf.constant_initializer(0.4/np.power(x.get_shape().as_list()[1], 0.5)) + sigma_init = tf.compat.v1.constant_initializer(0.4/np.power(x.get_shape().as_list()[1], 0.5)) # Sample noise from gaussian - p = tf.random_normal([x.get_shape().as_list()[1], 1]) - q = tf.random_normal([1, size]) + p = tf.random.normal([x.get_shape().as_list()[1], 1]) + q = tf.random.normal([1, size]) f_p = f(p); f_q = f(q) w_epsilon = f_p*f_q; b_epsilon = tf.squeeze(f_q) # w = w_mu + w_sigma*w_epsilon - w_mu = tf.get_variable("/w_mu", [x.get_shape()[1], size], initializer=mu_init) - w_sigma = tf.get_variable("/w_sigma", [x.get_shape()[1], size], initializer=sigma_init) + w_mu = tf.compat.v1.get_variable("/w_mu", [x.get_shape()[1], size], initializer=mu_init) + w_sigma = tf.compat.v1.get_variable("/w_sigma", [x.get_shape()[1], size], initializer=sigma_init) w = w_mu + tf.multiply(w_sigma, w_epsilon) ret = tf.matmul(x, w) if bias: # b = b_mu + b_sigma*b_epsilon - b_mu = tf.get_variable("/b_mu", [size], initializer=mu_init) - b_sigma = tf.get_variable("/b_sigma", [size], initializer=sigma_init) + b_mu = tf.compat.v1.get_variable("/b_mu", [size], initializer=mu_init) + b_sigma = tf.compat.v1.get_variable("/b_sigma", [size], initializer=sigma_init) b = b_mu + tf.multiply(b_sigma, b_epsilon) return activation_fn(ret + b) else: @@ -90,15 +90,15 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", """ 2D convolution layer. """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): stride_shape = [1, stride[0], stride[1], 1] filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters] - w = tf.get_variable("W", filter_shape, dtype, initializer=tf.contrib.layers.xavier_initializer(), + w = tf.compat.v1.get_variable("W", filter_shape, dtype, initializer=tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"), collections=collections) - b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.constant_initializer(0.0), + b = tf.compat.v1.get_variable("b", [1, 1, 1, num_filters], initializer=tf.compat.v1.constant_initializer(0.0), collections=collections) - return tf.nn.conv2d(x, w, stride_shape, pad) + b + return tf.nn.conv2d(input=x, filters=w, strides=stride_shape, padding=pad) + b def deconv2d(x, output_channels, name, filter_size=(4, 4), stride=(2, 2), @@ -107,10 +107,10 @@ def deconv2d(x, output_channels, name, filter_size=(4, 4), stride=(2, 2), Deconvolution layer, paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): stride_shape = [1, stride[0], stride[1], 1] - batch_size = tf.shape(x)[0] + batch_size = tf.shape(input=x)[0] input_height = int(x.get_shape()[1]) input_width = int(x.get_shape()[2]) input_channels = int(x.get_shape()[3]) @@ -126,9 +126,9 @@ def deconv2d(x, output_channels, name, filter_size=(4, 4), stride=(2, 2), # initialize weights with random weights w_bound = np.sqrt(6. / (fan_in + fan_out)) - w = tf.get_variable("d_W", filter_shape, dtype, initializer=tf.contrib.layers.xavier_initializer(), + w = tf.compat.v1.get_variable("d_W", filter_shape, dtype, initializer=tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"), collections=collections) - b = tf.get_variable("d_b", [1, 1, 1, output_channels], initializer=tf.constant_initializer(0.0), + b = tf.compat.v1.get_variable("d_b", [1, 1, 1, output_channels], initializer=tf.compat.v1.constant_initializer(0.0), collections=collections) return tf.nn.conv2d_transpose(x, w, output_shape, @@ -141,7 +141,7 @@ def conv1d(x, num_filters, name, filter_size=3, stride=2, pad="SAME", dtype=tf.f """ 1D convolution layer. """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): stride_shape = stride # print('stride_shape:',stride_shape) @@ -150,11 +150,11 @@ def conv1d(x, num_filters, name, filter_size=3, stride=2, pad="SAME", dtype=tf.f # print('filter_shape:', filter_shape) - w = tf.get_variable("W", filter_shape, dtype, initializer=tf.contrib.layers.xavier_initializer(), + w = tf.compat.v1.get_variable("W", filter_shape, dtype, initializer=tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"), collections=collections) - b = tf.get_variable("b", [1, 1, num_filters], initializer=tf.constant_initializer(0.0), + b = tf.compat.v1.get_variable("b", [1, 1, num_filters], initializer=tf.compat.v1.constant_initializer(0.0), collections=collections) - return tf.nn.conv1d(x, w, stride_shape, pad) + b + return tf.nn.conv1d(input=x, filters=w, stride=stride_shape, padding=pad) + b def conv2d_dw(x, num_filters, name='conv2d_dw', filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, @@ -162,7 +162,7 @@ def conv2d_dw(x, num_filters, name='conv2d_dw', filter_size=(3, 3), stride=(1, 1 """ Depthwise 2D convolution layer. Slow, do not use. """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): stride_shape = [1, stride[0], stride[1], 1] filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[-1]), num_filters] fan_in = np.prod(filter_shape[:3]) @@ -170,8 +170,8 @@ def conv2d_dw(x, num_filters, name='conv2d_dw', filter_size=(3, 3), stride=(1, 1 # initialize weights with random weights w_bound = np.sqrt(6. / (fan_in + fan_out)) - w = tf.get_variable("W", filter_shape, dtype, - tf.contrib.layers.xavier_initializer(), collections=collections) - b = tf.get_variable("b", [1, 1, 1, num_filters * int(x.get_shape()[-1])], - initializer=tf.constant_initializer(0.0), collections=collections) - return tf.nn.depthwise_conv2d(x, w, stride_shape, pad, [1, 1]) + b + w = tf.compat.v1.get_variable("W", filter_shape, dtype, + tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"), collections=collections) + b = tf.compat.v1.get_variable("b", [1, 1, 1, num_filters * int(x.get_shape()[-1])], + initializer=tf.compat.v1.constant_initializer(0.0), collections=collections) + return tf.nn.depthwise_conv2d(input=x, filter=w, strides=stride_shape, padding=pad, dilations=[1, 1]) + b diff --git a/btgym/algorithms/nn/losses.py b/btgym/algorithms/nn/losses.py index 1592b152..dc421067 100644 --- a/btgym/algorithms/nn/losses.py +++ b/btgym/algorithms/nn/losses.py @@ -25,28 +25,28 @@ def aac_loss_def(act_target, adv_target, r_target, pi_logits, pi_vf, pi_prime_lo tensor holding estimated AAC loss; list of related tensorboard summaries. """ - with tf.name_scope(name + '/aac'): - neg_pi_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2( + with tf.compat.v1.name_scope(name + '/aac'): + neg_pi_log_prob = tf.nn.softmax_cross_entropy_with_logits( logits=pi_logits, labels=act_target ) - pi_loss = tf.reduce_mean(neg_pi_log_prob * adv_target) - vf_loss = 0.5 * tf.losses.mean_squared_error(r_target, pi_vf) - entropy = tf.reduce_mean(cat_entropy(pi_logits)) + pi_loss = tf.reduce_mean(input_tensor=neg_pi_log_prob * adv_target) + vf_loss = 0.5 * tf.compat.v1.losses.mean_squared_error(r_target, pi_vf) + entropy = tf.reduce_mean(input_tensor=cat_entropy(pi_logits)) loss = pi_loss + vf_loss - entropy * entropy_beta - mean_vf = tf.reduce_mean(pi_vf) - mean_t_target = tf.reduce_mean(r_target) + mean_vf = tf.reduce_mean(input_tensor=pi_vf) + mean_t_target = tf.reduce_mean(input_tensor=r_target) summaries = [ - tf.summary.scalar('policy_loss', pi_loss), - tf.summary.scalar('value_loss', vf_loss), + tf.compat.v1.summary.scalar('policy_loss', pi_loss), + tf.compat.v1.summary.scalar('value_loss', vf_loss), ] if verbose: summaries += [ - tf.summary.scalar('entropy', entropy), - tf.summary.scalar('value_fn', mean_vf), + tf.compat.v1.summary.scalar('entropy', entropy), + tf.compat.v1.summary.scalar('value_fn', mean_vf), # tf.summary.scalar('empirical_return',mean_t_target), # tf.summary.histogram('value_fn', pi_vf), # tf.summary.histogram('empirical_return', r_target), @@ -79,13 +79,13 @@ def ppo_loss_def(act_target, adv_target, r_target, pi_logits, pi_vf, pi_prime_lo #act_target = tf.placeholder(tf.float32, [None, env.action_space.n], name="on_policy_action_pl") #adv_target = tf.placeholder(tf.float32, [None], name="on_policy_advantage_pl") #r_target = tf.placeholder(tf.float32, [None], name="on_policy_return_pl") - with tf.name_scope(name + '/ppo'): - pi_log_prob = - tf.nn.softmax_cross_entropy_with_logits_v2( + with tf.compat.v1.name_scope(name + '/ppo'): + pi_log_prob = - tf.nn.softmax_cross_entropy_with_logits( logits=pi_logits, labels=act_target ) pi_old_log_prob = tf.stop_gradient( - - tf.nn.softmax_cross_entropy_with_logits_v2( + - tf.nn.softmax_cross_entropy_with_logits( logits=pi_prime_logits, labels=act_target ) @@ -95,27 +95,27 @@ def ppo_loss_def(act_target, adv_target, r_target, pi_logits, pi_vf, pi_prime_lo surr1 = pi_ratio * adv_target # surrogate from conservative policy iteration surr2 = tf.clip_by_value(pi_ratio, 1.0 - epsilon, 1.0 + epsilon) * adv_target - pi_surr_loss = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) - vf_loss = tf.losses.mean_squared_error(r_target, pi_vf) # V.fn. loss - entropy = tf.reduce_mean(cat_entropy(pi_logits)) + pi_surr_loss = - tf.reduce_mean(input_tensor=tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) + vf_loss = tf.compat.v1.losses.mean_squared_error(r_target, pi_vf) # V.fn. loss + entropy = tf.reduce_mean(input_tensor=cat_entropy(pi_logits)) loss = pi_surr_loss + vf_loss - entropy * entropy_beta # Info: - mean_pi_ratio = tf.reduce_mean(pi_ratio) - mean_vf = tf.reduce_mean(pi_vf) - mean_kl_old_new = tf.reduce_mean(kl_divergence(pi_prime_logits, pi_logits)) + mean_pi_ratio = tf.reduce_mean(input_tensor=pi_ratio) + mean_vf = tf.reduce_mean(input_tensor=pi_vf) + mean_kl_old_new = tf.reduce_mean(input_tensor=kl_divergence(pi_prime_logits, pi_logits)) summaries = [ - tf.summary.scalar('l_clip_loss', pi_surr_loss), - tf.summary.scalar('value_loss', vf_loss), + tf.compat.v1.summary.scalar('l_clip_loss', pi_surr_loss), + tf.compat.v1.summary.scalar('value_loss', vf_loss), ] if verbose: summaries += [ - tf.summary.scalar('entropy', entropy), - tf.summary.scalar('Dkl_old_new', mean_kl_old_new), - tf.summary.scalar('pi_ratio', mean_pi_ratio), - tf.summary.scalar('value_fn', mean_vf), + tf.compat.v1.summary.scalar('entropy', entropy), + tf.compat.v1.summary.scalar('Dkl_old_new', mean_kl_old_new), + tf.compat.v1.summary.scalar('pi_ratio', mean_pi_ratio), + tf.compat.v1.summary.scalar('value_fn', mean_vf), ] return loss, summaries @@ -136,11 +136,11 @@ def value_fn_loss_def(r_target, pi_vf, name='_vr_', verbose=False): list of related tensorboard summaries. """ # r_target = tf.placeholder(tf.float32, [None], name="vr_target") - with tf.name_scope(name + '/value_replay'): - loss = tf.losses.mean_squared_error(r_target, pi_vf) + with tf.compat.v1.name_scope(name + '/value_replay'): + loss = tf.compat.v1.losses.mean_squared_error(r_target, pi_vf) if verbose: - summaries = [tf.summary.scalar('v_loss', loss)] + summaries = [tf.compat.v1.summary.scalar('v_loss', loss)] else: summaries = [] @@ -172,17 +172,17 @@ def pc_loss_def(actions, targets, pi_pc_q, name='_pc_', verbose=False): """ #actions = tf.placeholder(tf.float32, [None, env.action_space.n], name="pc_action") #targets = tf.placeholder(tf.float32, [None, None, None], name="pc_target") - with tf.name_scope(name + '/pixel_control'): + with tf.compat.v1.name_scope(name + '/pixel_control'): # Get Q-value features for actions been taken and define loss: - pc_action_reshaped = tf.reshape(actions, [-1, 1, 1, tf.shape(actions)[-1]]) + pc_action_reshaped = tf.reshape(actions, [-1, 1, 1, tf.shape(input=actions)[-1]]) pc_q_action = tf.multiply(pi_pc_q, pc_action_reshaped) - pc_q_action = tf.reduce_sum(pc_q_action, axis=-1, keepdims=False) + pc_q_action = tf.reduce_sum(input_tensor=pc_q_action, axis=-1, keepdims=False) - batch_size = tf.shape(targets)[0] - loss = tf.reduce_sum(tf.square(targets - pc_q_action)) / tf.cast(batch_size, tf.float32) + batch_size = tf.shape(input=targets)[0] + loss = tf.reduce_sum(input_tensor=tf.square(targets - pc_q_action)) / tf.cast(batch_size, tf.float32) #loss = tf.losses.absolute_difference(targets, pc_q_action) if verbose: - summaries = [tf.summary.scalar('q_loss', loss)] + summaries = [tf.compat.v1.summary.scalar('q_loss', loss)] else: summaries = [] @@ -213,13 +213,13 @@ def rp_loss_def(rp_targets, pi_rp_logits, name='_rp_', verbose=False): list of related tensorboard summaries. """ #rp_targets = tf.placeholder(tf.float32, [1, 3], name="rp_target") - with tf.name_scope(name + '/reward_prediction'): - loss = tf.nn.softmax_cross_entropy_with_logits_v2( + with tf.compat.v1.name_scope(name + '/reward_prediction'): + loss = tf.nn.softmax_cross_entropy_with_logits( labels=rp_targets, logits=pi_rp_logits )[0] if verbose: - summaries = [tf.summary.scalar('class_loss', loss), ] + summaries = [tf.compat.v1.summary.scalar('class_loss', loss), ] else: summaries = [] @@ -241,11 +241,11 @@ def ae_loss_def(targets, logits, alpha=1.0, name='ae_loss', verbose=False, **kwa tensor holding estimated reconstruction loss list of summarues """ - with tf.name_scope(name + '/ae'): - loss = tf.losses.mean_squared_error(targets, logits) + with tf.compat.v1.name_scope(name + '/ae'): + loss = tf.compat.v1.losses.mean_squared_error(targets, logits) if verbose: - summaries = [tf.summary.scalar('reconstruct_loss', loss)] + summaries = [tf.compat.v1.summary.scalar('reconstruct_loss', loss)] else: summaries = [] @@ -274,14 +274,14 @@ def beta_vae_loss_def(targets, logits, d_kl, alpha=1.0, beta=1.0, name='beta_vae list of summarues """ - with tf.name_scope(name + '/b_vae'): - r_loss = tf.losses.mean_squared_error(targets, logits) - vae_loss = tf.reduce_mean(d_kl) + with tf.compat.v1.name_scope(name + '/b_vae'): + r_loss = tf.compat.v1.losses.mean_squared_error(targets, logits) + vae_loss = tf.reduce_mean(input_tensor=d_kl) loss = alpha * r_loss + beta * vae_loss if verbose: summaries = [ - tf.summary.scalar('reconstruct_loss', r_loss), - tf.summary.scalar('d_kl_loss', vae_loss), + tf.compat.v1.summary.scalar('reconstruct_loss', r_loss), + tf.compat.v1.summary.scalar('d_kl_loss', vae_loss), ] else: summaries = [] diff --git a/btgym/algorithms/nn/networks.py b/btgym/algorithms/nn/networks.py index a077db07..2d998db9 100644 --- a/btgym/algorithms/nn/networks.py +++ b/btgym/algorithms/nn/networks.py @@ -36,7 +36,7 @@ def conv_2d_network(x, Returns: tensor holding state features; """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): for i, num_filters in enumerate(conv_2d_num_filters): x = tf.nn.elu( norm_layer( @@ -55,7 +55,7 @@ def conv_2d_network(x, ) ) if keep_prob is not None: - x = tf.nn.dropout(x, keep_prob=keep_prob, name="_layer_{}_with_dropout".format(i + 1)) + x = tf.nn.dropout(x, rate=1 - (keep_prob), name="_layer_{}_with_dropout".format(i + 1)) # A3c/BaseAAC original paper design: # x = tf.nn.elu(conv2d(x, 16, 'conv2d_1', [8, 8], [4, 4], pad, dtype, collections, reuse)) @@ -123,27 +123,27 @@ def lstm_network( lstm state output tensor; lstm flattened feed placeholders as tuple. """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): # Prepare rnn type: if static: - rnn_net = tf.nn.static_rnn + rnn_net = tf.compat.v1.nn.static_rnn # Remove time dimension (suppose always get one) and wrap to list: x = [x[:, 0, :]] else: - rnn_net = tf.nn.dynamic_rnn + rnn_net = tf.compat.v1.nn.dynamic_rnn # Define LSTM layers: lstm = [] for size in lstm_layers: layer = lstm_class(size) if keep_prob is not None: - layer = tf.nn.rnn_cell.DropoutWrapper(layer, output_keep_prob=keep_prob) + layer = tf.compat.v1.nn.rnn_cell.DropoutWrapper(layer, output_keep_prob=keep_prob) lstm.append(layer) lstm = rnn.MultiRNNCell(lstm, state_is_tuple=True) # Get time_dimension as [1]-shaped tensor: - step_size = tf.expand_dims(tf.shape(x)[1], [0]) + step_size = tf.expand_dims(tf.shape(input=x)[1], [0]) lstm_init_state = lstm.zero_state(1, dtype=tf.float32) @@ -183,7 +183,7 @@ def dense_aac_network(x, ac_space_depth, name='dense_aac', linear_layer_ref=nois for every space in ac_space_shape dictionary """ - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): # Center-logits: logits = norm_layer( linear_layer_ref( @@ -247,8 +247,8 @@ def pixel_change_2d_estimator(ob_space, pc_estimator_stride=(2, 2), **kwargs): Note: crops input array by one pix from either side; --> 1D signal to be shaped as [signal_length, 3] """ - input_state = tf.placeholder(tf.float32, list(ob_space), name='pc_change_est_state_in') - input_last_state = tf.placeholder(tf.float32, list(ob_space), name='pc_change_est_last_state_in') + input_state = tf.compat.v1.placeholder(tf.float32, list(ob_space), name='pc_change_est_state_in') + input_last_state = tf.compat.v1.placeholder(tf.float32, list(ob_space), name='pc_change_est_last_state_in') x = tf.abs(tf.subtract(input_state, input_last_state)) # TODO: tf.square? @@ -258,13 +258,13 @@ def pixel_change_2d_estimator(ob_space, pc_estimator_stride=(2, 2), **kwargs): else: x = tf.expand_dims(x, 0)[:, 1:-1, 1:-1, :] # True 2D, fake batch dim and crop H, W dims - x = tf.reduce_mean(x, axis=-1, keepdims=True) + x = tf.reduce_mean(input_tensor=x, axis=-1, keepdims=True) - x_out = tf.nn.max_pool( - x, - [1, pc_estimator_stride[0], pc_estimator_stride[1], 1], - [1, pc_estimator_stride[0], pc_estimator_stride[1], 1], - 'SAME' + x_out = tf.nn.max_pool2d( + input=x, + ksize=[1, pc_estimator_stride[0], pc_estimator_stride[1], 1], + strides=[1, pc_estimator_stride[0], pc_estimator_stride[1], 1], + padding='SAME' ) return input_state, input_last_state, x_out @@ -285,7 +285,7 @@ def duelling_pc_network(x, x=x, size=np.prod(duell_pc_x_inner_shape), name='pc_dense', - initializer=tf.contrib.layers.xavier_initializer(), + initializer=tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"), reuse=reuse ) ) @@ -296,7 +296,7 @@ def duelling_pc_network(x, # Q-value estimate using advantage mean, # as (9) in "Dueling Network Architectures..." paper: # https://arxiv.org/pdf/1511.06581.pdf - pc_a_mean = tf.reduce_mean(pc_a, axis=-1, keepdims=True) + pc_a_mean = tf.reduce_mean(input_tensor=pc_a, axis=-1, keepdims=True) pc_q = pc_v + pc_a - pc_a_mean # [None, 20, 20, ac_size] return pc_q diff --git a/btgym/algorithms/policy/base.py b/btgym/algorithms/policy/base.py index 91c07243..a7dd4b93 100644 --- a/btgym/algorithms/policy/base.py +++ b/btgym/algorithms/policy/base.py @@ -78,33 +78,33 @@ def __init__(self, self.rp_state_in = nested_placeholders(self.ob_space.shape, batch_dim=None, name='rp_state_in') # Placeholders for previous step action[multi-categorical vector encoding] and reward [scalar]: - self.on_last_a_in = tf.placeholder( + self.on_last_a_in = tf.compat.v1.placeholder( tf.float32, [None, self.ac_space.encoded_depth], name='on_policy_last_action_in_pl' ) - self.on_last_reward_in = tf.placeholder(tf.float32, [None], name='on_policy_last_reward_in_pl') + self.on_last_reward_in = tf.compat.v1.placeholder(tf.float32, [None], name='on_policy_last_reward_in_pl') - self.off_last_a_in = tf.placeholder( + self.off_last_a_in = tf.compat.v1.placeholder( tf.float32, [None, self.ac_space.encoded_depth], name='off_policy_last_action_in_pl' ) - self.off_last_reward_in = tf.placeholder(tf.float32, [None], name='off_policy_last_reward_in_pl') + self.off_last_reward_in = tf.compat.v1.placeholder(tf.float32, [None], name='off_policy_last_reward_in_pl') # Placeholders for rnn batch and time-step dimensions: - self.on_batch_size = tf.placeholder(tf.int32, name='on_policy_batch_size') - self.on_time_length = tf.placeholder(tf.int32, name='on_policy_sequence_size') + self.on_batch_size = tf.compat.v1.placeholder(tf.int32, name='on_policy_batch_size') + self.on_time_length = tf.compat.v1.placeholder(tf.int32, name='on_policy_sequence_size') - self.off_batch_size = tf.placeholder(tf.int32, name='off_policy_batch_size') - self.off_time_length = tf.placeholder(tf.int32, name='off_policy_sequence_size') + self.off_batch_size = tf.compat.v1.placeholder(tf.int32, name='off_policy_batch_size') + self.off_time_length = tf.compat.v1.placeholder(tf.int32, name='off_policy_sequence_size') try: if self.train_phase is not None: pass except AttributeError: - self.train_phase = tf.placeholder_with_default( + self.train_phase = tf.compat.v1.placeholder_with_default( tf.constant(False, dtype=tf.bool), shape=(), name='train_phase_flag_pl' @@ -114,7 +114,7 @@ def __init__(self, on_aac_x = conv_2d_network(self.on_state_in['external'], self.ob_space.shape['external'], ac_space, **kwargs) # Reshape rnn inputs for batch training as [rnn_batch_dim, rnn_time_dim, flattened_depth]: - x_shape_dynamic = tf.shape(on_aac_x) + x_shape_dynamic = tf.shape(input=on_aac_x) max_seq_len = tf.cast(x_shape_dynamic[0] / self.on_batch_size, tf.int32) x_shape_static = on_aac_x.get_shape().as_list() @@ -167,7 +167,7 @@ def __init__(self, off_aac_x = conv_2d_network(self.off_state_in['external'], self.ob_space.shape['external'], ac_space, reuse=True, **kwargs) # Reshape rnn inputs for batch training as [rnn_batch_dim, rnn_time_dim, flattened_depth]: - x_shape_dynamic = tf.shape(off_aac_x) + x_shape_dynamic = tf.shape(input=off_aac_x) max_seq_len = tf.cast(x_shape_dynamic[0] / self.off_batch_size, tf.int32) x_shape_static = off_aac_x.get_shape().as_list() @@ -239,7 +239,7 @@ def __init__(self, self.vr_value = self.off_vf # Aux3: `Reward prediction` network: - self.rp_batch_size = tf.placeholder(tf.int32, name='rp_batch_size') + self.rp_batch_size = tf.compat.v1.placeholder(tf.int32, name='rp_batch_size') # Shared conv. output: rp_x = conv_2d_network(self.rp_state_in['external'], self.ob_space.shape['external'], ac_space, reuse=True, **kwargs) @@ -252,13 +252,13 @@ def __init__(self, self.rp_logits = dense_rp_network(rp_x) # Batch-norm related : - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + self.update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) # Add moving averages to save list: - moving_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*moving.*') - renorm_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*renorm.*') + moving_var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, tf.compat.v1.get_variable_scope().name + '.*moving.*') + renorm_var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, tf.compat.v1.get_variable_scope().name + '.*renorm.*') # What to save: - self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) + self.var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, tf.compat.v1.get_variable_scope().name) self.var_list += moving_var_list + renorm_var_list # Callbacks: @@ -273,7 +273,7 @@ def get_initial_features(self, **kwargs): LSTM zero-state tuple. """ # TODO: rework as in: AacStackedMetaPolicy --> base runner, verbose runner; synchro_runner ok - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() return sess.run(self.on_lstm_init_state) def act(self, observation, lstm_state, last_action, last_reward, deterministic=False): @@ -293,7 +293,7 @@ def act(self, observation, lstm_state, last_action, last_reward, deterministic=F Action as dictionary of several action encodings, actions logits, V-fn value, output RNN state """ try: - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() feeder = {pl: value for pl, value in zip(self.on_lstm_state_pl_flatten, flatten_nested(lstm_state))} feeder.update(feed_dict_from_nested(self.on_state_in, observation, expand_batch=True)) feeder.update( @@ -353,7 +353,7 @@ def get_value(self, observation, lstm_state, last_action, last_reward): Returns: V-function value """ - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() feeder = feed_dict_rnn_context(self.on_lstm_state_pl_flatten, lstm_state) feeder.update(feed_dict_from_nested(self.on_state_in, observation, expand_batch=True)) feeder.update( @@ -380,7 +380,7 @@ def get_pc_target(self, state, last_state, **kwargs): Returns: Estimated absolute difference between two subsampled states. """ - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() feeder = {self.pc_change_state_in: state['external'], self.pc_change_last_state_in: last_state['external']} return sess.run(self.pc_target, feeder)[0,...,0] diff --git a/btgym/algorithms/policy/meta.py b/btgym/algorithms/policy/meta.py index d84ff43e..37ea0fdd 100644 --- a/btgym/algorithms/policy/meta.py +++ b/btgym/algorithms/policy/meta.py @@ -19,14 +19,14 @@ def __init__(self, task, num_host_policies, learn_rate, name='SubMetaPolicy'): learn_rate: meta-policy learning rate name: name scope """ - with tf.variable_scope(name_or_scope=name): + with tf.compat.v1.variable_scope(name_or_scope=name): self.task = task self.learn_rate = learn_rate self.num_host_policies = num_host_policies - self.input_stat_pl = tf.placeholder(dtype=tf.float32, name='in_stat_pl') + self.input_stat_pl = tf.compat.v1.placeholder(dtype=tf.float32, name='in_stat_pl') - self.input_stat = tf.reduce_mean(self.input_stat_pl) + self.input_stat = tf.reduce_mean(input_tensor=self.input_stat_pl) self.initial_cluster_value = tf.concat( [ @@ -43,38 +43,38 @@ def __init__(self, task, num_host_policies, learn_rate, name='SubMetaPolicy'): name='cluster_wide_averages_slot' ) - update_task_iteration = tf.scatter_nd_add(self.cluster_averages_slot, [[0, task]], [1]) + update_task_iteration = tf.compat.v1.scatter_nd_add(self.cluster_averages_slot, [[0, task]], [1]) with tf.control_dependencies([update_task_iteration]): avg_prev = self.cluster_averages_slot[1, task] k = self.cluster_averages_slot[0, task] avg = avg_prev + (self.input_stat - avg_prev) / k - self.update_op = tf.scatter_nd_update(self.cluster_averages_slot, [[1, task]], [avg]) + self.update_op = tf.compat.v1.scatter_nd_update(self.cluster_averages_slot, [[1, task]], [avg]) - self.reset_op = tf.assign( + self.reset_op = tf.compat.v1.assign( self.cluster_averages_slot, self.initial_cluster_value ) # Toy network: - prob = tf.layers.dense( + prob = tf.compat.v1.layers.dense( tf.expand_dims(self.cluster_averages_slot[1, :], axis=-1), units=10, activation=tf.nn.sigmoid, use_bias=False, ) - self.next_step_prob = tf.layers.dense( + self.next_step_prob = tf.compat.v1.layers.dense( prob, units=1, activation=tf.nn.sigmoid, use_bias=False, ) - self.distribution = tf.distributions.Bernoulli( - probs=tf.reduce_max(self.next_step_prob) + self.distribution = tf.compat.v1.distributions.Bernoulli( + probs=tf.reduce_max(input_tensor=self.next_step_prob) ) self.sample = self.distribution.sample() - self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) + self.var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, tf.compat.v1.get_variable_scope().name) self.cluster_stat = tf.clip_by_value( # tf.reduce_mean( @@ -86,26 +86,26 @@ def __init__(self, task, num_host_policies, learn_rate, name='SubMetaPolicy'): ) bound_avg = tf.sigmoid(- self.cluster_stat) self.loss = tf.reduce_mean( - bound_avg * (1 - self.next_step_prob) + (1 - bound_avg) * self.next_step_prob + input_tensor=bound_avg * (1 - self.next_step_prob) + (1 - bound_avg) * self.next_step_prob ) - self.grads = tf.gradients(self.loss, self.var_list) + self.grads = tf.gradients(ys=self.loss, xs=self.var_list) self.summaries = [ - tf.summary.scalar('worker_avg_stat', self.cluster_averages_slot[1, task]), - tf.summary.scalar('worker_iterations', self.cluster_averages_slot[0, task]), + tf.compat.v1.summary.scalar('worker_avg_stat', self.cluster_averages_slot[1, task]), + tf.compat.v1.summary.scalar('worker_iterations', self.cluster_averages_slot[0, task]), #tf.summary.histogram('clipped_cluster_stat', self.cluster_stat), - tf.summary.scalar('loss', self.loss), - tf.summary.histogram('next_step_prob', self.next_step_prob), - tf.summary.scalar('grads_norm', tf.global_norm(self.grads)) + tf.compat.v1.summary.scalar('loss', self.loss), + tf.compat.v1.summary.histogram('next_step_prob', self.next_step_prob), + tf.compat.v1.summary.scalar('grads_norm', tf.linalg.global_norm(self.grads)) ] def update(self, input_stat): - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() feed_dict = {self.input_stat_pl: input_stat} sess.run(self.update_op, feed_dict) def reset(self): - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() sess.run(self.reset_op) def global_reset(self): @@ -122,7 +122,7 @@ def act(self): Returns: """ - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() fetched = sess.run([self.sample]) return fetched[-1] diff --git a/btgym/algorithms/policy/stacked_lstm.py b/btgym/algorithms/policy/stacked_lstm.py index c4750a3a..160728e8 100644 --- a/btgym/algorithms/policy/stacked_lstm.py +++ b/btgym/algorithms/policy/stacked_lstm.py @@ -77,7 +77,7 @@ def __init__(self, # self.encode_internal_state = encode_internal_state self.share_encoder_params = share_encoder_params if self.share_encoder_params: - self.reuse_encoder_params = tf.AUTO_REUSE + self.reuse_encoder_params = tf.compat.v1.AUTO_REUSE else: self.reuse_encoder_params = False @@ -93,26 +93,26 @@ def __init__(self, self.rp_state_in = nested_placeholders(self.ob_space.shape, batch_dim=None, name='rp_state_in') # Placeholders for previous step action[multi-categorical vector encoding] and reward [scalar]: - self.on_last_a_in = tf.placeholder( + self.on_last_a_in = tf.compat.v1.placeholder( tf.float32, [None, self.ac_space.encoded_depth], name='on_policy_last_action_in_pl' ) - self.on_last_reward_in = tf.placeholder(tf.float32, [None], name='on_policy_last_reward_in_pl') + self.on_last_reward_in = tf.compat.v1.placeholder(tf.float32, [None], name='on_policy_last_reward_in_pl') - self.off_last_a_in = tf.placeholder( + self.off_last_a_in = tf.compat.v1.placeholder( tf.float32, [None, self.ac_space.encoded_depth], name='off_policy_last_action_in_pl' ) - self.off_last_reward_in = tf.placeholder(tf.float32, [None], name='off_policy_last_reward_in_pl') + self.off_last_reward_in = tf.compat.v1.placeholder(tf.float32, [None], name='off_policy_last_reward_in_pl') # Placeholders for rnn batch and time-step dimensions: - self.on_batch_size = tf.placeholder(tf.int32, name='on_policy_batch_size') - self.on_time_length = tf.placeholder(tf.int32, name='on_policy_sequence_size') + self.on_batch_size = tf.compat.v1.placeholder(tf.int32, name='on_policy_batch_size') + self.on_time_length = tf.compat.v1.placeholder(tf.int32, name='on_policy_sequence_size') - self.off_batch_size = tf.placeholder(tf.int32, name='off_policy_batch_size') - self.off_time_length = tf.placeholder(tf.int32, name='off_policy_sequence_size') + self.off_batch_size = tf.compat.v1.placeholder(tf.int32, name='off_policy_batch_size') + self.off_time_length = tf.compat.v1.placeholder(tf.int32, name='off_policy_sequence_size') self.debug['on_state_in_keys'] = list(self.on_state_in.keys()) @@ -122,7 +122,7 @@ def __init__(self, pass except AttributeError: - self.train_phase = tf.placeholder_with_default( + self.train_phase = tf.compat.v1.placeholder_with_default( tf.constant(False, dtype=tf.bool), shape=(), name='train_phase_flag_pl' @@ -161,7 +161,7 @@ def __init__(self, else: layer_name_template = 'encoded_{}_{}' encoded_streams = { - name: tf.layers.flatten( + name: tf.compat.v1.layers.flatten( self.state_encoder_class_ref( x=stream, ob_space=self.ob_space.shape[key][name], @@ -180,7 +180,7 @@ def __init__(self, ) else: # Got single data stream: - encoded_mode = tf.layers.flatten( + encoded_mode = tf.compat.v1.layers.flatten( self.state_encoder_class_ref( x=self.on_state_in[key], ob_space=self.ob_space.shape[key], @@ -200,14 +200,14 @@ def __init__(self, # TODO: for encoder prediction test, output `naive` estimates for logits and value directly from encoder: [self.on_simple_logits, self.on_simple_value, _] = dense_aac_network( - tf.layers.flatten(on_aac_x), + tf.compat.v1.layers.flatten(on_aac_x), ac_space_depth=self.ac_space.one_hot_depth, linear_layer_ref=linear_layer_ref, name='aac_dense_simple_pi_v' ) # Reshape rnn inputs for batch training as: [rnn_batch_dim, rnn_time_dim, flattened_depth]: - x_shape_dynamic = tf.shape(on_aac_x) + x_shape_dynamic = tf.shape(input=on_aac_x) max_seq_len = tf.cast(x_shape_dynamic[0] / self.on_batch_size, tf.int32) x_shape_static = on_aac_x.get_shape().as_list() @@ -308,7 +308,7 @@ def __init__(self, self.debug['self.on_lstm_1_state_pl_flatten'] = self.on_lstm_1_state_pl_flatten # For time_flat only: Reshape on_lstm_1_state_out from [1,2,20,size] -->[20,1,2,size] --> [20,1, 2xsize]: - reshape_lstm_1_state_out = tf.transpose(self.on_lstm_1_state_out, [2, 0, 1, 3]) + reshape_lstm_1_state_out = tf.transpose(a=self.on_lstm_1_state_out, perm=[2, 0, 1, 3]) reshape_lstm_1_state_out_shape_static = reshape_lstm_1_state_out.get_shape().as_list() # Take policy logits off first LSTM-dense layer: @@ -390,7 +390,7 @@ def __init__(self, else: layer_name_template = 'encoded_{}_{}' encoded_streams = { - name: tf.layers.flatten( + name: tf.compat.v1.layers.flatten( self.state_encoder_class_ref( x=stream, ob_space=self.ob_space.shape[key][name], @@ -409,7 +409,7 @@ def __init__(self, ) else: # Got single data stream: - encoded_mode = tf.layers.flatten( + encoded_mode = tf.compat.v1.layers.flatten( self.state_encoder_class_ref( x=self.off_state_in[key], ob_space=self.ob_space.shape[key], @@ -426,7 +426,7 @@ def __init__(self, off_aac_x = self.off_aac_x_encoded['external'] # Reshape rnn inputs for batch training as [rnn_batch_dim, rnn_time_dim, flattened_depth]: - x_shape_dynamic = tf.shape(off_aac_x) + x_shape_dynamic = tf.shape(input=off_aac_x) max_seq_len = tf.cast(x_shape_dynamic[0] / self.off_batch_size, tf.int32) x_shape_static = off_aac_x.get_shape().as_list() @@ -602,7 +602,7 @@ def __init__(self, # Aux3: # `Reward prediction` network. - self.rp_batch_size = tf.placeholder(tf.int32, name='rp_batch_size') + self.rp_batch_size = tf.compat.v1.placeholder(tf.int32, name='rp_batch_size') # Shared encoded output: rp_x = {} @@ -614,7 +614,7 @@ def __init__(self, else: layer_name_template = 'encoded_{}_{}' encoded_streams = { - name: tf.layers.flatten( + name: tf.compat.v1.layers.flatten( self.state_encoder_class_ref( x=stream, ob_space=self.ob_space.shape[key][name], @@ -633,7 +633,7 @@ def __init__(self, ) else: # Got single data stream: - encoded_mode = tf.layers.flatten( + encoded_mode = tf.compat.v1.layers.flatten( self.state_encoder_class_ref( x=self.rp_state_in[key], ob_space=self.ob_space.shape, @@ -655,13 +655,13 @@ def __init__(self, self.rp_logits = dense_rp_network(rp_x, linear_layer_ref=linear_layer_ref) # Batch-norm related: - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + self.update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) # Add moving averages to save list: - moving_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*moving.*') - renorm_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*renorm.*') + moving_var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, tf.compat.v1.get_variable_scope().name + '.*moving.*') + renorm_var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, tf.compat.v1.get_variable_scope().name + '.*renorm.*') # What to save: - self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) + self.var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, tf.compat.v1.get_variable_scope().name) self.var_list += moving_var_list + renorm_var_list # Callbacks: @@ -721,7 +721,7 @@ def get_initial_features(self, state, context=None): KeyError if [`metadata`]:[`trial_num`,`type`] keys not found """ try: - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() new_context = list(sess.run(self.on_lstm_init_state)) if state['metadata']['trial_num'] != self.current_trial_num\ or context is None\ diff --git a/btgym/algorithms/utils.py b/btgym/algorithms/utils.py index 34aae914..da323aea 100644 --- a/btgym/algorithms/utils.py +++ b/btgym/algorithms/utils.py @@ -19,14 +19,14 @@ def rnn_placeholders(state): Returns: tuple of placeholders """ - if isinstance(state, tf.contrib.rnn.LSTMStateTuple): + if isinstance(state, tf.nn.rnn_cell.LSTMStateTuple): c, h = state - c = tf.placeholder(tf.float32, tf.TensorShape([None]).concatenate(c.get_shape()[1:]), c.op.name + '_c_pl') - h = tf.placeholder(tf.float32, tf.TensorShape([None]).concatenate(h.get_shape()[1:]), h.op.name + '_h_pl') - return tf.contrib.rnn.LSTMStateTuple(c, h) + c = tf.compat.v1.placeholder(tf.float32, tf.TensorShape([None]).concatenate(c.get_shape()[1:]), c.op.name + '_c_pl') + h = tf.compat.v1.placeholder(tf.float32, tf.TensorShape([None]).concatenate(h.get_shape()[1:]), h.op.name + '_h_pl') + return tf.nn.rnn_cell.LSTMStateTuple(c, h) elif isinstance(state, tf.Tensor): h = state - h = tf.placeholder(tf.float32, tf.TensorShape([None]).concatenate(h.get_shape()[1:]), h.op.name + '_h_pl') + h = tf.compat.v1.placeholder(tf.float32, tf.TensorShape([None]).concatenate(h.get_shape()[1:]), h.op.name + '_h_pl') return h else: structure = [rnn_placeholders(x) for x in state] @@ -49,7 +49,7 @@ def nested_placeholders(ob_space, batch_dim=None, name='nested'): out = {key: nested_placeholders(value, batch_dim, name + '_' + key) for key, value in ob_space.items()} return out else: - out = tf.placeholder(tf.float32, [batch_dim] + list(ob_space), name + '_pl') + out = tf.compat.v1.placeholder(tf.float32, [batch_dim] + list(ob_space), name + '_pl') return out diff --git a/btgym/algorithms/worker.py b/btgym/algorithms/worker.py index 5eec019a..76bf0865 100644 --- a/btgym/algorithms/worker.py +++ b/btgym/algorithms/worker.py @@ -16,10 +16,10 @@ import tensorflow as tf sys.path.insert(0, '..') -tf.logging.set_verbosity(tf.logging.INFO) +tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) -class FastSaver(tf.train.Saver): +class FastSaver(tf.compat.v1.train.Saver): """ Disables write_meta_graph argument, which freezes entire process and is mostly useless. @@ -188,7 +188,7 @@ def run(self): StreamHandler(sys.stdout).push_application() self.log = Logger('Worker_{}'.format(self.task), level=self.log_level) try: - tf.reset_default_graph() + tf.compat.v1.reset_default_graph() if self.test_mode: import gym @@ -198,22 +198,22 @@ def run(self): # Start tf.server: if self.job_name in 'ps': - server = tf.train.Server( + server = tf.distribute.Server( cluster, job_name=self.job_name, task_index=self.task, - config=tf.ConfigProto(device_filters=["/job:ps"]) + config=tf.compat.v1.ConfigProto(device_filters=["/job:ps"]) ) self.log.debug('parameters_server started.') # Just block here: server.join() else: - server = tf.train.Server( + server = tf.distribute.Server( cluster, job_name='worker', task_index=self.task, - config=tf.ConfigProto( + config=tf.compat.v1.ConfigProto( intra_op_parallelism_threads=4, # original was: 1 inter_op_parallelism_threads=4, # original was: 2 ) @@ -308,11 +308,11 @@ def run(self): self.log.debug('trainer ok.') # Saver-related: - variables_to_save = [v for v in tf.global_variables() if not 'local' in v.name] - local_variables = [v for v in tf.global_variables() if 'local' in v.name] + tf.local_variables() - init_op = tf.initializers.variables(variables_to_save) - local_init_op = tf.initializers.variables(local_variables) - init_all_op = tf.global_variables_initializer() + variables_to_save = [v for v in tf.compat.v1.global_variables() if not 'local' in v.name] + local_variables = [v for v in tf.compat.v1.global_variables() if 'local' in v.name] + tf.compat.v1.local_variables() + init_op = tf.compat.v1.initializers.variables(variables_to_save) + local_init_op = tf.compat.v1.initializers.variables(local_variables) + init_all_op = tf.compat.v1.global_variables_initializer() def init_fn(_sess): self.log.notice("initializing all parameters...") @@ -332,12 +332,12 @@ def init_fn(_sess): self.saver = FastSaver(var_list=variables_to_save, max_to_keep=1, save_relative_paths=True) - self.config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(self.task)]) + self.config = tf.compat.v1.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(self.task)]) - sess_manager = tf.train.SessionManager( + sess_manager = tf.compat.v1.train.SessionManager( local_init_op=local_init_op, ready_op=None, - ready_for_local_init_op=tf.report_uninitialized_variables(variables_to_save), + ready_for_local_init_op=tf.compat.v1.report_uninitialized_variables(variables_to_save), graph=None, recovery_wait_secs=90, ) @@ -364,7 +364,7 @@ def init_fn(_sess): self.log.info("connecting to the parameter server... ") - self.summary_writer = tf.summary.FileWriter(self.summary_dir, sess.graph) + self.summary_writer = tf.compat.v1.summary.FileWriter(self.summary_dir, sess.graph) trainer.start(sess, self.summary_writer) # Note: `self.global_step` refers to number of environment steps diff --git a/btgym/monitor/tensorboard2.py b/btgym/monitor/tensorboard2.py index e1cad240..93021bd3 100644 --- a/btgym/monitor/tensorboard2.py +++ b/btgym/monitor/tensorboard2.py @@ -69,7 +69,7 @@ def __init__(self, p = psutil.Popen(['rm', '-R', ] + files, stdout=PIPE, stderr=PIPE) # Prepare writer: - self.writer = tf.summary.FileWriter(self.logdir, graph=tf.get_default_graph()) + self.writer = tf.compat.v1.summary.FileWriter(self.logdir, graph=tf.compat.v1.get_default_graph()) # Create summary: @@ -77,25 +77,25 @@ def __init__(self, for entry in scalars: assert type(entry) == str - self.feed_holder[entry] = tf.placeholder(tf.float32) - summaries += [tf.summary.scalar(entry, self.feed_holder[entry],)] + self.feed_holder[entry] = tf.compat.v1.placeholder(tf.float32) + summaries += [tf.compat.v1.summary.scalar(entry, self.feed_holder[entry],)] for entry in images: assert type(entry) == str - self.feed_holder[entry] = tf.placeholder(tf.uint8, [None, None, None, 3]) - summaries += [tf.summary.image(entry, self.feed_holder[entry], )] + self.feed_holder[entry] = tf.compat.v1.placeholder(tf.uint8, [None, None, None, 3]) + summaries += [tf.compat.v1.summary.image(entry, self.feed_holder[entry], )] for entry in histograms: assert type(entry) == str - self.feed_holder[entry] = tf.placeholder(tf.float32,[None, None],) - summaries += [tf.summary.histogram(entry, self.feed_holder[entry], )] + self.feed_holder[entry] = tf.compat.v1.placeholder(tf.float32,[None, None],) + summaries += [tf.compat.v1.summary.histogram(entry, self.feed_holder[entry], )] for entry in text: assert type(entry) == str - self.feed_holder[entry] = tf.placeholder(tf.string) - summaries += [tf.summary.histogram(entry, self.feed_holder[entry], )] + self.feed_holder[entry] = tf.compat.v1.placeholder(tf.string) + summaries += [tf.compat.v1.summary.histogram(entry, self.feed_holder[entry], )] - self.summary = tf.summary.merge(summaries) + self.summary = tf.compat.v1.summary.merge(summaries) def write(self, sess, feed_dict, global_step): """ diff --git a/btgym/research/b_vae_a3c.py b/btgym/research/b_vae_a3c.py index d6e13097..b1dc90fb 100644 --- a/btgym/research/b_vae_a3c.py +++ b/btgym/research/b_vae_a3c.py @@ -107,15 +107,15 @@ def __init__(self, self.rp_state_in = nested_placeholders(ob_space, batch_dim=None, name='rp_state_in') # Placeholders for concatenated action [one-hot] and reward [scalar]: - self.on_a_r_in = tf.placeholder(tf.float32, [None, ac_space + 1], name='on_policy_action_reward_in_pl') - self.off_a_r_in = tf.placeholder(tf.float32, [None, ac_space + 1], name='off_policy_action_reward_in_pl') + self.on_a_r_in = tf.compat.v1.placeholder(tf.float32, [None, ac_space + 1], name='on_policy_action_reward_in_pl') + self.off_a_r_in = tf.compat.v1.placeholder(tf.float32, [None, ac_space + 1], name='off_policy_action_reward_in_pl') # Placeholders for rnn batch and time-step dimensions: - self.on_batch_size = tf.placeholder(tf.int32, name='on_policy_batch_size') - self.on_time_length = tf.placeholder(tf.int32, name='on_policy_sequence_size') + self.on_batch_size = tf.compat.v1.placeholder(tf.int32, name='on_policy_batch_size') + self.on_time_length = tf.compat.v1.placeholder(tf.int32, name='on_policy_sequence_size') - self.off_batch_size = tf.placeholder(tf.int32, name='off_policy_batch_size') - self.off_time_length = tf.placeholder(tf.int32, name='off_policy_sequence_size') + self.off_batch_size = tf.compat.v1.placeholder(tf.int32, name='off_policy_batch_size') + self.off_time_length = tf.compat.v1.placeholder(tf.int32, name='off_policy_sequence_size') # ============= Base on-policy AAC network =========== @@ -133,7 +133,7 @@ def __init__(self, self.on_vae_d_kl_ext = on_d_kl_ext # Reshape rnn inputs for batch training as [rnn_batch_dim, rnn_time_dim, flattened_depth]: - x_shape_dynamic = tf.shape(on_aac_x_ext) + x_shape_dynamic = tf.shape(input=on_aac_x_ext) max_seq_len = tf.cast(x_shape_dynamic[0] / self.on_batch_size, tf.int32) x_shape_static = on_aac_x_ext.get_shape().as_list() @@ -158,7 +158,7 @@ def __init__(self, x_int_shape_static = on_x_int.get_shape().as_list() on_x_int = [ tf.reshape(on_x_int, [self.on_batch_size, max_seq_len, np.prod(x_int_shape_static[1:])])] - self.debug['state_internal_enc'] = tf.shape(on_x_int) + self.debug['state_internal_enc'] = tf.shape(input=on_x_int) else: # Feed as is: @@ -167,7 +167,7 @@ def __init__(self, self.on_state_in['internal'], [self.on_batch_size, max_seq_len, np.prod(x_int_shape_static[1:])] ) - self.debug['state_internal'] = tf.shape(self.on_state_in['internal']) + self.debug['state_internal'] = tf.shape(input=self.on_state_in['internal']) on_x_int = [on_x_int] self.on_state_decoded_int = None self.on_vae_d_kl_int = None @@ -177,7 +177,7 @@ def __init__(self, self.on_state_decoded_int = None self.on_vae_d_kl_int = None - self.debug['conv_input_to_lstm1'] = tf.shape(on_aac_x_ext) + self.debug['conv_input_to_lstm1'] = tf.shape(input=on_aac_x_ext) # Feed last last_reward into LSTM_1 layer along with encoded `external` state features: on_stage2_1_input = [on_aac_x_ext, on_a_r_in[..., -1][..., None]] #+ on_x_internal @@ -188,18 +188,18 @@ def __init__(self, # LSTM_1 full input: on_aac_x_ext = tf.concat(on_stage2_1_input, axis=-1) - self.debug['concat_input_to_lstm1'] = tf.shape(on_aac_x_ext) + self.debug['concat_input_to_lstm1'] = tf.shape(input=on_aac_x_ext) # First LSTM layer takes encoded `external` state: [on_x_lstm_1_out, self.on_lstm_1_init_state, self.on_lstm_1_state_out, self.on_lstm_1_state_pl_flatten] =\ lstm_network(on_aac_x_ext, self.on_time_length, lstm_class_ref, (lstm_layers[0],), name='lstm_1') - self.debug['on_x_lstm_1_out'] = tf.shape(on_x_lstm_1_out) - self.debug['self.on_lstm_1_state_out'] = tf.shape(self.on_lstm_1_state_out) - self.debug['self.on_lstm_1_state_pl_flatten'] = tf.shape(self.on_lstm_1_state_pl_flatten) + self.debug['on_x_lstm_1_out'] = tf.shape(input=on_x_lstm_1_out) + self.debug['self.on_lstm_1_state_out'] = tf.shape(input=self.on_lstm_1_state_out) + self.debug['self.on_lstm_1_state_pl_flatten'] = tf.shape(input=self.on_lstm_1_state_pl_flatten) # For time_flat only: Reshape on_lstm_1_state_out from [1,2,20,size] -->[20,1,2,size] --> [20,1, 2xsize]: - reshape_lstm_1_state_out = tf.transpose(self.on_lstm_1_state_out, [2, 0, 1, 3]) + reshape_lstm_1_state_out = tf.transpose(a=self.on_lstm_1_state_out, perm=[2, 0, 1, 3]) reshape_lstm_1_state_out_shape_static = reshape_lstm_1_state_out.get_shape().as_list() reshape_lstm_1_state_out = tf.reshape( reshape_lstm_1_state_out, @@ -212,7 +212,7 @@ def __init__(self, x_shape_static = on_x_lstm_1_out.get_shape().as_list() rsh_on_x_lstm_1_out = tf.reshape(on_x_lstm_1_out, [x_shape_dynamic[0], x_shape_static[-1]]) - self.debug['reshaped_on_x_lstm_1_out'] = tf.shape(rsh_on_x_lstm_1_out) + self.debug['reshaped_on_x_lstm_1_out'] = tf.shape(input=rsh_on_x_lstm_1_out) # Aac policy output and action-sampling function: [self.on_logits, _, self.on_sample] = dense_aac_network( @@ -231,20 +231,20 @@ def __init__(self, # LSTM_2 full input: on_aac_x_ext = tf.concat(on_stage2_2_input, axis=-1) - self.debug['on_stage2_2_input'] = tf.shape(on_aac_x_ext) + self.debug['on_stage2_2_input'] = tf.shape(input=on_aac_x_ext) [on_x_lstm_2_out, self.on_lstm_2_init_state, self.on_lstm_2_state_out, self.on_lstm_2_state_pl_flatten] = \ lstm_network(on_aac_x_ext, self.on_time_length, lstm_class_ref, (lstm_layers[-1],), name='lstm_2') - self.debug['on_x_lstm_2_out'] = tf.shape(on_x_lstm_2_out) - self.debug['self.on_lstm_2_state_out'] = tf.shape(self.on_lstm_2_state_out) - self.debug['self.on_lstm_2_state_pl_flatten'] = tf.shape(self.on_lstm_2_state_pl_flatten) + self.debug['on_x_lstm_2_out'] = tf.shape(input=on_x_lstm_2_out) + self.debug['self.on_lstm_2_state_out'] = tf.shape(input=self.on_lstm_2_state_out) + self.debug['self.on_lstm_2_state_pl_flatten'] = tf.shape(input=self.on_lstm_2_state_pl_flatten) # Reshape back to [batch, flattened_depth], where batch = rnn_batch_dim * rnn_time_dim: x_shape_static = on_x_lstm_2_out.get_shape().as_list() on_x_lstm_out = tf.reshape(on_x_lstm_2_out, [x_shape_dynamic[0], x_shape_static[-1]]) - self.debug['reshaped_on_x_lstm_out'] = tf.shape(on_x_lstm_out) + self.debug['reshaped_on_x_lstm_out'] = tf.shape(input=on_x_lstm_out) # Aac value function: [_, self.on_vf, _] = dense_aac_network( @@ -274,7 +274,7 @@ def __init__(self, self.off_vae_d_kl_ext = off_d_kl_ext # Reshape rnn inputs for batch training as [rnn_batch_dim, rnn_time_dim, flattened_depth]: - x_shape_dynamic = tf.shape(off_aac_x) + x_shape_dynamic = tf.shape(input=off_aac_x) max_seq_len = tf.cast(x_shape_dynamic[0] / self.off_batch_size, tf.int32) x_shape_static = off_aac_x.get_shape().as_list() @@ -397,7 +397,7 @@ def __init__(self, # Aux3: # `Reward prediction` network. - self.rp_batch_size = tf.placeholder(tf.int32, name='rp_batch_size') + self.rp_batch_size = tf.compat.v1.placeholder(tf.int32, name='rp_batch_size') # Shared conv. output: rp_encoded_layers, rp_x, rp_decoded_layers, _, rp_d_kl = encoder_class_ref( @@ -421,18 +421,18 @@ def __init__(self, pass except AttributeError: - self.train_phase = tf.placeholder_with_default( + self.train_phase = tf.compat.v1.placeholder_with_default( tf.constant(False, dtype=tf.bool), shape=(), name='train_phase_flag_pl' ) - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + self.update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) # Add moving averages to save list: - moving_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*moving.*') - renorm_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*renorm.*') + moving_var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, tf.compat.v1.get_variable_scope().name + '.*moving.*') + renorm_var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, tf.compat.v1.get_variable_scope().name + '.*renorm.*') # What to save: - self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) + self.var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, tf.compat.v1.get_variable_scope().name) self.var_list += moving_var_list + renorm_var_list # Callbacks: @@ -465,7 +465,7 @@ def get_initial_features(self, state, context=None): KeyError if [`metadata`]:[`trial_num`,`type`] keys not found """ try: - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() new_context = list(sess.run(self.on_lstm_init_state)) if state['metadata']['trial_num'] != self.current_trial_num\ or context is None\ @@ -503,7 +503,7 @@ def __init__(self, ae_loss=beta_vae_loss_def, ae_alpha=1.0, ae_beta=1.0, _log_na try: super(bVAENA3C, self).__init__(name=_log_name, **kwargs) with tf.device(self.worker_device): - with tf.variable_scope('local'): + with tf.compat.v1.variable_scope('local'): on_vae_loss_ext, on_ae_summary_ext = ae_loss( targets=self.local_network.on_state_in['external'], logits=self.local_network.on_state_decoded_ext, @@ -531,7 +531,7 @@ def __init__(self, ae_loss=beta_vae_loss_def, ae_alpha=1.0, ae_beta=1.0, _log_na # Override train op def: self.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.loss, self.local_network.var_list), + tf.gradients(ys=self.loss, xs=self.local_network.var_list), 40.0 ) grads_and_vars = list(zip(self.grads, self.network.var_list)) @@ -539,7 +539,7 @@ def __init__(self, ae_loss=beta_vae_loss_def, ae_alpha=1.0, ae_beta=1.0, _log_na # Merge summary: extended_summary.append(self.model_summary_op) - self.model_summary_op = tf.summary.merge(extended_summary, name='extended_summary') + self.model_summary_op = tf.compat.v1.summary.merge(extended_summary, name='extended_summary') except: msg = 'Child 0.0 class __init()__ exception occurred' + \ diff --git a/btgym/research/casual/aac.py b/btgym/research/casual/aac.py index 7f27de5a..e389e959 100644 --- a/btgym/research/casual/aac.py +++ b/btgym/research/casual/aac.py @@ -279,7 +279,7 @@ def _make_loss(self, **kwargs): # Guidance annealing: if self.guided_decay_steps is not None: - self.guided_lambda_decayed = tf.train.polynomial_decay( + self.guided_lambda_decayed = tf.compat.v1.train.polynomial_decay( self.guided_lambda, self.global_step + 1, self.guided_decay_steps, @@ -327,14 +327,14 @@ def _make_train_op(self, pi, pi_prime, pi_global): """ # Each worker gets a different set of adam optimizer parameters: - self.optimizer = tf.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) + self.optimizer = tf.compat.v1.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) # Clipped gradients: self.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.loss, pi.var_list), + tf.gradients(ys=self.loss, xs=pi.var_list), 40.0 ) - self.grads_global_norm = tf.global_norm(self.grads) + self.grads_global_norm = tf.linalg.global_norm(self.grads) # Copy weights from the parameter server to the local model self.sync = self.sync_pi = tf.group( *[v1.assign(v2) for v1, v2 in zip(pi.var_list, pi_global.var_list)] @@ -351,21 +351,21 @@ def _make_train_op(self, pi, pi_prime, pi_global): assert 'external' in obs_space_keys, \ 'Expected observation space to contain `external` mode, got: {}'.format(obs_space_keys) - self.inc_step = self.global_step.assign_add(tf.shape(pi.on_state_in['external'])[0]) + self.inc_step = self.global_step.assign_add(tf.shape(input=pi.on_state_in['external'])[0]) self.local_network.meta.grads_and_vars = list( zip(self.local_network.meta.grads, self.network.meta.var_list) ) - self.meta_opt = tf.train.GradientDescentOptimizer(self.local_network.meta.learn_rate) + self.meta_opt = tf.compat.v1.train.GradientDescentOptimizer(self.local_network.meta.learn_rate) self.meta_train_op = self.meta_opt.apply_gradients(self.local_network.meta.grads_and_vars) - self.local_network.meta.sync_slot_op = tf.assign( + self.local_network.meta.sync_slot_op = tf.compat.v1.assign( self.local_network.meta.cluster_averages_slot, self.network.meta.cluster_averages_slot, ) - self.local_network.meta.send_stat_op = tf.scatter_nd_update( + self.local_network.meta.send_stat_op = tf.compat.v1.scatter_nd_update( self.network.meta.cluster_averages_slot, [[0, self.task], [1, self.task]], [ diff --git a/btgym/research/casual_conv/layers.py b/btgym/research/casual_conv/layers.py index 8fa01ca4..d47e75a8 100644 --- a/btgym/research/casual_conv/layers.py +++ b/btgym/research/casual_conv/layers.py @@ -4,22 +4,22 @@ def time_to_batch(value, dilation, name=None): - with tf.name_scope('time_to_batch'): - shape = tf.shape(value) + with tf.compat.v1.name_scope('time_to_batch'): + shape = tf.shape(input=value) pad_elements = dilation - 1 - (shape[1] + dilation - 1) % dilation - padded = tf.pad(value, [[0, 0], [0, pad_elements], [0, 0]]) + padded = tf.pad(tensor=value, paddings=[[0, 0], [0, pad_elements], [0, 0]]) reshaped = tf.reshape(padded, [-1, dilation, shape[2]]) - transposed = tf.transpose(reshaped, perm=[1, 0, 2]) + transposed = tf.transpose(a=reshaped, perm=[1, 0, 2]) return tf.reshape(transposed, [shape[0] * dilation, -1, shape[2]]) def batch_to_time(value, dilation, name=None): - with tf.name_scope('batch_to_time'): - shape = tf.shape(value) + with tf.compat.v1.name_scope('batch_to_time'): + shape = tf.shape(input=value) prepared = tf.reshape(value, [dilation, -1, shape[2]]) - transposed = tf.transpose(prepared, perm=[1, 0, 2]) + transposed = tf.transpose(a=prepared, perm=[1, 0, 2]) return tf.reshape(transposed, - [tf.div(shape[0], dilation), -1, shape[2]]) + [tf.compat.v1.div(shape[0], dilation), -1, shape[2]]) def dilated_conv1d( @@ -32,7 +32,7 @@ def dilated_conv1d( name='dialted_conv_1d', reuse=False ): - with tf.name_scope(name): + with tf.compat.v1.name_scope(name): if dilation_rate > 1: transformed = time_to_batch(inputs, dilation_rate) conv = conv1d( @@ -56,7 +56,7 @@ def dilated_conv1d( reuse=reuse ) # Remove excess elements at the end. - out_width = tf.shape(inputs)[1] - (filter_width - 1) * dilation_rate + out_width = tf.shape(input=inputs)[1] - (filter_width - 1) * dilation_rate result = tf.slice(restored, [0, 0, 0], [-1, out_width, -1]) diff --git a/btgym/research/casual_conv/networks.py b/btgym/research/casual_conv/networks.py index cef1c333..ac48c641 100644 --- a/btgym/research/casual_conv/networks.py +++ b/btgym/research/casual_conv/networks.py @@ -32,7 +32,7 @@ def conv_1d_casual_encoder( tensor holding state features; """ - with tf.variable_scope(name_or_scope=name, reuse=reuse): + with tf.compat.v1.variable_scope(name_or_scope=name, reuse=reuse): shape = x.get_shape().as_list() if len(shape) > 3: # remove pseudo 2d dimension x = x[:, :, 0, :] @@ -53,7 +53,7 @@ def conv_1d_casual_encoder( if tail != 0: pad = conv_1d_filter_size - tail paddings = [[0, 0], [pad, 0], [0, 0]] - y = tf.pad(y, paddings) + y = tf.pad(tensor=y, paddings=paddings) length += pad # print('padded_length: ', length) @@ -83,7 +83,7 @@ def conv_1d_casual_encoder( y = conv_1d_activation(y) if keep_prob is not None: - y = tf.nn.dropout(y, keep_prob=keep_prob, name="_layer_{}_with_dropout".format(i)) + y = tf.nn.dropout(y, rate=1 - (keep_prob), name="_layer_{}_with_dropout".format(i)) layers.append(y) @@ -171,7 +171,7 @@ def attention_layer(inputs, attention_ref=tf.contrib.seq2seq.LuongAttention, nam # Suppose there is no previous context for attention (uhm?): alignments = attention_mechanism( query_state, - attention_mechanism.initial_alignments(tf.shape(inputs)[0], dtype=tf.float32) + attention_mechanism.initial_alignments(tf.shape(input=inputs)[0], dtype=tf.float32) ) # Somehow attention call returns tuple of twin tensors (wtf?): if isinstance(alignments, tuple): @@ -219,7 +219,7 @@ def conv_1d_casual_attention_encoder( tensor holding state features; """ - with tf.variable_scope(name_or_scope=name, reuse=reuse): + with tf.compat.v1.variable_scope(name_or_scope=name, reuse=reuse): shape = x.get_shape().as_list() if len(shape) > 3: # remove pseudo 2d dimension x = x[:, :, 0, :] @@ -240,7 +240,7 @@ def conv_1d_casual_attention_encoder( if tail != 0: pad = conv_1d_filter_size - tail paddings = [[0, 0], [pad, 0], [0, 0]] - y = tf.pad(y, paddings) + y = tf.pad(tensor=y, paddings=paddings) length += pad # print('padded_length: ', length) @@ -268,7 +268,7 @@ def conv_1d_casual_attention_encoder( y = conv_1d_activation(y) if keep_prob is not None: - y = tf.nn.dropout(y, keep_prob=keep_prob, name="_layer_{}_with_dropout".format(i)) + y = tf.nn.dropout(y, rate=1 - (keep_prob), name="_layer_{}_with_dropout".format(i)) if conv_1d_gated: split_size = int(conv_1d_num_filters / 2) diff --git a/btgym/research/encoder_test/aac.py b/btgym/research/encoder_test/aac.py index d27d82dc..a9997d3e 100644 --- a/btgym/research/encoder_test/aac.py +++ b/btgym/research/encoder_test/aac.py @@ -553,13 +553,13 @@ def _make_loss(self, pi, pi_prime, name='base', verbose=True, **kwargs): tensor holding estimated loss graph list of related summaries """ - with tf.name_scope(name): + with tf.compat.v1.name_scope(name): # On-policy AAC loss definition: - pi.on_pi_act_target = tf.placeholder( + pi.on_pi_act_target = tf.compat.v1.placeholder( tf.float32, [None, self.ref_env.action_space.one_hot_depth], name="on_policy_action_pl" ) - pi.on_pi_adv_target = tf.placeholder(tf.float32, [None], name="on_policy_advantage_pl") - pi.on_pi_r_target = tf.placeholder(tf.float32, [None], name="on_policy_return_pl") + pi.on_pi_adv_target = tf.compat.v1.placeholder(tf.float32, [None], name="on_policy_advantage_pl") + pi.on_pi_r_target = tf.compat.v1.placeholder(tf.float32, [None], name="on_policy_return_pl") # clip_epsilon = tf.cast(self.clip_epsilon * self.learn_rate_decayed / self.opt_learn_rate, tf.float32) # @@ -576,20 +576,20 @@ def _make_loss(self, pi, pi_prime, name='base', verbose=True, **kwargs): # verbose=verbose # ) pi_regression = tf.exp(pi.regression) - regress_loss = tf.losses.mean_squared_error( + regress_loss = tf.compat.v1.losses.mean_squared_error( labels=pi.regression_targets, predictions=pi_regression, weights=self.regress_lambda, ) - self.mse = tf.metrics.mean_squared_error( + self.mse = tf.compat.v1.metrics.mean_squared_error( labels=pi.regression_targets, predictions=pi_regression ) model_summaries = [ - tf.summary.scalar('regress_loss', regress_loss), - tf.summary.scalar('mse_metric', self.mse[0]) + tf.compat.v1.summary.scalar('regress_loss', regress_loss), + tf.compat.v1.summary.scalar('mse_metric', self.mse[0]) ] # Accumulate total loss: # loss = float(self.class_lambda) * regress_loss + float(self.aac_lambda) * on_pi_loss\ @@ -615,14 +615,14 @@ def _make_train_op(self, pi, pi_prime, pi_global): """ # Each worker gets a different set of adam optimizer parameters: - self.optimizer = tf.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) + self.optimizer = tf.compat.v1.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) # Clipped gradients: self.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.loss, pi.var_list), + tf.gradients(ys=self.loss, xs=pi.var_list), 40.0 ) - self.grads_global_norm = tf.global_norm(self.grads) + self.grads_global_norm = tf.linalg.global_norm(self.grads) # Copy weights from the parameter server to the local model: self.sync = self.sync_pi = tf.group( @@ -647,7 +647,7 @@ def _make_train_op(self, pi, pi_prime, pi_global): stream = pi.on_state_in['external'][list(pi.on_state_in['external'].keys())[0]] else: stream = pi.on_state_in['external'] - self.inc_step = self.global_step.assign_add(tf.shape(stream)[0]) + self.inc_step = self.global_step.assign_add(tf.shape(input=stream)[0]) train_op = [self.optimizer.apply_gradients(grads_and_vars), self.mse] @@ -665,64 +665,64 @@ def _combine_summaries(self, policy=None, model_summaries=None): if model_summaries is not None: if self.use_global_network: # Model-wide statistics: - with tf.name_scope('model'): + with tf.compat.v1.name_scope('model'): model_summaries += [ - tf.summary.scalar("grad_global_norm", self.grads_global_norm), - tf.summary.scalar("learn_rate", self.learn_rate_decayed), + tf.compat.v1.summary.scalar("grad_global_norm", self.grads_global_norm), + tf.compat.v1.summary.scalar("learn_rate", self.learn_rate_decayed), # cause actual rate is a jaggy due to test freezes - tf.summary.scalar("total_loss", self.loss), + tf.compat.v1.summary.scalar("total_loss", self.loss), ] if policy is not None: - model_summaries += [tf.summary.scalar("var_global_norm", tf.global_norm(policy.var_list))] + model_summaries += [tf.compat.v1.summary.scalar("var_global_norm", tf.linalg.global_norm(policy.var_list))] else: model_summaries = [] # Model stat. summary: - model_summary = tf.summary.merge(model_summaries, name='model_summary') + model_summary = tf.compat.v1.summary.merge(model_summaries, name='model_summary') # Episode-related summaries: ep_summary = dict( # Summary placeholders - render_atari=tf.placeholder(tf.uint8, [None, None, None, 1]), - total_r=tf.placeholder(tf.float32, ), - cpu_time=tf.placeholder(tf.float32, ), - final_value=tf.placeholder(tf.float32, ), - steps=tf.placeholder(tf.int32, ), + render_atari=tf.compat.v1.placeholder(tf.uint8, [None, None, None, 1]), + total_r=tf.compat.v1.placeholder(tf.float32, ), + cpu_time=tf.compat.v1.placeholder(tf.float32, ), + final_value=tf.compat.v1.placeholder(tf.float32, ), + steps=tf.compat.v1.placeholder(tf.int32, ), ) if self.test_mode: # For Atari: - ep_summary['render_op'] = tf.summary.image("model/state", ep_summary['render_atari']) + ep_summary['render_op'] = tf.compat.v1.summary.image("model/state", ep_summary['render_atari']) else: # BTGym rendering: ep_summary.update( { - mode: tf.placeholder(tf.uint8, [None, None, None, None], name=mode + '_pl') + mode: tf.compat.v1.placeholder(tf.uint8, [None, None, None, None], name=mode + '_pl') for mode in self.env_list[0].render_modes + self.aux_render_modes } ) - ep_summary['render_op'] = tf.summary.merge( - [tf.summary.image(mode, ep_summary[mode]) + ep_summary['render_op'] = tf.compat.v1.summary.merge( + [tf.compat.v1.summary.image(mode, ep_summary[mode]) for mode in self.env_list[0].render_modes + self.aux_render_modes] ) # Episode stat. summary: - ep_summary['btgym_stat_op'] = tf.summary.merge( + ep_summary['btgym_stat_op'] = tf.compat.v1.summary.merge( [ - tf.summary.scalar('episode_train/cpu_time_sec', ep_summary['cpu_time']), - tf.summary.scalar('episode_train/total_reward', ep_summary['total_r']), + tf.compat.v1.summary.scalar('episode_train/cpu_time_sec', ep_summary['cpu_time']), + tf.compat.v1.summary.scalar('episode_train/total_reward', ep_summary['total_r']), ], name='episode_train_btgym' ) # Test episode stat. summary: - ep_summary['test_btgym_stat_op'] = tf.summary.merge( + ep_summary['test_btgym_stat_op'] = tf.compat.v1.summary.merge( [ - tf.summary.scalar('episode_test/total_reward', ep_summary['total_r']), + tf.compat.v1.summary.scalar('episode_test/total_reward', ep_summary['total_r']), ], name='episode_test_btgym' ) - ep_summary['atari_stat_op'] = tf.summary.merge( + ep_summary['atari_stat_op'] = tf.compat.v1.summary.merge( [ - tf.summary.scalar('episode/total_reward', ep_summary['total_r']), - tf.summary.scalar('episode/steps', ep_summary['steps']) + tf.compat.v1.summary.scalar('episode/total_reward', ep_summary['total_r']), + tf.compat.v1.summary.scalar('episode/steps', ep_summary['steps']) ], name='episode_atari' ) diff --git a/btgym/research/encoder_test/networks.py b/btgym/research/encoder_test/networks.py index b115cc82..24852510 100644 --- a/btgym/research/encoder_test/networks.py +++ b/btgym/research/encoder_test/networks.py @@ -33,7 +33,7 @@ def conv_2d_network_skip(x, """ assert conv_2d_num_filters[-1] % 2 == 0 layers = [] - with tf.variable_scope(name, reuse=reuse): + with tf.compat.v1.variable_scope(name, reuse=reuse): for i, num_filters in enumerate(conv_2d_num_filters): x = tf.nn.elu( norm_layer( @@ -52,7 +52,7 @@ def conv_2d_network_skip(x, ) ) if keep_prob is not None: - x = tf.nn.dropout(x, keep_prob=keep_prob, name="_layer_{}_with_dropout".format(i + 1)) + x = tf.nn.dropout(x, rate=1 - (keep_prob), name="_layer_{}_with_dropout".format(i + 1)) layers.append(x) @@ -72,7 +72,7 @@ def conv_2d_network_skip(x, # print('{}.shape = {}'.format(x.name, x.get_shape().as_list())) if conv_2d_enable_skip: - x = tf.concat([tf.layers.flatten(l) for l in layers], axis=-1, name='flattened_encoded_state') + x = tf.concat([tf.compat.v1.layers.flatten(l) for l in layers], axis=-1, name='flattened_encoded_state') # print('{}.shape = {}'.format(x.name, x.get_shape().as_list())) return x @@ -85,7 +85,7 @@ def identity_encoder(x, name='identity_encoder', **kwargs): Returns: tensor holding state features; """ - with tf.variable_scope(name,): - x = tf.layers.flatten(x) + with tf.compat.v1.variable_scope(name,): + x = tf.compat.v1.layers.flatten(x) return x \ No newline at end of file diff --git a/btgym/research/encoder_test/policy.py b/btgym/research/encoder_test/policy.py index 9383595b..e5fb05d0 100644 --- a/btgym/research/encoder_test/policy.py +++ b/btgym/research/encoder_test/policy.py @@ -72,7 +72,7 @@ def __init__(self, self.encode_internal_state = encode_internal_state self.share_encoder_params = share_encoder_params if self.share_encoder_params: - self.reuse_encoder_params = tf.AUTO_REUSE + self.reuse_encoder_params = tf.compat.v1.AUTO_REUSE else: self.reuse_encoder_params = False @@ -88,16 +88,16 @@ def __init__(self, self.on_state_in = nested_placeholders(self.ob_space.shape, batch_dim=None, name='on_policy_state_in') # Placeholders for previous step action[multi-categorical vector encoding] and reward [scalar]: - self.on_last_a_in = tf.placeholder( + self.on_last_a_in = tf.compat.v1.placeholder( tf.float32, [None, self.ac_space.encoded_depth], name='on_policy_last_action_in_pl' ) - self.on_last_reward_in = tf.placeholder(tf.float32, [None], name='on_policy_last_reward_in_pl') + self.on_last_reward_in = tf.compat.v1.placeholder(tf.float32, [None], name='on_policy_last_reward_in_pl') # Placeholders for rnn batch and time-step dimensions: - self.on_batch_size = tf.placeholder(tf.int32, name='on_policy_batch_size') - self.on_time_length = tf.placeholder(tf.int32, name='on_policy_sequence_size') + self.on_batch_size = tf.compat.v1.placeholder(tf.int32, name='on_policy_batch_size') + self.on_time_length = tf.compat.v1.placeholder(tf.int32, name='on_policy_sequence_size') self.debug['on_state_in_keys'] = list(self.on_state_in.keys()) @@ -111,7 +111,7 @@ def __init__(self, pass except AttributeError: - self.train_phase = tf.placeholder_with_default( + self.train_phase = tf.compat.v1.placeholder_with_default( tf.constant(False, dtype=tf.bool), shape=(), name='train_phase_flag_pl' @@ -146,7 +146,7 @@ def __init__(self, else: layer_name_template = 'encoded_{}_{}' encoded_streams = { - name: tf.layers.flatten( + name: tf.compat.v1.layers.flatten( self.state_encoder_class_ref( x=stream, ob_space=self.ob_space.shape[key][name], @@ -165,7 +165,7 @@ def __init__(self, ) else: # Got single data stream: - encoded_mode = tf.layers.flatten( + encoded_mode = tf.compat.v1.layers.flatten( self.state_encoder_class_ref( x=self.on_state_in[key], ob_space=self.ob_space.shape[key], @@ -184,14 +184,14 @@ def __init__(self, # TODO: for encoder prediction test, output `naive` estimates for logits and value directly from encoder: [self.on_simple_logits, self.on_simple_value, _] = dense_aac_network( - tf.layers.flatten(on_aac_x), + tf.compat.v1.layers.flatten(on_aac_x), ac_space_depth=self.ac_space.one_hot_depth, linear_layer_ref=linear_layer_ref, name='aac_dense_simple_pi_v' ) # Reshape rnn inputs for batch training as: [rnn_batch_dim, rnn_time_dim, flattened_depth]: - x_shape_dynamic = tf.shape(on_aac_x) + x_shape_dynamic = tf.shape(input=on_aac_x) max_seq_len = tf.cast(x_shape_dynamic[0] / self.on_batch_size, tf.int32) x_shape_static = on_aac_x.get_shape().as_list() @@ -280,7 +280,7 @@ def __init__(self, self.debug['self.on_lstm_1_state_pl_flatten'] = self.on_lstm_1_state_pl_flatten # For time_flat only: Reshape on_lstm_1_state_out from [1,2,20,size] -->[20,1,2,size] --> [20,1, 2xsize]: - reshape_lstm_1_state_out = tf.transpose(self.on_lstm_1_state_out, [2, 0, 1, 3]) + reshape_lstm_1_state_out = tf.transpose(a=self.on_lstm_1_state_out, perm=[2, 0, 1, 3]) reshape_lstm_1_state_out_shape_static = reshape_lstm_1_state_out.get_shape().as_list() # Take policy logits off first LSTM-dense layer: @@ -363,7 +363,7 @@ def __init__(self, # ) #self.regression = tf.layers.flatten(self.debug['on_state_external_encoded']) self.regression = linear( - x=tf.layers.flatten(self.debug['on_state_external_encoded']), + x=tf.compat.v1.layers.flatten(self.debug['on_state_external_encoded']), size=self.regression_targets.shape.as_list()[-1], initializer=normalized_columns_initializer(0.1), name='on_dense_simple_regression', @@ -379,7 +379,7 @@ def __init__(self, # name='on_dense_rnn_regression' # ) self.regression = linear( - x=tf.layers.flatten(self.debug['reshaped_on_x_lstm_2_out']), + x=tf.compat.v1.layers.flatten(self.debug['reshaped_on_x_lstm_2_out']), size=self.regression_targets.shape.as_list()[-1], initializer=normalized_columns_initializer(0.1), name='on_dense_rnn_regression', @@ -394,13 +394,13 @@ def __init__(self, self.on_lstm_state_pl_flatten = self.on_lstm_1_state_pl_flatten + self.on_lstm_2_state_pl_flatten # Batch-norm related: - self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + self.update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) # Add moving averages to save list: - moving_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*moving.*') - renorm_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, tf.get_variable_scope().name + '.*renorm.*') + moving_var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, tf.compat.v1.get_variable_scope().name + '.*moving.*') + renorm_var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, tf.compat.v1.get_variable_scope().name + '.*renorm.*') # What to save: - self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) + self.var_list = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, tf.compat.v1.get_variable_scope().name) self.var_list += moving_var_list + renorm_var_list # RL2 related: @@ -434,7 +434,7 @@ def get_initial_features(self, state, context=None): KeyError if [`metadata`]:[`trial_num`,`type`] keys not found """ try: - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() new_context = list(sess.run(self.on_lstm_init_state)) if state['metadata']['trial_num'] != self.current_trial_num\ or context is None\ @@ -473,7 +473,7 @@ def act(self, observation, lstm_state, last_action, last_reward): Returns: Action as dictionary of several action encodings, actions logits, V-fn value, output RNN state """ - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() feeder = {pl: value for pl, value in zip(self.on_lstm_state_pl_flatten, flatten_nested(lstm_state))} feeder.update(feed_dict_from_nested(self.on_state_in, observation, expand_batch=True)) feeder.update( diff --git a/btgym/research/gps/aac.py b/btgym/research/gps/aac.py index 73cac65b..68c6ae54 100644 --- a/btgym/research/gps/aac.py +++ b/btgym/research/gps/aac.py @@ -93,7 +93,7 @@ def _make_loss(self, **kwargs): # Guidance annealing: if self.guided_decay_steps is not None: - self.guided_lambda_decayed = tf.train.polynomial_decay( + self.guided_lambda_decayed = tf.compat.v1.train.polynomial_decay( self.guided_lambda, self.global_step + 1, self.guided_decay_steps, diff --git a/btgym/research/gps/loss.py b/btgym/research/gps/loss.py index cf221afa..056c081a 100644 --- a/btgym/research/gps/loss.py +++ b/btgym/research/gps/loss.py @@ -14,17 +14,17 @@ def guided_aac_loss_def_0_0(pi_actions, expert_actions, name='on_policy/aac', ve tensor holding estimated imitation loss; list of related tensorboard summaries. """ - with tf.name_scope(name + '/guided_loss'): + with tf.compat.v1.name_scope(name + '/guided_loss'): # Loss over expert action's distribution: - neg_pi_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2( + neg_pi_log_prob = tf.nn.softmax_cross_entropy_with_logits( logits=pi_actions, - labels=tf.argmax(expert_actions, axis=-1) + labels=tf.argmax(input=expert_actions, axis=-1) ) - loss = tf.reduce_mean(neg_pi_log_prob) + loss = tf.reduce_mean(input_tensor=neg_pi_log_prob) if verbose: - summaries = [tf.summary.scalar('actions_ce', loss)] + summaries = [tf.compat.v1.summary.scalar('actions_ce', loss)] else: summaries = [] @@ -44,18 +44,18 @@ def guided_aac_loss_def_0_1(pi_actions, expert_actions, name='on_policy/aac', ve tensor holding estimated imitation loss; list of related tensorboard summaries. """ - with tf.name_scope(name + '/guided_loss'): + with tf.compat.v1.name_scope(name + '/guided_loss'): # Loss over expert buy/ sell: # Cross-entropy on subset?... - neg_pi_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2( + neg_pi_log_prob = tf.nn.softmax_cross_entropy_with_logits( logits=pi_actions[..., 1:3], labels=expert_actions[..., 1:3] ) - loss = tf.reduce_mean(neg_pi_log_prob) + loss = tf.reduce_mean(input_tensor=neg_pi_log_prob) if verbose: - summaries = [tf.summary.scalar('actions_ce', loss)] + summaries = [tf.compat.v1.summary.scalar('actions_ce', loss)] else: summaries = [] @@ -75,20 +75,20 @@ def guided_aac_loss_def_0_3(pi_actions, expert_actions, name='on_policy/aac', ve tensor holding estimated imitation loss; list of related tensorboard summaries. """ - with tf.name_scope(name + '/guided_loss'): + with tf.compat.v1.name_scope(name + '/guided_loss'): if 'guided_lambda' in kwargs.keys(): guided_lambda = kwargs['guided_lambda'] else: guided_lambda = 1.0 # Loss over expert buy/ sell: - loss = tf.losses.mean_squared_error( + loss = tf.compat.v1.losses.mean_squared_error( labels=expert_actions[..., 1:3], predictions=tf.nn.softmax(pi_actions)[..., 1:3], ) * guided_lambda if verbose: - summaries = [tf.summary.scalar('actions_mse', loss)] + summaries = [tf.compat.v1.summary.scalar('actions_mse', loss)] else: summaries = [] diff --git a/btgym/research/metalearn_2/_fwrnn_aac.py b/btgym/research/metalearn_2/_fwrnn_aac.py index 52cf1a54..e69de29b 100644 --- a/btgym/research/metalearn_2/_fwrnn_aac.py +++ b/btgym/research/metalearn_2/_fwrnn_aac.py @@ -1,188 +0,0 @@ -import tensorflow as tf -import time - -import sys -from logbook import Logger, StreamHandler - -from btgym.research.gps.aac import GuidedAAC -from btgym.algorithms.runner.synchro import BaseSynchroRunner - - -class MetaAAC_2_0(GuidedAAC): - """ - RNN adaptation experiment - """ - - def __init__( - self, - trial_source_target_cycle=(1, 0), - num_episodes_per_trial=1, - name='AAC_FWRNN_Ada', - **kwargs - ): - runner_config = { - 'class_ref': BaseSynchroRunner, - 'kwargs': { - 'data_sample_config': {'mode': 0}, - 'name': '', - }, - } - super(MetaAAC_2_0, self).__init__( - runner_config=runner_config, - name=name, - **kwargs - ) - - self.current_data = None - self.current_feed_dict = None - - # Trials sampling control: - self.num_source_trials = trial_source_target_cycle[0] - self.num_target_trials = trial_source_target_cycle[-1] - self.num_episodes_per_trial = num_episodes_per_trial - - # Note that only master (test runner) is requesting trials - - self.current_source_trial = 0 - self.current_target_trial = 0 - self.current_trial_mode = 0 # source - self.current_episode = 0 - - def get_sample_config(self, mode=0, **kwargs): - """ - Returns environment configuration parameters for next episode to sample. - - Args: - mode: bool, False for slave (train data), True for master (test data) - - Returns: - configuration dictionary of type `btgym.datafeed.base.EnvResetConfig` - """ - - new_trial = 0 - - # Only master environment updates counters: - if self.current_episode >= self.num_episodes_per_trial: - # Reset episode counter: - self.current_episode = 0 - - # Request new trial: - new_trial = 1 - # Decide on trial type (source/target): - if self.current_source_trial >= self.num_source_trials: - # Time to switch to target mode: - self.current_trial_mode = 1 - # Reset counters: - self.current_source_trial = 0 - self.current_target_trial = 0 - - if self.current_target_trial >= self.num_target_trials: - # Vise versa: - self.current_trial_mode = 0 - self.current_source_trial = 0 - self.current_target_trial = 0 - - # Update counter: - if self.current_trial_mode: - self.current_target_trial += 1 - else: - self.current_source_trial += 1 - - self.current_episode += 1 - - - # Compose btgym.datafeed.base.EnvResetConfig-consistent dict: - sample_config = dict( - episode_config=dict( - get_new=True, - sample_type=mode, - b_alpha=1.0, - b_beta=1.0 - ), - trial_config=dict( - get_new=new_trial, - sample_type=self.current_trial_mode, - b_alpha=1.0, - b_beta=1.0 - ) - ) - return sample_config - - def get_episode(self, **kwargs): - data_streams = [runner.get_episode(**kwargs) for runner in self.runners] - return {key: [stream[key] for stream in data_streams] for key in data_streams[0].keys()} - - def process(self, sess, **kwargs): - if self.task < 1: - self.process_test(sess) - - else: - self.process_train(sess) - - def process_test(self, sess): - """ - test step. - - Args: - sess (tensorflow.Session): tf session obj. - - """ - # Copy from parameter server: - sess.run(self.sync_pi) - for i in range(1): - test_data = self.get_episode(init_context=0) - self.process_summary(sess, test_data) - #self.log.warning('self.current_episode: {}'.format(self.current_episode)) - - #time.sleep(5) - - def process_train(self, sess): - """ - Train step. - - Args: - sess (tensorflow.Session): tf session obj. - - """ - try: - # Say `No` to redundant summaries: - wirte_model_summary = \ - self.local_steps % self.model_summary_freq == 0 - - # Collect train trajectory: - train_data = self.get_data() - feed_dict = self.process_data(sess,,,,, train_data,, - # self.log.warning('Train data ok.') - - # Copy from parameter server: - sess.run(self.sync_pi) - # self.log.warning('Sync ok.') - - # Update pi_prime parameters wrt collected data: - if wirte_model_summary: - fetches = [self.train_op, self.model_summary_op, self.inc_step] - else: - fetches = [self.train_op, self.inc_step] - - fetched = sess.run(fetches, feed_dict=feed_dict) - - # self.log.warning('Train gradients ok.') - - if wirte_model_summary: - model_summary = fetched[-2] - - else: - model_summary = None - - # Write down summaries: - self.process_summary(sess, train_data, model_summary) - self.local_steps += 1 - - except: - msg = 'process() exception occurred' + \ - '\n\nPress `Ctrl-C` or jupyter:[Kernel]->[Interrupt] for clean exit.\n' - self.log.exception(msg) - raise RuntimeError(msg) - - - diff --git a/btgym/research/metalearn_2/_mldg_batch.py b/btgym/research/metalearn_2/_mldg_batch.py index f3e5ae11..e69de29b 100644 --- a/btgym/research/metalearn_2/_mldg_batch.py +++ b/btgym/research/metalearn_2/_mldg_batch.py @@ -1,530 +0,0 @@ -import tensorflow as tf -import numpy as np - -import sys -from logbook import Logger, StreamHandler - -from btgym.research.mldg.aac import SubAAC -from btgym.algorithms.runner.synchro import BaseSynchroRunner - - -class MLDG(): - """ - Asynchronous implementation of MLDG algorithm - for continuous adaptation in dynamically changing environments. - - Papers: - Da Li et al., - "Learning to Generalize: Meta-Learning for Domain Generalization" - https://arxiv.org/abs/1710.03463 - - Maruan Al-Shedivat et al., - "Continuous Adaptation via Meta-Learning in Nonstationary and Competitive Environments" - https://arxiv.org/abs/1710.03641 - - """ - def __init__( - self, - env, - task, - log_level, - aac_class_ref=SubAAC, - runner_config=None, - aac_lambda=1.0, - guided_lambda=1.0, - rollout_length=20, - train_support=300, - fast_adapt_num_steps=10, - fast_adapt_batch_size=32, - trial_source_target_cycle=(1, 0), - num_episodes_per_trial=1, # one-shot adaptation - _aux_render_modes=('action_prob', 'value_fn', 'lstm_1_h', 'lstm_2_h'), - name='MLDG', - **kwargs - ): - try: - self.aac_class_ref = aac_class_ref - self.task = task - self.name = name - self.summary_writer = None - - StreamHandler(sys.stdout).push_application() - self.log = Logger('{}_{}'.format(name, task), level=log_level) - - self.rollout_length = rollout_length - self.train_support = train_support # number of train experiences to collect - self.train_batch_size = int(self.train_support / self.rollout_length) - self.fast_adapt_num_steps = fast_adapt_num_steps - self.fast_adapt_batch_size = fast_adapt_batch_size - - if runner_config is None: - self.runner_config = { - 'class_ref': BaseSynchroRunner, - 'kwargs': {}, - } - else: - self.runner_config = runner_config - - self.env_list = env - - assert isinstance(self.env_list, list) and len(self.env_list) == 2, \ - 'Expected pair of environments, got: {}'.format(self.env_list) - - # Instantiate two sub-trainers: one for test and one for train environments: - - self.runner_config['kwargs']['data_sample_config'] = {'mode': 1} # master - self.runner_config['kwargs']['name'] = 'master' - - self.train_aac = aac_class_ref( - env=self.env_list[0], # train data will be master environment TODO: really dumb data control. improve. - task=self.task, - log_level=log_level, - runner_config=self.runner_config, - aac_lambda=aac_lambda, - guided_lambda=guided_lambda, - rollout_length=self.rollout_length, - trial_source_target_cycle=trial_source_target_cycle, - num_episodes_per_trial=num_episodes_per_trial, - _use_target_policy=False, - _use_global_network=True, - _aux_render_modes=_aux_render_modes, - name=self.name + '_sub_Train', - **kwargs - ) - - self.runner_config['kwargs']['data_sample_config'] = {'mode': 0} # master - self.runner_config['kwargs']['name'] = 'slave' - - self.test_aac = aac_class_ref( - env=self.env_list[-1], # test data -> slave env. - task=self.task, - log_level=log_level, - runner_config=self.runner_config, - aac_lambda=aac_lambda, - guided_lambda=guided_lambda, - rollout_length=self.rollout_length, - trial_source_target_cycle=trial_source_target_cycle, - num_episodes_per_trial=num_episodes_per_trial, - _use_target_policy=False, - _use_global_network=False, - global_step_op=self.train_aac.global_step, - global_episode_op=self.train_aac.global_episode, - inc_episode_op=self.train_aac.inc_episode, - _aux_render_modes=_aux_render_modes, - name=self.name + '_sub_Test', - **kwargs - ) - - self.local_steps = self.train_aac.local_steps - self.model_summary_freq = self.train_aac.model_summary_freq - #self.model_summary_op = self.train_aac.model_summary_op - - self._make_train_op() - self.test_aac.model_summary_op = tf.summary.merge( - [self.test_aac.model_summary_op, self._combine_meta_summaries()], - name='meta_model_summary' - ) - - except: - msg = 'MLDG.__init()__ exception occurred' + \ - '\n\nPress `Ctrl-C` or jupyter:[Kernel]->[Interrupt] for clean exit.\n' - self.log.exception(msg) - raise RuntimeError(msg) - - def _make_train_op(self): - """ - - Defines: - tensors holding training op graph for sub trainers and self; - """ - pi = self.train_aac.local_network - pi_prime = self.test_aac.local_network - - self.test_aac.sync = self.test_aac.sync_pi = tf.group( - *[v1.assign(v2) for v1, v2 in zip(pi_prime.var_list, pi.var_list)] - ) - - self.global_step = self.train_aac.global_step - self.global_episode = self.train_aac.global_episode - - self.test_aac.global_step = self.train_aac.global_step - self.test_aac.global_episode = self.train_aac.global_episode - self.test_aac.inc_episode = self.train_aac.inc_episode - self.train_aac.inc_episode = None - self.inc_step = self.train_aac.inc_step - - # Meta-loss: - self.loss = 0.5 * self.train_aac.loss + 0.5 * self.test_aac.loss - - # Clipped gradients: - self.train_aac.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.train_aac.loss, pi.var_list), - 40.0 - ) - self.log.warning('self.train_aac.grads: {}'.format(len(list(self.train_aac.grads)))) - - # self.test_aac.grads, _ = tf.clip_by_global_norm( - # tf.gradients(self.test_aac.loss, pi_prime.var_list), - # 40.0 - # ) - # Meta-gradient: - grads_i, _ = tf.clip_by_global_norm( - tf.gradients(self.train_aac.loss, pi.var_list), - 40.0 - ) - - grads_i_next, _ = tf.clip_by_global_norm( - tf.gradients(self.test_aac.loss, pi_prime.var_list), - 40.0 - ) - - self.grads = [] - for g1, g2 in zip(grads_i, grads_i_next): - if g1 is not None and g2 is not None: - meta_g = 0.5 * g1 + 0.5 * g2 - else: - meta_g = None - - self.grads.append(meta_g) - - #self.log.warning('self.grads_len: {}'.format(len(list(self.grads)))) - - # Gradients to update local copy of pi_prime (from train data): - train_grads_and_vars = list(zip(self.train_aac.grads, pi_prime.var_list)) - - # self.log.warning('train_grads_and_vars_len: {}'.format(len(train_grads_and_vars))) - - # Meta-gradients to be sent to parameter server: - meta_grads_and_vars = list(zip(self.grads, self.train_aac.network.var_list)) - - # self.log.warning('meta_grads_and_vars_len: {}'.format(len(meta_grads_and_vars))) - - # Set global_step increment equal to observation space batch size: - obs_space_keys = list(self.train_aac.local_network.on_state_in.keys()) - - assert 'external' in obs_space_keys, \ - 'Expected observation space to contain `external` mode, got: {}'.format(obs_space_keys) - self.train_aac.inc_step = self.train_aac.global_step.assign_add( - tf.shape(self.train_aac.local_network.on_state_in['external'])[0] - ) - - self.train_op = self.train_aac.optimizer.apply_gradients(train_grads_and_vars) - - # Optimizer for meta-update: - self.optimizer = tf.train.AdamOptimizer(self.train_aac.train_learn_rate, epsilon=1e-5) - # TODO: own alpha-leran rate - self.meta_train_op = self.optimizer.apply_gradients(meta_grads_and_vars) - - self.log.debug('meta_train_op defined') - - def _combine_meta_summaries(self): - - meta_model_summaries = [ - tf.summary.scalar("meta_grad_global_norm", tf.global_norm(self.grads)), - tf.summary.scalar("total_meta_loss", self.loss), - ] - - return meta_model_summaries - - def start(self, sess, summary_writer, **kwargs): - """ - Executes all initializing operations, - starts environment runner[s]. - Supposed to be called by parent worker just before training loop starts. - - Args: - sess: tf session object. - kwargs: not used by default. - """ - try: - # Copy weights from global to local: - sess.run(self.train_aac.sync_pi) - sess.run(self.test_aac.sync_pi) - - # Start thread_runners: - self.train_aac._start_runners( # master first - sess, - summary_writer, - init_context=None, - data_sample_config=self.train_aac.get_sample_config(mode=1) - ) - self.test_aac._start_runners( - sess, - summary_writer, - init_context=None, - data_sample_config=self.test_aac.get_sample_config(mode=0) - ) - - self.summary_writer = summary_writer - self.log.notice('Runners started.') - - except: - msg = 'start() exception occurred' + \ - '\n\nPress `Ctrl-C` or jupyter:[Kernel]->[Interrupt] for clean exit.\n' - self.log.exception(msg) - raise RuntimeError(msg) - - def fast_adapt_step(self, sess, batch_size, on_policy_batch, off_policy_batch, rp_batch, make_summary=False): - """ - One step of test_policy adaptation. - - Args: - sess: tensorflow.Session obj. - batch_size: train mini-batch size - on_policy_batch: `on_policy` train data - off_policy_batch: `off_policy` train data or None - rp_batch: 'reward_prediction` train data or None - make_summary: bool, if True - compute model summary - - Returns: - model summary or None - """ - # Sample from train distribution: - on_mini_batch = self.train_aac.sample_batch(on_policy_batch, batch_size) - off_mini_batch = self.train_aac.sample_batch(off_policy_batch, batch_size) - rp_mini_batch = self.train_aac.sample_batch(rp_batch, batch_size) - - feed_dict = self.train_aac._get_main_feeder(sess, on_mini_batch, off_mini_batch, rp_mini_batch, True) - - if make_summary: - fetches = [self.train_op, self.train_aac.model_summary_op] - else: - fetches = [self.train_op] - - # Update pi_prime parameters wrt sampled data: - fetched = sess.run(fetches, feed_dict=feed_dict) - - # self.log.warning('Train gradients ok.') - - if make_summary: - summary = fetched[-1] - - else: - summary = None - - return summary - - def train_step(self, sess, data_config): - """ - Collects train task data and updates test policy parameters (fast adaptation). - - Args: - sess: tensorflow.Session obj. - data_config: configuration dictionary of type `btgym.datafeed.base.EnvResetConfig` - - Returns: - batched train data - - """ - # Collect train distribution: - train_batch = self.train_aac.get_batch( - size=self.train_batch_size, - require_terminal=True, - same_trial=True, - data_sample_config=data_config - ) - - # for rollout in train_batch['on_policy']: - # self.log.warning( - # 'Train data trial_num: {}'.format( - # np.asarray(rollout['state']['metadata']['trial_num']) - # ) - # ) - - # Process time-flat-alike (~iid) to treat as empirical data distribution over train task: - on_policy_batch, off_policy_batch, rp_batch = self.train_aac.process_batch(sess, train_batch) - - # self.log.warning('Train data ok.') - - local_step = sess.run(self.global_step) - local_episode = sess.run(self.global_episode) - model_summary = None - - # Extract all non-empty summaries: - ep_summary = [summary for summary in train_batch['ep_summary'] if summary is not None] - - # Perform number of test policy updates wrt. collected train data: - for i in range(self.fast_adapt_num_steps): - model_summary = self.fast_adapt_step( - sess, - batch_size=self.fast_adapt_batch_size, - on_policy_batch=on_policy_batch, - off_policy_batch=off_policy_batch, - rp_batch=rp_batch, - make_summary=(local_step + i) % self.model_summary_freq == 0 - ) - # self.log.warning('Batch {} Train gradients ok.'.format(i)) - - # Write down summaries: - train_summary = dict( - render_summary=[None], - test_ep_summary=[None], - ep_summary=[ep_summary.pop() if len(ep_summary) > 0 else None] - ) - self.train_aac.process_summary( - sess, - train_summary, - model_summary, - step=local_step + i, - episode=local_episode + i - ) - - return on_policy_batch, off_policy_batch, rp_batch - - def meta_train_step(self, sess, data_config, on_policy_batch, off_policy_batch, rp_batch): - """ - Collects data from source domain test task and performs meta-update to shared parameters vector. - Writes down relevant summaries. - - Args: - sess: tensorflow.Session obj. - data_config: configuration dictionary of type `btgym.datafeed.base.EnvResetConfig` - on_policy_batch: `on_policy` train data - off_policy_batch: `off_policy` train data or None - rp_batch: 'reward_prediction` train data or None - - """ - done = False - while not done: - # Say `No` to redundant summaries: - wirte_model_summary = \ - self.local_steps % self.model_summary_freq == 0 - - # Collect test trajectory wrt updated test_policy parameters: - test_data = self.test_aac.get_data( - init_context=None, - data_sample_config=data_config - ) - test_batch_size = 0 # TODO: adjust on/off/rp sizes - for rollout in test_data['on_policy']: - test_batch_size += len(rollout['position']) - - test_feed_dict = self.test_aac.process_data(sess,,,,, test_data,, - - # self.log.warning('Test data rollout for step {} ok.'.format(self.local_steps)) - # - # self.log.warning( - # 'Test data trial_num: {}'.format( - # np.asarray(test_data['on_policy'][0]['state']['metadata']['trial_num']) - # ) - # ) - - # Sample train data of same size: - feed_dict = self.train_aac._get_main_feeder( - sess, - self.train_aac.sample_batch(on_policy_batch, test_batch_size), - self.train_aac.sample_batch(off_policy_batch, test_batch_size), - self.train_aac.sample_batch(rp_batch, test_batch_size), - True - ) - # Add test trajectory: - feed_dict.update(test_feed_dict) - - # Perform meta-update: - if wirte_model_summary: - meta_fetches = [self.meta_train_op, self.test_aac.model_summary_op, self.inc_step] - else: - meta_fetches = [self.meta_train_op, self.inc_step] - - meta_fetched = sess.run(meta_fetches, feed_dict=feed_dict) - - # self.log.warning('Meta-gradients ok.') - - if wirte_model_summary: - meta_model_summary = meta_fetched[-2] - - else: - meta_model_summary = None - - # Write down summaries: - self.test_aac.process_summary(sess, test_data, meta_model_summary) - self.local_steps += 1 - - # If test episode ended? - done = np.asarray(test_data['terminal']).any() - - def meta_test_step(self, sess, data_config, on_policy_batch, off_policy_batch, rp_batch): - """ - Validates adapted policy on data from target domain test task. - Writes down relevant summaries. - - Args: - sess: tensorflow.Session obj. - data_config: configuration dictionary of type `btgym.datafeed.base.EnvResetConfig` - on_policy_batch: `on_policy` train data - off_policy_batch: `off_policy` train data or None - rp_batch: 'reward_prediction` train data or None - - """ - done = False - while not done: - # Collect test trajectory: - test_data = self.test_aac.get_data( - init_context=None, - data_sample_config=data_config - ) - - # self.log.warning('Target test rollout ok.') - # self.log.warning( - # 'Test data target trial_num: {}'.format( - # np.asarray(test_data['on_policy'][0]['state']['metadata']['trial_num']) - # ) - # ) - # self.log.warning('target_render_ep_summary: {}'.format(test_data['render_summary'])) - - # Write down summaries: - self.test_aac.process_summary(sess, test_data) - - # If test episode ended? - done = np.asarray(test_data['terminal']).any() - - def process(self, sess): - """ - Meta-train procedure for one-shot learning/ - - Args: - sess (tensorflow.Session): tf session obj. - - """ - try: - # Copy from parameter server: - sess.run(self.train_aac.sync_pi) - sess.run(self.test_aac.sync_pi) - - #self.log.warning('Sync ok.') - - # Decide on data configuration for train/test trajectories, - # such as all data will come from same trial (maybe different episodes) - # and trial type as well (~from source or target domain): - # note: data_config counters get updated once per process() call - train_data_config = self.train_aac.get_sample_config(mode=1) # master env., draws trial - test_data_config = self.train_aac.get_sample_config(mode=0) # slave env, catches up with same trial - - # If data comes from source or target domain: - is_target = train_data_config['trial_config']['sample_type'] - - # self.log.warning('PROCESS_train_data_config: {}'.format(train_data_config)) - # self.log.warning('PROCESS_test_data_config: {}'.format(test_data_config)) - - # Fast adaptation step: - # collect train trajectories, process time-flat-alike (~iid) to treat as empirical data distribution - # over train task and adapt test_policy wrt. train experience: - on_policy_batch, off_policy_batch, rp_batch = self.train_step(sess, train_data_config) - - # Slow adaptation step: - if is_target: - # Meta-test: - # self.log.warning('Running meta-test episode...') - self.meta_test_step(sess,test_data_config, on_policy_batch, off_policy_batch, rp_batch) - - else: - # Meta-train: - # self.log.warning('Running meta-train episode...') - self.meta_train_step(sess,test_data_config, on_policy_batch, off_policy_batch, rp_batch) - - except: - msg = 'process() exception occurred' + \ - '\n\nPress `Ctrl-C` or jupyter:[Kernel]->[Interrupt] for clean exit.\n' - self.log.exception(msg) - raise RuntimeError(msg) - diff --git a/btgym/research/metalearn_2/loss.py b/btgym/research/metalearn_2/loss.py index cb845436..15f709bf 100644 --- a/btgym/research/metalearn_2/loss.py +++ b/btgym/research/metalearn_2/loss.py @@ -19,37 +19,37 @@ def meta_loss_def_1_0( name='_meta_', verbose=False ): - with tf.name_scope(name + '/meta'): - neg_pi_log_prob_train = tf.nn.softmax_cross_entropy_with_logits_v2( + with tf.compat.v1.name_scope(name + '/meta'): + neg_pi_log_prob_train = tf.nn.softmax_cross_entropy_with_logits( logits=pi_logits_train, labels=act_target_train ) - neg_pi_log_prob_test = tf.nn.softmax_cross_entropy_with_logits_v2( + neg_pi_log_prob_test = tf.nn.softmax_cross_entropy_with_logits( logits=pi_logits_test, labels=act_target_test ) pi_loss = tf.reduce_mean( - (neg_pi_log_prob_train + neg_pi_log_prob_test) * adv_target_test + input_tensor=(neg_pi_log_prob_train + neg_pi_log_prob_test) * adv_target_test ) - vf_loss_train = 0.5 * tf.losses.mean_squared_error(r_target_test, pi_vf_train) - vf_loss_test = 0.5 * tf.losses.mean_squared_error(r_target_test, pi_vf_test) + vf_loss_train = 0.5 * tf.compat.v1.losses.mean_squared_error(r_target_test, pi_vf_train) + vf_loss_test = 0.5 * tf.compat.v1.losses.mean_squared_error(r_target_test, pi_vf_test) - entropy = tf.reduce_mean(cat_entropy(pi_logits_test)) + entropy = tf.reduce_mean(input_tensor=cat_entropy(pi_logits_test)) loss = pi_loss + vf_loss_test + vf_loss_train - entropy * entropy_beta - mean_vf_test = tf.reduce_mean(pi_vf_test) - mean_vf_train = tf.reduce_mean(pi_vf_train) + mean_vf_test = tf.reduce_mean(input_tensor=pi_vf_test) + mean_vf_train = tf.reduce_mean(input_tensor=pi_vf_train) summaries = [ - tf.summary.scalar('meta_policy_loss', pi_loss), - tf.summary.scalar('meta_value_loss_test', vf_loss_test), + tf.compat.v1.summary.scalar('meta_policy_loss', pi_loss), + tf.compat.v1.summary.scalar('meta_value_loss_test', vf_loss_test), ] if verbose: summaries += [ - tf.summary.scalar('entropy', entropy), - tf.summary.scalar('value_fn_test', mean_vf_test), - tf.summary.scalar('value_fn_train', mean_vf_train) + tf.compat.v1.summary.scalar('entropy', entropy), + tf.compat.v1.summary.scalar('value_fn_test', mean_vf_test), + tf.compat.v1.summary.scalar('value_fn_train', mean_vf_train) ] return loss, summaries diff --git a/btgym/research/mldg/aac.py b/btgym/research/mldg/aac.py index 93aedad1..8c52d5a3 100644 --- a/btgym/research/mldg/aac.py +++ b/btgym/research/mldg/aac.py @@ -235,7 +235,7 @@ def __init__( self._make_train_op() - self.test_aac.model_summary_op = tf.summary.merge( + self.test_aac.model_summary_op = tf.compat.v1.summary.merge( [self.test_aac.model_summary_op, self._combine_meta_summaries()], name='meta_model_summary' ) @@ -279,11 +279,11 @@ def _make_train_op(self): # Clipped gradients: self.train_aac.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.train_aac.loss, pi.var_list), + tf.gradients(ys=self.train_aac.loss, xs=pi.var_list), 40.0 ) self.test_aac.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.test_aac.loss, pi_prime.var_list), + tf.gradients(ys=self.test_aac.loss, xs=pi_prime.var_list), 40.0 ) # Aliases: @@ -320,18 +320,18 @@ def _make_train_op(self): assert 'external' in obs_space_keys, \ 'Expected observation space to contain `external` mode, got: {}'.format(obs_space_keys) self.train_aac.inc_step = self.train_aac.global_step.assign_add( - tf.shape(self.test_aac.local_network.on_state_in['external'])[0] + tf.shape(input=self.test_aac.local_network.on_state_in['external'])[0] ) self.inc_step = self.train_aac.inc_step # Pi to pi_prime local adaptation op: # self.train_op = self.train_aac.optimizer.apply_gradients(train_grads_and_vars) # self.fast_opt = tf.train.GradientDescentOptimizer(self.alpha_rate) - self.fast_opt = tf.train.GradientDescentOptimizer(self.fast_opt_learn_rate) + self.fast_opt = tf.compat.v1.train.GradientDescentOptimizer(self.fast_opt_learn_rate) self.train_op = self.fast_opt.apply_gradients(train_grads_and_vars) # Learning rate annealing: - self.learn_rate_decayed = tf.train.polynomial_decay( + self.learn_rate_decayed = tf.compat.v1.train.polynomial_decay( self.opt_learn_rate, self.global_step + 1, self.opt_decay_steps, @@ -341,7 +341,7 @@ def _make_train_op(self): ) # Optimizer for meta-update, sharing same learn rate (change?): - self.optimizer = tf.train.AdamOptimizer(self.learn_rate_decayed, epsilon=1e-5) + self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learn_rate_decayed, epsilon=1e-5) # Global meta-optimisation op: self.meta_train_op = self.optimizer.apply_gradients(meta_grads_and_vars) @@ -353,8 +353,8 @@ def _combine_meta_summaries(self): Additional summaries here. """ meta_model_summaries = [ - tf.summary.scalar('meta_grad_global_norm', tf.global_norm(self.grads)), - tf.summary.scalar('total_meta_loss', self.loss), + tf.compat.v1.summary.scalar('meta_grad_global_norm', tf.linalg.global_norm(self.grads)), + tf.compat.v1.summary.scalar('total_meta_loss', self.loss), #tf.summary.scalar('alpha_learn_rate', self.alpha_rate), #tf.summary.scalar('alpha_learn_rate_loss', self.alpha_rate_loss) ] @@ -848,11 +848,11 @@ def _make_train_op(self): # Clipped gradients: self.train_aac.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.train_aac.loss, pi.var_list), + tf.gradients(ys=self.train_aac.loss, xs=pi.var_list), 40.0 ) self.test_aac.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.test_aac.loss, pi_prime.var_list), + tf.gradients(ys=self.test_aac.loss, xs=pi_prime.var_list), 40.0 ) # Aliases: @@ -889,17 +889,17 @@ def _make_train_op(self): assert 'external' in obs_space_keys, \ 'Expected observation space to contain `external` mode, got: {}'.format(obs_space_keys) self.train_aac.inc_step = self.train_aac.global_step.assign_add( - tf.shape(self.train_aac.local_network.on_state_in['external'])[0] + tf.shape(input=self.train_aac.local_network.on_state_in['external'])[0] ) # Simple SGD, no average statisitics: - self.fast_optimizer_train = tf.train.GradientDescentOptimizer(self.fast_learn_rate_train) - self.fast_optimizer_test = tf.train.GradientDescentOptimizer(self.fast_learn_rate_test) + self.fast_optimizer_train = tf.compat.v1.train.GradientDescentOptimizer(self.fast_learn_rate_train) + self.fast_optimizer_test = tf.compat.v1.train.GradientDescentOptimizer(self.fast_learn_rate_test) # Pi to pi_prime local adaptation op: self.train_op = self.fast_optimizer_train.apply_gradients(train_grads_and_vars) # Optimizer for meta-update, sharing same learn rate (change?): - self.optimizer = tf.train.AdamOptimizer(self.train_aac.train_learn_rate, epsilon=1e-5) + self.optimizer = tf.compat.v1.train.AdamOptimizer(self.train_aac.train_learn_rate, epsilon=1e-5) # Global meta-optimisation op: self.meta_train_op = self.optimizer.apply_gradients(meta_grads_and_vars) diff --git a/btgym/research/mldg/aac_1.py b/btgym/research/mldg/aac_1.py index 45a1d7d8..4d34d1cd 100644 --- a/btgym/research/mldg/aac_1.py +++ b/btgym/research/mldg/aac_1.py @@ -73,7 +73,7 @@ def __init__( name=name, **kwargs ) - self.model_summary_op = tf.summary.merge( + self.model_summary_op = tf.compat.v1.summary.merge( [self.model_summary_op, self._combine_meta_summaries()], name='meta_model_summary' ) @@ -98,7 +98,7 @@ def _make_loss(self, pi, pi_prime): ) # Guidance annealing: if self.guided_decay_steps is not None: - self.guided_lambda_decayed = tf.train.polynomial_decay( + self.guided_lambda_decayed = tf.compat.v1.train.polynomial_decay( self.guided_lambda, self.global_step + 1, self.guided_decay_steps, @@ -152,16 +152,16 @@ def _make_train_op(self, pi, pi_prime, pi_global): *[v1.assign(v2) for v1, v2 in zip(pi.var_list, pi_prime.var_list)] ) self.sync = [self.sync_pi, self.sync_pi_prime] - self.optimizer = tf.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) - self.fast_optimizer = tf.train.GradientDescentOptimizer(self.fast_opt_learn_rate) + self.optimizer = tf.compat.v1.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) + self.fast_optimizer = tf.compat.v1.train.GradientDescentOptimizer(self.fast_opt_learn_rate) # Clipped gradients: pi.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.meta_train_loss, pi.var_list), + tf.gradients(ys=self.meta_train_loss, xs=pi.var_list), 40.0 ) pi_prime.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.meta_test_loss, pi_prime.var_list), + tf.gradients(ys=self.meta_test_loss, xs=pi_prime.var_list), 40.0 ) # Meta_optimisation gradients as sum of meta-train and meta-test gradients: @@ -189,7 +189,7 @@ def _make_train_op(self, pi, pi_prime, pi_global): assert 'external' in obs_space_keys, \ 'Expected observation space to contain `external` mode, got: {}'.format(obs_space_keys) - self.inc_step = self.global_step.assign_add(tf.shape(self.local_network.on_state_in['external'])[0]) + self.inc_step = self.global_step.assign_add(tf.shape(input=self.local_network.on_state_in['external'])[0]) # Local fast optimisation op: self.fast_train_op = self.fast_optimizer.apply_gradients(train_grads_and_vars) @@ -204,9 +204,9 @@ def _combine_meta_summaries(self): """ Additional summaries here. """ - with tf.name_scope(self.name): + with tf.compat.v1.name_scope(self.name): meta_model_summaries = [ - tf.summary.scalar('meta_grad_global_norm', tf.global_norm(self.grads)), + tf.compat.v1.summary.scalar('meta_grad_global_norm', tf.linalg.global_norm(self.grads)), # tf.summary.scalar('total_meta_loss', self.loss), # tf.summary.scalar('alpha_learn_rate', self.alpha_rate), # tf.summary.scalar('alpha_learn_rate_loss', self.alpha_rate_loss) diff --git a/btgym/research/mldg/aac_1d.py b/btgym/research/mldg/aac_1d.py index 1a24ab40..005b1023 100644 --- a/btgym/research/mldg/aac_1d.py +++ b/btgym/research/mldg/aac_1d.py @@ -97,16 +97,16 @@ def _make_train_op(self, pi, pi_prime, pi_global): *[v1.assign(v2) for v1, v2 in zip(pi.var_list, pi_prime.var_list)] ) self.sync = [self.sync_pi, self.sync_pi_prime] - self.optimizer = tf.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) - self.fast_optimizer = tf.train.GradientDescentOptimizer(self.fast_opt_learn_rate) + self.optimizer = tf.compat.v1.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) + self.fast_optimizer = tf.compat.v1.train.GradientDescentOptimizer(self.fast_opt_learn_rate) # Clipped gradients: pi.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.meta_train_loss, pi.var_list), + tf.gradients(ys=self.meta_train_loss, xs=pi.var_list), 40.0 ) pi_prime.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.meta_test_loss, pi_prime.var_list), + tf.gradients(ys=self.meta_test_loss, xs=pi_prime.var_list), 40.0 ) # Meta_optimisation gradients as sum of meta-train and meta-test gradients: @@ -134,7 +134,7 @@ def _make_train_op(self, pi, pi_prime, pi_global): assert 'external' in obs_space_keys, \ 'Expected observation space to contain `external` mode, got: {}'.format(obs_space_keys) - self.inc_step = self.global_step.assign_add(tf.shape(pi_prime.on_state_in['external'])[0]) + self.inc_step = self.global_step.assign_add(tf.shape(input=pi_prime.on_state_in['external'])[0]) # Local fast optimisation op: self.fast_train_op = self.fast_optimizer.apply_gradients(train_grads_and_vars) diff --git a/btgym/research/mldg/aac_1s.py b/btgym/research/mldg/aac_1s.py index 2f3312ea..4ab55db8 100644 --- a/btgym/research/mldg/aac_1s.py +++ b/btgym/research/mldg/aac_1s.py @@ -223,11 +223,11 @@ def _make_loss(self, pi, pi_prime, name='base', verbose=True): tensor holding estimated loss graph list of related summaries """ - with tf.name_scope(name): + with tf.compat.v1.name_scope(name): # Guidance annealing: if self.guided_decay_steps is not None: - self.guided_lambda_decayed = tf.train.polynomial_decay( + self.guided_lambda_decayed = tf.compat.v1.train.polynomial_decay( self.guided_lambda, self.global_step + 1, self.guided_decay_steps, @@ -249,11 +249,11 @@ def _make_loss(self, pi, pi_prime, name='base', verbose=True): ) # On-policy AAC loss definition: - pi.on_pi_act_target = tf.placeholder( + pi.on_pi_act_target = tf.compat.v1.placeholder( tf.float32, [None, self.ref_env.action_space.n], name="on_policy_action_pl" ) - pi.on_pi_adv_target = tf.placeholder(tf.float32, [None], name="on_policy_advantage_pl") - pi.on_pi_r_target = tf.placeholder(tf.float32, [None], name="on_policy_return_pl") + pi.on_pi_adv_target = tf.compat.v1.placeholder(tf.float32, [None], name="on_policy_advantage_pl") + pi.on_pi_r_target = tf.compat.v1.placeholder(tf.float32, [None], name="on_policy_return_pl") clip_epsilon = tf.cast(self.clip_epsilon * self.learn_rate_decayed / self.opt_learn_rate, tf.float32) @@ -273,10 +273,10 @@ def _make_loss(self, pi, pi_prime, name='base', verbose=True): model_summaries = on_pi_summaries + g_summary # Off-policy losses: - pi.off_pi_act_target = tf.placeholder( + pi.off_pi_act_target = tf.compat.v1.placeholder( tf.float32, [None, self.ref_env.action_space.n], name="off_policy_action_pl") - pi.off_pi_adv_target = tf.placeholder(tf.float32, [None], name="off_policy_advantage_pl") - pi.off_pi_r_target = tf.placeholder(tf.float32, [None], name="off_policy_return_pl") + pi.off_pi_adv_target = tf.compat.v1.placeholder(tf.float32, [None], name="off_policy_advantage_pl") + pi.off_pi_r_target = tf.compat.v1.placeholder(tf.float32, [None], name="off_policy_return_pl") if self.use_off_policy_aac: # Off-policy AAC loss graph mirrors on-policy: @@ -296,7 +296,7 @@ def _make_loss(self, pi, pi_prime, name='base', verbose=True): if self.use_value_replay: # Value function replay loss: - pi.vr_target = tf.placeholder(tf.float32, [None], name="vr_target") + pi.vr_target = tf.compat.v1.placeholder(tf.float32, [None], name="vr_target") self.vr_loss, self.vr_summaries = self.vr_loss( r_target=pi.vr_target, pi_vf=pi.vr_value, @@ -323,16 +323,16 @@ def _make_train_op(self, pi, pi_prime, pi_global): *[v1.assign(v2) for v1, v2 in zip(pi.var_list, pi_global.var_list)] ) self.sync = self.sync_pi - self.optimizer = tf.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) - self.fast_optimizer = tf.train.GradientDescentOptimizer(self.fast_opt_learn_rate) + self.optimizer = tf.compat.v1.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) + self.fast_optimizer = tf.compat.v1.train.GradientDescentOptimizer(self.fast_opt_learn_rate) # Clipped gradients: pi.on_grads, _ = tf.clip_by_global_norm( - tf.gradients(self.on_pi_loss, pi.var_list), + tf.gradients(ys=self.on_pi_loss, xs=pi.var_list), 40.0 ) pi.off_grads, _ = tf.clip_by_global_norm( - tf.gradients(self.off_pi_loss, pi.var_list), + tf.gradients(ys=self.off_pi_loss, xs=pi.var_list), 40.0 ) @@ -354,7 +354,7 @@ def _make_train_op(self, pi, pi_prime, pi_global): assert 'external' in obs_space_keys, \ 'Expected observation space to contain `external` mode, got: {}'.format(obs_space_keys) - self.inc_step = self.global_step.assign_add(tf.shape(pi.on_state_in['external'])[0]) + self.inc_step = self.global_step.assign_add(tf.shape(input=pi.on_state_in['external'])[0]) # Local fast optimisation op: self.local_train_op = self.fast_optimizer.apply_gradients(local_grads_and_vars) @@ -501,22 +501,22 @@ def _make_train_op(self, pi, pi_prime, pi_global): *[v1.assign(v2) for v1, v2 in zip(pi.var_list, pi_global.var_list)] ) self.sync = self.sync_pi - self.optimizer = tf.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) + self.optimizer = tf.compat.v1.train.AdamOptimizer(self.train_learn_rate, epsilon=1e-5) # Clipped gradients: pi.on_grads, _ = tf.clip_by_global_norm( - tf.gradients(self.on_pi_loss, pi.var_list), + tf.gradients(ys=self.on_pi_loss, xs=pi.var_list), 40.0 ) pi.off_grads, _ = tf.clip_by_global_norm( - tf.gradients(self.off_pi_loss, pi.var_list), + tf.gradients(ys=self.off_pi_loss, xs=pi.var_list), 40.0 ) self.grads = pi.on_grads # Learnable fast rate: #self.fast_learn_rate = tf.reduce_mean(pi.off_learn_alpha, name='mean_alpha_rate') / 10 - self.fast_optimizer = tf.train.GradientDescentOptimizer(self.fast_opt_learn_rate) + self.fast_optimizer = tf.compat.v1.train.GradientDescentOptimizer(self.fast_opt_learn_rate) # self.alpha_rate_loss = tf.global_norm(pi.off_grads) # self.alpha_grads, _ = tf.clip_by_global_norm( # tf.gradients(self.alpha_rate_loss, pi.var_list), @@ -539,7 +539,7 @@ def _make_train_op(self, pi, pi_prime, pi_global): assert 'external' in obs_space_keys, \ 'Expected observation space to contain `external` mode, got: {}'.format(obs_space_keys) - self.inc_step = self.global_step.assign_add(tf.shape(pi.on_state_in['external'])[0]) + self.inc_step = self.global_step.assign_add(tf.shape(input=pi.on_state_in['external'])[0]) # Local fast optimisation op: self.local_train_op = self.fast_optimizer.apply_gradients(local_grads_and_vars) diff --git a/btgym/research/mldg/policy.py b/btgym/research/mldg/policy.py index 09a2e7ec..71899a59 100644 --- a/btgym/research/mldg/policy.py +++ b/btgym/research/mldg/policy.py @@ -46,7 +46,7 @@ def get_initial_features(self, state, context=None): #print('Meta_policy_init_metadata:', state['metadata']) #print('self.current_trial_num:', self.current_trial_num) try: - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() new_context = list(sess.run(self.on_lstm_init_state)) if context is not None: if state['metadata']['trial_num'] == self.current_trial_num or state['metadata']['type']: diff --git a/btgym/research/model_based/aac.py b/btgym/research/model_based/aac.py index facb3e9a..33573776 100644 --- a/btgym/research/model_based/aac.py +++ b/btgym/research/model_based/aac.py @@ -44,67 +44,67 @@ def _combine_summaries(self, policy=None, model_summaries=None): if model_summaries is not None: if self.use_global_network: # Model-wide statistics: - with tf.name_scope('model'): + with tf.compat.v1.name_scope('model'): model_summaries += [ - tf.summary.scalar("grad_global_norm", self.grads_global_norm), - tf.summary.scalar("learn_rate", self.learn_rate_decayed), + tf.compat.v1.summary.scalar("grad_global_norm", self.grads_global_norm), + tf.compat.v1.summary.scalar("learn_rate", self.learn_rate_decayed), # cause actual rate is a jaggy due to test freezes - tf.summary.scalar("total_loss", self.loss), + tf.compat.v1.summary.scalar("total_loss", self.loss), ] if policy is not None: - model_summaries += [tf.summary.scalar("var_global_norm", tf.global_norm(policy.var_list))] + model_summaries += [tf.compat.v1.summary.scalar("var_global_norm", tf.linalg.global_norm(policy.var_list))] else: model_summaries = [] # Model stat. summary: - model_summary = tf.summary.merge(model_summaries, name='model_summary') + model_summary = tf.compat.v1.summary.merge(model_summaries, name='model_summary') # Episode-related summaries: ep_summary = dict( # Summary placeholders - render_atari=tf.placeholder(tf.uint8, [None, None, None, 1]), - total_r=tf.placeholder(tf.float32, ), - cpu_time=tf.placeholder(tf.float32, ), - final_value=tf.placeholder(tf.float32, ), - steps=tf.placeholder(tf.int32, ), - ou_lambda=tf.placeholder(tf.float32, ), - ou_sigma=tf.placeholder(tf.float32, ), - ou_mu=tf.placeholder(tf.float32, ), + render_atari=tf.compat.v1.placeholder(tf.uint8, [None, None, None, 1]), + total_r=tf.compat.v1.placeholder(tf.float32, ), + cpu_time=tf.compat.v1.placeholder(tf.float32, ), + final_value=tf.compat.v1.placeholder(tf.float32, ), + steps=tf.compat.v1.placeholder(tf.int32, ), + ou_lambda=tf.compat.v1.placeholder(tf.float32, ), + ou_sigma=tf.compat.v1.placeholder(tf.float32, ), + ou_mu=tf.compat.v1.placeholder(tf.float32, ), ) ep_summary.update( { - mode: tf.placeholder(tf.uint8, [None, None, None, None], name=mode + '_pl') + mode: tf.compat.v1.placeholder(tf.uint8, [None, None, None, None], name=mode + '_pl') for mode in self.env_list[0].render_modes + self.aux_render_modes } ) - ep_summary['render_op'] = tf.summary.merge( - [tf.summary.image(mode, ep_summary[mode]) + ep_summary['render_op'] = tf.compat.v1.summary.merge( + [tf.compat.v1.summary.image(mode, ep_summary[mode]) for mode in self.env_list[0].render_modes + self.aux_render_modes] ) # Episode stat. summary: - ep_summary['btgym_stat_op'] = tf.summary.merge( + ep_summary['btgym_stat_op'] = tf.compat.v1.summary.merge( [ - tf.summary.scalar('episode_train/cpu_time_sec', ep_summary['cpu_time']), - tf.summary.scalar('episode_train/final_value', ep_summary['final_value']), - tf.summary.scalar('episode_train/total_reward', ep_summary['total_r']), - tf.summary.scalar('episode_train/ou_lambda', ep_summary['ou_lambda']), - tf.summary.scalar('episode_train/ou_sigma', ep_summary['ou_sigma']), - tf.summary.scalar('episode_train/ou_mu', ep_summary['ou_mu']), + tf.compat.v1.summary.scalar('episode_train/cpu_time_sec', ep_summary['cpu_time']), + tf.compat.v1.summary.scalar('episode_train/final_value', ep_summary['final_value']), + tf.compat.v1.summary.scalar('episode_train/total_reward', ep_summary['total_r']), + tf.compat.v1.summary.scalar('episode_train/ou_lambda', ep_summary['ou_lambda']), + tf.compat.v1.summary.scalar('episode_train/ou_sigma', ep_summary['ou_sigma']), + tf.compat.v1.summary.scalar('episode_train/ou_mu', ep_summary['ou_mu']), ], name='episode_train_btgym' ) # Test episode stat. summary: - ep_summary['test_btgym_stat_op'] = tf.summary.merge( + ep_summary['test_btgym_stat_op'] = tf.compat.v1.summary.merge( [ - tf.summary.scalar('episode_test/total_reward', ep_summary['total_r']), - tf.summary.scalar('episode_test/final_value', ep_summary['final_value']), + tf.compat.v1.summary.scalar('episode_test/total_reward', ep_summary['total_r']), + tf.compat.v1.summary.scalar('episode_test/final_value', ep_summary['final_value']), ], name='episode_test_btgym' ) - ep_summary['atari_stat_op'] = tf.summary.merge( + ep_summary['atari_stat_op'] = tf.compat.v1.summary.merge( [ - tf.summary.scalar('episode/total_reward', ep_summary['total_r']), - tf.summary.scalar('episode/steps', ep_summary['steps']) + tf.compat.v1.summary.scalar('episode/total_reward', ep_summary['total_r']), + tf.compat.v1.summary.scalar('episode/steps', ep_summary['steps']) ], name='episode_atari' ) @@ -280,16 +280,16 @@ def _make_train_op(self): # Clipped gradients for critic (critic's train op is disabled by `_use_global_network=False` # to avoid actor's name scope violation): self.critic_aac.grads, _ = tf.clip_by_global_norm( - tf.gradients(self.critic_aac.loss, pi_critic.var_list), + tf.gradients(ys=self.critic_aac.loss, xs=pi_critic.var_list), 40.0 ) # Placeholders for stored gradients values, include None's to correctly map Vars: self.actor_aac.grads_placeholders = [ - tf.placeholder(shape=grad.shape, dtype=grad.dtype) if grad is not None else None + tf.compat.v1.placeholder(shape=grad.shape, dtype=grad.dtype) if grad is not None else None for grad in self.actor_aac.grads ] self.critic_aac.grads_placeholders = [ - tf.placeholder(shape=grad.shape, dtype=grad.dtype) if grad is not None else None + tf.compat.v1.placeholder(shape=grad.shape, dtype=grad.dtype) if grad is not None else None for grad in self.critic_aac.grads ] @@ -317,7 +317,7 @@ def _make_train_op(self): self.inc_step = self.actor_aac.inc_step # Op to update critic with gradients from actor: - self.critic_aac.optimizer = tf.train.AdamOptimizer(self.actor_aac.learn_rate_decayed, epsilon=1e-5) + self.critic_aac.optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_aac.learn_rate_decayed, epsilon=1e-5) self.update_critic_op = self.critic_aac.optimizer.apply_gradients(critic_grads_and_vars) # Use actor optimizer to update global policy instance: diff --git a/btgym/research/policy_rl2.py b/btgym/research/policy_rl2.py index 94e058c9..3fd24c69 100644 --- a/btgym/research/policy_rl2.py +++ b/btgym/research/policy_rl2.py @@ -41,7 +41,7 @@ def get_initial_features(self, state, context=None): try: if state['metadata']['trial_num'] != self.current_trial_num or context is None or state['metadata']['type']: # Assume new/initial trial or test, reset context: - sess = tf.get_default_session() + sess = tf.compat.v1.get_default_session() new_context = sess.run(self.on_lstm_init_state) print('RL^2 policy context reset') diff --git a/setup.py b/setup.py index 95b3e342..f627e0c5 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ ], version='0.0.8', install_requires=[ - 'tensorflow>=1.5', + 'tensorflow>=2.3', 'opencv-python', 'gym[atari]', 'backtrader', From 8addf9cead9b1b728164effd09f443835f4eac0a Mon Sep 17 00:00:00 2001 From: Wojciech Indyk Date: Fri, 28 Aug 2020 20:56:07 +0200 Subject: [PATCH 2/6] Tf1 -> Tf2: LayerNormBasicLSTMCell to tfa.rnn.LayerNormLSTMCell --- btgym/algorithms/policy/stacked_lstm.py | 6 +++--- btgym/research/b_vae_a3c.py | 6 +++--- btgym/research/encoder_test/policy.py | 6 ++++-- btgym/research/gps/policy.py | 4 ++-- setup.py | 1 + 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/btgym/algorithms/policy/stacked_lstm.py b/btgym/algorithms/policy/stacked_lstm.py index 160728e8..fca44d41 100644 --- a/btgym/algorithms/policy/stacked_lstm.py +++ b/btgym/algorithms/policy/stacked_lstm.py @@ -1,4 +1,4 @@ -from tensorflow.contrib.layers import flatten as batch_flatten +import tensorflow_addons as tfa from btgym.algorithms.policy.base import BaseAacPolicy from btgym.algorithms.nn.networks import * @@ -27,7 +27,7 @@ def __init__(self, ac_space, rp_sequence_size, state_encoder_class_ref=conv_2d_network, - lstm_class_ref=tf.contrib.rnn.LayerNormBasicLSTMCell, + lstm_class_ref=tfa.rnn.LayerNormLSTMCell, lstm_layers=(256, 256), linear_layer_ref=noisy_linear, share_encoder_params=False, @@ -47,7 +47,7 @@ def __init__(self, ob_space: instance of btgym.spaces.DictSpace ac_space: instance of btgym.spaces.ActionDictSpace rp_sequence_size: reward prediction sample length - lstm_class_ref: tf.nn.lstm class to use + lstm_class_ref: tfa.rnn class to use lstm_layers: tuple of LSTM layers sizes linear_layer_ref: linear layer class to use share_encoder_params: bool, whether to share encoder parameters for every 'external' data stream diff --git a/btgym/research/b_vae_a3c.py b/btgym/research/b_vae_a3c.py index b1dc90fb..99342cb5 100644 --- a/btgym/research/b_vae_a3c.py +++ b/btgym/research/b_vae_a3c.py @@ -1,4 +1,4 @@ -from tensorflow.contrib.layers import flatten as batch_flatten +import tensorflow_addons as tfa from btgym.algorithms.policy.base import BaseAacPolicy from btgym.algorithms.policy.stacked_lstm import AacStackedRL2Policy @@ -51,7 +51,7 @@ def __init__(self, (32, (3, 1), (2, 1)), (32, (3, 1), (2, 1)) ), - lstm_class_ref=tf.contrib.rnn.LayerNormBasicLSTMCell, + lstm_class_ref=tfa.rnn.LayerNormLSTMCell, lstm_layers=(256, 256), lstm_2_init_period=50, linear_layer_ref=noisy_linear, @@ -68,7 +68,7 @@ def __init__(self, ob_space: dictionary of observation state shapes ac_space: discrete action space shape (length) rp_sequence_size: reward prediction sample length - lstm_class_ref: tf.nn.lstm class to use + lstm_class_ref: tfa.rnn class to use lstm_layers: tuple of LSTM layers sizes lstm_2_init_period: number of `get_initial_context()` method calls before force LSTM_2 context reset. linear_layer_ref: linear layer class to use diff --git a/btgym/research/encoder_test/policy.py b/btgym/research/encoder_test/policy.py index e5fb05d0..92de41c6 100644 --- a/btgym/research/encoder_test/policy.py +++ b/btgym/research/encoder_test/policy.py @@ -1,3 +1,5 @@ +import tensorflow_addons as tfa + from btgym.algorithms.policy.base import BaseAacPolicy from btgym.algorithms.nn.networks import * from btgym.algorithms.utils import * @@ -17,7 +19,7 @@ def __init__(self, ac_space, rp_sequence_size=4, state_encoder_class_ref=conv_2d_network, - lstm_class_ref=tf.contrib.rnn.LayerNormBasicLSTMCell, + lstm_class_ref=tfa.rnn.LayerNormLSTMCell, lstm_layers=(256, 256), linear_layer_ref=noisy_linear, share_encoder_params=False, @@ -39,7 +41,7 @@ def __init__(self, ob_space: instance of btgym.spaces.DictSpace ac_space: instance of btgym.spaces.ActionDictSpace rp_sequence_size: reward prediction sample length - lstm_class_ref: tf.nn.lstm class to use + lstm_class_ref: tfa.rnn class to use lstm_layers: tuple of LSTM layers sizes linear_layer_ref: linear layer class to use share_encoder_params: bool, whether to share encoder parameters for every 'external' data stream diff --git a/btgym/research/gps/policy.py b/btgym/research/gps/policy.py index ca271506..e90dd2fa 100644 --- a/btgym/research/gps/policy.py +++ b/btgym/research/gps/policy.py @@ -1,4 +1,4 @@ -import tensorflow as tf +import tensorflow_addons as tfa from btgym.algorithms.policy.stacked_lstm import AacStackedRL2Policy from btgym.algorithms.nn.layers import noisy_linear @@ -16,7 +16,7 @@ def __init__( (64, (3, 1), (2, 1)), (64, (3, 1), (2, 1)) ), - lstm_class_ref=tf.contrib.rnn.LayerNormBasicLSTMCell, + lstm_class_ref=tfa.rnn.LayerNormLSTMCell, lstm_layers=(256, 256), lstm_2_init_period=50, linear_layer_ref=noisy_linear, diff --git a/setup.py b/setup.py index f627e0c5..3c1e2715 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ version='0.0.8', install_requires=[ 'tensorflow>=2.3', + 'tensorflow_addons>=0.11', 'opencv-python', 'gym[atari]', 'backtrader', From 1032bc1085f8322de4ab7bca5649b972937ba595 Mon Sep 17 00:00:00 2001 From: Wojciech Indyk Date: Fri, 28 Aug 2020 21:36:56 +0200 Subject: [PATCH 3/6] Tf1 -> Tf2: layer_norm to tf.keras.layers, LuongAttention --- btgym/algorithms/nn/ae.py | 2 +- btgym/algorithms/nn/networks.py | 2 +- btgym/research/casual_conv/networks.py | 9 +++++---- btgym/research/encoder_test/networks.py | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/btgym/algorithms/nn/ae.py b/btgym/algorithms/nn/ae.py index f88b60b9..3c36d368 100644 --- a/btgym/algorithms/nn/ae.py +++ b/btgym/algorithms/nn/ae.py @@ -1,7 +1,6 @@ import numpy as np import tensorflow as tf from tensorflow.contrib.layers import flatten as batch_flatten -from tensorflow.contrib.layers import layer_norm as norm_layer from btgym.algorithms.nn.layers import normalized_columns_initializer, linear, conv2d @@ -35,6 +34,7 @@ def conv2d_encoder(x, layer_shapes = [x.get_shape()] layer_outputs = [] for i, layer_spec in enumerate(layer_config, 1): + norm_layer = tf.keras.layers.LayerNormalization() x = tf.nn.elu( norm_layer( conv2d( diff --git a/btgym/algorithms/nn/networks.py b/btgym/algorithms/nn/networks.py index 2d998db9..03007e09 100644 --- a/btgym/algorithms/nn/networks.py +++ b/btgym/algorithms/nn/networks.py @@ -7,7 +7,6 @@ import numpy as np import tensorflow as tf import tensorflow.contrib.rnn as rnn -from tensorflow.contrib.layers import layer_norm as norm_layer from tensorflow.python.util.nest import flatten as flatten_nested from btgym.algorithms.nn.layers import normalized_columns_initializer, categorical_sample @@ -38,6 +37,7 @@ def conv_2d_network(x, """ with tf.compat.v1.variable_scope(name, reuse=reuse): for i, num_filters in enumerate(conv_2d_num_filters): + norm_layer = tf.keras.layers.LayerNormalization() x = tf.nn.elu( norm_layer( conv_2d_layer_ref( diff --git a/btgym/research/casual_conv/networks.py b/btgym/research/casual_conv/networks.py index ac48c641..e8e1f6e3 100644 --- a/btgym/research/casual_conv/networks.py +++ b/btgym/research/casual_conv/networks.py @@ -1,6 +1,6 @@ import tensorflow as tf -from tensorflow.contrib.layers import layer_norm as norm_layer +import tensorflow_addons as tfa import numpy as np import math @@ -78,7 +78,8 @@ def conv_1d_casual_encoder( # b2t: y = tf.reshape(y, [-1, num_time_batches, conv_1d_num_filters], name='layer_{}_output'.format(i)) - y = norm_layer(y) + normalization_layer = tf.keras.layers.LayerNormalization() + y = normalization_layer(y) if conv_1d_activation is not None: y = conv_1d_activation(y) @@ -137,7 +138,7 @@ def conv_1d_casual_encoder( return encoded -def attention_layer(inputs, attention_ref=tf.contrib.seq2seq.LuongAttention, name='attention_layer', **kwargs): +def attention_layer(inputs, attention_ref=tfa.seq2seq.LuongAttention, name='attention_layer', **kwargs): """ Temporal attention layer. Computes attention context based on last(left) value in time dim. @@ -201,7 +202,7 @@ def conv_1d_casual_attention_encoder( conv_1d_num_filters=32, conv_1d_filter_size=2, conv_1d_activation=tf.nn.elu, - conv_1d_attention_ref=tf.contrib.seq2seq.LuongAttention, + conv_1d_attention_ref=tfa.seq2seq.LuongAttention, name='casual_encoder', keep_prob=None, conv_1d_gated=False, diff --git a/btgym/research/encoder_test/networks.py b/btgym/research/encoder_test/networks.py index 24852510..7938a1c3 100644 --- a/btgym/research/encoder_test/networks.py +++ b/btgym/research/encoder_test/networks.py @@ -1,7 +1,6 @@ import numpy as np import tensorflow as tf import tensorflow.contrib.rnn as rnn -from tensorflow.contrib.layers import layer_norm as norm_layer from tensorflow.python.util.nest import flatten as flatten_nested from btgym.algorithms.nn.layers import normalized_columns_initializer, categorical_sample @@ -35,6 +34,7 @@ def conv_2d_network_skip(x, layers = [] with tf.compat.v1.variable_scope(name, reuse=reuse): for i, num_filters in enumerate(conv_2d_num_filters): + norm_layer = tf.keras.layers.LayerNormalization() x = tf.nn.elu( norm_layer( conv_2d_layer_ref( From d591bea1358c7e2115c41b95baa84e012574f607 Mon Sep 17 00:00:00 2001 From: Wojciech Indyk Date: Sat, 29 Aug 2020 05:54:56 +0200 Subject: [PATCH 4/6] Tf1 -> Tf2: migrate tf.contrib --- btgym/algorithms/nn/ae.py | 3 +-- btgym/algorithms/nn/networks.py | 7 ++++--- btgym/algorithms/rollout.py | 2 +- btgym/algorithms/utils.py | 15 +++++++-------- btgym/research/encoder_test/networks.py | 1 - 5 files changed, 13 insertions(+), 15 deletions(-) diff --git a/btgym/algorithms/nn/ae.py b/btgym/algorithms/nn/ae.py index 3c36d368..6bb2e5dd 100644 --- a/btgym/algorithms/nn/ae.py +++ b/btgym/algorithms/nn/ae.py @@ -1,6 +1,5 @@ import numpy as np import tensorflow as tf -from tensorflow.contrib.layers import flatten as batch_flatten from btgym.algorithms.nn.layers import normalized_columns_initializer, linear, conv2d @@ -165,7 +164,7 @@ def conv2d_autoencoder( reuse=reuse ) # Flatten hidden state, pass through dense : - z = batch_flatten(encoder_layers[-1]) + z = tf.reshape(encoder_layers[-1], [tf.shape(encoder_layers[-1])[0], -1]) h, w, c = encoder_layers[-1].get_shape().as_list()[1:] z = linear_layer_ref( diff --git a/btgym/algorithms/nn/networks.py b/btgym/algorithms/nn/networks.py index 03007e09..0841ee7d 100644 --- a/btgym/algorithms/nn/networks.py +++ b/btgym/algorithms/nn/networks.py @@ -6,7 +6,7 @@ import numpy as np import tensorflow as tf -import tensorflow.contrib.rnn as rnn +import tensorflow_addons as tfa from tensorflow.python.util.nest import flatten as flatten_nested from btgym.algorithms.nn.layers import normalized_columns_initializer, categorical_sample @@ -105,7 +105,7 @@ def conv_1d_network(x, def lstm_network( x, lstm_sequence_length, - lstm_class=rnn.BasicLSTMCell, + lstm_class=tf.compat.v1.nn.rnn_cell.BasicLSTMCell, lstm_layers=(256,), static=False, keep_prob=None, @@ -141,7 +141,7 @@ def lstm_network( lstm.append(layer) - lstm = rnn.MultiRNNCell(lstm, state_is_tuple=True) + lstm = tf.compat.v1.nn.rnn_cell.MultiRNNCell(lstm, state_is_tuple=True) # Get time_dimension as [1]-shaped tensor: step_size = tf.expand_dims(tf.shape(input=x)[1], [0]) @@ -185,6 +185,7 @@ def dense_aac_network(x, ac_space_depth, name='dense_aac', linear_layer_ref=nois with tf.compat.v1.variable_scope(name, reuse=reuse): # Center-logits: + norm_layer = tf.keras.layers.LayerNormalization() logits = norm_layer( linear_layer_ref( x=x, diff --git a/btgym/algorithms/rollout.py b/btgym/algorithms/rollout.py index 1eea6028..4b9282ba 100644 --- a/btgym/algorithms/rollout.py +++ b/btgym/algorithms/rollout.py @@ -8,7 +8,7 @@ import numpy as np -from tensorflow.contrib.rnn import LSTMStateTuple +from tensorflow._api.v2.compat.v1.nn.rnn_cell import LSTMStateTuple from btgym.algorithms.math_utils import discount from btgym.algorithms.utils import batch_pad diff --git a/btgym/algorithms/utils.py b/btgym/algorithms/utils.py index da323aea..82c61645 100644 --- a/btgym/algorithms/utils.py +++ b/btgym/algorithms/utils.py @@ -3,7 +3,6 @@ import tensorflow as tf from tensorflow.python.util.nest import flatten as flatten_nested from tensorflow.python.util.nest import assert_same_structure -from tensorflow.contrib.rnn import LSTMStateTuple from gym.spaces import Discrete, Dict @@ -19,11 +18,11 @@ def rnn_placeholders(state): Returns: tuple of placeholders """ - if isinstance(state, tf.nn.rnn_cell.LSTMStateTuple): + if isinstance(state, tf.compat.v1.nn.rnn_cell.LSTMStateTuple): c, h = state c = tf.compat.v1.placeholder(tf.float32, tf.TensorShape([None]).concatenate(c.get_shape()[1:]), c.op.name + '_c_pl') h = tf.compat.v1.placeholder(tf.float32, tf.TensorShape([None]).concatenate(h.get_shape()[1:]), h.op.name + '_h_pl') - return tf.nn.rnn_cell.LSTMStateTuple(c, h) + return tf.compat.v1.nn.rnn_cell.LSTMStateTuple(c, h) elif isinstance(state, tf.Tensor): h = state h = tf.compat.v1.placeholder(tf.float32, tf.TensorShape([None]).concatenate(h.get_shape()[1:]), h.op.name + '_h_pl') @@ -180,10 +179,10 @@ def batch_stack(dict_list, _top=True): value_list = [value[key] for value in dict_list] batch[key] = batch_stack(value_list, False) - elif isinstance(master, LSTMStateTuple): + elif isinstance(master, tf.compat.v1.nn.rnn_cell.LSTMStateTuple): c = batch_stack([state[0] for state in dict_list], False) h = batch_stack([state[1] for state in dict_list], False) - batch = LSTMStateTuple(c=c, h=h) + batch = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(c=c, h=h) elif isinstance(master, tuple): batch = tuple([batch_stack([struct[i] for struct in dict_list], False) for i in range(len(master))]) @@ -220,10 +219,10 @@ def batch_gather(batch_dict, indices, _top=True): for key, value in batch_dict.items(): batch[key] = batch_gather(value, indices, False) - elif isinstance(batch_dict, LSTMStateTuple): + elif isinstance(batch_dict, tf.compat.v1.nn.rnn_cell.LSTMStateTuple): c = batch_gather(batch_dict[0], indices, False) h = batch_gather(batch_dict[1], indices, False) - batch = LSTMStateTuple(c=c, h=h) + batch = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(c=c, h=h) elif isinstance(batch_dict, tuple): batch = tuple([batch_gather(struct, indices, False) for struct in batch_dict]) @@ -312,7 +311,7 @@ def _show_struct(struct): print(key) _show_struct(value) - elif type(struct) in [LSTMStateTuple, tuple, list]: + elif type(struct) in [tf.compat.v1.nn.rnn_cell.LSTMStateTuple, tuple, list]: print('LSTM/tuple/list:', type(struct), len(struct)) for i in struct: _show_struct(i) diff --git a/btgym/research/encoder_test/networks.py b/btgym/research/encoder_test/networks.py index 7938a1c3..abaae2c0 100644 --- a/btgym/research/encoder_test/networks.py +++ b/btgym/research/encoder_test/networks.py @@ -1,6 +1,5 @@ import numpy as np import tensorflow as tf -import tensorflow.contrib.rnn as rnn from tensorflow.python.util.nest import flatten as flatten_nested from btgym.algorithms.nn.layers import normalized_columns_initializer, categorical_sample From 640fb55a77a5126a8b8d71600fbbe20aaeb3d990 Mon Sep 17 00:00:00 2001 From: Wojciech Indyk Date: Sun, 27 Sep 2020 14:21:54 +0200 Subject: [PATCH 5/6] Tf1 -> Tf2: LSTMCell --- btgym/algorithms/policy/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/btgym/algorithms/policy/base.py b/btgym/algorithms/policy/base.py index a7dd4b93..3c0dcece 100644 --- a/btgym/algorithms/policy/base.py +++ b/btgym/algorithms/policy/base.py @@ -10,6 +10,7 @@ # from gym.spaces import Discrete, Dict +from tensorflow._api.v2.compat.v1.nn.rnn_cell import BasicLSTMCell from btgym.algorithms.nn.networks import * from btgym.algorithms.utils import * @@ -34,7 +35,7 @@ def __init__(self, ob_space, ac_space, rp_sequence_size, - lstm_class=rnn.BasicLSTMCell, + lstm_class=BasicLSTMCell, lstm_layers=(256,), action_dp_alpha=200.0, aux_estimate=False, @@ -405,7 +406,7 @@ def __init__(self, ob_space, ac_space, rp_sequence_size, - lstm_class=rnn.BasicLSTMCell, + lstm_class=BasicLSTMCell, lstm_layers=(256,), action_dp_alpha=200.0, aux_estimate=True, From 57efd45c77f55ccd0b943ee78535050c01ee377a Mon Sep 17 00:00:00 2001 From: Wojciech Indyk Date: Sun, 27 Sep 2020 16:57:35 +0200 Subject: [PATCH 6/6] Tf1 -> Tf2: MLDG --- btgym/research/metalearn_2/_mldg_batch.py | 531 ++++++++++++++++++++++ 1 file changed, 531 insertions(+) diff --git a/btgym/research/metalearn_2/_mldg_batch.py b/btgym/research/metalearn_2/_mldg_batch.py index e69de29b..3a22701a 100644 --- a/btgym/research/metalearn_2/_mldg_batch.py +++ b/btgym/research/metalearn_2/_mldg_batch.py @@ -0,0 +1,531 @@ +import tensorflow as tf +import numpy as np + +import sys +from logbook import Logger, StreamHandler + +from btgym.research.mldg.aac import SubAAC +from btgym.algorithms.runner.synchro import BaseSynchroRunner + + +class MLDG(): + """ + Asynchronous implementation of MLDG algorithm + for continuous adaptation in dynamically changing environments. + + Papers: + Da Li et al., + "Learning to Generalize: Meta-Learning for Domain Generalization" + https://arxiv.org/abs/1710.03463 + + Maruan Al-Shedivat et al., + "Continuous Adaptation via Meta-Learning in Nonstationary and Competitive Environments" + https://arxiv.org/abs/1710.03641 + + """ + def __init__( + self, + env, + task, + log_level, + aac_class_ref=SubAAC, + runner_config=None, + aac_lambda=1.0, + guided_lambda=1.0, + rollout_length=20, + train_support=300, + fast_adapt_num_steps=10, + fast_adapt_batch_size=32, + trial_source_target_cycle=(1, 0), + num_episodes_per_trial=1, # one-shot adaptation + _aux_render_modes=('action_prob', 'value_fn', 'lstm_1_h', 'lstm_2_h'), + name='MLDG', + **kwargs + ): + try: + self.aac_class_ref = aac_class_ref + self.task = task + self.name = name + self.summary_writer = None + + StreamHandler(sys.stdout).push_application() + self.log = Logger('{}_{}'.format(name, task), level=log_level) + + self.rollout_length = rollout_length + self.train_support = train_support # number of train experiences to collect + self.train_batch_size = int(self.train_support / self.rollout_length) + self.fast_adapt_num_steps = fast_adapt_num_steps + self.fast_adapt_batch_size = fast_adapt_batch_size + + if runner_config is None: + self.runner_config = { + 'class_ref': BaseSynchroRunner, + 'kwargs': {}, + } + else: + self.runner_config = runner_config + + self.env_list = env + + assert isinstance(self.env_list, list) and len(self.env_list) == 2, \ + 'Expected pair of environments, got: {}'.format(self.env_list) + + # Instantiate two sub-trainers: one for test and one for train environments: + + self.runner_config['kwargs']['data_sample_config'] = {'mode': 1} # master + self.runner_config['kwargs']['name'] = 'master' + + self.train_aac = aac_class_ref( + env=self.env_list[0], # train data will be master environment TODO: really dumb data control. improve. + task=self.task, + log_level=log_level, + runner_config=self.runner_config, + aac_lambda=aac_lambda, + guided_lambda=guided_lambda, + rollout_length=self.rollout_length, + trial_source_target_cycle=trial_source_target_cycle, + num_episodes_per_trial=num_episodes_per_trial, + _use_target_policy=False, + _use_global_network=True, + _aux_render_modes=_aux_render_modes, + name=self.name + '_sub_Train', + **kwargs + ) + + self.runner_config['kwargs']['data_sample_config'] = {'mode': 0} # master + self.runner_config['kwargs']['name'] = 'slave' + + self.test_aac = aac_class_ref( + env=self.env_list[-1], # test data -> slave env. + task=self.task, + log_level=log_level, + runner_config=self.runner_config, + aac_lambda=aac_lambda, + guided_lambda=guided_lambda, + rollout_length=self.rollout_length, + trial_source_target_cycle=trial_source_target_cycle, + num_episodes_per_trial=num_episodes_per_trial, + _use_target_policy=False, + _use_global_network=False, + global_step_op=self.train_aac.global_step, + global_episode_op=self.train_aac.global_episode, + inc_episode_op=self.train_aac.inc_episode, + _aux_render_modes=_aux_render_modes, + name=self.name + '_sub_Test', + **kwargs + ) + + self.local_steps = self.train_aac.local_steps + self.model_summary_freq = self.train_aac.model_summary_freq + #self.model_summary_op = self.train_aac.model_summary_op + + self._make_train_op() + self.test_aac.model_summary_op = tf.summary.merge( + [self.test_aac.model_summary_op, self._combine_meta_summaries()], + name='meta_model_summary' + ) + + except: + msg = 'MLDG.__init()__ exception occurred' + \ + '\n\nPress `Ctrl-C` or jupyter:[Kernel]->[Interrupt] for clean exit.\n' + self.log.exception(msg) + raise RuntimeError(msg) + + def _make_train_op(self): + """ + + Defines: + tensors holding training op graph for sub trainers and self; + """ + pi = self.train_aac.local_network + pi_prime = self.test_aac.local_network + + self.test_aac.sync = self.test_aac.sync_pi = tf.group( + *[v1.assign(v2) for v1, v2 in zip(pi_prime.var_list, pi.var_list)] + ) + + self.global_step = self.train_aac.global_step + self.global_episode = self.train_aac.global_episode + + self.test_aac.global_step = self.train_aac.global_step + self.test_aac.global_episode = self.train_aac.global_episode + self.test_aac.inc_episode = self.train_aac.inc_episode + self.train_aac.inc_episode = None + self.inc_step = self.train_aac.inc_step + + # Meta-loss: + self.loss = 0.5 * self.train_aac.loss + 0.5 * self.test_aac.loss + + # Clipped gradients: + self.train_aac.grads, _ = tf.clip_by_global_norm( + tf.gradients(self.train_aac.loss, pi.var_list), + 40.0 + ) + self.log.warning('self.train_aac.grads: {}'.format(len(list(self.train_aac.grads)))) + + # self.test_aac.grads, _ = tf.clip_by_global_norm( + # tf.gradients(self.test_aac.loss, pi_prime.var_list), + # 40.0 + # ) + # Meta-gradient: + grads_i, _ = tf.clip_by_global_norm( + tf.gradients(self.train_aac.loss, pi.var_list), + 40.0 + ) + + grads_i_next, _ = tf.clip_by_global_norm( + tf.gradients(self.test_aac.loss, pi_prime.var_list), + 40.0 + ) + + self.grads = [] + for g1, g2 in zip(grads_i, grads_i_next): + if g1 is not None and g2 is not None: + meta_g = 0.5 * g1 + 0.5 * g2 + else: + meta_g = None + + self.grads.append(meta_g) + + #self.log.warning('self.grads_len: {}'.format(len(list(self.grads)))) + + # Gradients to update local copy of pi_prime (from train data): + train_grads_and_vars = list(zip(self.train_aac.grads, pi_prime.var_list)) + + # self.log.warning('train_grads_and_vars_len: {}'.format(len(train_grads_and_vars))) + + # Meta-gradients to be sent to parameter server: + meta_grads_and_vars = list(zip(self.grads, self.train_aac.network.var_list)) + + # self.log.warning('meta_grads_and_vars_len: {}'.format(len(meta_grads_and_vars))) + + # Set global_step increment equal to observation space batch size: + obs_space_keys = list(self.train_aac.local_network.on_state_in.keys()) + + assert 'external' in obs_space_keys, \ + 'Expected observation space to contain `external` mode, got: {}'.format(obs_space_keys) + self.train_aac.inc_step = self.train_aac.global_step.assign_add( + tf.shape(self.train_aac.local_network.on_state_in['external'])[0] + ) + + self.train_op = self.train_aac.optimizer.apply_gradients(train_grads_and_vars) + + # Optimizer for meta-update: + self.optimizer = tf.train.AdamOptimizer(self.train_aac.train_learn_rate, epsilon=1e-5) + # TODO: own alpha-leran rate + self.meta_train_op = self.optimizer.apply_gradients(meta_grads_and_vars) + + self.log.debug('meta_train_op defined') + + def _combine_meta_summaries(self): + + meta_model_summaries = [ + tf.summary.scalar("meta_grad_global_norm", tf.global_norm(self.grads)), + tf.summary.scalar("total_meta_loss", self.loss), + ] + + return meta_model_summaries + + def start(self, sess, summary_writer, **kwargs): + """ + Executes all initializing operations, + starts environment runner[s]. + Supposed to be called by parent worker just before training loop starts. + + Args: + sess: tf session object. + kwargs: not used by default. + """ + try: + # Copy weights from global to local: + sess.run(self.train_aac.sync_pi) + sess.run(self.test_aac.sync_pi) + + # Start thread_runners: + self.train_aac._start_runners( # master first + sess, + summary_writer, + init_context=None, + data_sample_config=self.train_aac.get_sample_config(mode=1) + ) + self.test_aac._start_runners( + sess, + summary_writer, + init_context=None, + data_sample_config=self.test_aac.get_sample_config(mode=0) + ) + + self.summary_writer = summary_writer + self.log.notice('Runners started.') + + except: + msg = 'start() exception occurred' + \ + '\n\nPress `Ctrl-C` or jupyter:[Kernel]->[Interrupt] for clean exit.\n' + self.log.exception(msg) + raise RuntimeError(msg) + + def fast_adapt_step(self, sess, batch_size, on_policy_batch, off_policy_batch, rp_batch, make_summary=False): + """ + One step of test_policy adaptation. + + Args: + sess: tensorflow.Session obj. + batch_size: train mini-batch size + on_policy_batch: `on_policy` train data + off_policy_batch: `off_policy` train data or None + rp_batch: 'reward_prediction` train data or None + make_summary: bool, if True - compute model summary + + Returns: + model summary or None + """ + # Sample from train distribution: + on_mini_batch = self.train_aac.sample_batch(on_policy_batch, batch_size) + off_mini_batch = self.train_aac.sample_batch(off_policy_batch, batch_size) + rp_mini_batch = self.train_aac.sample_batch(rp_batch, batch_size) + + feed_dict = self.train_aac._get_main_feeder(sess, on_mini_batch, off_mini_batch, rp_mini_batch, True) + + if make_summary: + fetches = [self.train_op, self.train_aac.model_summary_op] + else: + fetches = [self.train_op] + + # Update pi_prime parameters wrt sampled data: + fetched = sess.run(fetches, feed_dict=feed_dict) + + # self.log.warning('Train gradients ok.') + + if make_summary: + summary = fetched[-1] + + else: + summary = None + + return summary + + def train_step(self, sess, data_config): + """ + Collects train task data and updates test policy parameters (fast adaptation). + + Args: + sess: tensorflow.Session obj. + data_config: configuration dictionary of type `btgym.datafeed.base.EnvResetConfig` + + Returns: + batched train data + + """ + # Collect train distribution: + train_batch = self.train_aac.get_batch( + size=self.train_batch_size, + require_terminal=True, + same_trial=True, + data_sample_config=data_config + ) + + # for rollout in train_batch['on_policy']: + # self.log.warning( + # 'Train data trial_num: {}'.format( + # np.asarray(rollout['state']['metadata']['trial_num']) + # ) + # ) + + # Process time-flat-alike (~iid) to treat as empirical data distribution over train task: + on_policy_batch, off_policy_batch, rp_batch = self.train_aac.process_batch(sess, train_batch) + + # self.log.warning('Train data ok.') + + local_step = sess.run(self.global_step) + local_episode = sess.run(self.global_episode) + model_summary = None + + # Extract all non-empty summaries: + ep_summary = [summary for summary in train_batch['ep_summary'] if summary is not None] + + # Perform number of test policy updates wrt. collected train data: + for i in range(self.fast_adapt_num_steps): + model_summary = self.fast_adapt_step( + sess, + batch_size=self.fast_adapt_batch_size, + on_policy_batch=on_policy_batch, + off_policy_batch=off_policy_batch, + rp_batch=rp_batch, + make_summary=(local_step + i) % self.model_summary_freq == 0 + ) + # self.log.warning('Batch {} Train gradients ok.'.format(i)) + + # Write down summaries: + train_summary = dict( + render_summary=[None], + test_ep_summary=[None], + ep_summary=[ep_summary.pop() if len(ep_summary) > 0 else None] + ) + self.train_aac.process_summary( + sess, + train_summary, + model_summary, + step=local_step + i, + episode=local_episode + i + ) + + return on_policy_batch, off_policy_batch, rp_batch + + def meta_train_step(self, sess, data_config, on_policy_batch, off_policy_batch, rp_batch): + """ + Collects data from source domain test task and performs meta-update to shared parameters vector. + Writes down relevant summaries. + + Args: + sess: tensorflow.Session obj. + data_config: configuration dictionary of type `btgym.datafeed.base.EnvResetConfig` + on_policy_batch: `on_policy` train data + off_policy_batch: `off_policy` train data or None + rp_batch: 'reward_prediction` train data or None + + """ + done = False + while not done: + # Say `No` to redundant summaries: + wirte_model_summary = \ + self.local_steps % self.model_summary_freq == 0 + + # Collect test trajectory wrt updated test_policy parameters: + test_data = self.test_aac.get_data( + init_context=None, + data_sample_config=data_config + ) + test_batch_size = 0 # TODO: adjust on/off/rp sizes + for rollout in test_data['on_policy']: + test_batch_size += len(rollout['position']) + + test_feed_dict = self.test_aac.process_data(sess,,,,, test_data,, + + # self.log.warning('Test data rollout for step {} ok.'.format(self.local_steps)) + # + # self.log.warning( + # 'Test data trial_num: {}'.format( + # np.asarray(test_data['on_policy'][0]['state']['metadata']['trial_num']) + # ) + # ) + + # Sample train data of same size: + feed_dict = self.train_aac._get_main_feeder( + sess, + self.train_aac.sample_batch(on_policy_batch, test_batch_size), + self.train_aac.sample_batch(off_policy_batch, test_batch_size), + self.train_aac.sample_batch(rp_batch, test_batch_size), + True + ) + # Add test trajectory: + feed_dict.update(test_feed_dict) + + # Perform meta-update: + if wirte_model_summary: + meta_fetches = [self.meta_train_op, self.test_aac.model_summary_op, self.inc_step] + else: + meta_fetches = [self.meta_train_op, self.inc_step] + + meta_fetched = sess.run(meta_fetches, feed_dict=feed_dict) + + # self.log.warning('Meta-gradients ok.') + + if wirte_model_summary: + meta_model_summary = meta_fetched[-2] + + else: + meta_model_summary = None + + # Write down summaries: + self.test_aac.process_summary(sess, test_data, meta_model_summary) + self.local_steps += 1 + + # If test episode ended? + done = np.asarray(test_data['terminal']).any() + + def meta_test_step(self, sess, data_config, on_policy_batch, off_policy_batch, rp_batch): + """ + Validates adapted policy on data from target domain test task. + Writes down relevant summaries. + + Args: + sess: tensorflow.Session obj. + data_config: configuration dictionary of type `btgym.datafeed.base.EnvResetConfig` + on_policy_batch: `on_policy` train data + off_policy_batch: `off_policy` train data or None + rp_batch: 'reward_prediction` train data or None + + """ + done = False + while not done: + # Collect test trajectory: + test_data = self.test_aac.get_data( + init_context=None, + data_sample_config=data_config + ) + + # self.log.warning('Target test rollout ok.') + # self.log.warning( + # 'Test data target trial_num: {}'.format( + # np.asarray(test_data['on_policy'][0]['state']['metadata']['trial_num']) + # ) + # ) + # self.log.warning('target_render_ep_summary: {}'.format(test_data['render_summary'])) + + # Write down summaries: + self.test_aac.process_summary(sess, test_data) + + # If test episode ended? + done = np.asarray(test_data['terminal']).any() + + def process(self, sess): + """ + Meta-train procedure for one-shot learning/ + + Args: + sess (tensorflow.Session): tf session obj. + + """ + try: + # Copy from parameter server: + sess.run(self.train_aac.sync_pi) + sess.run(self.test_aac.sync_pi) + + #self.log.warning('Sync ok.') + + # Decide on data configuration for train/test trajectories, + # such as all data will come from same trial (maybe different episodes) + # and trial type as well (~from source or target domain): + # note: data_config counters get updated once per process() call + train_data_config = self.train_aac.get_sample_config(mode=1) # master env., draws trial + test_data_config = self.train_aac.get_sample_config(mode=0) # slave env, catches up with same trial + + # If data comes from source or target domain: + is_target = train_data_config['trial_config']['sample_type'] + + # self.log.warning('PROCESS_train_data_config: {}'.format(train_data_config)) + # self.log.warning('PROCESS_test_data_config: {}'.format(test_data_config)) + + # Fast adaptation step: + # collect train trajectories, process time-flat-alike (~iid) to treat as empirical data distribution + # over train task and adapt test_policy wrt. train experience: + on_policy_batch, off_policy_batch, rp_batch = self.train_step(sess, train_data_config) + + # Slow adaptation step: + if is_target: + # Meta-test: + # self.log.warning('Running meta-test episode...') + self.meta_test_step(sess,test_data_config, on_policy_batch, off_policy_batch, rp_batch) + + else: + # Meta-train: + # self.log.warning('Running meta-train episode...') + self.meta_train_step(sess,test_data_config, on_policy_batch, off_policy_batch, rp_batch) + + except: + msg = 'process() exception occurred' + \ + '\n\nPress `Ctrl-C` or jupyter:[Kernel]->[Interrupt] for clean exit.\n' + self.log.exception(msg) + raise RuntimeError(msg) + +