From 0c859715d36c218f375cc136fa149a954085d5d1 Mon Sep 17 00:00:00 2001 From: gianluigigrandesso <50986305+gianluigigrandesso@users.noreply.github.com> Date: Thu, 17 Nov 2022 17:47:06 +0100 Subject: [PATCH] Uploaded scripts --- CACTO_manipulator3DoF_pyomo.py | 916 ++++++++++++++++++++++++++++++++ TO_manipulator3DoF_pyomo.py | 538 +++++++++++++++++++ config_manipulator3DoF_pyomo.py | 111 ++++ dynamics_manipulator3DoF.py | 538 +++++++++++++++++++ inits.py | 57 ++ replay_buffer.py | 187 +++++++ 6 files changed, 2347 insertions(+) create mode 100644 CACTO_manipulator3DoF_pyomo.py create mode 100644 TO_manipulator3DoF_pyomo.py create mode 100644 config_manipulator3DoF_pyomo.py create mode 100644 dynamics_manipulator3DoF.py create mode 100644 inits.py create mode 100644 replay_buffer.py diff --git a/CACTO_manipulator3DoF_pyomo.py b/CACTO_manipulator3DoF_pyomo.py new file mode 100644 index 0000000..ba51216 --- /dev/null +++ b/CACTO_manipulator3DoF_pyomo.py @@ -0,0 +1,916 @@ +import os +import tensorflow as tf +from tensorflow.keras import layers, regularizers +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.patches import Ellipse +import math +import random +from pyomo.environ import * +from pyomo.dae import * +import pinocchio as pin +from TO_manipulator3DoF_pyomo import TO_manipulator, plot_results_TO +import config_manipulator3DoF_pyomo as conf +from dynamics_manipulator3DoF import RobotWrapper, RobotSimulator, load_urdf +from replay_buffer import PrioritizedReplayBuffer +from inits import init_tau0,init_tau1,init_tau2,init_q0,init_q1,init_q2,init_v0,init_v1,init_v2,init_q0_ICS,init_q1_ICS,init_q2_ICS,init_v0_ICS,init_v1_ICS,init_v2_ICS,init_0 + +# os.environ["CUDA_VISIBLE_DEVICES"]="-1" # Uncomment to run TF on CPU rather than GPU + +# Class implementing the NNs update +class Training: + def __init__(self, batch_size=conf.BATCH_SIZE): + self.batch_size = batch_size + return + + # Update both critic and actor + def update(self, episode, red_state_batch_norm, red_state_batch, state_batch_norm, state_batch, reward_batch, next_state_batch, next_state_batch_norm, d_batch, weights_batch=None): + + with tf.GradientTape() as tape: + + # Update the critic + if conf.TD_N: + y = reward_batch # When using n-step TD, reward_batch is the batch of costs-to-go and not the batch of single step rewards + else: + if conf.NORMALIZE_INPUTS: + target_values = target_critic(next_state_batch_norm, training=True) # Compute Value at next state after conf.nsteps_TD_N steps given by target critic + else: + target_values = target_critic(next_state_batch, training=True) + y = reward_batch + (1-d_batch)*target_values # Compute batch of 1-step targets for the critic loss + + if conf.NORMALIZE_INPUTS: + critic_value = critic_model(state_batch_norm, training=True) # Compute batch of Values associated to the sampled batch of states + else: + critic_value = critic_model(state_batch, training=True) + + if weights_batch is None: + critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value)) # Critic loss function (tf.math.reduce_mean computes the mean of elements across dimensions of a tensor, in this case across the batch) + else: + critic_loss = tf.math.reduce_mean(tf.math.square(tf.math.multiply(weights_batch,(y - critic_value)))) + + # Compute the gradients of the critic loss w.r.t. critic's parameters + critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables) + + # Update the critic backpropagating the gradients + critic_optimizer.apply_gradients(zip(critic_grad, critic_model.trainable_variables)) + + # Update the actor after the critic pre-training + if episode >= conf.EPISODE_CRITIC_PRETRAINING: + if conf.NORMALIZE_INPUTS: + actions = actor_model(red_state_batch_norm, training=True) + else: + actions = actor_model(red_state_batch, training=True) + + # Both take into account normalization, ds_next_da is the gradient of the dynamics w.r.t. policy actions (ds'_da) + next_state_tf, ds_next_da = simulate_and_derivative_tf(red_state_batch,actions.numpy(),self.batch_size) + + with tf.GradientTape() as tape: + tape.watch(next_state_tf) + critic_value_next = critic_model(next_state_tf, training=True) # next_state_batch = next state after applying policy's action, already normalized if conf.NORMALIZE_INPUTS=1 + dV_ds_next = tape.gradient(critic_value_next, next_state_tf) # dV_ds' = gradient of V w.r.t. s', where s'=f(s,a) a=policy(s) + + with tf.GradientTape() as tape1: + with tf.GradientTape() as tape2: + tape1.watch(actions) + tape2.watch(next_state_tf) + rewards_tf = reward_tf(next_state_tf, actions, self.batch_size, d_batch) + dr_da = tape1.gradient(rewards_tf,actions) # dr_da = gradient of reward r w.r.t. policy's action a + dr_ds_next = tape2.gradient(rewards_tf,next_state_tf) # dr_ds' = gradient of reward r w.r.t. next state s' after performing policy's action a + + dr_ds_next_dV_ds_next_reshaped = tf.reshape(dr_ds_next+dV_ds_next,(self.batch_size,1,num_states)) # dr_ds' + dV_ds' + dr_ds_next_dV_ds_next = tf.matmul(dr_ds_next_dV_ds_next_reshaped,ds_next_da) # (dr_ds' + dV_ds')*ds'_da + dr_da_reshaped = tf.reshape(dr_da,(self.batch_size,1,num_actions)) + tf_sum = dr_ds_next_dV_ds_next+dr_da_reshaped # (dr_ds' + dV_ds')*ds'_da + dr_da + + # Now let's multiply -[(dr_ds' + dV_ds')*ds'_da + dr_da] by the actions a and then let's autodifferentiate w.r.t theta_A (actor NN's parameters) to finally get -dQ/dtheta_A + with tf.GradientTape() as tape: + tape.watch(actor_model.trainable_variables) + if conf.NORMALIZE_INPUTS: + actions = actor_model(state_batch_norm, training=True) + else: + actions = actor_model(state_batch, training=True) + actions_reshaped = tf.reshape(actions,(self.batch_size,num_actions,1)) + tf_sum_reshaped = tf.reshape(tf_sum,(self.batch_size,1,num_actions)) + Q_neg = tf.matmul(-tf_sum_reshaped,actions_reshaped) + + mean_Qneg = tf.math.reduce_mean(Q_neg) # Also here we need a scalar so we compute the mean -Q across the batch + + # Gradients of the actor loss w.r.t. actor's parameters + actor_grad = tape.gradient(mean_Qneg, actor_model.trainable_variables) + + # Update the actor backpropagating the gradients + actor_optimizer.apply_gradients(zip(actor_grad, actor_model.trainable_variables)) + + def learn(self,episode, prioritized_buffer): + # Sample batch of transitions from the buffer + experience = prioritized_buffer.sample(self.batch_size, beta=conf.prioritized_replay_eps) # Bias annealing not performed, that's why beta is equal to a very small number (0 not accepted by PrioritizedReplayBuffer) + x_batch, a_next_batch, a_batch, r_batch, x2_batch, d_batch, weights, batch_idxes = experience # Importance sampling weights (actually not used) should anneal the bias (see Prioritized Experience Replay paper) + + # Convert batch of transitions into a tensor + x_batch = x_batch.reshape(self.batch_size,num_states) + a_next_batch = a_next_batch.reshape(self.batch_size,num_actions) + a_batch = a_batch.reshape(self.batch_size,num_actions) + r_batch = r_batch.reshape(self.batch_size,1) + x2_batch = x2_batch.reshape(self.batch_size,num_states) + d_batch = d_batch.reshape(self.batch_size,1) + weights = weights.reshape(self.batch_size,1) + state_batch = tf.convert_to_tensor(x_batch) + state_batch = tf.cast(state_batch, dtype=tf.float32) + action_batch = tf.convert_to_tensor(a_batch) + action_batch = tf.cast(action_batch, dtype=tf.float32) + reward_batch = tf.convert_to_tensor(r_batch) + reward_batch = tf.cast(reward_batch, dtype=tf.float32) + d_batch = tf.cast(d_batch, dtype=tf.float32) + next_state_batch = tf.convert_to_tensor(x2_batch) + next_state_batch = tf.cast(next_state_batch, dtype=tf.float32) + if conf.prioritized_replay_alpha == 0: + weights_batch = tf.convert_to_tensor(np.ones_like(weights), dtype=tf.float32) + else: + weights_batch = tf.convert_to_tensor(weights, dtype=tf.float32) + + if conf.NORMALIZE_INPUTS: + x2_batch_norm = x2_batch / conf.state_norm_arr + x2_batch_norm[:,-1] = 2*x2_batch_norm[:,-1] -1 + next_state_batch_norm = tf.convert_to_tensor(x2_batch_norm) + next_state_batch_norm = tf.cast(next_state_batch_norm, dtype=tf.float32) + x_batch_norm = x_batch / conf.state_norm_arr + x_batch_norm[:,-1] = 2*x_batch_norm[:,-1] -1 + state_batch_norm = tf.convert_to_tensor(x_batch_norm) + state_batch_norm = tf.cast(state_batch_norm, dtype=tf.float32) + + # Update priorities + if conf.prioritized_replay_alpha != 0: + if conf.NORMALIZE_INPUTS: + v_batch = critic_model(state_batch_norm, training=True) # Compute batch of Values associated to the sampled batch ofstates + v2_batch = target_critic(next_state_batch_norm, training=True) # Compute batch of Values from target critic associated to sampled batch of next states + vref_batch = reward_batch + (1-d_batch)*(v2_batch) # Compute the targets for the TD error + else: + v_batch = critic_model(state_batch, training=True) + v2_batch = target_critic(next_state_batch, training=True) + vref_batch = reward_batch + (1-d_batch)*(v2_batch) + + td_errors = tf.math.abs(tf.math.subtract(vref_batch,v_batch)) + new_priorities = td_errors.numpy() + conf.prioritized_replay_eps # Proportional prioritization where p_i = |TD_error_i| + conf.prioritized_replay_eps + prioritized_buffer.update_priorities(batch_idxes, new_priorities) + + # Update NNs + self.update(episode, state_batch_norm, state_batch, state_batch_norm, state_batch, reward_batch, next_state_batch, next_state_batch_norm, d_batch, weights_batch=weights_batch) + +# Update target critic NN +# Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows TensorFlow to build a static graph out of the logic and computations in our function. +# This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one. Working only with TensorFlow tensors (e.g. not working with Numpy arrays) +@tf.function +def update_target(target_weights, weights, tau): + for (a, b) in zip(target_weights, weights): + a.assign(b * tau + a * (1 - tau)) + +# Create actor NN +def get_actor(): + + inputs = layers.Input(shape=(conf.num_states,)) + lay1 = layers.Dense(conf.NH1,kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A))(inputs) + leakyrelu1 = layers.LeakyReLU()(lay1) + + lay2 = layers.Dense(conf.NH2, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A))(leakyrelu1) + leakyrelu2 = layers.LeakyReLU()(lay2) + + outputs = layers.Dense(conf.num_actions, activation="tanh", kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A))(leakyrelu2) + + outputs = outputs * conf.tau_upper_bound # Bound actions + model = tf.keras.Model(inputs, outputs) + return model + +# Create critic NN +def get_critic(): + state_input = layers.Input(shape=(num_states,)) + state_out1 = layers.Dense(16, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C))(state_input) + leakyrelu1 = layers.LeakyReLU()(state_out1) + + state_out2 = layers.Dense(32, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C))(leakyrelu1) + leakyrelu2 = layers.LeakyReLU()(state_out2) + + out_lay1 = layers.Dense(conf.NH1, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C))(leakyrelu2) + leakyrelu3 = layers.LeakyReLU()(out_lay1) + + out_lay2 = layers.Dense(conf.NH2, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C))(leakyrelu3) + leakyrelu4 = layers.LeakyReLU()(out_lay2) + + outputs = layers.Dense(1, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C))(leakyrelu4) + + model = tf.keras.Model([state_input], outputs) + + return model + +# Get optimal actions at timestep "step" in the current episode +def get_TO_actions(step, action0_TO=None, action1_TO=None, action2_TO=None): + + actions = np.array([action0_TO[step], action1_TO[step], action2_TO[step]]) + + # Buond actions in case they are not already bounded in the TO problem + if step < NSTEPS_SH: + next_actions = np.array([np.clip(action0_TO[step+1], conf.tau_lower_bound, conf.tau_upper_bound), np.clip(action1_TO[step+1], conf.tau_lower_bound, conf.tau_upper_bound), np.clip(action2_TO[step+1], conf.tau_lower_bound, conf.tau_upper_bound)]) + else: + next_actions = np.copy(actions) # Actions at last step of the episode are not performed + + return actions, next_actions + +# Define reward (-cost) function +def reward(x2,u=None): + # End-effector coordinates + x_ee = conf.x_base + conf.l*(math.cos(x2[0]) + math.cos(x2[0]+x2[1]) + math.cos(x2[0]+x2[1]+x2[2])) + y_ee = conf.y_base + conf.l*(math.sin(x2[0]) + math.sin(x2[0]+x2[1]) + math.sin(x2[0]+x2[1]+x2[2])) + + # Penalties for the ellipses representing the obstacle + ell1_pen = math.log(math.exp(conf.alpha*-(((x_ee-conf.XC1)**2)/((conf.A1/2)**2) + ((y_ee-conf.YC1)**2)/((conf.B1/2)**2) - 1.0)) + 1)/conf.alpha + ell2_pen = math.log(math.exp(conf.alpha*-(((x_ee-conf.XC2)**2)/((conf.A2/2)**2) + ((y_ee-conf.YC2)**2)/((conf.B2/2)**2) - 1.0)) + 1)/conf.alpha + ell3_pen = math.log(math.exp(conf.alpha*-(((x_ee-conf.XC3)**2)/((conf.A3/2)**2) + ((y_ee-conf.YC3)**2)/((conf.B3/2)**2) - 1.0)) + 1)/conf.alpha + + # Term pushing the agent to stay in the neighborhood of target + peak_reward = math.log(math.exp(conf.alpha2*-(math.sqrt((x_ee-conf.TARGET_STATE[0])**2 +0.1) - math.sqrt(0.1) - 0.1 + math.sqrt((y_ee-conf.TARGET_STATE[1])**2 +0.1) - math.sqrt(0.1) - 0.1)) + 1)/conf.alpha2 + + # Term penalizing the FINAL joint velocity + if x2[-1] == conf.dt*round(rand_time/conf.dt): + vel_joint = x2[3]**2 + x2[4]**2 + x2[5]**2 - 10000/conf.w_v + else: + vel_joint = 0 + + r = (conf.w_d*(-(x_ee-conf.TARGET_STATE[0])**2 -(y_ee-conf.TARGET_STATE[1])**2) + conf.w_peak*peak_reward - conf.w_v*vel_joint - conf.w_ob1*ell1_pen - conf.w_ob2*ell2_pen - conf.w_ob3*ell3_pen - conf.w_u*(u[0]**2 + u[1]**2 + u[2]**2))/100 + + return r + +# Define reward function with TensorFlow tensors needed in the actor update (to compute its derivatives with TensorFlow autodifferentiation). Batch-wise computation +def reward_tf(x2,u,BATCH_SIZE,last_ts): + + # De-normalize x2 because it is normalized if conf.NORMALIZE_INPUTS=1. (Mask trick needed because TensorFlow's autodifferentiation doesn't work if tensors' elements are directly modified by accessing them) + if conf.NORMALIZE_INPUTS: + x2_time = tf.stack([np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), (x2[:,-1]+1)*conf.state_norm_arr[-1]/2],1) + x2_no_time = x2 * conf.state_norm_arr + mask = tf.cast(tf.stack([np.ones(BATCH_SIZE), np.ones(BATCH_SIZE), np.ones(BATCH_SIZE), np.ones(BATCH_SIZE), np.ones(BATCH_SIZE), np.ones(BATCH_SIZE), np.zeros(BATCH_SIZE)],1),tf.float32) + x2_not_norm = x2_no_time * mask + x2_time * (1 - mask) + else: + x2_not_norm = x2 + + x_ee = conf.x_base + conf.l*(tf.math.cos(x2_not_norm[:,0]) + tf.math.cos(x2_not_norm[:,0]+x2_not_norm[:,1]) + tf.math.cos(x2_not_norm[:,0]+x2_not_norm[:,1]+x2_not_norm[:,2])) + y_ee = conf.y_base + conf.l*(tf.math.sin(x2_not_norm[:,0]) + tf.math.sin(x2_not_norm[:,0]+x2_not_norm[:,1]) + tf.math.sin(x2_not_norm[:,0]+x2_not_norm[:,1]+x2_not_norm[:,2])) + + ell1_pen = tf.math.log(tf.math.exp(conf.alpha*-(((x_ee[:]-conf.XC1)**2)/((conf.A1/2)**2) + ((y_ee[:]-conf.YC1)**2)/((conf.B1/2)**2) - 1.0)) + 1)/conf.alpha + ell2_pen = tf.math.log(tf.math.exp(conf.alpha*-(((x_ee[:]-conf.XC2)**2)/((conf.A2/2)**2) + ((y_ee[:]-conf.YC2)**2)/((conf.B2/2)**2) - 1.0)) + 1)/conf.alpha + ell3_pen = tf.math.log(tf.math.exp(conf.alpha*-(((x_ee[:]-conf.XC3)**2)/((conf.A3/2)**2) + ((y_ee[:]-conf.YC3)**2)/((conf.B3/2)**2) - 1.0)) + 1)/conf.alpha + + peak_reward = tf.math.log(tf.math.exp(conf.alpha2*-(tf.math.sqrt((x_ee[:]-conf.TARGET_STATE[0])**2 +0.1) - tf.math.sqrt(0.1) - 0.1 + tf.math.sqrt((y_ee[:]-conf.TARGET_STATE[1])**2 +0.1) - tf.math.sqrt(0.1) - 0.1)) + 1)/conf.alpha2 + + vel_joint_list = [] + for i in range(BATCH_SIZE): + if last_ts[i][0] == 1.0: + vel_joint_list.append(x2_not_norm[i,3]**2 + x2_not_norm[i,4]**2 + x2_not_norm[i,5]**2 - 10000/conf.w_v) + else: + vel_joint_list.append(0) + vel_joint = tf.cast(tf.stack(vel_joint_list),tf.float32) + + r = (conf.w_d*(-(x_ee[:]-conf.TARGET_STATE[0])**2 -(y_ee[:]-conf.TARGET_STATE[1])**2) + conf.w_peak*peak_reward -conf.w_v*vel_joint - conf.w_ob1*ell1_pen - conf.w_ob2*ell2_pen - conf.w_ob3*ell3_pen - conf.w_u*(u[:,0]**2 + u[:,1]**2 + u[:,2]**2))/100 + + return r + +# Simulate dynamics to get next state +def simulate(dt,state,u): + + # Create robot model in Pinocchio with q_init as initial configuration and v_init as initial velocities + q_init = np.zeros(int((len(state)-1)/2)) + for state_index in range(int((len(state)-1)/2)): + q_init[state_index] = state[state_index] + q_init = q_init.T + v_init = np.zeros(int((len(state)-1)/2)) + for state_index in range(int((len(state)-1)/2)): + v_init[state_index] = state[int((len(state)-1)/2)+state_index] + v_init = v_init.T + r = load_urdf(URDF_PATH) + robot = RobotWrapper(r.model, r.collision_model, r.visual_model) + simu = RobotSimulator(robot, q_init, v_init, conf.simulation_type, conf.tau_coulomb_max) + + # Simulate control u + simu.simulate(u, conf.dt, 1) + q_next, v_next, a_next = np.copy(simu.q), np.copy(simu.v), np.copy(simu.dv) + q0_next, q1_next, q2_next = q_next[0],q_next[1],q_next[2] + v0_next, v1_next, v2_next = v_next[0],v_next[1],v_next[2] + + t_next = state[-1] + conf.dt + + return np.array([q0_next,q1_next,q2_next,v0_next,v1_next,v2_next,t_next]) + +# Simulate dynamics using tensors and compute its gradient w.r.t control. Batch-wise computation +def simulate_and_derivative_tf(state,u,BATCH_SIZE): + + q0_next, q1_next, q2_next = np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE) + v0_next, v1_next, v2_next = np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE) + Fu = np.zeros((conf.BATCH_SIZE,num_states,num_actions)) + + for sample_indx in range(BATCH_SIZE): + state_np = state[sample_indx] + + # Create robot model in Pinocchio with q_init as initial configuration + q_init = np.zeros(int((len(state_np)-1)/2)) + for state_index in range(int((len(state_np)-1)/2)): + q_init[state_index] = state_np[state_index] + q_init = q_init.T + v_init = np.zeros(int((len(state_np)-1)/2)) + for state_index in range(int((len(state_np)-1)/2)): + v_init[state_index] = state_np[int((len(state_np)-1)/2)+state_index] + v_init = v_init.T + r = load_urdf(URDF_PATH) + robot = RobotWrapper(r.model, r.collision_model, r.visual_model) + simu = RobotSimulator(robot, q_init, v_init, conf.simulation_type, conf.tau_coulomb_max) + + # Dynamics gradient w.r.t control (1st order euler) + nq = robot.nq + nv = robot.nv + nu = robot.na + nx = nq+nv + pin.computeABADerivatives(robot.model, robot.data, q_init, v_init, u[sample_indx]) + Fu_sample = np.zeros((nx, nu)) + Fu_sample[nv:, :] = robot.data.Minv + Fu_sample *= conf.dt + if conf.NORMALIZE_INPUTS: + Fu_sample *= (1/conf.state_norm_arr[3]) + + Fu[sample_indx] = np.vstack((Fu_sample, np.zeros(num_actions))) + + # Simulate control u + simu.simulate(u[sample_indx], conf.dt, 1) + q_next, v_next, = np.copy(simu.q), np.copy(simu.v) + q0_next_sample, q1_next_sample, q2_next_sample = q_next[0],q_next[1],q_next[2] + v0_next_sample, v1_next_sample, v2_next_sample = v_next[0],v_next[1],v_next[2] + q0_next[sample_indx] = q0_next_sample + q1_next[sample_indx] = q1_next_sample + q2_next[sample_indx] = q2_next_sample + v0_next[sample_indx] = v0_next_sample + v1_next[sample_indx] = v1_next_sample + v2_next[sample_indx] = v2_next_sample + + t_next = state[:,-1] + conf.dt + + if conf.NORMALIZE_INPUTS: + q0_next = q0_next / conf.state_norm_arr[0] + q1_next = q1_next / conf.state_norm_arr[1] + q2_next = q2_next / conf.state_norm_arr[2] + v0_next = v0_next / conf.state_norm_arr[3] + v1_next = v1_next / conf.state_norm_arr[4] + v2_next = v2_next / conf.state_norm_arr[5] + t_next = 2*t_next/conf.state_norm_arr[-1] -1 + + Fu = tf.convert_to_tensor(Fu,dtype=tf.float32) + + return tf.cast(tf.stack([q0_next,q1_next,q2_next,v0_next,v1_next,v2_next,t_next],1),dtype=tf.float32), Fu + +# Plot results from TO and episode to check consistency +def plot_results(tau0,tau1,tau2,x_TO,y_TO,x_RL,y_RL,steps,to=0): + + timesteps = conf.dt*np.arange(steps+1) + timesteps2 = conf.dt*np.arange(steps) + fig = plt.figure(figsize=(12,8)) + if to: + plt.suptitle('TO EXPLORATION: N try = {}'.format(N_try), y=1, fontsize=20) + else: + plt.suptitle('POLICY EXPLORATION: N try = {}'.format(N_try), y=1, fontsize=20) + + ax1 = fig.add_subplot(2, 2, 1) + ax1.plot(timesteps, x_TO, 'ro', linewidth=1, markersize=1) + ax1.plot(timesteps, y_TO, 'bo', linewidth=1, markersize=1) + ax1.plot(timesteps, x_RL, 'go', linewidth=1, markersize=1) + ax1.plot(timesteps, y_RL, 'ko', linewidth=1, markersize=1) + ax1.set_title('End-Effector Position',fontsize=20) + ax1.legend(["x_TO","y_TO","x_RL","y_RL"],fontsize=20) + ax1.set_xlabel('Time [s]',fontsize=20) + ax1.set_ylabel('[m]',fontsize=20) + + ax2 = fig.add_subplot(2, 2, 3) + ax2.plot(timesteps2, tau0, 'ro', linewidth=1, markersize=1) + ax2.plot(timesteps2, tau1, 'bo', linewidth=1, markersize=1) + ax2.plot(timesteps2, tau2, 'go', linewidth=1, markersize=1) + ax2.legend(['tau0','tau1','tau2'],fontsize=20) + ax2.set_xlabel('Time [s]',fontsize=20) + ax2.set_title('Controls',fontsize=20) + + ell1 = Ellipse((conf.XC1, conf.YC1), conf.A1, conf.B1, 0.0) + ell1.set_facecolor([30/255, 130/255, 76/255, 1]) + ell2 = Ellipse((conf.XC2, conf.YC2), conf.A2, conf.B2, 0.0) + ell2.set_facecolor([30/255, 130/255, 76/255, 1]) + ell3 = Ellipse((conf.XC3, conf.YC3), conf.A3, conf.B3, 0.0) + ell3.set_facecolor([30/255, 130/255, 76/255, 1]) + ax3 = fig.add_subplot(1, 2, 2) + ax3.plot(x_TO, y_TO, 'ro', linewidth=1, markersize=1) + ax3.plot(x_RL, y_RL, 'bo', linewidth=1, markersize=1) + ax3.legend(['TO','RL'],fontsize=20) + ax3.plot([x_TO[0]],[y_TO[0]],'ko',markersize=5) + ax3.add_artist(ell1) + ax3.add_artist(ell2) + ax3.add_artist(ell3) + ax3.plot([conf.TARGET_STATE[0]],[conf.TARGET_STATE[1]],'bo',markersize=5) + ax3.set_xlim(-41, 31) + ax3.set_aspect('equal', 'box') + ax3.set_title('Plane',fontsize=20) + ax3.set_xlabel('X [m]',fontsize=20) + ax3.set_ylabel('Y [m]',fontsize=20) + ax3.set_ylim(-35, 35) + + for ax in [ax1, ax2, ax3]: + ax.grid(True) + + fig.tight_layout() + plt.show() + +# Plot policy rollout from a single initial state as well as state and control trajectories +def plot_policy(tau0,tau1,tau2,x,y,steps,n_updates, diff_loc=0, PRETRAIN=0): + + timesteps = conf.dt*np.arange(steps) + fig = plt.figure(figsize=(12,8)) + plt.suptitle('POLICY: Discrete model, N try = {} N updates = {}'.format(N_try,n_updates), y=1, fontsize=20) + + ax1 = fig.add_subplot(2, 2, 1) + ax1.plot(timesteps, x, 'ro', linewidth=1, markersize=1) + ax1.plot(timesteps, y, 'bo', linewidth=1, markersize=1) + ax1.set_title('End-Effector Position',fontsize=20) + ax1.legend(["x","y"],fontsize=20) + ax1.set_xlabel('Time [s]',fontsize=20) + ax1.set_ylabel('[m]',fontsize=20) + + ax2 = fig.add_subplot(2, 2, 3) + ax2.plot(timesteps, tau0, 'ro', linewidth=1, markersize=1) + ax2.plot(timesteps, tau1, 'bo', linewidth=1, markersize=1) + ax2.plot(timesteps, tau2, 'go', linewidth=1, markersize=1) + ax2.legend(['tau0','tau1','tau2'],fontsize=20) + ax2.set_xlabel('Time [s]',fontsize=20) + ax2.set_title('Controls',fontsize=20) + + ell1 = Ellipse((conf.XC1, conf.YC1), conf.A1, conf.B1, 0.0) + ell1.set_facecolor([30/255, 130/255, 76/255, 1]) + ell2 = Ellipse((conf.XC2, conf.YC2), conf.A2, conf.B2, 0.0) + ell2.set_facecolor([30/255, 130/255, 76/255, 1]) + ell3 = Ellipse((conf.XC3, conf.YC3), conf.A3, conf.B3, 0.0) + ell3.set_facecolor([30/255, 130/255, 76/255, 1]) + ax3 = fig.add_subplot(1, 2, 2) + ax3.plot(x, y, 'ro', linewidth=1, markersize=1) + ax3.add_artist(ell1) + ax3.add_artist(ell2) + ax3.add_artist(ell3) + ax3.plot([conf.x_base],[3*conf.l],'ko',markersize=5) + ax3.plot([conf.TARGET_STATE[0]],[conf.TARGET_STATE[1]],'b*',markersize=10) + ax3.set_xlim([-41, 31]) + ax3.set_aspect('equal', 'box') + ax3.set_title('Plane',fontsize=20) + ax3.set_xlabel('X [m]',fontsize=20) + ax3.set_ylabel('Y [m]',fontsize=20) + ax3.set_ylim(-35, 35) + + for ax in [ax1, ax2, ax3]: + ax.grid(True) + + fig.tight_layout() + if PRETRAIN: + plt.savefig(Fig_path+'/PolicyEvaluation_Pretrain_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates)) + else: + if diff_loc==0: + plt.savefig(Fig_path+'/PolicyEvaluation_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates)) + else: + plt.savefig(Fig_path+'/Actor/PolicyEvaluation_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates)) + plt.clf() + plt.close(fig) + +# Plot only policy rollouts from multiple initial states +def plot_policy_eval(x_list,y_list,n_updates, diff_loc=0, PRETRAIN=0): + + fig = plt.figure(figsize=(12,8)) + plt.suptitle('POLICY: Discrete model, N try = {} N updates = {}'.format(N_try,n_updates), y=1, fontsize=20) + ell1 = Ellipse((conf.XC1, conf.YC1), conf.A1, conf.B1, 0.0) + ell1.set_facecolor([30/255, 130/255, 76/255, 1]) + ell2 = Ellipse((conf.XC2, conf.YC2), conf.A2, conf.B2, 0.0) + ell2.set_facecolor([30/255, 130/255, 76/255, 1]) + ell3 = Ellipse((conf.XC3, conf.YC3), conf.A3, conf.B3, 0.0) + ell3.set_facecolor([30/255, 130/255, 76/255, 1]) + + ax = fig.add_subplot(1, 1, 1) + for idx in range(len(x_list)): + if idx == 0: + ax.plot(x_list[idx], y_list[idx], 'ro', linewidth=1, markersize=1) + elif idx == 1: + ax.plot(x_list[idx], y_list[idx], 'bo', linewidth=1, markersize=1) + elif idx == 2: + ax.plot(x_list[idx], y_list[idx], 'go', linewidth=1, markersize=1) + elif idx == 3: + ax.plot(x_list[idx], y_list[idx], 'co', linewidth=1, markersize=1) + elif idx == 4: + ax.plot(x_list[idx], y_list[idx], 'yo', linewidth=1, markersize=1) + elif idx == 5: + ax.plot(x_list[idx], y_list[idx], color='tab:orange', marker='o', markersize=1, linestyle='None') + elif idx == 6: + ax.plot(x_list[idx], y_list[idx], color='grey', marker='o', markersize=1, linestyle='None') + elif idx == 7: + ax.plot(x_list[idx], y_list[idx], color='tab:pink', marker='o', markersize=1, linestyle='None') + elif idx == 8: + ax.plot(x_list[idx], y_list[idx], color='lime', marker='o', markersize=1, linestyle='None') + ax.plot(x_list[idx][0],y_list[idx][0],'ko',markersize=5) + ax.plot(conf.TARGET_STATE[0],conf.TARGET_STATE[1],'b*',markersize=10) + ax.add_artist(ell1) + ax.add_artist(ell2) + ax.add_artist(ell3) + ax.set_xlim([-41, 31]) + ax.set_aspect('equal', 'box') + ax.set_xlabel('X [m]',fontsize=20) + ax.set_ylabel('Y [m]',fontsize=20) + ax.set_ylim(-35, 35) + ax.grid(True) + fig.tight_layout() + if PRETRAIN: + plt.savefig(Fig_path+'/PolicyEvaluationMultiInit_Pretrain_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates)) + else: + if diff_loc==0: + plt.savefig(Fig_path+'/PolicyEvaluationMultiInit_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates)) + else: + plt.savefig(Fig_path+'/Actor/PolicyEvaluationMultiInit_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates)) + plt.clf() + plt.close(fig) + +# Plot rollout of the actor from some initial states. It generates the results and then calls plot_policy() and plot_policy_eval() +def rollout(update_step_cntr, diff_loc=0, PRETRAIN=0): + + init_states_sim = [np.array([math.pi/4,-math.pi/8,-math.pi/8,0.0,0.0,0.0,0.0]),np.array([-math.pi/4,math.pi/8,math.pi/8,0.0,0.0,0.0,0.0]),np.array([math.pi/2,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([-math.pi/2,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([3*math.pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([-3*math.pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([math.pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([-math.pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([math.pi,0.0,0.0,0.0,0.0,0.0,0.0])] + tau0_all_sim,tau1_all_sim,tau2_all_sim = [],[],[] + x_ee_all_sim, y_ee_all_sim = [], [] + + for k in range(len(init_states_sim)): + tau0_arr_sim,tau1_arr_sim,tau2_arr_sim = [],[],[] + x_ee_arr_sim = [conf.x_base + conf.l*(math.cos(init_states_sim[k][0]) + math.cos(init_states_sim[k][0]+init_states_sim[k][1]) + math.cos(init_states_sim[k][0]+init_states_sim[k][1]+init_states_sim[k][2]))] + y_ee_arr_sim = [conf.y_base + conf.l*(math.sin(init_states_sim[k][0]) + math.sin(init_states_sim[k][0]+init_states_sim[k][1]) + math.sin(init_states_sim[k][0]+init_states_sim[k][1]+init_states_sim[k][2]))] + prev_state_sim = np.copy(init_states_sim[k]) + episodic_reward_sim = 0 + + for i in range(conf.NSTEPS-1): + if conf.NORMALIZE_INPUTS: + prev_state_sim_norm = prev_state_sim / conf.state_norm_arr + prev_state_sim_norm[-1] = 2*prev_state_sim_norm[-1] -1 + tf_x_sim = tf.expand_dims(tf.convert_to_tensor(prev_state_sim_norm), 0) + else: + tf_x_sim = tf.expand_dims(tf.convert_to_tensor(prev_state_sim), 0) + ctrl = actor_model(tf_x_sim) + ctrl_sim = tf.squeeze(ctrl).numpy() + next_state_sim = simulate(conf.dt,prev_state_sim,ctrl_sim) + rwrd_sim = reward(next_state_sim,ctrl_sim) + episodic_reward_sim += rwrd_sim + tau0_arr_sim.append(ctrl_sim[0]) + tau1_arr_sim.append(ctrl_sim[1]) + tau2_arr_sim.append(ctrl_sim[2]) + x_ee_arr_sim.append(conf.x_base + conf.l*(math.cos(next_state_sim[0]) + math.cos(next_state_sim[0]+next_state_sim[1]) + math.cos(next_state_sim[0]+next_state_sim[1]+next_state_sim[2]))) + y_ee_arr_sim.append(conf.y_base + conf.l*(math.sin(next_state_sim[0]) + math.sin(next_state_sim[0]+next_state_sim[1]) + math.sin(next_state_sim[0]+next_state_sim[1]+next_state_sim[2]))) + prev_state_sim = np.copy(next_state_sim) + + if i==conf.NSTEPS-2: + if conf.NORMALIZE_INPUTS: + prev_state_sim_norm = prev_state_sim / conf.state_norm_arr + prev_state_sim_norm[-1] = 2*prev_state_sim_norm[-1] -1 + tf_x_sim = tf.expand_dims(tf.convert_to_tensor(prev_state_sim_norm), 0) + else: + tf_x_sim = tf.expand_dims(tf.convert_to_tensor(prev_state_sim), 0) + ctrl = actor_model(tf_x_sim) + ctrl_sim = tf.squeeze(ctrl).numpy() + tau0_arr_sim.append(ctrl_sim[0]) + tau1_arr_sim.append(ctrl_sim[1]) + tau2_arr_sim.append(ctrl_sim[2]) + if k==2: + plot_policy(tau0_arr_sim,tau1_arr_sim,tau2_arr_sim,x_ee_arr_sim,y_ee_arr_sim,conf.NSTEPS,update_step_cntr, diff_loc=diff_loc, PRETRAIN=PRETRAIN) + print("N try = {}: Simulation Return @ N updates = {} ==> {}".format(N_try,update_step_cntr,episodic_reward_sim)) + + tau0_all_sim.append(np.copy(tau0_arr_sim)) + tau1_all_sim.append(np.copy(tau1_arr_sim)) + tau2_all_sim.append(np.copy(tau2_arr_sim)) + x_ee_all_sim.append(np.copy(x_ee_arr_sim)) + y_ee_all_sim.append(np.copy(y_ee_arr_sim)) + + plot_policy_eval(x_ee_all_sim,y_ee_all_sim,update_step_cntr, diff_loc=diff_loc, PRETRAIN=PRETRAIN) + + return tau0_all_sim, tau1_all_sim, tau2_all_sim, x_ee_all_sim, y_ee_all_sim + +# Plot returns (not so meaningful given that the initial state, so also the time horizon, of each episode is randomized) +def plot_Return(): + fig = plt.figure(figsize=(15,8)) + ax = fig.add_subplot(1, 1, 1) + ax.plot(ep_reward_list) + ax.set_xlabel("Episode") + ax.set_ylabel("Return") + ax.set_title("N_try = {}".format(N_try)) + ax.grid(True) + plt.savefig(Fig_path+'/EpReturn_Manipulator3DoF_{}'.format(N_try)) + plt.close() + +# Plot average return considering 40 episodes +def plot_AvgReturn(): + fig = plt.figure(figsize=(15,8)) + ax = fig.add_subplot(1, 1, 1) + ax.plot(avg_reward_list) + ax.set_xlabel("Episode") + ax.set_ylabel("Avg. Return") + ax.set_title("N_try = {}".format(N_try)) + ax.grid(True) + plt.savefig(Fig_path+'/AvgReturn_Manipulator3DoF_{}'.format(N_try)) + plt.close() + + +# To run TF on CPU rather than GPU (seems faster since the NNs are small and some gradients are computed with Pinocchio on CPU --> bottleneck = communication CPU-GPU?) +# os.environ["CUDA_VISIBLE_DEVICES"]="-1" +tf.config.experimental.list_physical_devices('GPU') + +tf.random.set_seed(123) # Set seed for reproducibility + +# Set the ticklabel font size globally +plt.rcParams['xtick.labelsize'] = 22 +plt.rcParams['ytick.labelsize'] = 22 + +URDF_PATH = "urdf/planar_manipulator_3dof.urdf" # Path to the urdf file + +N_try = 1 # Number of test run +Fig_path = './Results/Figures/N_try_{}'.format(N_try) +NNs_path = './Results/NNs/N_try_{}'.format(N_try) +Config_path = './Results/Configs/' + +# Create folders to store the results and the trained NNs +try: + os.makedirs(Fig_path) +except: + print("N try = {} Figures folder already existing".format(N_try)) + pass +try: + os.makedirs(Fig_path+'/Actor') +except: + print("N try = {} Actor folder already existing".format(N_try)) + pass +try: + os.makedirs(NNs_path) +except: + print("N try = {} NNs folder already existing".format(N_try)) + pass +try: + os.makedirs(Config_path) +except: + print("N try = {} Configs folder already existing".format(N_try)) + pass + +num_states = 7 # Number of states +num_actions = 3 # Number of actions + +if __name__ == "__main__": + + # Create actor, critic and target NNs + actor_model = get_actor() + critic_model = get_critic() + target_critic = get_critic() + + # Set initial weights of targets equal to those of actor and critic. Comment if recovering from a stopped training + target_critic.set_weights(critic_model.get_weights()) + + # Uncomment if recovering from a stopped training + # update_step_counter = 102400 + # NNs_path+"/Manipulator_{}.h5".format(update_step_counter)) + # actor_model.load_weights(NNs_path+"/Manipulator_{}.h5".format(update_step_counter)) + # critic_model.load_weights(NNs_path+"/Manipulator_critic{}.h5".format(update_step_counter)) + # target_critic.load_weights(NNs_path+"/Manipulator_target_critic{}.h5".format(update_step_counter)) + + # Set optimizer specifying the learning rates + if conf.LR_SCHEDULE: + CRITIC_LR_SCHEDULE = tf.keras.optimizers.schedules.PiecewiseConstantDecay(conf.boundaries_schedule_LR_C, conf.values_schedule_LR_C) # Piecewise constant decay schedule + ACTOR_LR_SCHEDULE = tf.keras.optimizers.schedules.PiecewiseConstantDecay(conf.boundaries_schedule_LR_A, conf.values_schedule_LR_A) + critic_optimizer = tf.keras.optimizers.Adam(CRITIC_LR_SCHEDULE) + actor_optimizer = tf.keras.optimizers.Adam(ACTOR_LR_SCHEDULE) + else: + critic_optimizer = tf.keras.optimizers.Adam(conf.CRITIC_LEARNING_RATE) + actor_optimizer = tf.keras.optimizers.Adam(conf.ACTOR_LEARNING_RATE) + + # Save config file + f=open(Config_path+'/Manipulator_config{}.txt'.format(N_try), 'w') + f.write("conf.NEPISODES = {}, conf.NSTEPS = {}, conf.CRITIC_LEARNING_RATE = {}, conf.ACTOR_LEARNING_RATE = {}, conf.UPDATE_RATE = {}, conf.REPLAY_SIZE = {}, conf.BATCH_SIZE = {}, conf.NH1 = {}, conf.NH2 = {}, conf.dt = {}".format(conf.NEPISODES,conf.NSTEPS,conf.CRITIC_LEARNING_RATE,conf.ACTOR_LEARNING_RATE,conf.UPDATE_RATE,conf.REPLAY_SIZE,conf.BATCH_SIZE,conf.NH1,conf.NH2,conf.dt)+ + "\n"+str(conf.UPDATE_LOOPS)+" updates every {} episodes".format(conf.EP_UPDATE)+ + "\n\nReward = ({}*(-(x_ee-conf.TARGET_STATE[0])**2 -(y_ee-conf.TARGET_STATE[1])**2) + {}*peak_reward - {}*vel_joint - {}*ell1_pen - {}*ell2_pen - {}*ell3_pen - {}*(u[0]**2 + u[1]**2 + u[2]**2))/100, vel_joint = x2[3]**2 + x2[4]**2 + x2[5]**2 - 10000/{} if final step else 0, peak reward = math.log(math.exp({}*-(x_err-0.1 + y_err-0.1)) + 1)/{}, x_err = math.sqrt((x_ee-conf.TARGET_STATE[0])**2 +0.1) - math.sqrt(0.1), y_err = math.sqrt((y_ee-conf.TARGET_STATE[1])**2 +0.1) - math.sqrt(0.1), ell_pen = log(exp({}*-(((x_ee-XC)**2)/((a/2)**2) + ((y_ee-YC)**2)/((conf.B1/2)**2) - 1.0)) + 1)/{}".format(conf.w_d,conf.w_peak,conf.w_v,conf.w_ob1,conf.w_ob2,conf.w_ob3,conf.w_u,conf.w_v,conf.alpha2,conf.alpha2,conf.alpha,conf.alpha)+ + "\n\nBase = [{},{}]".format(conf.x_base,conf.y_base)+", target (conf.TARGET_STATE) = "+str(conf.TARGET_STATE)+ + "\nPrioritized_replay_alpha = "+str(conf.prioritized_replay_alpha)+", conf.prioritized_replay_beta0 = "+str(conf.prioritized_replay_beta0)+", conf.prioritized_replay_eps = "+str(conf.prioritized_replay_eps)+ + "\nActor: kernel_and_bias_regularizer = l1_l2({}), Critic: kernel_and_bias_regularizer = l1_l2({}) (in each layer)".format(conf.wreg_l1_A,conf.wreg_l2_A,conf.wreg_l1_C,conf.wreg_l2_C)+ + "\nScheduled step LR decay = {}: critic values = {} and boundaries = {}, policy values = {} and boundaries = {}".format(conf.LR_SCHEDULE,conf.values_schedule_LR_C,conf.boundaries_schedule_LR_C,conf.values_schedule_LR_A,conf.boundaries_schedule_LR_A)+ + "\nRandom initial state -> [uniform(-pi,pi), uniform(-pi,pi), uniform(-pi,pi), uniform(-pi/4,pi/4), uniform(-pi/4,pi/4), uniform(-pi/4,pi/4),uniform(0,(NSTEPS_SH-1)*conf.dt)"+ + "\nNormalized inputs = {}, q by {} and qdot by {}".format(conf.NORMALIZE_INPUTS,conf.state_norm_arr[0],conf.state_norm_arr[3])+ + "\nEpisodes of critic pretraining = {}".format(conf.EPISODE_CRITIC_PRETRAINING)+ + "\nn-step TD = {} with {} lookahead steps".format(conf.TD_N, conf.nsteps_TD_N)) + f.close() + + # Create training instance and an empty (prioritized) replay buffer + training = Training(conf.BATCH_SIZE) + prioritized_buffer = PrioritizedReplayBuffer(conf.REPLAY_SIZE, alpha=conf.prioritized_replay_alpha) + + # Lists to store the reward history of each episode and the average reward history of last few episodes + ep_reward_list = [] + avg_reward_list = [] + + # Initialize the counter of the updates. Comment if recovering from a stopped training + update_step_counter = 0 + + # START TRAINING + # If recovering from a stopped training, starting episode must be ((update_step_counter/conf.UPDATE_LOOPS)*conf.EP_UPDATE)+1 + for ep in range(conf.NEPISODES): + + TO = 0 # Flag to indicate if the TO problem has been solved + DONE = 0 # Flag indicating if the episode has terminated + ep_return = 0 # Initialize the return + step_counter = 0 # Initialize the counter of episode steps + + # START TO PROBLEM + while TO==0: + + # Randomize initial state + rand_time = random.uniform(0,(conf.NSTEPS-1)*conf.dt) + prev_state = np.array([random.uniform(-math.pi,math.pi), random.uniform(-math.pi,math.pi), random.uniform(-math.pi,math.pi), + random.uniform(-math.pi/4,math.pi/4), random.uniform(-math.pi/4,math.pi/4), random.uniform(-math.pi/4,math.pi/4), + conf.dt*round(rand_time/conf.dt)]) + if conf.NORMALIZE_INPUTS: + prev_state_norm = prev_state / conf.state_norm_arr + prev_state_norm[-1] = 2*prev_state_norm[-1] - 1 + tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state_norm), 0) + else: + tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0) + + # Set the horizon of TO problem / RL episode + NSTEPS_SH = conf.NSTEPS - int(round(rand_time/conf.dt)) + + # Lists to store TO state and control trajectories + tau0_arr,tau1_arr,tau2_arr = [],[],[] + q0_arr, q1_arr, q2_arr = [prev_state[0]], [prev_state[1]], [prev_state[2]] + v0_arr, v1_arr, v2_arr = [prev_state[3]], [prev_state[4]], [prev_state[5]] + t_arr = [prev_state[-1]] + x_ee_arr = [conf.x_base + conf.l*(math.cos(prev_state[0]) + math.cos(prev_state[0]+prev_state[1]) + math.cos(prev_state[0]+prev_state[1]+prev_state[2]))] + y_ee_arr = [conf.y_base + conf.l*(math.sin(prev_state[0]) + math.sin(prev_state[0]+prev_state[1]) + math.sin(prev_state[0]+prev_state[1]+prev_state[2]))] + + # Actor rollout used to initialize TO state and control variables + init_TO_states = np.zeros((num_states, NSTEPS_SH+1)) + init_TO_states[0][0] = prev_state[0] + init_TO_states[1][0] = prev_state[1] + init_TO_states[2][0] = prev_state[2] + init_TO_states[3][0] = prev_state[3] + init_TO_states[4][0] = prev_state[4] + init_TO_states[5][0] = prev_state[5] + init_TO_controls = np.zeros((num_actions, NSTEPS_SH+1)) + init_TO_controls[0][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[0] + init_TO_controls[1][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[1] + init_TO_controls[2][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[2] + init_prev_state = np.copy(prev_state) + + # Simulate actor's actions to compute the state trajectory used to initialize TO state variables + for i in range(1, NSTEPS_SH+1): + init_next_state = simulate(conf.dt,init_prev_state,np.array([init_TO_controls[0][i-1],init_TO_controls[1][i-1],init_TO_controls[2][i-1]])) + init_TO_states[0][i] = init_next_state[0] + init_TO_states[1][i] = init_next_state[1] + init_TO_states[2][i] = init_next_state[2] + init_TO_states[3][i] = init_next_state[3] + init_TO_states[4][i] = init_next_state[4] + init_TO_states[5][i] = init_next_state[5] + if conf.NORMALIZE_INPUTS: + init_next_state_norm = init_next_state / conf.state_norm_arr + init_next_state_norm[-1] = 2*init_next_state_norm[-1] - 1 + init_tf_next_state = tf.expand_dims(tf.convert_to_tensor(init_next_state_norm), 0) + else: + init_tf_next_state = tf.expand_dims(tf.convert_to_tensor(init_next_state), 0) + init_TO_controls[0][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[0] + init_TO_controls[1][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[1] + init_TO_controls[2][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[2] + init_prev_state = np.copy(init_next_state) + + # Create TO problem + if ep < conf.EPISODE_CRITIC_PRETRAINING or ep < conf.EPISODE_ICS_INIT: + TO_mdl = TO_manipulator(prev_state, init_q0_ICS, init_q1_ICS, init_q2_ICS, init_v0_ICS, init_v1_ICS, init_v2_ICS, init_0, init_0, init_0, init_0, init_0, init_0, NSTEPS_SH) + else: + if ep == conf.EPISODE_ICS_INIT and conf.LR_SCHEDULE: + # Re-initialize Adam otherwise it keeps being affected by the estimates of first-order and second-order moments computed previously with ICS warm-starting + critic_optimizer = tf.keras.optimizers.Adam(CRITIC_LR_SCHEDULE) + actor_optimizer = tf.keras.optimizers.Adam(ACTOR_LR_SCHEDULE) + TO_mdl = TO_manipulator(prev_state, init_q0, init_q1, init_q2, init_v0, init_v1, init_v2, init_0, init_0, init_0, init_tau0, init_tau1, init_tau2, NSTEPS_SH, init_TO = [init_TO_controls, init_TO_states]) + + # Indexes of TO variables + K = np.array([k for k in TO_mdl.k]) + + # Select solver + solver = SolverFactory('ipopt') + solver.options['linear_solver'] = "ma57" + + # Try to solve TO problem + try: + results = solver.solve(TO_mdl) + if str(results.solver.termination_condition) == "optimal": + #Retrieve control trajectory + tau0_TO = [TO_mdl.tau0[k]() for k in K] + tau1_TO = [TO_mdl.tau1[k]() for k in K] + tau2_TO = [TO_mdl.tau2[k]() for k in K] + TO = 1 + else: + print('TO solution not optimal') + raise Exception() + except: + print("*** TO failed ***") + + # Plot TO solution + # plot_results_TO(TO_mdl) + + ep_arr = [] + + # START RL EPISODE + while True: + # Get current and next TO actions + action, next_TO_action = get_TO_actions(step_counter, action0_TO=tau0_TO, action1_TO=tau1_TO, action2_TO=tau2_TO) + + # Simulate actions and retrieve next state + next_state = simulate(conf.dt,prev_state,action) + + # Store performed action and next state + tau0_arr.append(action[0]) + tau1_arr.append(action[1]) + tau2_arr.append(action[2]) + q0_arr.append(next_state[0]) + q1_arr.append(next_state[1]) + q2_arr.append(next_state[2]) + v0_arr.append(next_state[3]) + v1_arr.append(next_state[4]) + v2_arr.append(next_state[5]) + t_arr.append(next_state[-1]) + x_ee_arr.append(conf.x_base + conf.l*(math.cos(q0_arr[-1]) + math.cos(q0_arr[-1]+q1_arr[-1]) + math.cos(q0_arr[-1]+q1_arr[-1]+q2_arr[-1]))) + y_ee_arr.append(conf.y_base + conf.l*(math.sin(q0_arr[-1]) + math.sin(q0_arr[-1]+q1_arr[-1]) + math.sin(q0_arr[-1]+q1_arr[-1]+q2_arr[-1]))) + + # Compute reward + rwrd = reward(next_state, action) + ep_arr.append(rwrd) + + if step_counter==NSTEPS_SH-1: + DONE = 1 + + # Store transition if you want to use 1-step TD + if conf.TD_N==0: + prioritized_buffer.add(prev_state, next_TO_action, action, rwrd, next_state, float(DONE)) + + ep_return += rwrd # Increment the episodic return by the reward just recived + step_counter += 1 + prev_state = np.copy(next_state) # Next state becomes the prev state at the next episode step + + # End episode + if DONE == 1: + break + + # Plot the state and control trajectories of this episode + # plot_results(tau0_arr,tau1_arr,tau2_arr,x_ee,y_ee,x_ee_arr,y_ee_arr,NSTEPS_SH,to=TO) + + # Store transition after computing the (partial) cost-to go when using n-step TD (up to Monte Carlo) + if conf.TD_N: + DONE = 0 + cost_to_go_arr = [] + for i in range(len(ep_arr)): + final_i = min(i+conf.nsteps_TD_N,len(ep_arr)) + if final_i == len(ep_arr): + V_final = 0.0 + else: + if conf.NORMALIZE_INPUTS: + next_state_rollout = np.array([q0_arr[final_i],q1_arr[final_i],q2_arr[final_i],v0_arr[final_i],v1_arr[final_i],v2_arr[final_i],t_arr[final_i]]) / conf.state_norm_arr + next_state_rollout[-1] = 2*next_state_rollout[-1] - 1 + else: + next_state_rollout = np.array([q0_arr[final_i],q1_arr[final_i],q2_arr[final_i],v0_arr[final_i],v1_arr[final_i],v2_arr[final_i],t_arr[final_i]]) / conf.state_norm_arr + tf_next_state_rollout = tf.expand_dims(tf.convert_to_tensor(next_state_rollout), 0) + V_final = target_critic(tf_next_state_rollout, training=False).numpy()[0][0] + cost_to_go = sum(ep_arr[i:final_i+1]) + V_final + cost_to_go_arr.append(np.float32(cost_to_go)) + if i == len(ep_arr)-1: + DONE = 1 + prioritized_buffer.add(np.array([q0_arr[i],q1_arr[i],q2_arr[i],v0_arr[i],v1_arr[i],v2_arr[i],t_arr[i]]), next_TO_action, action, cost_to_go_arr[i], np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0]), float(DONE)) + + # Update the NNs + for i in range(conf.UPDATE_LOOPS): + training.learn(ep, prioritized_buffer) # Update critic and actor + update_target(target_critic.variables, critic_model.variables, conf.UPDATE_RATE) # Update target critic + update_step_counter += 1 + + # Plot rollouts every 0.5% of the training (saved in a separate folder) + if ep>=conf.ep_no_update and ep%int((conf.NEPISODES-conf.ep_no_update)/200)==0: + _, _, _, _, _ = rollout(update_step_counter, diff_loc=1) + + # Plot rollouts and save the NNs every 5% of the training + if ep>=conf.ep_no_update and int((conf.NEPISODES-conf.ep_no_update)/20)!=1 and int((conf.NEPISODES-conf.ep_no_update)/20)!=0 and ep%int((conf.NEPISODES-conf.ep_no_update)/20)==0: + _, _, _, _, _ = rollout(update_step_counter) + actor_model.save_weights(NNs_path+"/Manipulator_{}.h5".format(update_step_counter)) + critic_model.save_weights(NNs_path+"/Manipulator_critic_{}.h5".format(update_step_counter)) + target_critic.save_weights(NNs_path+"/Manipulator_target_critic_{}.h5".format(update_step_counter)) + + # ep_reward_list.append(ep_return) + # avg_reward = np.mean(ep_reward_list[-40:]) # Mean of last 40 episodes + # avg_reward_list.append(avg_reward) + + print("Episode {} ---> Return = {}".format(ep, ep_return)) + + # Plot returns + # plot_AvgReturn() + # plot_Return() + + # Save networks at the end of the training + actor_model.save_weights(NNs_path+"/Manipulator_actor_final.h5") + critic_model.save_weights(NNs_path+"/Manipulator_critic_final.h5") + target_critic.save_weights(NNs_path+"/Manipulator_target_critic_final.h5") + + # Simulate the final policy + tau0_all_final_sim, tau1_all_final_sim, tau2_all_final_sim, x_ee_all_final_sim, y_ee_all_final_sim = rollout(update_step_counter) \ No newline at end of file diff --git a/TO_manipulator3DoF_pyomo.py b/TO_manipulator3DoF_pyomo.py new file mode 100644 index 0000000..ef023f7 --- /dev/null +++ b/TO_manipulator3DoF_pyomo.py @@ -0,0 +1,538 @@ +import tensorflow as tf +from tensorflow.keras import layers, regularizers +from pyomo.environ import * +import numpy as np +import random +import matplotlib as mpl +import matplotlib.pyplot as plt +from matplotlib.patches import Ellipse +from math import pi +import math +from dynamics_manipulator3DoF import RobotWrapper, RobotSimulator, load_urdf +import config_manipulator3DoF_pyomo as conf +from inits import init_tau0,init_tau1,init_tau2,init_q0,init_q1,init_q2,init_v0,init_v1,init_v2,init_q0_ICS,init_q1_ICS,init_q2_ICS,init_v0_ICS,init_v1_ICS,init_v2_ICS,init_0 + +# Plot EE trajectories of all TO problems related to the set of initial states considered +def plot_results_TO_all(q0_all,q1_all,x_init_all_sim,y_init_all_sim,x_all_sim,y_all_sim): + fig = plt.figure(figsize=(12,8)) + ell1 = Ellipse((conf.XC1, conf.YC1), conf.A1, conf.B1, 0.0) + ell1.set_facecolor([30/255, 130/255, 76/255, 1]) + ell2 = Ellipse((conf.XC2, conf.YC2), conf.A2, conf.B2, 0.0) + ell2.set_facecolor([30/255, 130/255, 76/255, 1]) + ell3 = Ellipse((conf.XC3, conf.YC3), conf.A3, conf.B3, 0.0) + ell3.set_facecolor([30/255, 130/255, 76/255, 1]) + + K = range(len(q0_all)) + q0,q1,x1,x2,y1,y2 = [],[],[],[],[],[] + for i in K: + q0.append(q0_all[i][0]) + q1.append(q1_all[i][0]) + + x0 = -7.0 + y0 = 0.0 + for k in K: + x1.append(x0 + conf.l*cos(q0[k])) + x2.append(x0 + conf.l*(cos(q0[k]) + cos(q0[k]+q1[k]))) + y1.append(conf.l*sin(q0[k])) + y2.append(conf.l*(sin(q0[k]) + sin(q0[k]+q1[k]))) + + ax = fig.add_subplot(1, 1, 1) + for idx in range(len(x_all_sim)): + ax.plot(x_all_sim[idx], y_all_sim[idx], 'ro', linewidth=1, markersize=1) + init_states, = ax.plot(-7,0,'tab:orange',marker = 'o', ms = 15, mec = 'r') + ax.plot(x_init_all_sim[idx], y_init_all_sim[idx], 'mo', linewidth=1, markersize=1, alpha=0.4) + targ_state, = ax.plot(conf.x_des,conf.y_des,'k*',markersize=15) + ax.legend([init_states, targ_state], ['Base', 'Target point EE'], fontsize=20, loc='lower right') + ax.add_artist(ell1) + ax.add_artist(ell2) + ax.add_artist(ell3) + for i in K: + ax.plot([x0,x1[i],x2[i]],[y0,y1[i],y2[i]],'b',marker = 'o', ms = 5, mec = 'r', linewidth=1, alpha=0.8) #xp, yp, 'r--', + ax.plot([x2[i],x_all_sim[i][0]],[y2[i],y_all_sim[i][0]],'b', linewidth=1, alpha=0.8) + if abs(x_all_sim[i][0]-x2[i]) >= 1e-5: + angle = math.degrees(math.atan((y_all_sim[i][0]-y2[i])/(x_all_sim[i][0]-x2[i]))) + elif y_all_sim[i][0]>0: + angle = -90 + else: + angle = 90 + if angle<=1e5 and angle>=-1e5 and x_all_sim[i][0]<0: + angle+=180 + t = mpl.markers.MarkerStyle(marker='$\sqsubset$') + t._transform = t.get_transform().rotate_deg(angle) + ax.plot(x_all_sim[i][0],y_all_sim[i][0],'b', marker=t, ms = 15, linewidth=1) + ax.set_xlim([-41, 31]) + ax.set_aspect('equal', 'box') + ax.tick_params(axis='x') + ax.tick_params(axis='y') + ax.set_xlabel('X [m]',fontsize=20) + ax.set_ylabel('Y [m]',fontsize=20) + ax.set_ylim(-35, 35) + ax.grid(True) + + fig.tight_layout() + plt.show() + +# Plot trajectories of joint angles and velocities of all TO problems related to the set of initial states considered +def plot_results_TO_all_trajs(q0_all,q1_all,q2_all,v0_all,v1_all,v2_all,tau0_all,tau1_all,tau2_all): + fig = plt.figure(figsize=(12,10)) + + K = range(len(q0_all[0])) + time = [conf.dt*k for k in K] + + ax2 = fig.add_subplot(3, 1, 1) + for idx in range(len(q0_all)): + ax2.plot(time, q0_all[idx], 'r', linewidth=1, markersize=1) + ax2.plot(time, q1_all[idx], 'b', linewidth=1, markersize=1) + ax2.plot(time, q2_all[idx], 'g', linewidth=1, markersize=1) + ax2.legend(['q0','q1','q2'],fontsize=16) + ax2.set_ylabel('[rad]',fontsize=20) + ax2.grid(True) + + ax3 = fig.add_subplot(3, 1, 2) + for idx in range(len(q0_all)): + ax3.plot(time, v0_all[idx], 'r', linewidth=1, markersize=1) + ax3.plot(time, v1_all[idx], 'b', linewidth=1, markersize=1) + ax3.plot(time, v2_all[idx], 'g', linewidth=1, markersize=1) + ax3.legend(['v0','v1','v2'],fontsize=16) + ax3.set_ylabel('[rad/s]',fontsize=20) + ax3.grid(True) + + ax4 = fig.add_subplot(3, 1, 3) + for idx in range(len(q0_all)): + ax4.plot(time, tau0_all[idx], 'r', linewidth=1, markersize=1) + ax4.plot(time, tau1_all[idx], 'b', linewidth=1, markersize=1) + ax4.plot(time, tau2_all[idx], 'g', linewidth=1, markersize=1) + ax4.legend(['tau0','tau1','tau2'],fontsize=16) + ax4.set_ylabel('[Nm]',fontsize=20) + ax4.set_xlabel('[s]',fontsize=20) + ax4.grid(True) + + fig.tight_layout() + plt.show() + +# Plot result of one TO problem +def plot_results_TO(m): + K = np.array([k for k in m.k]) + tau0 = [m.tau0[k]() for k in K] + tau1 = [m.tau1[k]() for k in K] + tau2 = [m.tau2[k]() for k in K] + x_ee = [(-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))() for k in K] + y_ee = [(conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))() for k in K] + q0 = [m.q0[k]() for k in K] + q1 = [m.q1[k]() for k in K] + q2 = [m.q2[k]() for k in K] + v0 = [m.v0[k]() for k in K] + v1 = [m.v1[k]() for k in K] + v2 = [m.v2[k]() for k in K] + + x0 = -7.0 + x1 = [x0 + conf.l*cos(q0[k]) for k in K] + x2 = [x0 + conf.l*(cos(q0[k]) + cos(q0[k]+q1[k])) for k in K] + y0 = 0.0 + y1 = [conf.l*sin(q0[k]) for k in K] + y2 = [conf.l*(sin(q0[k]) + sin(q0[k]+q1[k])) for k in K] + + cost=m.obj() + + fig = plt.figure(figsize=(12,10)) + plt.suptitle('Discrete model', y=1, fontsize=16) + + ax1 = fig.add_subplot(4, 2, 1) + ax1.plot(conf.dt*K, x_ee, 'ro', linewidth=1, markersize=1) + ax1.plot(conf.dt*K, y_ee, 'bo', linewidth=1, markersize=1) + ax1.set_title('Position',fontsize=16) + ax1.legend(['x','y'],fontsize=16) + ax1.set_ylim(-31, 31) + ax1.grid(True) + + ax2 = fig.add_subplot(4, 2, 3) + ax2.plot(conf.dt*K, q0, 'ro', linewidth=1, markersize=1) + ax2.plot(conf.dt*K, q1, 'bo', linewidth=1, markersize=1) + ax2.plot(conf.dt*K, q2, 'go', linewidth=1, markersize=1) + ax2.set_title('Joint position',fontsize=16) + ax2.legend(['q0','q1','q2'],fontsize=16) + ax2.grid(True) + + ax3 = fig.add_subplot(4, 2, 5) + ax3.plot(conf.dt*K, v0, 'ro', linewidth=1, markersize=1) + ax3.plot(conf.dt*K, v1, 'bo', linewidth=1, markersize=1) + ax3.plot(conf.dt*K, v2, 'go', linewidth=1, markersize=1) + ax3.set_title('Joint velocity',fontsize=16) + ax3.legend(['v0','v1','v2'],fontsize=16) + ax3.grid(True) + + ax4 = fig.add_subplot(4, 2, 7) + ax4.plot(conf.dt*K, tau0, 'ro', linewidth=1, markersize=1) + ax4.plot(conf.dt*K, tau1, 'bo', linewidth=1, markersize=1) + ax4.plot(conf.dt*K, tau2, 'go', linewidth=1, markersize=1) + ax4.legend(['tau0','tau1','tau2'],fontsize=16) + ax4.set_title('Controls',fontsize=16) + + ell1 = Ellipse((conf.XC1, conf.YC1), conf.A1, conf.B1, 0.0) + ell1.set_facecolor([30/255, 130/255, 76/255, 1]) + ell2 = Ellipse((conf.XC2, conf.YC2), conf.A2, conf.B2, 0.0) + ell2.set_facecolor([30/255, 130/255, 76/255, 1]) + ell3 = Ellipse((conf.XC3, conf.YC3), conf.A3, conf.B3, 0.0) + ell3.set_facecolor([30/255, 130/255, 76/255, 1]) + ax3 = fig.add_subplot(1, 2, 2) + ax3.plot(x_ee, y_ee, 'ro', linewidth=1, markersize=2) + N_points_conf = 3 + for g in range(N_points_conf): + index = int(g*K[-1]/(N_points_conf-1)) + if g==0: + ax3.plot([x0,x1[index],x2[index],x_ee[index]],[y0,y1[index],y2[index],y_ee[index]],'b',marker = 'o', ms = 5, mec = 'r', linewidth=1, alpha=0.8) #xp, yp, 'r--', + elif g==1: + ax3.plot([x0,x1[index],x2[index],x_ee[index]],[y0,y1[index],y2[index],y_ee[index]],'dimgrey',marker = 'o', ms = 5, mec = 'r', linewidth=1, alpha=0.8) #xp, yp, 'r--', + else: + ax3.plot([x0,x1[index],x2[index],x_ee[index]],[y0,y1[index],y2[index],y_ee[index]],'orange',marker = 'o', ms = 5, mec = 'r', linewidth=1, alpha=0.8) #xp, yp, 'r--', + ax3.add_artist(ell1) + ax3.add_artist(ell2) + ax3.add_artist(ell3) + ax3.set_xlim(-41, 31) + ax3.set_aspect('equal', 'box') + ax3.set_title("Plane, OBJ = {}".format(cost),fontsize=16) + ax3.set_xlabel('X',fontsize=16) + ax3.set_ylabel('Y',fontsize=16) + ax3.set_ylim(-35, 35) + + + for ax in [ax1, ax2, ax3]: + ax.grid(True) + + fig.tight_layout() + plt.show() + +# Create TO problem +def TO_manipulator(ICS, init_q0, init_q1, init_q2, init_v0, init_v1, init_v2, init_a0, init_a1, init_a2, init_tau0, init_tau1, init_tau2, N, init_TO=None): + m = ConcreteModel() + m.k = RangeSet(0, N) + + if init_TO != None: + init_TO_controls = init_TO[0] + init_TO_states = init_TO[1] + m.tau0 = Var(m.k, initialize=init_tau0(m,m.k,init_TO_controls), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) + m.tau1 = Var(m.k, initialize=init_tau1(m,m.k,init_TO_controls), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) + m.tau2 = Var(m.k, initialize=init_tau2(m,m.k,init_TO_controls), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) + m.q0 = Var(m.k, initialize=init_q0(m,m.k,init_TO_states)) + m.q1 = Var(m.k, initialize=init_q1(m,m.k,init_TO_states)) + m.q2 = Var(m.k, initialize=init_q2(m,m.k,init_TO_states)) + m.v0 = Var(m.k, initialize=init_v0(m,m.k,init_TO_states)) + m.v1 = Var(m.k, initialize=init_v1(m,m.k,init_TO_states)) + m.v2 = Var(m.k, initialize=init_v2(m,m.k,init_TO_states)) + else: + m.tau0 = Var(m.k, initialize=init_tau0(m,m.k), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) + m.tau1 = Var(m.k, initialize=init_tau1(m,m.k), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) + m.tau2 = Var(m.k, initialize=init_tau2(m,m.k), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) + m.q0 = Var(m.k, initialize=init_q0(m,m.k,ICS)) + m.q1 = Var(m.k, initialize=init_q1(m,m.k,ICS)) + m.q2 = Var(m.k, initialize=init_q2(m,m.k,ICS)) + m.v0 = Var(m.k, initialize=init_v0(m,m.k,ICS)) + m.v1 = Var(m.k, initialize=init_v1(m,m.k,ICS)) + m.v2 = Var(m.k, initialize=init_v2(m,m.k,ICS)) + + m.a0 = Var(m.k, initialize=init_a0) + m.a1 = Var(m.k, initialize=init_a1) + m.a2 = Var(m.k, initialize=init_a2) + + m.icfix_q0 = Constraint(rule = lambda m: m.q0[0] == ICS[0]) + m.icfix_q1 = Constraint(rule = lambda m: m.q1[0] == ICS[1]) + m.icfix_q2 = Constraint(rule = lambda m: m.q2[0] == ICS[2]) + m.icfix_v0 = Constraint(rule = lambda m: m.v0[0] == ICS[3]) + m.icfix_v1 = Constraint(rule = lambda m: m.v1[0] == ICS[4]) + m.icfix_v2 = Constraint(rule = lambda m: m.v2[0] == ICS[5]) + + ### Integration ### + m.v0_update = Constraint(m.k, rule = lambda m, k: + m.v0[k+1] == m.v0[k] + conf.dt*m.a0[k] if k < N else Constraint.Skip) + + m.v1_update = Constraint(m.k, rule = lambda m, k: + m.v1[k+1] == m.v1[k] + conf.dt*m.a1[k] if k < N else Constraint.Skip) + + m.v2_update = Constraint(m.k, rule = lambda m, k: + m.v2[k+1] == m.v2[k] + conf.dt*m.a2[k] if k < N else Constraint.Skip) + + m.q0_update = Constraint(m.k, rule = lambda m, k: + m.q0[k+1] == m.q0[k] + conf.dt*m.v0[k] if k < N else Constraint.Skip) + + m.q1_update = Constraint(m.k, rule = lambda m, k: + m.q1[k+1] == m.q1[k] + conf.dt*m.v1[k] if k < N else Constraint.Skip) + + m.q2_update = Constraint(m.k, rule = lambda m, k: + m.q2[k+1] == m.q2[k] + conf.dt*m.v2[k] if k < N else Constraint.Skip) + + + ### Dyamics ### + m.EoM_0 = Constraint(m.k, rule = lambda m, k: + (1/4)*(4*conf.Iz*m.a2[k] + conf.l**2*conf.M*m.a2[k] + 2*conf.l**2*conf.M*m.a2[k]*cos(m.q2[k]) + 2*conf.l**2*conf.M*m.a2[k]*cos(m.q1[k] + m.q2[k]) + 2*m.a1[k]*(4*conf.Iz + 3*conf.l**2*conf.M + 3*conf.l**2*conf.M*cos(m.q1[k]) + 2*conf.l**2*conf.M*cos(m.q2[k]) + conf.l**2*conf.M*cos(m.q1[k] + m.q2[k])) + m.a0[k]*(12*conf.Iz + 15*conf.l**2*conf.M + + 12*conf.l**2*conf.M*cos(m.q1[k]) + 4*conf.l**2*conf.M*cos(m.q2[k]) + 4*conf.l**2*conf.M*cos(m.q1[k] + m.q2[k])) - 4*m.tau0[k] - 12*conf.l**2*conf.M*sin(m.q1[k])*m.v0[k]*m.v1[k] - 4*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v0[k]*m.v1[k] - 6*conf.l**2*conf.M*sin(m.q1[k])*m.v1[k]**2 - 2*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v1[k]**2 + - 4*conf.l**2*conf.M*sin(m.q2[k])*m.v0[k]*m.v2[k] - 4*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v0[k]*m.v2[k] - 4*conf.l**2*conf.M*sin(m.q2[k])*m.v1[k]*m.v2[k] - 4*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v1[k]*m.v2[k] - 2*conf.l**2*conf.M*sin(m.q2[k])*m.v2[k]**2 - 2*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v2[k]**2) == 0 if k < N else Constraint.Skip) + + m.EoM_1 = Constraint(m.k, rule = lambda m, k: + (1/4)*(4*conf.Iz*m.a2[k] + conf.l**2*conf.M*m.a2[k] + 2*conf.l**2*conf.M*m.a2[k]*cos(m.q2[k]) + m.a1[k]*(8*conf.Iz + 6*conf.l**2*conf.M + 4*conf.l**2*conf.M*cos(m.q2[k])) + 2*m.a0[k]*(4*conf.Iz + 3*conf.l**2*conf.M + 3*conf.l**2*conf.M*cos(m.q1[k]) + 2*conf.l**2*conf.M*cos(m.q2[k]) + conf.l**2*conf.M*cos(m.q1[k] + m.q2[k])) + - 4*m.tau1[k] + 6*conf.l**2*conf.M*sin(m.q1[k])*m.v0[k]**2 + 2*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v0[k]**2 - 4*conf.l**2*conf.M*sin(m.q2[k])*m.v0[k]*m.v2[k] - 4*conf.l**2*conf.M*sin(m.q2[k])*m.v1[k]*m.v2[k] - 2*conf.l**2*conf.M*sin(m.q2[k])*m.v2[k]**2) == 0 if k < N else Constraint.Skip) + + m.EoM_2 = Constraint(m.k, rule = lambda m, k: + (1/4)*(4*conf.Iz*m.a2[k] + conf.l**2*conf.M*m.a2[k] + m.a1[k]*(4*conf.Iz + conf.l**2*conf.M + 2*conf.l**2*conf.M*cos(m.q2[k])) + m.a0[k]*(4*conf.Iz + conf.l**2*conf.M + 2*conf.l**2*conf.M*cos(m.q2[k]) + 2*conf.l**2*conf.M*cos(m.q1[k] + m.q2[k])) - 4*m.tau2[k] + 2*conf.l**2*conf.M*sin(m.q2[k])*m.v0[k]**2 + 2*conf.l**2*conf.M*sin(m.q1[k] + + m.q2[k])*m.v0[k]**2 + 4*conf.l**2*conf.M*sin(m.q2[k])*m.v0[k]*m.v1[k] + 2*conf.l**2*conf.M*sin(m.q2[k])*m.v1[k]**2) == 0 if k < N else Constraint.Skip) + + ### Penalties representing the obstacle ### + m.ell1_penalty = sum((log(exp(conf.alpha*-((((-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))-conf.XC1)**2)/((conf.A1/2)**2) + (((conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))-conf.YC1)**2)/((conf.B1/2)**2) - 1.0)) + 1)/conf.alpha) for k in m.k) - (log(exp(conf.alpha*-(((-7 + conf.l*(cos(m.q0[0]) + cos(m.q0[0]+m.q1[0]) + cos(m.q0[0]+m.q1[0]+m.q2[0]))-conf.XC1)**2)/((conf.A1/2)**2) + ((conf.l*(sin(m.q0[0]) + sin(m.q0[0]+m.q1[0]) + sin(m.q0[0]+m.q1[0]+m.q2[0]))-conf.YC1)**2)/((conf.B1/2)**2) - 1.0)) + 1)/conf.alpha) + m.ell2_penalty = sum((log(exp(conf.alpha*-((((-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))-conf.XC2)**2)/((conf.A2/2)**2) + (((conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))-conf.YC2)**2)/((conf.B2/2)**2) - 1.0)) + 1)/conf.alpha) for k in m.k) - (log(exp(conf.alpha*-(((-7 + conf.l*(cos(m.q0[0]) + cos(m.q0[0]+m.q1[0]) + cos(m.q0[0]+m.q1[0]+m.q2[0]))-conf.XC2)**2)/((conf.A2/2)**2) + ((conf.l*(sin(m.q0[0]) + sin(m.q0[0]+m.q1[0]) + sin(m.q0[0]+m.q1[0]+m.q2[0]))-conf.YC2)**2)/((conf.B2/2)**2) - 1.0)) + 1)/conf.alpha) + m.ell3_penalty = sum((log(exp(conf.alpha*-((((-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))-conf.XC3)**2)/((conf.A3/2)**2) + (((conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))-conf.YC3)**2)/((conf.B3/2)**2) - 1.0)) + 1)/conf.alpha) for k in m.k) - (log(exp(conf.alpha*-(((-7 + conf.l*(cos(m.q0[0]) + cos(m.q0[0]+m.q1[0]) + cos(m.q0[0]+m.q1[0]+m.q2[0]))-conf.XC3)**2)/((conf.A3/2)**2) + ((conf.l*(sin(m.q0[0]) + sin(m.q0[0]+m.q1[0]) + sin(m.q0[0]+m.q1[0]+m.q2[0]))-conf.YC3)**2)/((conf.B3/2)**2) - 1.0)) + 1)/conf.alpha) + + ### Control effort term ### + m.u_obj = sum((m.tau0[k]**2 + m.tau1[k]**2 + m.tau2[k]**2) for k in m.k) - (m.tau0[N]**2 + m.tau1[N]**2 + m.tau2[N]**2) + + ### Distence to target term (quadratic term + log valley centered at target) ### + m.dist_cost = sum((conf.w_d*(((-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))-conf.x_des)**2 + ((conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))-conf.y_des)**2) - conf.w_peak*(log(exp(conf.alpha2*-(sqrt(((-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))-conf.x_des)**2 +0.1) - 0.1 + sqrt(((conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))-conf.y_des)**2 +0.1) - 0.1 -2*sqrt(0.1))) + 1)/conf.alpha2)) for k in m.k) - (conf.w_d*((-7 + conf.l*(cos(m.q0[0]) + cos(m.q0[0]+m.q1[0]) + cos(m.q0[0]+m.q1[0]+m.q2[0]))-conf.x_des)**2 + (conf.l*(sin(m.q0[0]) + sin(m.q0[0]+m.q1[0]) + sin(m.q0[0]+m.q1[0]+m.q2[0]))-conf.y_des)**2) - conf.w_peak*(log(exp(conf.alpha2*-(sqrt((-7 + conf.l*(cos(m.q0[0]) + cos(m.q0[0]+m.q1[0]) + cos(m.q0[0]+m.q1[0]+m.q2[0]))-conf.x_des)**2 +0.1) - 0.1 + sqrt((conf.l*(sin(m.q0[0]) + sin(m.q0[0]+m.q1[0]) + sin(m.q0[0]+m.q1[0]+m.q2[0]))-conf.y_des)**2 +0.1) - 0.1 -2*sqrt(0.1))) + 1)/conf.alpha2)) + + # m.v = sum(((m.v0[k])**2 + (m.v1[k])**2 + (m.v2[k])**2)*(k/m.k[-1])**2 for k in m.k) + m.v = (m.v0[N])**2 + (m.v1[N])**2 + (m.v2[N])**2 + + m.obj = Objective(expr = (m.dist_cost + conf.w_v*m.v + conf.w_ob1*m.ell1_penalty + conf.w_ob2*m.ell2_penalty + conf.w_ob3*m.ell3_penalty + conf.w_u*m.u_obj - 10000)/100, sense=minimize) + return m + +# Simulate dynamics +def simulate(dt,state,u): + + # Create robot model in Pinocchio with q_init as initial configuration and v_init as initial velocities + q_init = np.zeros(int((len(state)-1)/2)) + for state_index in range(int((len(state)-1)/2)): + q_init[state_index] = state[state_index] + q_init = q_init.T + v_init = np.zeros(int((len(state)-1)/2)) + for state_index in range(int((len(state)-1)/2)): + v_init[state_index] = state[int((len(state)-1)/2)+state_index] + v_init = v_init.T + r = load_urdf(URDF_PATH) + robot = RobotWrapper(r.model, r.collision_model, r.visual_model) + simu = RobotSimulator(robot, q_init, v_init, simulation_type, tau_coulomb_max) + + # Simulate control u + simu.simulate(u, dt, 1) + q_next, v_next, a_next = np.copy(simu.q), np.copy(simu.v), np.copy(simu.dv) + q0_next, q1_next, q2_next = q_next[0],q_next[1],q_next[2] + v0_next, v1_next, v2_next = v_next[0],v_next[1],v_next[2] + + t_next = state[-1] + dt + + return np.array([q0_next,q1_next,q2_next,v0_next,v1_next,v2_next,t_next]) + +if __name__ == "__main__": + from CACTO_manipulator3DoF_pyomo import get_actor + plt.rcParams['xtick.labelsize'] = 22 + plt.rcParams['ytick.labelsize'] = 22 + + N = conf.NSTEPS # Number of timesteps + num_states = 7 # Number of states + num_actions = 3 # Number of actions + q_norm = 15 # Joint angle normalization + v_norm = 10 # Velocity normalization + + simulate_coulomb_friction = 0 # To simulate friction + simulation_type = 'euler' # Either 'timestepping' or 'euler' + tau_coulomb_max = 0*np.ones(3) # Expressed as percentage of torque max + + + # Path to the urdf file + URDF_PATH = "urdf/planar_manipulator_3dof.urdf" + + ## Inverse kinematics considering the EE being in ICS_ee + # init_states_sim = [] + # ICS_ee_list = [[10,0],[-10,0],[0,10],[0,-10],[7,7],[-7,7],[7,-7],[-7,-7],[6,0]] + # for i in range(len(ICS_ee_list)): + # ICS_ee = ICS_ee_list[i] + # phi = math.atan2(ICS_ee[1],(ICS_ee[0]+7)) # SUM OF JOINT ANGLES FIXED = orientation of the segment connecting the base with the EE + # X3rd_joint = (ICS_ee[0]+7) - conf.l* math.cos(phi) + # Y3rd_joint = (ICS_ee[1]) - conf.l* math.sin(phi) + # c2 = (X3rd_joint**2 + Y3rd_joint**2 -2*conf.l**2)/(2*conf.l**2) + # if ICS_ee[1]>=0: + # s2 = math.sqrt(1-c2**2) + # else: + # s2 = -math.sqrt(1-c2**2) + # s1 = ((conf.l + conf.l*c2)*Y3rd_joint - conf.l*s2*X3rd_joint)/(X3rd_joint**2 + Y3rd_joint**2) + # c1 = ((conf.l + conf.l*c2)*X3rd_joint - conf.l*s2*Y3rd_joint)/(X3rd_joint**2 + Y3rd_joint**2) + # ICS_q0 = math.atan2(s1,c1) + # ICS_q1 = math.atan2(s2,c2) + # init_states_sim.append(np.array([ICS_q0,ICS_q1,phi-ICS_q0-ICS_q1,0.0,0.0,0.0,0.0])) # Initial state + + obj_arr = np.array([]) + + # Set of initial states + # init_states_sim = [np.array([random.uniform(-math.pi,math.pi), random.uniform(-math.pi,math.pi), random.uniform(-math.pi,math.pi), + # random.uniform(-math.pi/4,math.pi/4), random.uniform(-math.pi/4,math.pi/4), random.uniform(-math.pi/4,math.pi/4)])] + init_states_sim = [np.array([pi/4,-pi/8,-pi/8,0.0,0.0,0.0,0.0]),np.array([-pi/4,pi/8,pi/8,0.0,0.0,0.0,0.0]),np.array([pi/2,0.0,0.0,0.0,0.0,0.0,0.0]), + np.array([-pi/2,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([3*pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([-3*pi/4,0.0,0.0,0.0,0.0,0.0,0.0]), + np.array([pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([-pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([pi,0.0,0.0,0.0,0.0,0.0,0.0])] + + # Lists to store TO solutions + x_ee_all_sim, y_ee_all_sim, q0_all_sim, q1_all_sim, q2_all_sim, v0_all_sim, v1_all_sim, v2_all_sim, a0_all_sim, a1_all_sim, a2_all_sim, tau0_all_sim, tau1_all_sim, tau2_all_sim = [],[],[],[],[],[],[],[],[],[],[],[],[],[] + x_ee_init_all_sim, y_ee_init_all_sim, q0_init_all_sim, q1_init_all_sim, q2_init_all_sim, v0_init_all_sim, v1_init_all_sim, v2_init_all_sim, a0_init_all_sim, a1_init_all_sim, a2_init_all_sim = [],[],[],[],[],[],[],[],[],[],[] + + CACTO_ROLLOUT = 1 # Flag to use CACTO warm-start + N_try = 1 # Number of CACTO training whose policy is going to be rolled-out + NORMALIZE_INPUTS = 1 # If states were normalized during training + state_norm_arr = np.array([q_norm,q_norm,q_norm, + v_norm,v_norm,v_norm,int(N*conf.dt)]) + + if CACTO_ROLLOUT: + NNs_path = './Results/NNs/N_try_{}'.format(N_try) + + actor_model = get_actor() + + ## If loading final weights + # actor_model.load_weights(NNs_path+"Manipulator3DoF_final_actor.h5") + + ## If loading weights saved before completing the training + nupdates = 128000 + actor_model.load_weights(NNs_path+"/Manipulator_{}.h5".format(nupdates)) + + + # START SOLVING TO PROBLEMS + for i in range(len(init_states_sim)): + + ICS = init_states_sim[i] + + if CACTO_ROLLOUT: + # POLICY ROLLOUT + q0_CACTO = [init_states_sim[i][0]] + q1_CACTO = [init_states_sim[i][1]] + q2_CACTO = [init_states_sim[i][2]] + v0_CACTO = [init_states_sim[i][3]] + v1_CACTO = [init_states_sim[i][4]] + v2_CACTO = [init_states_sim[i][5]] + a0_CACTO = [0.0] + a1_CACTO = [0.0] + a2_CACTO = [0.0] + tau0_CACTO = [] + tau1_CACTO = [] + tau2_CACTO = [] + step_counter_local = 0 + prev_state_local = np.copy(init_states_sim[i]) + + if NORMALIZE_INPUTS: + prev_state_norm = prev_state_local / state_norm_arr + prev_state_norm[-1] = 2*prev_state_norm[-1] - 1 + tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state_norm), 0) + else: + tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state_local), 0) + + init_TO_states = np.zeros((num_states, N+1)) + init_TO_states[0][0] = prev_state_local[0] + init_TO_states[1][0] = prev_state_local[1] + init_TO_states[2][0] = prev_state_local[2] + init_TO_states[3][0] = prev_state_local[3] + init_TO_states[4][0] = prev_state_local[4] + init_TO_states[5][0] = prev_state_local[5] + init_TO_controls = np.zeros((num_actions, N+1)) + init_TO_controls[0][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[0] + init_TO_controls[1][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[1] + init_TO_controls[2][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[2] + init_prev_state = np.copy(prev_state_local) + + for i in range(1, N+1): + init_next_state = simulate(conf.dt,init_prev_state,np.array([init_TO_controls[0][i-1],init_TO_controls[1][i-1],init_TO_controls[2][i-1]])) + init_TO_states[0][i] = init_next_state[0] + init_TO_states[1][i] = init_next_state[1] + init_TO_states[2][i] = init_next_state[2] + init_TO_states[3][i] = init_next_state[3] + init_TO_states[4][i] = init_next_state[4] + init_TO_states[5][i] = init_next_state[5] + if NORMALIZE_INPUTS: + init_next_state_norm = init_next_state / state_norm_arr + init_next_state_norm[-1] = 2*init_next_state_norm[-1] - 1 + init_tf_next_state = tf.expand_dims(tf.convert_to_tensor(init_next_state_norm), 0) + else: + init_tf_next_state = tf.expand_dims(tf.convert_to_tensor(init_next_state), 0) + init_TO_controls[0][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[0] + init_TO_controls[1][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[1] + init_TO_controls[2][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[2] + init_prev_state = np.copy(init_next_state) + + mdl = TO_manipulator(ICS, init_q0, init_q1, init_q2, init_v0, init_v1, init_v2, init_0, init_0, init_0, init_tau0, init_tau1, init_tau2, N, init_TO = [init_TO_controls, init_TO_states]) + else: + # Create TO problem choosing how to warm-start the variables between 1)ICS; 2)0s; 3)Random + mdl = TO_manipulator(ICS, init_q0_ICS, init_q1_ICS, init_q2_ICS, init_v0_ICS, init_v1_ICS, init_v2_ICS, init_0, init_0, init_0, init_0, init_0, init_0, N) + # mdl = TO_manipulator(ICS, init_0, init_0, init_0, init_0, init_0, init_0, init_0, init_0, init_0, init_0, init_0, init_0, conf.tau_upper_bound, conf.Iz, conf.M, conf.l, conf.w_d, conf.w_v, conf.w_peak, conf.w_ob1, conf.w_ob2, conf.w_ob3, conf.w_u, conf.x_des, conf.y_des, conf.XC1, conf.YC1, conf.XC2, conf.YC2, conf.XC3, conf.YC3, conf.A1, conf.B1, conf.A2, conf.B2, conf.A3, conf.B3, N, conf.dt, conf.alpha, conf.alpha2) + # mdl = TO_manipulator(ICS, init_rand_q0, init_rand_q1, init_rand_q2, init_rand_v0, init_rand_v1, init_rand_v2, init_rand_a0, init_rand_a1, init_rand_a2, init_rand_tau, init_rand_tau, init_rand_tau, conf.tau_upper_bound, conf.Iz, conf.M, conf.l, conf.w_d, conf.w_v, conf.w_peak, conf.w_ob1, conf.w_ob2, conf.w_ob3, conf.w_u,conf.x_des, conf.y_des, conf.XC1, conf.YC1, conf.XC2, conf.YC2, conf.XC3, conf.YC3, conf.A1, conf.B1, conf.A2, conf.B2, conf.A3, conf.B3, N, conf.dt, conf.alpha, conf.alpha2) + + # Lists storing values to be plotted + K = np.array([k for k in mdl.k]) + x_ee_init = [-7+conf.l*(cos(mdl.q0[k]()) + cos(mdl.q0[k]()+mdl.q1[k]()) + cos(mdl.q0[k]()+mdl.q1[k]()+mdl.q2[k]())) for k in K] + y_ee_init = [(conf.l*(sin(mdl.q0[k]()) + sin(mdl.q0[k]()+mdl.q1[k]()) + sin(mdl.q0[k]()+mdl.q1[k]()+mdl.q2[k]()))) for k in K] + q0_init = [mdl.q0[k]() for k in K] + q1_init = [mdl.q1[k]() for k in K] + q2_init = [mdl.q2[k]() for k in K] + v0_init = [mdl.v0[k]() for k in K] + v1_init = [mdl.v1[k]() for k in K] + v2_init = [mdl.v2[k]() for k in K] + a0_init = [mdl.a0[k]() for k in K] + a1_init = [mdl.a1[k]() for k in K] + a2_init = [mdl.a2[k]() for k in K] + x_ee_init_all_sim.append(x_ee_init) + y_ee_init_all_sim.append(y_ee_init) + q0_init_all_sim.append(q0_init) + q1_init_all_sim.append(q1_init) + q2_init_all_sim.append(q2_init) + v0_init_all_sim.append(v0_init) + v1_init_all_sim.append(v1_init) + v2_init_all_sim.append(v2_init) + a0_init_all_sim.append(a0_init) + a1_init_all_sim.append(a1_init) + a2_init_all_sim.append(a2_init) + + # Solve TO problem + solver = SolverFactory('ipopt') + solver.options['linear_solver'] = "ma57" + try: + results = solver.solve(mdl,tee=True) + print('***** OBJ = ',mdl.obj(),'*****') + if str(results.solver.termination_condition) != "optimal": + print(results.solver.termination_condition) + continue + else: + obj_arr = np.append(obj_arr,mdl.obj()) + except: + print("***** IPOPT FAILED *****") + continue + + x_ee = [-7+conf.l*(cos(mdl.q0[k]()) + cos(mdl.q0[k]()+mdl.q1[k]()) + cos(mdl.q0[k]()+mdl.q1[k]()+mdl.q2[k]())) for k in K] + y_ee = [(conf.l*(sin(mdl.q0[k]()) + sin(mdl.q0[k]()+mdl.q1[k]()) + sin(mdl.q0[k]()+mdl.q1[k]()+mdl.q2[k]()))) for k in K] + q0 = [mdl.q0[k]() for k in K] + q1 = [mdl.q1[k]() for k in K] + q2 = [mdl.q2[k]() for k in K] + v0 = [mdl.v0[k]() for k in K] + v1 = [mdl.v1[k]() for k in K] + v2 = [mdl.v2[k]() for k in K] + a0 = [mdl.a0[k]() for k in K] + a1 = [mdl.a1[k]() for k in K] + a2 = [mdl.a2[k]() for k in K] + tau0 = [mdl.tau0[k]() for k in K] + tau1 = [mdl.tau1[k]() for k in K] + tau2 = [mdl.tau2[k]() for k in K] + x_ee_all_sim.append(x_ee) + y_ee_all_sim.append(y_ee) + q0_all_sim.append(q0) + q1_all_sim.append(q1) + q2_all_sim.append(q2) + v0_all_sim.append(v0) + v1_all_sim.append(v1) + v2_all_sim.append(v2) + a0_all_sim.append(a0) + a1_all_sim.append(a1) + a2_all_sim.append(a2) + tau0_all_sim.append(tau0) + tau1_all_sim.append(tau1) + tau2_all_sim.append(tau2) + + print(obj_arr) + + ## Print trajectories + # print("sol_tau0 = ",mdl.tau0[:](),"\nsol_tau1 = ",mdl.tau1[:](),"\nsol_tau2 = ",mdl.tau2[:](),"\nsol_q0 = ",mdl.q0[:](),"\nsol_q1 = ",mdl.q1[:](),"\nsol_q2 = ",mdl.q2[:](),"\nsol_v0 = ",mdl.v0[:](),"\nsol_v1 = ",mdl.v1[:](),"\nsol_v2 = ",mdl.v2[:](),"\nsol_a0 = ",mdl.a0[:](),"\nsol_a1 = ",mdl.a1[:](),"\nsol_a2 = ",mdl.a2[:]()) + + # Plot stuff + plot_results_TO_all(q0_all_sim,q1_all_sim,x_ee_init_all_sim,y_ee_init_all_sim,x_ee_all_sim,y_ee_all_sim) + # plot_results_TO_all_trajs(q0_all_sim,q1_all_sim,q2_all_sim,v0_all_sim,v1_all_sim,v2_all_sim,tau0_all_sim,tau1_all_sim,tau2_all_sim) + # plot_results_TO(mdl) + + # pickle.dump({"q0":q0_all_sim,"q1":q1_all_sim,"q2":q2_all_sim,"v0":v0_all_sim,"v1":v1_all_sim,"v2":v2_all_sim,"tau0":tau0_all_sim,"tau1":tau1_all_sim,"tau2":tau2_all_sim},open('9_trajectories_ICSinit', 'wb')) \ No newline at end of file diff --git a/config_manipulator3DoF_pyomo.py b/config_manipulator3DoF_pyomo.py new file mode 100644 index 0000000..f251e78 --- /dev/null +++ b/config_manipulator3DoF_pyomo.py @@ -0,0 +1,111 @@ +import numpy as np + +ep_no_update = 100 # Episodes to wait before starting to update the NNs +NEPISODES = 50000+ep_no_update # Max training episodes +EP_UPDATE = 25 # Number of episodes before updating critic and actor +NSTEPS = 100 # Max episode length +CRITIC_LEARNING_RATE = 0.001 # Learning rate for the critic network +ACTOR_LEARNING_RATE = 0.0005 # Learning rate for the policy network +UPDATE_RATE = 0.0005 # Homotopy rate to update the target critic network +UPDATE_LOOPS = 160 # Number of updates of both critic and actor performed every EP_UPDATE episodes +REPLAY_SIZE = 2**15 # Size of the replay buffer +BATCH_SIZE = 64 # Size of the mini-batch +NH1 = 256 # 1st hidden layer size +NH2 = 256 # 2nd hidden layer size +dt = 0.05 # Timestep + +LR_SCHEDULE = 1 # Flag to use a scheduler for the learning rates + +boundaries_schedule_LR_C = [200*REPLAY_SIZE/BATCH_SIZE, + 300*REPLAY_SIZE/BATCH_SIZE, + 400*REPLAY_SIZE/BATCH_SIZE, + 500*REPLAY_SIZE/BATCH_SIZE] +# Values of critic LR +values_schedule_LR_C = [CRITIC_LEARNING_RATE, + CRITIC_LEARNING_RATE/2, + CRITIC_LEARNING_RATE/4, + CRITIC_LEARNING_RATE/8, + CRITIC_LEARNING_RATE/16] + +# Numbers of critic updates after which the actor LR is changed (based on values_schedule_LR_A) +boundaries_schedule_LR_A = [200*REPLAY_SIZE/BATCH_SIZE, + 300*REPLAY_SIZE/BATCH_SIZE, + 400*REPLAY_SIZE/BATCH_SIZE, + 500*REPLAY_SIZE/BATCH_SIZE] +# Values of actor LR +values_schedule_LR_A = [ACTOR_LEARNING_RATE, + ACTOR_LEARNING_RATE/2, + ACTOR_LEARNING_RATE/4, + ACTOR_LEARNING_RATE/8, + ACTOR_LEARNING_RATE/16] + +prioritized_replay_alpha = 0 # α determines how much prioritization is used, set to 0 to use a normal buffer. + # Used to define the probability of sampling transition i --> P(i) = p_i**α / sum(p_k**α) where p_i is the priority of transition i +''' +Prioritized replay introduces bias because it changes the sample distribution in an uncontrolled fashion (while the expectation is estimated with the mean +so it would need a uniform sample distribution), and therefore changes the solution that the estimates will converge to. We can correct this bias by using importance-sampling weights: w_i = (1 / (N*P(i)) )**β +that fully compensates for the non-uniform probabilities P(i) if β = 1. The unbiased nature of the updates is most important near convergence at the end of training, as the process is highly non-stationary anyway, +due to changing policies, state distributions and bootstrap targets; so a small bias can be ignored in this context. +''' +prioritized_replay_beta0 = 0.6 +prioritized_replay_beta_iters = None # Therefore let's exploit the flexibility of annealing the amount of IS correction over time, by defining a schedule on the exponent β + # that from its initial value β0 reaches 1 only at the end of learning. +prioritized_replay_eps = 1e-5 # It's a small positive constant that prevents the edge-case of transitions not being revisited once their error is zero + +NORMALIZE_INPUTS = 1 # Flag to normalize inputs (state and action) + +num_states = 7 # Number of states +num_actions = 3 # Number of actions +tau_upper_bound = 200 # Action upper bound +tau_lower_bound = -200 # Action upper bound +state_norm_arr = np.array([15,15,15,10, + 10,10,int(NSTEPS*dt)]) # Array used to normalize states + +EPISODE_CRITIC_PRETRAINING = 0 # Episodes of critic pretraining +EPISODE_ICS_INIT = 0 # Episodes where ICS warm-starting is used instead of actor rollout + +wreg_l1_A = 1e-2 # Weight of L1 regularization in actor's network +wreg_l2_A = 1e-2 # Weight of L2 regularization in actor's network +wreg_l1_C = 1e-2 # Weight of L1 regularization in critic's network +wreg_l2_C = 1e-2 # Weight of L2 regularization in critic's network + +TD_N = 1 # Flag to use n-step TD rather than 1-step TD +nsteps_TD_N = 1 # Number of lookahed steps + +XC1 = -2.0 # X coord center ellipse 1 +YC1 = 0.0 # Y coord center ellipse 1 +A1 = 6 # Width ellipse 1 +B1 = 10 # Height ellipse 1 +XC2 = 3.0 # X coord center ellipse 2 +YC2 = 4.0 # Y coord center ellipse 2 +A2 = 12 # Width ellipse 2 +B2 = 4 # Height ellipse 2 +XC3 = 3.0 # X coord center ellipse 2 +YC3 = -4.0 # Y coord center ellipse 2 +A3 = 12 # Width ellipse 2 +B3 = 4 # Height ellipse 2 + +w_d = 100 # Distance from target weight +w_v = 10000 # Velocity weight +w_u = 1 # Control effort weight +w_peak = 500000 # Target threshold weight +w_ob1 = 5000000 # Ellipse 1 weight +w_ob2 = 5000000 # Ellipse 2 weight +w_ob3 = 5000000 # Ellipse 3 weight +alpha = 50 # Soft abs coefficient (obstacle) +alpha2 = 50 # Soft abs coefficient (peak) + +M = 0.5 # Link mass +l = 10 # Link length +Iz = (M*l**2)/3 # Inertia around z axis passing though an end of a link of mass M and length l +x_base = -7.0 # x coord base +y_base = 0.0 # y coord base +x_des = -20.0 # Target x position +y_des = 0.0 # Target y position +TARGET_STATE = np.array([x_des,y_des]) # Target state + + +'''' Needed in dynamics_manipulator3DoF ''' +simulate_coulomb_friction = 0 # To simulate friction +simulation_type = 'euler' # Either 'timestepping' or 'euler' +tau_coulomb_max = 0*np.ones(3) # Expressed as percentage of torque max diff --git a/dynamics_manipulator3DoF.py b/dynamics_manipulator3DoF.py new file mode 100644 index 0000000..4126ccf --- /dev/null +++ b/dynamics_manipulator3DoF.py @@ -0,0 +1,538 @@ +import os +import math +import gepetto.corbaserver +import time +import subprocess +import pinocchio as pin +from pinocchio.robot_wrapper import RobotWrapper as PinocchioRobotWrapper +import numpy as np +from numpy.linalg import norm +import matplotlib.pyplot as plt +import random + +class Contact: + ''' A contact between a contact-point and a contact-surface + ''' + def __init__(self, contact_point, contact_surface): + self.cp = contact_point + self.cs = contact_surface + self.reset_contact_position() + + def reset_contact_position(self): + # Initialize anchor point p0, that is the initial (0-load) position of the spring + self.p0 = self.cp.get_position() + self.in_contact = True + + def compute_force(self): + self.f, self.p0 = self.cs.compute_force(self.cp, self.p0) + return self.f + + def get_jacobian(self): + return self.cp.get_jacobian() + +class RobotWrapper(PinocchioRobotWrapper): + + @staticmethod + def BuildFromURDF(filename, package_dirs=None, root_joint=None, verbose=False, meshLoader=None): + robot = RobotWrapper() + robot.initFromURDF(filename, package_dirs, root_joint, verbose, meshLoader) + return robot + + @property + def na(self): + if(self.model.joints[0].nq==7): + return self.model.nv-6 + return self.model.nv + + def mass(self, q, update=True): + if(update): + return pin.crba(self.model, self.data, q) + return self.data.M + + def nle(self, q, v, update=True): + if(update): + return pin.nonLinearEffects(self.model, self.data, q, v) + return self.data.nle + + def com(self, q=None, v=None, a=None, update=True): + if(update==False or q is None): + return PinocchioRobotWrapper.com(self, q) + if a is None: + if v is None: + return PinocchioRobotWrapper.com(self, q) + return PinocchioRobotWrapper.com(self, q, v) + return PinocchioRobotWrapper.com(self, q, v,a) + + def Jcom(self, q, update=True): + if(update): + return pin.jacobianCenterOfMass(self.model, self.data, q) + return self.data.Jcom + + def momentumJacobian(self, q, v, update=True): + if(update): + pin.ccrba(self.model, self.data, q, v) + return self.data.Ag + + def computeAllTerms(self, q, v): + ''' pin.computeAllTerms is equivalent to calling: + pinocchio::forwardKinematics + pinocchio::crba + pinocchio::nonLinearEffects + pinocchio::computeJointJacobians + pinocchio::centerOfMass + pinocchio::jacobianCenterOfMass + pinocchio::kineticEnergy + pinocchio::potentialEnergy + This is too much for our needs, so we call only the functions + we need, including those for the frame kinematics + ''' + #pin.computeAllTerms(self.model, self.data, q, v); + pin.forwardKinematics(self.model, self.data, q, v, np.zeros(self.model.nv)) + pin.computeJointJacobians(self.model, self.data) + pin.updateFramePlacements(self.model, self.data) + pin.crba(self.model, self.data, q) + pin.nonLinearEffects(self.model, self.data, q, v) + + def forwardKinematics(self, q, v=None, a=None): + if v is not None: + if a is not None: + pin.forwardKinematics(self.model, self.data, q, v, a) + else: + pin.forwardKinematics(self.model, self.data, q, v) + else: + pin.forwardKinematics(self.model, self.data, q) + + def inverseKinematics(self, ee_coords): + ''' Inverse kinematics algorithm to compute a joint configuration given the EE coordinates''' + + oMdes = pin.SE3(np.eye(3), np.array([ee_coords[0], ee_coords[1], 0.0])) + for j in range(100): + q = np.array([random.uniform(-math.pi,math.pi),random.uniform(-math.pi,math.pi),random.uniform(-math.pi,math.pi)]) + # q = pin.neutral(self.model) + + pin.forwardKinematics(self.model, self.data, q) + pin.updateFramePlacements(self.model, self.data) + eps = 1e-4 + IT_MAX = 1000 + DT = 1e-1 + damp = 1e-12 + frame_id = self.model.getFrameId('fixed_ee') + i=0 + while True: + pin.forwardKinematics(self.model,self.data,q) + pin.updateFramePlacements(self.model, self.data) + dMi = oMdes.actInv(self.data.oMf[frame_id]) + err = pin.log(dMi).vector + if norm(err) < eps: + success = True + break + if i >= IT_MAX: + success = False + break + J = pin.computeFrameJacobian(self.model,self.data,q,frame_id) + v = - J.T.dot(np.linalg.solve(J.dot(J.T) + damp * np.eye(6), err)) + q = pin.integrate(self.model,q,v*DT) + if not i % 10: + print('%d: error = %s' % (i, err.T)) + i += 1 + + if success: + print("Convergence achieved!") + break + else: + print("\nWarning: the iterative algorithm has not reached convergence to the desired precision. Retry ({}) with another initial configuration".format(j)) + + return q, success + + def frameJacobian(self, q, index, update=True, ref_frame=pin.ReferenceFrame.LOCAL_WORLD_ALIGNED): + ''' Call computeFrameJacobian if update is true. If not, user should call computeFrameJacobian first. + Then call getFrameJacobian and return the Jacobian matrix. + ref_frame can be: ReferenceFrame.LOCAL, ReferenceFrame.WORLD, ReferenceFrame.LOCAL_WORLD_ALIGNED + ''' + if(update): + pin.computeFrameJacobian(self.model, self.data, q, index) + return pin.getFrameJacobian(self.model, self.data, index, ref_frame) + + def frameVelocity(self, q, v, index, update_kinematics=True, ref_frame=pin.ReferenceFrame.LOCAL_WORLD_ALIGNED): + if update_kinematics: + pin.forwardKinematics(self.model, self.data, q, v) + v_local = pin.getFrameVelocity(self.model, self.data, index) + if ref_frame==pin.ReferenceFrame.LOCAL: + return v_local + + H = self.data.oMf[index] + if ref_frame==pin.ReferenceFrame.WORLD: + v_world = H.act(v_local) + return v_world + + Hr = pin.pin(H.rotation, np.zeros(3)) + v = Hr.act(v_local) + return v + + def frameAcceleration(self, q, v, a, index, update_kinematics=True, ref_frame=pin.ReferenceFrame.LOCAL_WORLD_ALIGNED): + if update_kinematics: + pin.forwardKinematics(self.model, self.data, q, v, a) + a_local = pin.getFrameAcceleration(self.model, self.data, index) + if ref_frame==pin.ReferenceFrame.LOCAL: + return a_local + + H = self.data.oMf[index] + if ref_frame==pin.ReferenceFrame.WORLD: + a_world = H.act(a_local) + return a_world + + Hr = pin.pin(H.rotation, np.zeros(3)) + a = Hr.act(a_local) + return a + + def frameClassicAcceleration(self, q, v, a, index, update_kinematics=True, ref_frame=pin.ReferenceFrame.LOCAL_WORLD_ALIGNED): + if update_kinematics: + pin.forwardKinematics(self.model, self.data, q, v, a) + v = pin.getFrameVelocity(self.model, self.data, index) + a_local = pin.getFrameAcceleration(self.model, self.data, index) + a_local.linear += np.cross(v.angular, v.linear, axis=0) + if ref_frame==pin.ReferenceFrame.LOCAL: + return a_local + + H = self.data.oMf[index] + if ref_frame==pin.ReferenceFrame.WORLD: + a_world = H.act(a_local) + return a_world + + Hr = pin.pin(H.rotation, np.zeros(3)) + a = Hr.act(a_local) + return a + + def deactivateCollisionPairs(self, collision_pair_indexes): + for i in collision_pair_indexes: + self.collision_data.deactivateCollisionPair(i) + + def addAllCollisionPairs(self): + self.collision_model.addAllCollisionPairs() + self.collision_data = pin.GeometryData(self.collision_model) + + def isInCollision(self, q, stop_at_first_collision=True): + return pin.computeCollisions(self.model, self.data, self.collision_model, self.collision_data, np.asmatrix(q).reshape((self.model.nq,1)), stop_at_first_collision) + + def findFirstCollisionPair(self, consider_only_active_collision_pairs=True): + for i in range(len(self.collision_model.collisionPairs)): + if(not consider_only_active_collision_pairs or self.collision_data.activeCollisionPairs[i]): + if(pin.computeCollision(self.collision_model, self.collision_data, i)): + return (i, self.collision_model.collisionPairs[i]) + return None + + def findAllCollisionPairs(self, consider_only_active_collision_pairs=True): + res = [] + for i in range(len(self.collision_model.collisionPairs)): + if(not consider_only_active_collision_pairs or self.collision_data.activeCollisionPairs[i]): + if(pin.computeCollision(self.collision_model, self.collision_data, i)): + res += [(i, self.collision_model.collisionPairs[i])] + return res + +class RobotSimulator: + + def __init__(self, robot, q0_init, v0_init, simulation_type, tau_coulomb_max, use_viewer=False, DISPLAY_T=None, CAMERA_TRANSFORM=None, show_floor=False): + self.robot = robot + self.model = self.robot.model + self.data = self.model.createData() + self.t = 0.0 # time + self.nv = nv = self.model.nv # Dimension of joint velocities vector + self.na = na = robot.na # number of actuated joints + self.S = np.hstack((np.zeros((na, nv-na)), np.eye(na, na))) # Matrix S used as filter of vector of inputs U + + self.contacts = [] + self.candidate_contact_points = [] # candidate contact points + self.contact_surfaces = [] + + self.DISPLAY_T = DISPLAY_T # refresh period for viewer + self.use_viewer = use_viewer + self.tau_coulomb_max = tau_coulomb_max + self.display_counter = self.DISPLAY_T + self.init(q0_init, v0_init, True) + + self.tau_c = np.zeros(na) # Coulomb friction torque + self.simulation_type = simulation_type + self.set_coulomb_friction(tau_coulomb_max) + + # for gepetto viewer + self.gui = None + if(use_viewer): + try: + prompt = subprocess.getstatusoutput("ps aux |grep 'gepetto-gui'|grep -v 'grep'|wc -l") + if int(prompt[1]) == 0: + os.system('gepetto-gui &') + time.sleep(1) + except: + pass + gepetto.corbaserver.Client() + self.robot.initViewer(loadModel=True) + self.gui = self.robot.viewer.gui + if(show_floor): + self.robot.viewer.gui.createSceneWithFloor('world') + self.gui.setLightingMode('world/floor', 'OFF') + self.robot.displayCollisions(False) + self.robot.displayVisuals(True) + self.robot.display(self.q) + try: + self.gui.setCameraTransform("python-pinocchio", CAMERA_TRANSFORM) + except: + self.gui.setCameraTransform(0, CAMERA_TRANSFORM) + + # Re-initialize the simulator + def init(self, q0=None, v0=None, reset_contact_positions=False): + self.first_iter = True + + if q0 is not None: + self.q = q0.copy() + + if(v0 is None): + self.v = np.zeros(self.robot.nv) + else: + self.v = v0.copy() + self.dv = np.zeros(self.robot.nv) + self.resize_contact_data(reset_contact_positions) + + def resize_contact_data(self, reset_contact_positions=False): + self.nc = len(self.contacts) # number of contacts + self.nk = 3*self.nc # size of contact force vector + self.f = np.zeros(self.nk) # contact forces + self.Jc = np.zeros((self.nk, self.model.nv)) # contact Jacobian + + # reset contact position + if(reset_contact_positions): + pin.forwardKinematics(self.model, self.data, self.q) + pin.updateFramePlacements(self.model, self.data) + for c in self.contacts: + c.reset_contact_position() + + self.compute_forces(compute_data=True) + + def set_coulomb_friction(self, tau_max): + self.tau_coulomb_max = 1e-2*tau_max*self.model.effortLimit + self.simulate_coulomb_friction = (norm(self.tau_coulomb_max)!=0.0) + + def collision_detection(self): + for s in self.contact_surfaces: # for each contact surface + for cp in self.candidate_contact_points: # for each candidate contact point + p = cp.get_position() + if(s.check_collision(p)): # check whether the point is colliding with the surface + if(not cp.active): # if the contact was not already active + print("Collision detected between point", cp.frame_name, " at ", p) + cp.active = True + cp.contact = Contact(cp, s) + self.contacts += [cp.contact] + self.resize_contact_data() + else: + if(cp.active): + print("Contact lost between point", cp.frame_name, " at ", p) + cp.active = False + self.contacts.remove(cp.contact) + self.resize_contact_data() + + def compute_forces(self, compute_data=True): + '''Compute the contact forces from q, v and elastic model''' + if compute_data: + pin.forwardKinematics(self.model, self.data, self.q, self.v) + # Computes the placements of all the operational frames according to the current joint placement stored in data + pin.updateFramePlacements(self.model, self.data) + self.collision_detection() + + i = 0 + for c in self.contacts: + self.f[i:i+3] = c.compute_force() + self.Jc[i:i+3, :] = c.get_jacobian() + i += 3 + return self.f + + def step(self, u, dt=None): + if dt is None: + dt = self.dt + + # (Forces are directly in the world frame, and aba wants them in the end effector frame) + pin.computeAllTerms(self.model, self.data, self.q, self.v) + pin.updateFramePlacements(self.model, self.data) + M = self.data.M + h = self.data.nle + self.collision_detection() + self.compute_forces(False) + + if(self.simulate_coulomb_friction and self.simulation_type=='timestepping'): + # minimize kinetic energy using time stepping + from quadprog import solve_qp + ''' + Solve a strictly convex quadratic program + + Minimize 1/2 x^T G x - a^T x + Subject to C.T x >= b + + Input Parameters + ---------- + G : array, shape=(n, n) + a : array, shape=(n,) + C : array, shape=(n, m) matrix defining the constraints + b : array, shape=(m), default=None, vector defining the constraints + meq : int, default=0 + the first meq constraints are treated as equality constraints, + all further as inequality constraints + ''' + # M (v' - v) = dt*S^T*(tau - tau_c) - dt*h + dt*J^T*f + # M v' = M*v + dt*(S^T*tau - h + J^T*f) - dt*S^T*tau_c + # M v' = b + B*tau_c + # v' = Minv*(b + B*tau_c) + b = M.dot(self.v) + dt*(self.S.T.dot(u) - h + self.Jc.T.dot(self.f)) + B = - dt*self.S.T + # Minimize kinetic energy: + # min v'.T * M * v' + # min (b+B*tau_c​).T*Minv*(b+B*tau_c​) + # min tau_c.T * B.T*Minv*B* tau_C + 2*b.T*Minv*B*tau_c + Minv = np.linalg.inv(M) + G = B.T.dot(Minv.dot(B)) + a = -b.T.dot(Minv.dot(B)) + C = np.vstack((np.eye(self.na), -np.eye(self.na))) + c = np.concatenate((-self.tau_coulomb_max, -self.tau_coulomb_max)) + solution = solve_qp(G, a, C.T, c, 0) + self.tau_c = solution[0] + self.v = Minv.dot(b + B.dot(self.tau_c)) + self.q = pin.integrate(self.model, self.q, self.v*dt) + self.dv = np.linalg.solve(M, self.S.T.dot(u-self.tau_c) - h + self.Jc.T.dot(self.f)) + elif(self.simulation_type=='euler' or self.simulate_coulomb_friction==False): + self.tau_c = self.tau_coulomb_max*np.sign(self.v[-self.na:]) + self.dv = np.linalg.solve(M, self.S.T.dot(u-self.tau_c) - h + self.Jc.T.dot(self.f)) + # v_mean = np.copy(self.v) + 0.5*dt*self.dv + self.q = pin.integrate(self.model, self.q, self.v*dt) + # self.q += dt*v_mean + self.v += self.dv*dt + else: + print("[ERROR] Unknown simulation type:", self.simulation_type) + + self.t += dt + return self.q, self.v, self.dv + + def reset(self): + self.first_iter = True + + def simulate(self, u, dt=0.05, ndt=1): + ''' Perform ndt steps, each lasting dt/ndt seconds ''' + tau_c_avg = 0*self.tau_c + for i in range(ndt): + self.q, self.v, self.dv = self.step(u, dt/ndt) + tau_c_avg += self.tau_c + self.tau_c = tau_c_avg / ndt + + if (self.use_viewer): + self.display_counter -= dt + if self.display_counter <= 0.0: + self.robot.display(self.q) + self.display_counter = self.DISPLAY_T + + return self.q, self.v, self.f + + def display(self, q): + if(self.use_viewer): + self.robot.display(q) + +def load_urdf(URDF_PATH): + robot = RobotWrapper.BuildFromURDF(URDF_PATH) + return robot + +def create_empty_figure(nRows=1, nCols=1,sharex=True): + f, ax = plt.subplots(nRows,nCols,sharex=sharex) + return (f, ax) + + +if __name__=='__main__': + + N = 200 # Number of time steps + dt = 0.05 # Controller time step + T_SIMULATION = N*dt # Number of seconds simulated + ndt = 1 + q_init = np.array([math.pi/2,0.,0.]) # Initial joint position + v_init = np.array([ 0.,0.,0.]).T # Initial joint velocity + + simulate_coulomb_friction = 0 # To simulate friction + simulation_type = 'euler' # Either 'timestepping' or 'euler' + tau_coulomb_max = 0*np.ones(3) # Expressed as percentage of torque max + + use_viewer = True + simulate_real_time = True + show_floor = False + PRINT_T = 1 # Print every PRINT_N time steps + DISPLAY_T = 0.02 # Update robot configuration in viewer every DISPLAY_N time steps + CAMERA_TRANSFORM = [3.3286049365997314, -11.498767852783203, 121.38613891601562, 0.051378559321165085, 0.023286784067749977, -0.001170438714325428, 0.9984070062637329] + + PLOT_JOINT_POS = 1 + PLOT_JOINT_VEL = 1 + PLOT_JOINT_ACC = 1 + PLOT_TORQUES = 1 + + URDF_PATH = "urdf/planar_manipulator_3dof.urdf" + + r = load_urdf(URDF_PATH) + robot = RobotWrapper(r.model, r.collision_model, r.visual_model) + simu = RobotSimulator(robot, q_init, v_init, simulation_type, tau_coulomb_max, use_viewer, DISPLAY_T, CAMERA_TRANSFORM, show_floor) + + frame_id = robot.model.getFrameId('fixed_ee') # End-effector frame + + # Example of TO solution from [np.array([pi/2,0.,0.,0.,0.,0.])], target=[-20,0] + tau0 = [50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 3.913573721018818, -49.99999997812837, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -49.99999998263544, -49.99999995861845, -49.99999993221877, -49.99999990306833, -49.99999987071753, -49.99999983461132, -49.9999997940566, -49.99999974817655, -49.999999695845204, -49.999999635591124, -49.999999565453635, -49.99999948277056, -49.9999993838735, -49.99999926362621, -49.99999911450014, -49.99999892406386, -49.99999866788871, -49.999998291144024, -49.99999766106636, -49.99999641971001, -49.99999334107301, -49.99998117668781, -46.3305280659364, -24.952363144765123, -19.338494352862362, -17.665869556802168, -15.969375750675875, -13.875635089856688, -11.802969516419605, -10.018418774230609, -8.566108149374008, -7.391570403941093, -6.42809739367328, -5.62500889060553, -4.94790811736938, -4.3727372800157545, -3.8813594725379144, -3.459361037489522, -3.0950636583315094, -2.7789782267165024, -2.5033915044260913, -2.262015684970941, -2.049698117228155, -1.8621904342485147, -1.6959687118390823, -1.5480935908624742, -1.4161006130642122, -1.2979134200206421, -1.1917745049867159, -1.0961896285185433, -1.0098829686449564, -0.9317607540079113, -0.8608816318066894, -0.7964324077286956, -0.7377080914713674, -0.6840954101556022, -0.6350591283370008, -0.59013065006311, -0.5488984844539794, -0.5110002398113251, -0.47611587644719633, -0.443962000211009, -0.4142870197805363, -0.38686702350141183, -0.3615022576843117, -0.33801410955749306, -0.31624251481435955, -0.2960437238304552, -0.2772883714424923, -0.25985980487578647, -0.24365263123064845, -0.22857145298785428, -0.21452976412923114, -0.20144898449681217, -0.18925761312116307, -0.17789048415522468, -0.1672881117027469, -0.1573961116191082, -0.14816469026299792, -0.1395481916748371, -0.13150469555377498, -0.12399565976918234, -0.11698560197296447, -0.11044181542538248, -0.10433411498268876, -0.0986346096734115, -0.09331749870240803, -0.08835888827625434, -0.08373662679008599, -0.07943015630294098, -0.0754203785771209, -0.07168953402935123, -0.06822109216157565, -0.06499965235029037, -0.06201085377818968, -0.05924129365424839, -0.05667845278067173, -0.05431062782792054, -0.05212686954785943, -0.05011692632728752, -0.04827119268177411, -0.046580662063378186, -0.045036883658417244, -0.043631922768426736, -0.042358324419103935, -0.041209079956523405, -0.040177596288934546, -0.03925766757960086, -0.03844344915052737, -0.03772943342401835, -0.037110427659416016, -0.0365815334480049, -0.03613812768636835, -0.0357758449797231, -0.03549056133594179, -0.03527837907343018, -0.03513561283161326, -0.03505877652434707, -0.03504457090814823, -0.035089871084497695, -0.035191714362526536, -0.035347293877997926, -0.03555397404494562, -0.035809340997228704, -0.0361112098283853, -0.036457214815460895, -0.03684329262668612, -0.03726157437932053, -0.03770484714126743, -0.03819829113291652, -0.03887398161427306, -0.03998048529109933, -0.04134077898704407, -0.04040053594758688, -0.02937217479892824, 9.310123181419991e-22, 3.2867662109711427e-21] + tau1 = [-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -49.999999925160694, -49.99999623355654, 49.99999661317066, 49.99999933991629, 49.99999958845886, 49.99999953495631, 49.9999991795935, 49.99999757171713, -13.88363883189849, -49.999997992976596, -49.99999919548091, -49.999999489843475, -49.99999954363985, -49.9999993577583, -49.99999849772317, -29.756444147424194, 49.99999851297247, 49.99999962440779, 49.9999999322578, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 49.999999540051434, 49.999945429392575, 13.523097736728628, 49.99999982418196, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 49.99999984633092, 49.99999847815977, 45.367144514865466, -33.83630427701238, -13.867233954583172, -0.5150175332480839, 1.6956169602865967, 0.9012440381465341, 0.3187378661553381, 0.26707626201201495, 0.5171759442383039, 0.9285422723436255, 1.4510231388471262, 2.06302507674324, 2.7476945320290667, 3.4893591250457145, 4.273988285544616, 5.089016044958304, 5.921661917254661, 6.756124677893429, 7.574948431143914, 8.379001318796224, 9.236308477361941, 10.315414824693468, 11.75923166010499, 13.268471377299782, 13.62397567301964, 10.907637822540988, 3.969572656941659, -5.830225659311305, -14.779022841260165, -19.47803573384028, -19.589654048970615, -17.768356513401585, -16.36754183864119, -15.36199581612403, -14.35125645595521, -13.244510677137141, -12.135092802473562, -11.11344736797613, -10.211734347258979, -9.423094529313513, -8.72906392583842, -8.113100899129789, -7.5631701490718495, -7.070407986273108, -6.627657375747211, -6.228745775955465, -5.868258643336417, -5.5414765828689125, -5.244321457631866, -4.973282798621214, -4.725337382788586, -4.497875009074871, -4.288635258573854, -4.095654894536202, -3.9172240426050853, -3.751849416948938, -3.5982233037430147, -3.455197342753047, -3.321760340634281, -3.19701947317666, -3.0801843310492707, -2.970553349808765, -2.867502240312227, -2.770474100184701, -2.6789709408909914, -2.5925464096857915, -2.51079952199875, -2.4333692507602214, -2.359929843838086, -2.2901867617590623, -2.223873145112396, -2.1607467353941847, -2.1005871847778277, -2.0431937006102228, -1.9883829781302658, -1.9359873826438572, -1.8858533472115067, -1.8378399580151306, -1.791817702152308, -1.7476673580051276, -1.705279009121051, -1.6645511668765645, -1.625389988380853, -1.5877085778626319, -1.5514263618540947, -1.5164685291262032, -1.4827655280009002, -1.4502526146865036, -1.4188694463462252, -1.3885597141467942, -1.3592708120698516, -1.3309535372229344, -1.303561818469381, -1.2770524703289947, -1.2513849693768573, -1.2265212510808312, -1.202425524762064, -1.1790641047486285, -1.1564052564097662, -1.1344190554708453, -1.1130772590507383, -1.092353187839831, -1.0722216177572568, -1.052658680675167, -1.033641772964263, -1.0151494715756355, -0.9971614566465445, -0.9796584399147069, -0.9626220990198487, -0.946035016537702, -0.9298806236668248, -0.9141431481429039, -0.8988075658099768, -0.8838595559304274, -0.8692854595130881, -0.8550722406754522, -0.8412074506631448, -0.8276791944692483, -0.814476099441573, -0.8015872864325896, -0.7890023425591132, -0.7767112958181723, -0.7647045914228321, -0.7529730698099669, -0.7415079462538057, -0.730300791389307, -0.7193435108300817, -0.7086283186315634, -0.6981477071505003, -0.6878944562286702, -0.6778618247056026, -0.6680440769047711, -0.6584367080561528, -0.649032784005356, -0.6398077540538428, -0.630695334893884, -0.6216287801285251, -0.6128950581280358, -0.6060640106567209, -0.6043160857918533, -0.6056359031766879, -0.5752845647421657, -0.40450819960279844, -5.263219031959085e-22, -1.5380345359532698e-21] + tau2 = [-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, 15.176472068098633, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 49.99999934388885, -29.77803281309609, -16.221752389165278, 4.122727684604918, 4.166393767837994, 3.514732370695024, 3.5692856129789194, 2.4256753138456904, -1.452743994278073, -1.2080631473419272, -0.16824452899613926, 0.029834958070621483, -0.09106383508912538, -0.21308198787284016, -0.29430046226428763, -0.35447656421814966, -0.40657909246598867, -0.45473497787083195, -0.5002374257267803, -0.5439659948642026, -0.5867233286245945, -0.6291705892311026, -0.6718331473214855, -0.7151390073252673, -0.7592564187013162, -0.8036391351722044, -0.8475769438021128, -0.8952290113173695, -0.9649183468551131, -1.0844424965204236, -1.2399487595565644, -1.2931714157860015, -0.995371696188642, -0.22354939862974435, 0.7437711833682358, 1.3358156125712366, 1.1994824770073702, 0.7863938473922568, 1.3479870385766368, 1.4423112083688652, 1.2691014140109311, 1.0840586470440485, 0.9515668250469531, 0.8585881498965094, 0.785163907513267, 0.7211673137280206, 0.6636031227311561, 0.6119758492465908, 0.566025511310649, 0.5252350824888673, 0.4889536610814826, 0.4565551896804238, 0.42750295577466474, 0.40135140687931065, 0.3777287206766371, 0.3563203706244419, 0.336857711997533, 0.31911016327731473, 0.30287928134679887, 0.28799387108429686, 0.2743058332244163, 0.26168665101999106, 0.2500244422575814, 0.23922149608186133, 0.22919221341436574, 0.21986138001835878, 0.21116271364150332, 0.20303763764195223, 0.19543424288068909, 0.18830640613489275, 0.181613039559615, 0.17531744973111885, 0.1693867885113119, 0.16379158166933863, 0.15850532265275655, 0.1535041219366205, 0.14876640321139548, 0.1442726397412392, 0.14000512485046981, 0.13594777169912195, 0.1320859379448401, 0.128406272477954, 0.12489658021600719, 0.1215457035178347, 0.11834341711682667, 0.11528033564763057, 0.11234783120603883, 0.10953796063043401, 0.10684340052928179, 0.10425738941269863, 0.10177367615415638, 0.09938647369364768, 0.09709041781637545, 0.09488052996924161, 0.09275218385042916, 0.09070107543134344, 0.08872319590873598, 0.08681480716783921, 0.08497241977806003, 0.08319277288486598, 0.08147281610339675, 0.07980969304210336, 0.07820072634929624, 0.07664340395951144, 0.07513536677545843, 0.07367439736441252, 0.07225840936640654, 0.07088543832539371, 0.06955363273033474, 0.06826124625714236, 0.06700663034399097, 0.06578822753486432, 0.0646045653322765, 0.06345425040158229, 0.06233596348969316, 0.061248454517693826, 0.06019053807870064, 0.059161089434241546, 0.0581590405595334, 0.057183376768826046, 0.05623313332378255, 0.05530739245838624, 0.05440528053964406, 0.05352596548789028, 0.05266865427903696, 0.05183259077469347, 0.05101705355436031, 0.05022135390940832, 0.049444834134151606, 0.04868686570280136, 0.04794684774067304, 0.04722420541324774, 0.04651838864563251, 0.04582887084885529, 0.04515514861748155, 0.044496741667771206, 0.0438531899725731, 0.04322403595870467, 0.04260877802808388, 0.04200684603254139, 0.041417895698825716, 0.040843079005469345, 0.04028715618387913, 0.039755323695318014, 0.0392231914332549, 0.03855375230108693, 0.03747277495639135, 0.036335197108619516, 0.03874870351250417, 0.04599993272961439, -1.1268571878427161e-21, -1.7791824201931178e-21] + + tau = [tau0,tau1,tau2] + + t = 0.0 + q = np.zeros((3,N+1)) + v = np.zeros((3,N+1)) + dv = np.zeros((3,N+1)) + q[:,0] = q_init + v[:,0] = v_init + + # Simulation + for i in range(len(tau0)): + time_start = time.time() + + # Send joint torques to simulator + simu.simulate([tau0[i],tau1[i],tau2[i]], dt, ndt) + q[:,i], v[:,i], dv[:,i] = simu.q, simu.v, simu.dv + t += dt + + time_spent = time.time() - time_start + if(simulate_real_time and time_spent < dt): + time.sleep(dt-time_spent) + + # PLOT STUFF + time = np.arange(0.0, N*dt+dt, dt) + + if PLOT_JOINT_POS: + (f, ax) = create_empty_figure(int(robot.nv/2),3) + ax = ax.reshape(robot.nv) + for i in range(robot.nv): + ax[i].plot(time, q[i,:], label='q') + ax[i].set_xlabel('Time [s]') + ax[i].set_ylabel(r'$q_'+str(i)+'$ [rad]') + + if PLOT_JOINT_VEL: + (f, ax) = create_empty_figure(int(robot.nv/2),3) + ax = ax.reshape(robot.nv) + for i in range(robot.nv): + ax[i].plot(time, v[i,:], label='v') + ax[i].set_xlabel('Time [s]') + ax[i].set_ylabel(r'v_'+str(i)+' [rad/s]') + + if PLOT_JOINT_ACC: + (f, ax) = create_empty_figure(int(robot.nv/2),3) + ax = ax.reshape(robot.nv) + for i in range(robot.nv): + ax[i].plot(time, dv[i,:], label=r'$\dot{v}$') + ax[i].set_xlabel('Time [s]') + ax[i].set_ylabel(r'$\dot{v}_'+str(i)+'$ [rad/s^2]') + + if PLOT_TORQUES: + (f, ax) = create_empty_figure(int(robot.nv/2),3) + ax = ax.reshape(robot.nv) + for i in range(robot.nv): + ax[i].plot(time, tau[i], label=r'$\tau_$ '+str(i)) + ax[i].set_xlabel('Time [s]') + ax[i].set_ylabel(r'$tau_'+str(i)+'$ [Nm]') + + plt.show() diff --git a/inits.py b/inits.py new file mode 100644 index 0000000..f51935e --- /dev/null +++ b/inits.py @@ -0,0 +1,57 @@ +''' Init functions to warm-start TO (Pyomo needs python functions to warm-start its variables) +''' + +# Warmstarting with CACTO rollout + +def init_tau0(m,k,init_TO_controls): + return init_TO_controls[0][k] + +def init_tau1(m,k,init_TO_controls): + return init_TO_controls[1][k] + +def init_tau2(m,k,init_TO_controls): + return init_TO_controls[2][k] + +def init_q0(m,k,init_TO_states): + return init_TO_states[0][k] + +def init_q1(m,k,init_TO_states): + return init_TO_states[1][k] + +def init_q2(m,k,init_TO_states): + return init_TO_states[2][k] + +def init_v0(m,k,init_TO_states): + return init_TO_states[3][k] + +def init_v1(m,k,init_TO_states): + return init_TO_states[4][k] + +def init_v2(m,k,init_TO_states): + return init_TO_states[5][k] + +# Warmstarting with initial conditions + +def init_q0_ICS(m,k,prev_state): + return prev_state[0] + +def init_q1_ICS(m,k,prev_state): + return prev_state[1] + +def init_q2_ICS(m,k,prev_state): + return prev_state[2] + +def init_v0_ICS(m,k,prev_state): + return prev_state[3] + +def init_v1_ICS(m,k,prev_state): + return prev_state[4] + +def init_v2_ICS(m,k,prev_state): + return prev_state[5] + +# Warmstarting with zeros + +def init_0(m,k): + return 0.0 + diff --git a/replay_buffer.py b/replay_buffer.py new file mode 100644 index 0000000..2676f08 --- /dev/null +++ b/replay_buffer.py @@ -0,0 +1,187 @@ +import numpy as np +import random +import math + +from segment_tree import SumSegmentTree, MinSegmentTree + + +class ReplayBuffer(object): + def __init__(self, size): + """Create Replay buffer. + Parameters + ---------- + size: int + Max number of transitions to store in the buffer. When the buffer + overflows the old memories are dropped. + """ + self._storage = [] + self._maxsize = size + self._next_idx = 0 + + def __len__(self): + return len(self._storage) + + def add(self, obs_t, action_pol, action_expl, reward, obs_tp1, done): + data = (obs_t, action_pol, action_expl, reward, obs_tp1, done) + + if self._next_idx >= len(self._storage): + self._storage.append(data) + else: + self._storage[self._next_idx] = data + self._next_idx = (self._next_idx + 1) % self._maxsize + + def _encode_sample(self, idxes): + obses_t, actions_pol, actions_expl, rewards, obses_tp1, dones = [], [], [], [], [], [] + for i in idxes: + data = self._storage[i] + obs_t, action_pol, action_expl, reward, obs_tp1, done = data + obses_t.append(np.array(obs_t, copy=False)) + actions_pol.append(np.array(action_pol, copy=False)) + actions_expl.append(np.array(action_expl, copy=False)) + rewards.append(reward) + obses_tp1.append(np.array(obs_tp1, copy=False)) + dones.append(done) + return np.array(obses_t), np.array(actions_pol), np.array(actions_expl), np.array(rewards), np.array(obses_tp1), np.array(dones) + + def sample(self, batch_size): + """Sample a batch of experiences. + Parameters + ---------- + batch_size: int + How many transitions to sample. + Returns + ------- + obs_batch: np.array + batch of observations + act_batch: np.array + batch of actions executed given obs_batch + rew_batch: np.array + rewards received as results of executing act_batch + next_obs_batch: np.array + next set of observations seen after executing act_batch + done_mask: np.array + done_mask[i] = 1 if executing act_batch[i] resulted in + the end of an episode and 0 otherwise. + TO_mask: np.array + TO_mask[i] = 1 if exectuting TO action + """ + idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] + return self._encode_sample(idxes) + + +class PrioritizedReplayBuffer(ReplayBuffer): + def __init__(self, size, alpha): + """Create Prioritized Replay buffer. + Parameters + ---------- + size: int + Max number of transitions to store in the buffer. When the buffer + overflows the old memories are dropped. + alpha: float + how much prioritization is used + (0 - no prioritization, 1 - full prioritization) + See Also + -------- + ReplayBuffer.__init__ + """ + super(PrioritizedReplayBuffer, self).__init__(size) + assert alpha >= 0 + self._alpha = alpha + + it_capacity = 1 + while it_capacity < size: + it_capacity *= 2 + + self._it_sum = SumSegmentTree(it_capacity) + self._it_min = MinSegmentTree(it_capacity) + self._max_priority = 1.0 + + def add(self, *args, **kwargs): + """See ReplayBuffer.store_effect""" + idx = self._next_idx + super().add(*args, **kwargs) + self._it_sum[idx] = self._max_priority ** self._alpha # Call to __setitem__ + self._it_min[idx] = self._max_priority ** self._alpha + + def _sample_proportional(self, batch_size): + res = [] + p_total = self._it_sum.sum(0, len(self._storage) - 1) + every_range_len = p_total / batch_size + for i in range(batch_size): + mass = random.random() * every_range_len + i * every_range_len + idx = self._it_sum.find_prefixsum_idx(mass) + res.append(idx) + return res + + def sample(self, batch_size, beta): + """Sample a batch of experiences. + compared to ReplayBuffer.sample + it also returns importance weights and idxes + of sampled experiences. + Parameters + ---------- + batch_size: int + How many transitions to sample. + beta: float + To what degree to use importance weights + (0 - no corrections, 1 - full correction) + Returns + ------- + obs_batch: np.array + batch of observations + act_batch: np.array + batch of actions executed given obs_batch + rew_batch: np.array + rewards received as results of executing act_batch + next_obs_batch: np.array + next set of observations seen after executing act_batch + done_mask: np.array + done_mask[i] = 1 if executing act_batch[i] resulted in + the end of an episode and 0 otherwise. + weights: np.array + Array of shape (batch_size,) and dtype np.float32 + denoting importance weight of each sampled transition + idxes: np.array + Array of shape (batch_size,) and dtype np.int32 + idexes in buffer of sampled experiences + """ + assert beta > 0 + + idxes = self._sample_proportional(batch_size) + + weights = [] + p_min = self._it_min.min() / self._it_sum.sum() + max_weight = (p_min * len(self._storage)) ** (-beta) + + for idx in idxes: + p_sample = self._it_sum[idx] / self._it_sum.sum() + weight = (p_sample * len(self._storage)) ** (-beta) + weights.append(weight / max_weight) + weights = np.array(weights) + encoded_sample = self._encode_sample(idxes) + return tuple(list(encoded_sample) + [weights, idxes]) + + def update_priorities(self, idxes, priorities): + """Update priorities of sampled transitions. + sets priority of transition at index idxes[i] in buffer + to priorities[i]. + Parameters + ---------- + idxes: [int] + List of idxes of sampled transitions + priorities: [float] + List of updated priorities corresponding to + transitions at the sampled idxes denoted by + variable `idxes`. + """ + assert len(idxes) == len(priorities) + for idx, priority in zip(idxes, priorities): + if math.isnan(priority): + print("\n ###################################### PRIORITY IS NAN #######################################\n") + priority = self._max_priority + assert priority > 0 + assert 0 <= idx < len(self._storage) + self._it_sum[idx] = priority ** self._alpha + self._it_min[idx] = priority ** self._alpha + + self._max_priority = max(self._max_priority, priority) \ No newline at end of file