From 0c859715d36c218f375cc136fa149a954085d5d1 Mon Sep 17 00:00:00 2001
From: gianluigigrandesso
 <50986305+gianluigigrandesso@users.noreply.github.com>
Date: Thu, 17 Nov 2022 17:47:06 +0100
Subject: [PATCH] Uploaded scripts

---
 CACTO_manipulator3DoF_pyomo.py  | 916 ++++++++++++++++++++++++++++++++
 TO_manipulator3DoF_pyomo.py     | 538 +++++++++++++++++++
 config_manipulator3DoF_pyomo.py | 111 ++++
 dynamics_manipulator3DoF.py     | 538 +++++++++++++++++++
 inits.py                        |  57 ++
 replay_buffer.py                | 187 +++++++
 6 files changed, 2347 insertions(+)
 create mode 100644 CACTO_manipulator3DoF_pyomo.py
 create mode 100644 TO_manipulator3DoF_pyomo.py
 create mode 100644 config_manipulator3DoF_pyomo.py
 create mode 100644 dynamics_manipulator3DoF.py
 create mode 100644 inits.py
 create mode 100644 replay_buffer.py

diff --git a/CACTO_manipulator3DoF_pyomo.py b/CACTO_manipulator3DoF_pyomo.py
new file mode 100644
index 0000000..ba51216
--- /dev/null
+++ b/CACTO_manipulator3DoF_pyomo.py
@@ -0,0 +1,916 @@
+import os
+import tensorflow as tf
+from tensorflow.keras import layers, regularizers
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.patches import Ellipse
+import math
+import random
+from pyomo.environ import *
+from pyomo.dae import *
+import pinocchio as pin
+from TO_manipulator3DoF_pyomo import TO_manipulator, plot_results_TO
+import config_manipulator3DoF_pyomo as conf
+from dynamics_manipulator3DoF import RobotWrapper, RobotSimulator, load_urdf
+from replay_buffer import PrioritizedReplayBuffer
+from inits import init_tau0,init_tau1,init_tau2,init_q0,init_q1,init_q2,init_v0,init_v1,init_v2,init_q0_ICS,init_q1_ICS,init_q2_ICS,init_v0_ICS,init_v1_ICS,init_v2_ICS,init_0
+
+# os.environ["CUDA_VISIBLE_DEVICES"]="-1"     # Uncomment to run TF on CPU rather than GPU
+
+# Class implementing the NNs update
+class Training:
+    def __init__(self, batch_size=conf.BATCH_SIZE):
+        self.batch_size = batch_size
+        return
+    
+    # Update both critic and actor                                                                     
+    def update(self, episode, red_state_batch_norm, red_state_batch, state_batch_norm, state_batch, reward_batch, next_state_batch, next_state_batch_norm, d_batch, weights_batch=None):
+
+        with tf.GradientTape() as tape:
+
+            # Update the critic
+            if conf.TD_N:
+                y = reward_batch                                                        # When using n-step TD, reward_batch is the batch of costs-to-go and not the batch of single step rewards
+            else:    
+                if conf.NORMALIZE_INPUTS:
+                    target_values = target_critic(next_state_batch_norm, training=True) # Compute Value at next state after conf.nsteps_TD_N steps given by target critic 
+                else:
+                    target_values = target_critic(next_state_batch, training=True)
+                y = reward_batch + (1-d_batch)*target_values                            # Compute batch of 1-step targets for the critic loss 
+            
+            if conf.NORMALIZE_INPUTS:
+                critic_value = critic_model(state_batch_norm, training=True)            # Compute batch of Values associated to the sampled batch of states
+            else:
+                critic_value = critic_model(state_batch, training=True)
+
+            if weights_batch is None:
+                critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))     # Critic loss function (tf.math.reduce_mean computes the mean of elements across dimensions of a tensor, in this case across the batch)
+            else:
+                critic_loss = tf.math.reduce_mean(tf.math.square(tf.math.multiply(weights_batch,(y - critic_value))))       
+
+        # Compute the gradients of the critic loss w.r.t. critic's parameters
+        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)    
+
+        # Update the critic backpropagating the gradients
+        critic_optimizer.apply_gradients(zip(critic_grad, critic_model.trainable_variables))
+
+        # Update the actor after the critic pre-training
+        if episode >= conf.EPISODE_CRITIC_PRETRAINING:                                       
+            if conf.NORMALIZE_INPUTS:
+                actions = actor_model(red_state_batch_norm, training=True)
+            else:
+                actions = actor_model(red_state_batch, training=True)
+
+            # Both take into account normalization, ds_next_da is the gradient of the dynamics w.r.t. policy actions (ds'_da)
+            next_state_tf, ds_next_da = simulate_and_derivative_tf(red_state_batch,actions.numpy(),self.batch_size)  
+
+            with tf.GradientTape() as tape:
+                tape.watch(next_state_tf)
+                critic_value_next = critic_model(next_state_tf, training=True)                                  # next_state_batch = next state after applying policy's action, already normalized if conf.NORMALIZE_INPUTS=1
+            dV_ds_next = tape.gradient(critic_value_next, next_state_tf)                                        # dV_ds' = gradient of V w.r.t. s', where s'=f(s,a) a=policy(s)   
+
+            with tf.GradientTape() as tape1:
+                with tf.GradientTape() as tape2:
+                        tape1.watch(actions)
+                        tape2.watch(next_state_tf)
+                        rewards_tf = reward_tf(next_state_tf, actions, self.batch_size, d_batch)
+            dr_da = tape1.gradient(rewards_tf,actions)                                                          # dr_da = gradient of reward r w.r.t. policy's action a
+            dr_ds_next = tape2.gradient(rewards_tf,next_state_tf)                                               # dr_ds' = gradient of reward r w.r.t. next state s' after performing policy's action a
+
+            dr_ds_next_dV_ds_next_reshaped = tf.reshape(dr_ds_next+dV_ds_next,(self.batch_size,1,num_states))   # dr_ds' + dV_ds'
+            dr_ds_next_dV_ds_next = tf.matmul(dr_ds_next_dV_ds_next_reshaped,ds_next_da)                        # (dr_ds' + dV_ds')*ds'_da
+            dr_da_reshaped = tf.reshape(dr_da,(self.batch_size,1,num_actions))                               
+            tf_sum = dr_ds_next_dV_ds_next+dr_da_reshaped                                                       # (dr_ds' + dV_ds')*ds'_da + dr_da
+
+            # Now let's multiply -[(dr_ds' + dV_ds')*ds'_da + dr_da] by the actions a and then let's autodifferentiate w.r.t theta_A (actor NN's parameters) to finally get -dQ/dtheta_A 
+            with tf.GradientTape() as tape:
+                tape.watch(actor_model.trainable_variables)
+                if conf.NORMALIZE_INPUTS:
+                    actions = actor_model(state_batch_norm, training=True)
+                else:
+                    actions = actor_model(state_batch, training=True)
+                actions_reshaped = tf.reshape(actions,(self.batch_size,num_actions,1))
+                tf_sum_reshaped = tf.reshape(tf_sum,(self.batch_size,1,num_actions))    
+                Q_neg = tf.matmul(-tf_sum_reshaped,actions_reshaped) 
+                
+                mean_Qneg = tf.math.reduce_mean(Q_neg)                  # Also here we need a scalar so we compute the mean -Q across the batch
+
+            # Gradients of the actor loss w.r.t. actor's parameters
+            actor_grad = tape.gradient(mean_Qneg, actor_model.trainable_variables) 
+            
+            # Update the actor backpropagating the gradients
+            actor_optimizer.apply_gradients(zip(actor_grad, actor_model.trainable_variables))
+
+    def learn(self,episode, prioritized_buffer):
+        # Sample batch of transitions from the buffer
+        experience = prioritized_buffer.sample(self.batch_size, beta=conf.prioritized_replay_eps)            # Bias annealing not performed, that's why beta is equal to a very small number (0 not accepted by PrioritizedReplayBuffer)
+        x_batch, a_next_batch, a_batch, r_batch, x2_batch, d_batch, weights, batch_idxes = experience   # Importance sampling weights (actually not used) should anneal the bias (see Prioritized Experience Replay paper) 
+        
+        # Convert batch of transitions into a tensor
+        x_batch = x_batch.reshape(self.batch_size,num_states)
+        a_next_batch = a_next_batch.reshape(self.batch_size,num_actions)
+        a_batch = a_batch.reshape(self.batch_size,num_actions)
+        r_batch = r_batch.reshape(self.batch_size,1)
+        x2_batch = x2_batch.reshape(self.batch_size,num_states)
+        d_batch = d_batch.reshape(self.batch_size,1)
+        weights = weights.reshape(self.batch_size,1)
+        state_batch = tf.convert_to_tensor(x_batch)
+        state_batch = tf.cast(state_batch, dtype=tf.float32)
+        action_batch = tf.convert_to_tensor(a_batch)
+        action_batch = tf.cast(action_batch, dtype=tf.float32)
+        reward_batch = tf.convert_to_tensor(r_batch)
+        reward_batch = tf.cast(reward_batch, dtype=tf.float32)                                     
+        d_batch = tf.cast(d_batch, dtype=tf.float32)
+        next_state_batch = tf.convert_to_tensor(x2_batch)
+        next_state_batch = tf.cast(next_state_batch, dtype=tf.float32)                
+        if conf.prioritized_replay_alpha == 0:
+            weights_batch = tf.convert_to_tensor(np.ones_like(weights), dtype=tf.float32)
+        else:    
+            weights_batch = tf.convert_to_tensor(weights, dtype=tf.float32)
+    
+        if conf.NORMALIZE_INPUTS:
+            x2_batch_norm = x2_batch / conf.state_norm_arr
+            x2_batch_norm[:,-1] = 2*x2_batch_norm[:,-1] -1
+            next_state_batch_norm = tf.convert_to_tensor(x2_batch_norm)
+            next_state_batch_norm = tf.cast(next_state_batch_norm, dtype=tf.float32)
+            x_batch_norm = x_batch / conf.state_norm_arr
+            x_batch_norm[:,-1] = 2*x_batch_norm[:,-1] -1
+            state_batch_norm = tf.convert_to_tensor(x_batch_norm)
+            state_batch_norm = tf.cast(state_batch_norm, dtype=tf.float32)
+          
+        # Update priorities
+        if conf.prioritized_replay_alpha != 0:
+            if conf.NORMALIZE_INPUTS:
+                v_batch = critic_model(state_batch_norm, training=True)                      # Compute batch of Values associated to the sampled batch ofstates
+                v2_batch = target_critic(next_state_batch_norm, training=True)               # Compute batch of Values from target critic associated to sampled batch of next states
+                vref_batch = reward_batch + (1-d_batch)*(v2_batch)                           # Compute the targets for the TD error         
+            else:
+                v_batch = critic_model(state_batch, training=True) 
+                v2_batch = target_critic(next_state_batch, training=True)                     
+                vref_batch = reward_batch + (1-d_batch)*(v2_batch)                          
+
+            td_errors = tf.math.abs(tf.math.subtract(vref_batch,v_batch))                           
+            new_priorities = td_errors.numpy() + conf.prioritized_replay_eps                      # Proportional prioritization where p_i = |TD_error_i| + conf.prioritized_replay_eps 
+            prioritized_buffer.update_priorities(batch_idxes, new_priorities)                                               
+        
+        # Update NNs
+        self.update(episode, state_batch_norm, state_batch, state_batch_norm, state_batch, reward_batch, next_state_batch, next_state_batch_norm, d_batch, weights_batch=weights_batch)
+
+# Update target critic NN
+# Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows TensorFlow to build a static graph out of the logic and computations in our function.
+# This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one. Working only with TensorFlow tensors (e.g. not working with Numpy arrays)
+@tf.function                                 
+def update_target(target_weights, weights, tau): 
+    for (a, b) in zip(target_weights, weights):
+        a.assign(b * tau + a * (1 - tau))    
+
+# Create actor NN
+def get_actor():
+
+    inputs = layers.Input(shape=(conf.num_states,))
+    lay1 = layers.Dense(conf.NH1,kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A))(inputs)                                        
+    leakyrelu1 = layers.LeakyReLU()(lay1)
+ 
+    lay2 = layers.Dense(conf.NH2, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A))(leakyrelu1)                                           
+    leakyrelu2 = layers.LeakyReLU()(lay2)
+
+    outputs = layers.Dense(conf.num_actions, activation="tanh", kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_A,conf.wreg_l2_A))(leakyrelu2) 
+
+    outputs = outputs * conf.tau_upper_bound          # Bound actions
+    model = tf.keras.Model(inputs, outputs)
+    return model 
+
+# Create critic NN 
+def get_critic(): 
+    state_input = layers.Input(shape=(num_states,))
+    state_out1 = layers.Dense(16, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C))(state_input) 
+    leakyrelu1 = layers.LeakyReLU()(state_out1)
+
+    state_out2 = layers.Dense(32, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C))(leakyrelu1) 
+    leakyrelu2 = layers.LeakyReLU()(state_out2)
+    
+    out_lay1 = layers.Dense(conf.NH1, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C))(leakyrelu2)
+    leakyrelu3 = layers.LeakyReLU()(out_lay1)
+    
+    out_lay2 = layers.Dense(conf.NH2, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C))(leakyrelu3)
+    leakyrelu4 = layers.LeakyReLU()(out_lay2)
+
+    outputs = layers.Dense(1, kernel_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C),bias_regularizer=regularizers.l1_l2(conf.wreg_l1_C,conf.wreg_l2_C))(leakyrelu4)
+
+    model = tf.keras.Model([state_input], outputs)
+
+    return model        
+
+# Get optimal actions at timestep "step" in the current episode
+def get_TO_actions(step, action0_TO=None, action1_TO=None, action2_TO=None):
+       
+    actions = np.array([action0_TO[step], action1_TO[step], action2_TO[step]])
+    
+    # Buond actions in case they are not already bounded in the TO problem
+    if step < NSTEPS_SH:
+        next_actions = np.array([np.clip(action0_TO[step+1], conf.tau_lower_bound, conf.tau_upper_bound), np.clip(action1_TO[step+1], conf.tau_lower_bound, conf.tau_upper_bound), np.clip(action2_TO[step+1], conf.tau_lower_bound, conf.tau_upper_bound)])
+    else:
+        next_actions = np.copy(actions) # Actions at last step of the episode are not performed 
+
+    return actions, next_actions
+
+# Define reward (-cost) function
+def reward(x2,u=None):
+    # End-effector coordinates
+    x_ee = conf.x_base + conf.l*(math.cos(x2[0]) + math.cos(x2[0]+x2[1]) + math.cos(x2[0]+x2[1]+x2[2]))
+    y_ee = conf.y_base + conf.l*(math.sin(x2[0]) + math.sin(x2[0]+x2[1]) + math.sin(x2[0]+x2[1]+x2[2]))   
+
+    # Penalties for the ellipses representing the obstacle
+    ell1_pen = math.log(math.exp(conf.alpha*-(((x_ee-conf.XC1)**2)/((conf.A1/2)**2) + ((y_ee-conf.YC1)**2)/((conf.B1/2)**2) - 1.0)) + 1)/conf.alpha
+    ell2_pen = math.log(math.exp(conf.alpha*-(((x_ee-conf.XC2)**2)/((conf.A2/2)**2) + ((y_ee-conf.YC2)**2)/((conf.B2/2)**2) - 1.0)) + 1)/conf.alpha
+    ell3_pen = math.log(math.exp(conf.alpha*-(((x_ee-conf.XC3)**2)/((conf.A3/2)**2) + ((y_ee-conf.YC3)**2)/((conf.B3/2)**2) - 1.0)) + 1)/conf.alpha
+
+    # Term pushing the agent to stay in the neighborhood of target
+    peak_reward = math.log(math.exp(conf.alpha2*-(math.sqrt((x_ee-conf.TARGET_STATE[0])**2 +0.1) - math.sqrt(0.1) - 0.1 + math.sqrt((y_ee-conf.TARGET_STATE[1])**2 +0.1) - math.sqrt(0.1) - 0.1)) + 1)/conf.alpha2
+
+    # Term penalizing the FINAL joint velocity
+    if x2[-1] == conf.dt*round(rand_time/conf.dt):
+        vel_joint = x2[3]**2 + x2[4]**2 + x2[5]**2 - 10000/conf.w_v
+    else:    
+        vel_joint = 0
+
+    r = (conf.w_d*(-(x_ee-conf.TARGET_STATE[0])**2 -(y_ee-conf.TARGET_STATE[1])**2) + conf.w_peak*peak_reward - conf.w_v*vel_joint - conf.w_ob1*ell1_pen - conf.w_ob2*ell2_pen - conf.w_ob3*ell3_pen - conf.w_u*(u[0]**2 + u[1]**2 + u[2]**2))/100 
+    
+    return r
+
+# Define reward function with TensorFlow tensors needed in the actor update (to compute its derivatives with TensorFlow autodifferentiation). Batch-wise computation
+def reward_tf(x2,u,BATCH_SIZE,last_ts):
+
+    # De-normalize x2 because it is normalized if conf.NORMALIZE_INPUTS=1. (Mask trick needed because TensorFlow's autodifferentiation doesn't work if tensors' elements are directly modified by accessing them)
+    if conf.NORMALIZE_INPUTS:
+        x2_time = tf.stack([np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), (x2[:,-1]+1)*conf.state_norm_arr[-1]/2],1)
+        x2_no_time = x2 * conf.state_norm_arr
+        mask = tf.cast(tf.stack([np.ones(BATCH_SIZE), np.ones(BATCH_SIZE), np.ones(BATCH_SIZE), np.ones(BATCH_SIZE), np.ones(BATCH_SIZE), np.ones(BATCH_SIZE), np.zeros(BATCH_SIZE)],1),tf.float32)
+        x2_not_norm = x2_no_time * mask + x2_time * (1 - mask)
+    else:
+        x2_not_norm = x2 
+
+    x_ee = conf.x_base + conf.l*(tf.math.cos(x2_not_norm[:,0]) + tf.math.cos(x2_not_norm[:,0]+x2_not_norm[:,1]) + tf.math.cos(x2_not_norm[:,0]+x2_not_norm[:,1]+x2_not_norm[:,2]))
+    y_ee = conf.y_base + conf.l*(tf.math.sin(x2_not_norm[:,0]) + tf.math.sin(x2_not_norm[:,0]+x2_not_norm[:,1]) + tf.math.sin(x2_not_norm[:,0]+x2_not_norm[:,1]+x2_not_norm[:,2]))
+
+    ell1_pen = tf.math.log(tf.math.exp(conf.alpha*-(((x_ee[:]-conf.XC1)**2)/((conf.A1/2)**2) + ((y_ee[:]-conf.YC1)**2)/((conf.B1/2)**2) - 1.0)) + 1)/conf.alpha
+    ell2_pen = tf.math.log(tf.math.exp(conf.alpha*-(((x_ee[:]-conf.XC2)**2)/((conf.A2/2)**2) + ((y_ee[:]-conf.YC2)**2)/((conf.B2/2)**2) - 1.0)) + 1)/conf.alpha
+    ell3_pen = tf.math.log(tf.math.exp(conf.alpha*-(((x_ee[:]-conf.XC3)**2)/((conf.A3/2)**2) + ((y_ee[:]-conf.YC3)**2)/((conf.B3/2)**2) - 1.0)) + 1)/conf.alpha
+
+    peak_reward = tf.math.log(tf.math.exp(conf.alpha2*-(tf.math.sqrt((x_ee[:]-conf.TARGET_STATE[0])**2 +0.1) - tf.math.sqrt(0.1) - 0.1 + tf.math.sqrt((y_ee[:]-conf.TARGET_STATE[1])**2 +0.1) - tf.math.sqrt(0.1) - 0.1)) + 1)/conf.alpha2
+
+    vel_joint_list = []
+    for i in range(BATCH_SIZE):
+        if last_ts[i][0] == 1.0:
+            vel_joint_list.append(x2_not_norm[i,3]**2 + x2_not_norm[i,4]**2 + x2_not_norm[i,5]**2 - 10000/conf.w_v)
+        else:    
+            vel_joint_list.append(0)
+    vel_joint = tf.cast(tf.stack(vel_joint_list),tf.float32)
+
+    r = (conf.w_d*(-(x_ee[:]-conf.TARGET_STATE[0])**2 -(y_ee[:]-conf.TARGET_STATE[1])**2) + conf.w_peak*peak_reward -conf.w_v*vel_joint - conf.w_ob1*ell1_pen - conf.w_ob2*ell2_pen - conf.w_ob3*ell3_pen - conf.w_u*(u[:,0]**2 + u[:,1]**2 + u[:,2]**2))/100 
+
+    return r 
+
+# Simulate dynamics to get next state
+def simulate(dt,state,u):
+
+    # Create robot model in Pinocchio with q_init as initial configuration and v_init as initial velocities
+    q_init = np.zeros(int((len(state)-1)/2))
+    for state_index in range(int((len(state)-1)/2)):
+        q_init[state_index] = state[state_index]
+    q_init = q_init.T
+    v_init = np.zeros(int((len(state)-1)/2))
+    for state_index in range(int((len(state)-1)/2)):
+        v_init[state_index] = state[int((len(state)-1)/2)+state_index]
+    v_init = v_init.T    
+    r = load_urdf(URDF_PATH)
+    robot = RobotWrapper(r.model, r.collision_model, r.visual_model)
+    simu = RobotSimulator(robot, q_init, v_init, conf.simulation_type, conf.tau_coulomb_max)
+
+    # Simulate control u
+    simu.simulate(u, conf.dt, 1)
+    q_next, v_next, a_next = np.copy(simu.q), np.copy(simu.v), np.copy(simu.dv)
+    q0_next, q1_next, q2_next = q_next[0],q_next[1],q_next[2]
+    v0_next, v1_next, v2_next = v_next[0],v_next[1],v_next[2]
+
+    t_next = state[-1] + conf.dt
+
+    return np.array([q0_next,q1_next,q2_next,v0_next,v1_next,v2_next,t_next])
+
+# Simulate dynamics using tensors and compute its gradient w.r.t control. Batch-wise computation
+def simulate_and_derivative_tf(state,u,BATCH_SIZE):
+    
+    q0_next, q1_next, q2_next = np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE)
+    v0_next, v1_next, v2_next = np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE), np.zeros(BATCH_SIZE)
+    Fu = np.zeros((conf.BATCH_SIZE,num_states,num_actions))  
+
+    for sample_indx in range(BATCH_SIZE):
+        state_np = state[sample_indx]
+
+        # Create robot model in Pinocchio with q_init as initial configuration
+        q_init = np.zeros(int((len(state_np)-1)/2))
+        for state_index in range(int((len(state_np)-1)/2)):
+            q_init[state_index] = state_np[state_index]
+        q_init = q_init.T    
+        v_init = np.zeros(int((len(state_np)-1)/2))
+        for state_index in range(int((len(state_np)-1)/2)):
+            v_init[state_index] = state_np[int((len(state_np)-1)/2)+state_index]
+        v_init = v_init.T
+        r = load_urdf(URDF_PATH)
+        robot = RobotWrapper(r.model, r.collision_model, r.visual_model)
+        simu = RobotSimulator(robot, q_init, v_init, conf.simulation_type, conf.tau_coulomb_max)
+
+        # Dynamics gradient w.r.t control (1st order euler)
+        nq = robot.nq
+        nv = robot.nv
+        nu = robot.na
+        nx = nq+nv
+        pin.computeABADerivatives(robot.model, robot.data, q_init, v_init, u[sample_indx])        
+        Fu_sample = np.zeros((nx, nu))
+        Fu_sample[nv:, :] = robot.data.Minv
+        Fu_sample *= conf.dt
+        if conf.NORMALIZE_INPUTS:
+            Fu_sample *= (1/conf.state_norm_arr[3])
+
+        Fu[sample_indx] = np.vstack((Fu_sample, np.zeros(num_actions)))
+
+        # Simulate control u
+        simu.simulate(u[sample_indx], conf.dt, 1)
+        q_next, v_next, = np.copy(simu.q), np.copy(simu.v)
+        q0_next_sample, q1_next_sample, q2_next_sample = q_next[0],q_next[1],q_next[2]
+        v0_next_sample, v1_next_sample, v2_next_sample = v_next[0],v_next[1],v_next[2]
+        q0_next[sample_indx] = q0_next_sample
+        q1_next[sample_indx] = q1_next_sample
+        q2_next[sample_indx] = q2_next_sample
+        v0_next[sample_indx] = v0_next_sample
+        v1_next[sample_indx] = v1_next_sample
+        v2_next[sample_indx] = v2_next_sample
+
+    t_next = state[:,-1] + conf.dt    
+    
+    if conf.NORMALIZE_INPUTS:
+        q0_next = q0_next / conf.state_norm_arr[0]
+        q1_next = q1_next / conf.state_norm_arr[1]
+        q2_next = q2_next / conf.state_norm_arr[2]
+        v0_next = v0_next / conf.state_norm_arr[3]
+        v1_next = v1_next / conf.state_norm_arr[4]
+        v2_next = v2_next / conf.state_norm_arr[5]
+        t_next = 2*t_next/conf.state_norm_arr[-1] -1 
+
+    Fu = tf.convert_to_tensor(Fu,dtype=tf.float32)
+
+    return tf.cast(tf.stack([q0_next,q1_next,q2_next,v0_next,v1_next,v2_next,t_next],1),dtype=tf.float32), Fu
+
+# Plot results from TO and episode to check consistency
+def plot_results(tau0,tau1,tau2,x_TO,y_TO,x_RL,y_RL,steps,to=0):
+
+    timesteps = conf.dt*np.arange(steps+1)
+    timesteps2 = conf.dt*np.arange(steps)
+    fig = plt.figure(figsize=(12,8))
+    if to:
+        plt.suptitle('TO EXPLORATION: N try = {}'.format(N_try), y=1, fontsize=20)
+    else:  
+        plt.suptitle('POLICY EXPLORATION: N try = {}'.format(N_try), y=1, fontsize=20)
+
+    ax1 = fig.add_subplot(2, 2, 1)
+    ax1.plot(timesteps, x_TO, 'ro', linewidth=1, markersize=1) 
+    ax1.plot(timesteps, y_TO, 'bo', linewidth=1, markersize=1)
+    ax1.plot(timesteps, x_RL, 'go', linewidth=1, markersize=1) 
+    ax1.plot(timesteps, y_RL, 'ko', linewidth=1, markersize=1)
+    ax1.set_title('End-Effector Position',fontsize=20)
+    ax1.legend(["x_TO","y_TO","x_RL","y_RL"],fontsize=20)
+    ax1.set_xlabel('Time [s]',fontsize=20)
+    ax1.set_ylabel('[m]',fontsize=20)    
+
+    ax2 = fig.add_subplot(2, 2, 3)
+    ax2.plot(timesteps2, tau0, 'ro', linewidth=1, markersize=1) 
+    ax2.plot(timesteps2, tau1, 'bo', linewidth=1, markersize=1) 
+    ax2.plot(timesteps2, tau2, 'go', linewidth=1, markersize=1) 
+    ax2.legend(['tau0','tau1','tau2'],fontsize=20) 
+    ax2.set_xlabel('Time [s]',fontsize=20)
+    ax2.set_title('Controls',fontsize=20)
+
+    ell1 = Ellipse((conf.XC1, conf.YC1), conf.A1, conf.B1, 0.0)
+    ell1.set_facecolor([30/255, 130/255, 76/255, 1])
+    ell2 = Ellipse((conf.XC2, conf.YC2), conf.A2, conf.B2, 0.0)
+    ell2.set_facecolor([30/255, 130/255, 76/255, 1])
+    ell3 = Ellipse((conf.XC3, conf.YC3), conf.A3, conf.B3, 0.0)
+    ell3.set_facecolor([30/255, 130/255, 76/255, 1])
+    ax3 = fig.add_subplot(1, 2, 2)
+    ax3.plot(x_TO, y_TO, 'ro', linewidth=1, markersize=1)
+    ax3.plot(x_RL, y_RL, 'bo', linewidth=1, markersize=1)
+    ax3.legend(['TO','RL'],fontsize=20)
+    ax3.plot([x_TO[0]],[y_TO[0]],'ko',markersize=5)
+    ax3.add_artist(ell1)
+    ax3.add_artist(ell2) 
+    ax3.add_artist(ell3) 
+    ax3.plot([conf.TARGET_STATE[0]],[conf.TARGET_STATE[1]],'bo',markersize=5)
+    ax3.set_xlim(-41, 31)
+    ax3.set_aspect('equal', 'box')
+    ax3.set_title('Plane',fontsize=20)
+    ax3.set_xlabel('X [m]',fontsize=20)
+    ax3.set_ylabel('Y [m]',fontsize=20)
+    ax3.set_ylim(-35, 35)
+    
+    for ax in [ax1, ax2, ax3]:
+        ax.grid(True)
+
+    fig.tight_layout()
+    plt.show()
+
+# Plot policy rollout from a single initial state as well as state and control trajectories
+def plot_policy(tau0,tau1,tau2,x,y,steps,n_updates, diff_loc=0, PRETRAIN=0):
+
+    timesteps = conf.dt*np.arange(steps)
+    fig = plt.figure(figsize=(12,8))
+    plt.suptitle('POLICY: Discrete model, N try = {} N updates = {}'.format(N_try,n_updates), y=1, fontsize=20)
+
+    ax1 = fig.add_subplot(2, 2, 1)
+    ax1.plot(timesteps, x, 'ro', linewidth=1, markersize=1) 
+    ax1.plot(timesteps, y, 'bo', linewidth=1, markersize=1)
+    ax1.set_title('End-Effector Position',fontsize=20)
+    ax1.legend(["x","y"],fontsize=20)
+    ax1.set_xlabel('Time [s]',fontsize=20)
+    ax1.set_ylabel('[m]',fontsize=20)    
+
+    ax2 = fig.add_subplot(2, 2, 3)
+    ax2.plot(timesteps, tau0, 'ro', linewidth=1, markersize=1) 
+    ax2.plot(timesteps, tau1, 'bo', linewidth=1, markersize=1) 
+    ax2.plot(timesteps, tau2, 'go', linewidth=1, markersize=1) 
+    ax2.legend(['tau0','tau1','tau2'],fontsize=20) 
+    ax2.set_xlabel('Time [s]',fontsize=20)
+    ax2.set_title('Controls',fontsize=20)
+
+    ell1 = Ellipse((conf.XC1, conf.YC1), conf.A1, conf.B1, 0.0)
+    ell1.set_facecolor([30/255, 130/255, 76/255, 1])
+    ell2 = Ellipse((conf.XC2, conf.YC2), conf.A2, conf.B2, 0.0)
+    ell2.set_facecolor([30/255, 130/255, 76/255, 1])
+    ell3 = Ellipse((conf.XC3, conf.YC3), conf.A3, conf.B3, 0.0)
+    ell3.set_facecolor([30/255, 130/255, 76/255, 1])
+    ax3 = fig.add_subplot(1, 2, 2)
+    ax3.plot(x, y, 'ro', linewidth=1, markersize=1) 
+    ax3.add_artist(ell1)
+    ax3.add_artist(ell2) 
+    ax3.add_artist(ell3) 
+    ax3.plot([conf.x_base],[3*conf.l],'ko',markersize=5)   
+    ax3.plot([conf.TARGET_STATE[0]],[conf.TARGET_STATE[1]],'b*',markersize=10) 
+    ax3.set_xlim([-41, 31])
+    ax3.set_aspect('equal', 'box')
+    ax3.set_title('Plane',fontsize=20)
+    ax3.set_xlabel('X [m]',fontsize=20)
+    ax3.set_ylabel('Y [m]',fontsize=20)
+    ax3.set_ylim(-35, 35)
+
+    for ax in [ax1, ax2, ax3]:
+        ax.grid(True)
+
+    fig.tight_layout()
+    if PRETRAIN:
+        plt.savefig(Fig_path+'/PolicyEvaluation_Pretrain_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates))
+    else:    
+        if diff_loc==0:
+            plt.savefig(Fig_path+'/PolicyEvaluation_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates))
+        else:
+            plt.savefig(Fig_path+'/Actor/PolicyEvaluation_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates))
+    plt.clf()
+    plt.close(fig)
+
+# Plot only policy rollouts from multiple initial states
+def plot_policy_eval(x_list,y_list,n_updates, diff_loc=0, PRETRAIN=0):
+
+    fig = plt.figure(figsize=(12,8))
+    plt.suptitle('POLICY: Discrete model, N try = {} N updates = {}'.format(N_try,n_updates), y=1, fontsize=20)
+    ell1 = Ellipse((conf.XC1, conf.YC1), conf.A1, conf.B1, 0.0)
+    ell1.set_facecolor([30/255, 130/255, 76/255, 1])
+    ell2 = Ellipse((conf.XC2, conf.YC2), conf.A2, conf.B2, 0.0)
+    ell2.set_facecolor([30/255, 130/255, 76/255, 1])
+    ell3 = Ellipse((conf.XC3, conf.YC3), conf.A3, conf.B3, 0.0)
+    ell3.set_facecolor([30/255, 130/255, 76/255, 1])
+
+    ax = fig.add_subplot(1, 1, 1)
+    for idx in range(len(x_list)):
+        if idx == 0:
+            ax.plot(x_list[idx], y_list[idx], 'ro', linewidth=1, markersize=1)
+        elif idx == 1:
+            ax.plot(x_list[idx], y_list[idx], 'bo', linewidth=1, markersize=1)
+        elif idx == 2:
+            ax.plot(x_list[idx], y_list[idx], 'go', linewidth=1, markersize=1)
+        elif idx == 3:
+            ax.plot(x_list[idx], y_list[idx], 'co', linewidth=1, markersize=1)
+        elif idx == 4:
+            ax.plot(x_list[idx], y_list[idx], 'yo', linewidth=1, markersize=1)
+        elif idx == 5:
+            ax.plot(x_list[idx], y_list[idx], color='tab:orange', marker='o', markersize=1, linestyle='None')
+        elif idx == 6:
+            ax.plot(x_list[idx], y_list[idx], color='grey', marker='o', markersize=1, linestyle='None')
+        elif idx == 7:
+            ax.plot(x_list[idx], y_list[idx], color='tab:pink', marker='o', markersize=1, linestyle='None')
+        elif idx == 8:
+            ax.plot(x_list[idx], y_list[idx], color='lime', marker='o', markersize=1, linestyle='None')            
+        ax.plot(x_list[idx][0],y_list[idx][0],'ko',markersize=5)
+    ax.plot(conf.TARGET_STATE[0],conf.TARGET_STATE[1],'b*',markersize=10)
+    ax.add_artist(ell1)
+    ax.add_artist(ell2) 
+    ax.add_artist(ell3)     
+    ax.set_xlim([-41, 31])
+    ax.set_aspect('equal', 'box')
+    ax.set_xlabel('X [m]',fontsize=20)
+    ax.set_ylabel('Y [m]',fontsize=20)
+    ax.set_ylim(-35, 35)
+    ax.grid(True)
+    fig.tight_layout()
+    if PRETRAIN:
+        plt.savefig(Fig_path+'/PolicyEvaluationMultiInit_Pretrain_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates))
+    else:    
+        if diff_loc==0:
+            plt.savefig(Fig_path+'/PolicyEvaluationMultiInit_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates))
+        else:
+            plt.savefig(Fig_path+'/Actor/PolicyEvaluationMultiInit_Manipulator3DoF_3OBS_{}_{}'.format(N_try,n_updates))
+    plt.clf()
+    plt.close(fig)
+
+# Plot rollout of the actor from some initial states. It generates the results and then calls plot_policy() and plot_policy_eval()
+def rollout(update_step_cntr, diff_loc=0, PRETRAIN=0):
+
+    init_states_sim = [np.array([math.pi/4,-math.pi/8,-math.pi/8,0.0,0.0,0.0,0.0]),np.array([-math.pi/4,math.pi/8,math.pi/8,0.0,0.0,0.0,0.0]),np.array([math.pi/2,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([-math.pi/2,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([3*math.pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([-3*math.pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([math.pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([-math.pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([math.pi,0.0,0.0,0.0,0.0,0.0,0.0])]
+    tau0_all_sim,tau1_all_sim,tau2_all_sim = [],[],[]
+    x_ee_all_sim, y_ee_all_sim = [], []
+
+    for k in range(len(init_states_sim)):
+        tau0_arr_sim,tau1_arr_sim,tau2_arr_sim = [],[],[]
+        x_ee_arr_sim = [conf.x_base + conf.l*(math.cos(init_states_sim[k][0]) + math.cos(init_states_sim[k][0]+init_states_sim[k][1]) + math.cos(init_states_sim[k][0]+init_states_sim[k][1]+init_states_sim[k][2]))]
+        y_ee_arr_sim = [conf.y_base + conf.l*(math.sin(init_states_sim[k][0]) + math.sin(init_states_sim[k][0]+init_states_sim[k][1]) + math.sin(init_states_sim[k][0]+init_states_sim[k][1]+init_states_sim[k][2]))]
+        prev_state_sim = np.copy(init_states_sim[k])
+        episodic_reward_sim = 0
+
+        for i in range(conf.NSTEPS-1):
+            if conf.NORMALIZE_INPUTS:
+                prev_state_sim_norm = prev_state_sim / conf.state_norm_arr
+                prev_state_sim_norm[-1] = 2*prev_state_sim_norm[-1] -1
+                tf_x_sim = tf.expand_dims(tf.convert_to_tensor(prev_state_sim_norm), 0)
+            else:
+                tf_x_sim = tf.expand_dims(tf.convert_to_tensor(prev_state_sim), 0)
+            ctrl = actor_model(tf_x_sim)
+            ctrl_sim = tf.squeeze(ctrl).numpy()
+            next_state_sim = simulate(conf.dt,prev_state_sim,ctrl_sim)
+            rwrd_sim = reward(next_state_sim,ctrl_sim)
+            episodic_reward_sim += rwrd_sim
+            tau0_arr_sim.append(ctrl_sim[0])
+            tau1_arr_sim.append(ctrl_sim[1])
+            tau2_arr_sim.append(ctrl_sim[2])
+            x_ee_arr_sim.append(conf.x_base + conf.l*(math.cos(next_state_sim[0]) + math.cos(next_state_sim[0]+next_state_sim[1]) + math.cos(next_state_sim[0]+next_state_sim[1]+next_state_sim[2])))
+            y_ee_arr_sim.append(conf.y_base + conf.l*(math.sin(next_state_sim[0]) + math.sin(next_state_sim[0]+next_state_sim[1]) + math.sin(next_state_sim[0]+next_state_sim[1]+next_state_sim[2])))
+            prev_state_sim = np.copy(next_state_sim)
+
+            if i==conf.NSTEPS-2:
+                if conf.NORMALIZE_INPUTS:
+                    prev_state_sim_norm = prev_state_sim / conf.state_norm_arr
+                    prev_state_sim_norm[-1] = 2*prev_state_sim_norm[-1] -1
+                    tf_x_sim = tf.expand_dims(tf.convert_to_tensor(prev_state_sim_norm), 0)
+                else:
+                    tf_x_sim = tf.expand_dims(tf.convert_to_tensor(prev_state_sim), 0)                    
+                ctrl = actor_model(tf_x_sim)
+                ctrl_sim = tf.squeeze(ctrl).numpy()
+                tau0_arr_sim.append(ctrl_sim[0])
+                tau1_arr_sim.append(ctrl_sim[1])
+                tau2_arr_sim.append(ctrl_sim[2])
+        if k==2:
+            plot_policy(tau0_arr_sim,tau1_arr_sim,tau2_arr_sim,x_ee_arr_sim,y_ee_arr_sim,conf.NSTEPS,update_step_cntr, diff_loc=diff_loc, PRETRAIN=PRETRAIN)
+            print("N try = {}: Simulation Return @ N updates = {} ==> {}".format(N_try,update_step_cntr,episodic_reward_sim))
+
+        tau0_all_sim.append(np.copy(tau0_arr_sim))            
+        tau1_all_sim.append(np.copy(tau1_arr_sim))            
+        tau2_all_sim.append(np.copy(tau2_arr_sim))            
+        x_ee_all_sim.append(np.copy(x_ee_arr_sim))            
+        y_ee_all_sim.append(np.copy(y_ee_arr_sim))
+
+    plot_policy_eval(x_ee_all_sim,y_ee_all_sim,update_step_cntr, diff_loc=diff_loc, PRETRAIN=PRETRAIN)
+
+    return  tau0_all_sim, tau1_all_sim, tau2_all_sim, x_ee_all_sim, y_ee_all_sim
+
+# Plot returns (not so meaningful given that the initial state, so also the time horizon, of each episode is randomized)
+def plot_Return():
+    fig = plt.figure(figsize=(15,8))
+    ax = fig.add_subplot(1, 1, 1)    
+    ax.plot(ep_reward_list)
+    ax.set_xlabel("Episode")
+    ax.set_ylabel("Return")
+    ax.set_title("N_try = {}".format(N_try))
+    ax.grid(True)
+    plt.savefig(Fig_path+'/EpReturn_Manipulator3DoF_{}'.format(N_try))
+    plt.close()
+
+# Plot average return considering 40 episodes 
+def plot_AvgReturn():
+    fig = plt.figure(figsize=(15,8))
+    ax = fig.add_subplot(1, 1, 1)
+    ax.plot(avg_reward_list)
+    ax.set_xlabel("Episode")
+    ax.set_ylabel("Avg. Return")
+    ax.set_title("N_try = {}".format(N_try))
+    ax.grid(True)
+    plt.savefig(Fig_path+'/AvgReturn_Manipulator3DoF_{}'.format(N_try))
+    plt.close()
+
+
+# To run TF on CPU rather than GPU (seems faster since the NNs are small and some gradients are computed with Pinocchio on CPU --> bottleneck = communication CPU-GPU?)
+# os.environ["CUDA_VISIBLE_DEVICES"]="-1"     
+tf.config.experimental.list_physical_devices('GPU')
+
+tf.random.set_seed(123)                                                     # Set seed for reproducibility
+
+# Set the ticklabel font size globally
+plt.rcParams['xtick.labelsize'] = 22
+plt.rcParams['ytick.labelsize'] = 22
+
+URDF_PATH = "urdf/planar_manipulator_3dof.urdf"                             # Path to the urdf file
+
+N_try = 1                                                                   # Number of test run
+Fig_path = './Results/Figures/N_try_{}'.format(N_try)
+NNs_path = './Results/NNs/N_try_{}'.format(N_try)
+Config_path = './Results/Configs/'
+
+# Create folders to store the results and the trained NNs
+try:
+    os.makedirs(Fig_path)                                                  
+except:
+    print("N try = {} Figures folder already existing".format(N_try))
+    pass
+try:
+    os.makedirs(Fig_path+'/Actor')                                         
+except:
+    print("N try = {} Actor folder already existing".format(N_try))
+    pass
+try:
+    os.makedirs(NNs_path)                                                 
+except:
+    print("N try = {} NNs folder already existing".format(N_try))
+    pass
+try:
+    os.makedirs(Config_path)                                              
+except:
+    print("N try = {} Configs folder already existing".format(N_try))
+    pass
+
+num_states = 7      # Number of states
+num_actions = 3     # Number of actions
+
+if __name__ == "__main__":
+
+    # Create actor, critic and target NNs
+    actor_model = get_actor()
+    critic_model = get_critic()
+    target_critic = get_critic()
+
+    # Set initial weights of targets equal to those of actor and critic. Comment if recovering from a stopped training
+    target_critic.set_weights(critic_model.get_weights())
+
+    # Uncomment if recovering from a stopped training
+    # update_step_counter = 102400
+    # NNs_path+"/Manipulator_{}.h5".format(update_step_counter))
+    # actor_model.load_weights(NNs_path+"/Manipulator_{}.h5".format(update_step_counter))
+    # critic_model.load_weights(NNs_path+"/Manipulator_critic{}.h5".format(update_step_counter))
+    # target_critic.load_weights(NNs_path+"/Manipulator_target_critic{}.h5".format(update_step_counter))
+
+    # Set optimizer specifying the learning rates
+    if conf.LR_SCHEDULE:
+        CRITIC_LR_SCHEDULE = tf.keras.optimizers.schedules.PiecewiseConstantDecay(conf.boundaries_schedule_LR_C, conf.values_schedule_LR_C)   # Piecewise constant decay schedule
+        ACTOR_LR_SCHEDULE = tf.keras.optimizers.schedules.PiecewiseConstantDecay(conf.boundaries_schedule_LR_A, conf.values_schedule_LR_A)
+        critic_optimizer = tf.keras.optimizers.Adam(CRITIC_LR_SCHEDULE)
+        actor_optimizer = tf.keras.optimizers.Adam(ACTOR_LR_SCHEDULE)
+    else:
+        critic_optimizer = tf.keras.optimizers.Adam(conf.CRITIC_LEARNING_RATE)
+        actor_optimizer = tf.keras.optimizers.Adam(conf.ACTOR_LEARNING_RATE)
+
+    # Save config file
+    f=open(Config_path+'/Manipulator_config{}.txt'.format(N_try), 'w')
+    f.write("conf.NEPISODES = {}, conf.NSTEPS = {}, conf.CRITIC_LEARNING_RATE = {}, conf.ACTOR_LEARNING_RATE = {}, conf.UPDATE_RATE = {}, conf.REPLAY_SIZE = {}, conf.BATCH_SIZE = {}, conf.NH1 = {}, conf.NH2 = {}, conf.dt = {}".format(conf.NEPISODES,conf.NSTEPS,conf.CRITIC_LEARNING_RATE,conf.ACTOR_LEARNING_RATE,conf.UPDATE_RATE,conf.REPLAY_SIZE,conf.BATCH_SIZE,conf.NH1,conf.NH2,conf.dt)+
+            "\n"+str(conf.UPDATE_LOOPS)+" updates every {} episodes".format(conf.EP_UPDATE)+
+            "\n\nReward = ({}*(-(x_ee-conf.TARGET_STATE[0])**2 -(y_ee-conf.TARGET_STATE[1])**2) + {}*peak_reward - {}*vel_joint - {}*ell1_pen - {}*ell2_pen - {}*ell3_pen - {}*(u[0]**2 + u[1]**2 + u[2]**2))/100, vel_joint = x2[3]**2 + x2[4]**2 + x2[5]**2 - 10000/{} if final step else 0, peak reward = math.log(math.exp({}*-(x_err-0.1 + y_err-0.1)) + 1)/{}, x_err = math.sqrt((x_ee-conf.TARGET_STATE[0])**2 +0.1) - math.sqrt(0.1), y_err = math.sqrt((y_ee-conf.TARGET_STATE[1])**2 +0.1) - math.sqrt(0.1), ell_pen = log(exp({}*-(((x_ee-XC)**2)/((a/2)**2) + ((y_ee-YC)**2)/((conf.B1/2)**2) - 1.0)) + 1)/{}".format(conf.w_d,conf.w_peak,conf.w_v,conf.w_ob1,conf.w_ob2,conf.w_ob3,conf.w_u,conf.w_v,conf.alpha2,conf.alpha2,conf.alpha,conf.alpha)+
+            "\n\nBase = [{},{}]".format(conf.x_base,conf.y_base)+", target (conf.TARGET_STATE) = "+str(conf.TARGET_STATE)+
+            "\nPrioritized_replay_alpha = "+str(conf.prioritized_replay_alpha)+", conf.prioritized_replay_beta0 = "+str(conf.prioritized_replay_beta0)+", conf.prioritized_replay_eps = "+str(conf.prioritized_replay_eps)+
+            "\nActor: kernel_and_bias_regularizer = l1_l2({}), Critic:  kernel_and_bias_regularizer = l1_l2({}) (in each layer)".format(conf.wreg_l1_A,conf.wreg_l2_A,conf.wreg_l1_C,conf.wreg_l2_C)+
+            "\nScheduled step LR decay = {}: critic values = {} and boundaries = {}, policy values = {} and boundaries = {}".format(conf.LR_SCHEDULE,conf.values_schedule_LR_C,conf.boundaries_schedule_LR_C,conf.values_schedule_LR_A,conf.boundaries_schedule_LR_A)+
+            "\nRandom initial state -> [uniform(-pi,pi), uniform(-pi,pi), uniform(-pi,pi), uniform(-pi/4,pi/4), uniform(-pi/4,pi/4), uniform(-pi/4,pi/4),uniform(0,(NSTEPS_SH-1)*conf.dt)"+ 
+            "\nNormalized inputs = {}, q by {} and qdot by {}".format(conf.NORMALIZE_INPUTS,conf.state_norm_arr[0],conf.state_norm_arr[3])+
+            "\nEpisodes of critic pretraining = {}".format(conf.EPISODE_CRITIC_PRETRAINING)+
+            "\nn-step TD = {} with {} lookahead steps".format(conf.TD_N, conf.nsteps_TD_N))
+    f.close()
+
+    # Create training instance and an empty (prioritized) replay buffer
+    training = Training(conf.BATCH_SIZE)                                                            
+    prioritized_buffer = PrioritizedReplayBuffer(conf.REPLAY_SIZE, alpha=conf.prioritized_replay_alpha)   
+
+    # Lists to store the reward history of each episode and the average reward history of last few episodes
+    ep_reward_list = []                                                                                     
+    avg_reward_list = []
+
+    # Initialize the counter of the updates. Comment if recovering from a stopped training
+    update_step_counter = 0                          
+
+    # START TRAINING
+    # If recovering from a stopped training, starting episode must be ((update_step_counter/conf.UPDATE_LOOPS)*conf.EP_UPDATE)+1
+    for ep in range(conf.NEPISODES): 
+        
+        TO = 0                                # Flag to indicate if the TO problem has been solved
+        DONE = 0                              # Flag indicating if the episode has terminated
+        ep_return = 0                         # Initialize the return
+        step_counter = 0                      # Initialize the counter of episode steps
+
+        # START TO PROBLEM 
+        while TO==0:
+
+            # Randomize initial state           
+            rand_time = random.uniform(0,(conf.NSTEPS-1)*conf.dt)
+            prev_state = np.array([random.uniform(-math.pi,math.pi), random.uniform(-math.pi,math.pi), random.uniform(-math.pi,math.pi), 
+                        random.uniform(-math.pi/4,math.pi/4), random.uniform(-math.pi/4,math.pi/4), random.uniform(-math.pi/4,math.pi/4),
+                        conf.dt*round(rand_time/conf.dt)])
+            if conf.NORMALIZE_INPUTS:
+                prev_state_norm = prev_state / conf.state_norm_arr
+                prev_state_norm[-1] = 2*prev_state_norm[-1] - 1
+                tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state_norm), 0)   
+            else:
+                tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)         
+
+            # Set the horizon of TO problem / RL episode 
+            NSTEPS_SH = conf.NSTEPS - int(round(rand_time/conf.dt))                   
+            
+            # Lists to store TO state and control trajectories
+            tau0_arr,tau1_arr,tau2_arr = [],[],[]
+            q0_arr, q1_arr, q2_arr = [prev_state[0]], [prev_state[1]], [prev_state[2]]
+            v0_arr, v1_arr, v2_arr = [prev_state[3]], [prev_state[4]], [prev_state[5]]
+            t_arr = [prev_state[-1]]
+            x_ee_arr = [conf.x_base + conf.l*(math.cos(prev_state[0]) + math.cos(prev_state[0]+prev_state[1]) + math.cos(prev_state[0]+prev_state[1]+prev_state[2]))]
+            y_ee_arr = [conf.y_base + conf.l*(math.sin(prev_state[0]) + math.sin(prev_state[0]+prev_state[1]) + math.sin(prev_state[0]+prev_state[1]+prev_state[2]))]
+        
+            # Actor rollout used to initialize TO state and control variables
+            init_TO_states = np.zeros((num_states, NSTEPS_SH+1))
+            init_TO_states[0][0] = prev_state[0]
+            init_TO_states[1][0] = prev_state[1]
+            init_TO_states[2][0] = prev_state[2]
+            init_TO_states[3][0] = prev_state[3]                     
+            init_TO_states[4][0] = prev_state[4]                     
+            init_TO_states[5][0] = prev_state[5]                     
+            init_TO_controls = np.zeros((num_actions, NSTEPS_SH+1))
+            init_TO_controls[0][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[0]
+            init_TO_controls[1][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[1] 
+            init_TO_controls[2][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[2] 
+            init_prev_state = np.copy(prev_state)
+
+            # Simulate actor's actions to compute the state trajectory used to initialize TO state variables
+            for i in range(1, NSTEPS_SH+1):                                                                                                                                 
+                init_next_state =  simulate(conf.dt,init_prev_state,np.array([init_TO_controls[0][i-1],init_TO_controls[1][i-1],init_TO_controls[2][i-1]]))
+                init_TO_states[0][i] = init_next_state[0]
+                init_TO_states[1][i] = init_next_state[1]
+                init_TO_states[2][i] = init_next_state[2]
+                init_TO_states[3][i] = init_next_state[3] 
+                init_TO_states[4][i] = init_next_state[4] 
+                init_TO_states[5][i] = init_next_state[5] 
+                if conf.NORMALIZE_INPUTS:
+                    init_next_state_norm = init_next_state / conf.state_norm_arr
+                    init_next_state_norm[-1] = 2*init_next_state_norm[-1] - 1
+                    init_tf_next_state = tf.expand_dims(tf.convert_to_tensor(init_next_state_norm), 0)        
+                else:    
+                    init_tf_next_state = tf.expand_dims(tf.convert_to_tensor(init_next_state), 0)        
+                init_TO_controls[0][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[0]
+                init_TO_controls[1][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[1] 
+                init_TO_controls[2][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[2] 
+                init_prev_state = np.copy(init_next_state)
+
+            # Create TO problem
+            if ep < conf.EPISODE_CRITIC_PRETRAINING or ep < conf.EPISODE_ICS_INIT:
+                TO_mdl = TO_manipulator(prev_state, init_q0_ICS, init_q1_ICS, init_q2_ICS, init_v0_ICS, init_v1_ICS, init_v2_ICS, init_0, init_0, init_0, init_0, init_0, init_0, NSTEPS_SH)
+            else:
+                if ep == conf.EPISODE_ICS_INIT and conf.LR_SCHEDULE:  
+                    # Re-initialize Adam otherwise it keeps being affected by the estimates of first-order and second-order moments computed previously with ICS warm-starting
+                    critic_optimizer = tf.keras.optimizers.Adam(CRITIC_LR_SCHEDULE)     
+                    actor_optimizer = tf.keras.optimizers.Adam(ACTOR_LR_SCHEDULE)
+                TO_mdl = TO_manipulator(prev_state, init_q0, init_q1, init_q2, init_v0, init_v1, init_v2, init_0, init_0, init_0, init_tau0, init_tau1, init_tau2, NSTEPS_SH, init_TO = [init_TO_controls, init_TO_states])
+                
+            # Indexes of TO variables       
+            K = np.array([k for k in TO_mdl.k])               
+
+            # Select solver
+            solver = SolverFactory('ipopt')
+            solver.options['linear_solver'] = "ma57"
+
+            # Try to solve TO problem
+            try:
+                results = solver.solve(TO_mdl)                              
+                if str(results.solver.termination_condition) == "optimal":    
+                    #Retrieve control trajectory
+                    tau0_TO = [TO_mdl.tau0[k]() for k in K]
+                    tau1_TO = [TO_mdl.tau1[k]() for k in K]
+                    tau2_TO = [TO_mdl.tau2[k]() for k in K]
+                    TO = 1
+                else:
+                    print('TO solution not optimal')                   
+                    raise Exception()         
+            except:
+                print("*** TO failed ***")                                 
+                
+            # Plot TO solution    
+            # plot_results_TO(TO_mdl)                                                               
+
+        ep_arr = []
+
+        # START RL EPISODE
+        while True:
+            # Get current and next TO actions
+            action, next_TO_action = get_TO_actions(step_counter, action0_TO=tau0_TO, action1_TO=tau1_TO, action2_TO=tau2_TO)              
+                                                                                                                            
+            # Simulate actions and retrieve next state
+            next_state = simulate(conf.dt,prev_state,action)
+
+            # Store performed action and next state
+            tau0_arr.append(action[0])
+            tau1_arr.append(action[1])
+            tau2_arr.append(action[2])
+            q0_arr.append(next_state[0])
+            q1_arr.append(next_state[1])
+            q2_arr.append(next_state[2])
+            v0_arr.append(next_state[3])
+            v1_arr.append(next_state[4])
+            v2_arr.append(next_state[5])
+            t_arr.append(next_state[-1])
+            x_ee_arr.append(conf.x_base + conf.l*(math.cos(q0_arr[-1]) + math.cos(q0_arr[-1]+q1_arr[-1]) + math.cos(q0_arr[-1]+q1_arr[-1]+q2_arr[-1])))
+            y_ee_arr.append(conf.y_base + conf.l*(math.sin(q0_arr[-1]) + math.sin(q0_arr[-1]+q1_arr[-1]) + math.sin(q0_arr[-1]+q1_arr[-1]+q2_arr[-1])))
+            
+            # Compute reward
+            rwrd = reward(next_state, action)
+            ep_arr.append(rwrd)
+
+            if step_counter==NSTEPS_SH-1:
+                DONE = 1
+
+            # Store transition if you want to use 1-step TD
+            if conf.TD_N==0:
+                prioritized_buffer.add(prev_state, next_TO_action, action, rwrd, next_state, float(DONE))
+
+            ep_return += rwrd                       # Increment the episodic return by the reward just recived
+            step_counter += 1                       
+            prev_state = np.copy(next_state)        # Next state becomes the prev state at the next episode step
+
+            # End episode
+            if DONE == 1:
+                break
+        
+        # Plot the state and control trajectories of this episode 
+        # plot_results(tau0_arr,tau1_arr,tau2_arr,x_ee,y_ee,x_ee_arr,y_ee_arr,NSTEPS_SH,to=TO)
+
+        # Store transition after computing the (partial) cost-to go when using n-step TD (up to Monte Carlo)
+        if conf.TD_N:
+            DONE = 0
+            cost_to_go_arr = []
+            for i in range(len(ep_arr)):                
+                final_i = min(i+conf.nsteps_TD_N,len(ep_arr))
+                if final_i == len(ep_arr):
+                    V_final = 0.0
+                else:
+                    if conf.NORMALIZE_INPUTS:
+                        next_state_rollout = np.array([q0_arr[final_i],q1_arr[final_i],q2_arr[final_i],v0_arr[final_i],v1_arr[final_i],v2_arr[final_i],t_arr[final_i]]) / conf.state_norm_arr
+                        next_state_rollout[-1] = 2*next_state_rollout[-1] - 1
+                    else:
+                        next_state_rollout = np.array([q0_arr[final_i],q1_arr[final_i],q2_arr[final_i],v0_arr[final_i],v1_arr[final_i],v2_arr[final_i],t_arr[final_i]]) / conf.state_norm_arr
+                    tf_next_state_rollout = tf.expand_dims(tf.convert_to_tensor(next_state_rollout), 0)
+                    V_final = target_critic(tf_next_state_rollout, training=False).numpy()[0][0]
+                cost_to_go = sum(ep_arr[i:final_i+1]) + V_final
+                cost_to_go_arr.append(np.float32(cost_to_go))
+                if i == len(ep_arr)-1:
+                    DONE = 1 
+                prioritized_buffer.add(np.array([q0_arr[i],q1_arr[i],q2_arr[i],v0_arr[i],v1_arr[i],v2_arr[i],t_arr[i]]), next_TO_action, action, cost_to_go_arr[i], np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0]), float(DONE))
+
+        # Update the NNs
+        for i in range(conf.UPDATE_LOOPS):
+            training.learn(ep, prioritized_buffer)                                         # Update critic and actor
+            update_target(target_critic.variables, critic_model.variables, conf.UPDATE_RATE)    # Update target critic
+            update_step_counter += 1
+
+        # Plot rollouts every 0.5% of the training (saved in a separate folder)
+        if ep>=conf.ep_no_update and ep%int((conf.NEPISODES-conf.ep_no_update)/200)==0:
+            _, _, _, _, _ = rollout(update_step_counter, diff_loc=1)            
+
+        # Plot rollouts and save the NNs every 5% of the training
+        if ep>=conf.ep_no_update and int((conf.NEPISODES-conf.ep_no_update)/20)!=1 and int((conf.NEPISODES-conf.ep_no_update)/20)!=0 and ep%int((conf.NEPISODES-conf.ep_no_update)/20)==0:          
+            _, _, _, _, _ = rollout(update_step_counter)        
+            actor_model.save_weights(NNs_path+"/Manipulator_{}.h5".format(update_step_counter))
+            critic_model.save_weights(NNs_path+"/Manipulator_critic_{}.h5".format(update_step_counter))
+            target_critic.save_weights(NNs_path+"/Manipulator_target_critic_{}.h5".format(update_step_counter))
+                
+        # ep_reward_list.append(ep_return)
+        # avg_reward = np.mean(ep_reward_list[-40:])  # Mean of last 40 episodes
+        # avg_reward_list.append(avg_reward)
+
+        print("Episode  {}  --->   Return = {}".format(ep, ep_return))
+
+    # Plot returns
+    # plot_AvgReturn()
+    # plot_Return()
+
+    # Save networks at the end of the training
+    actor_model.save_weights(NNs_path+"/Manipulator_actor_final.h5")
+    critic_model.save_weights(NNs_path+"/Manipulator_critic_final.h5")
+    target_critic.save_weights(NNs_path+"/Manipulator_target_critic_final.h5")
+
+    # Simulate the final policy
+    tau0_all_final_sim, tau1_all_final_sim, tau2_all_final_sim, x_ee_all_final_sim, y_ee_all_final_sim = rollout(update_step_counter)
\ No newline at end of file
diff --git a/TO_manipulator3DoF_pyomo.py b/TO_manipulator3DoF_pyomo.py
new file mode 100644
index 0000000..ef023f7
--- /dev/null
+++ b/TO_manipulator3DoF_pyomo.py
@@ -0,0 +1,538 @@
+import tensorflow as tf
+from tensorflow.keras import layers, regularizers
+from pyomo.environ import *
+import numpy as np
+import random
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+from matplotlib.patches import Ellipse
+from math import pi
+import math
+from dynamics_manipulator3DoF import RobotWrapper, RobotSimulator, load_urdf
+import config_manipulator3DoF_pyomo as conf
+from inits import init_tau0,init_tau1,init_tau2,init_q0,init_q1,init_q2,init_v0,init_v1,init_v2,init_q0_ICS,init_q1_ICS,init_q2_ICS,init_v0_ICS,init_v1_ICS,init_v2_ICS,init_0
+
+# Plot EE trajectories of all TO problems related to the set of initial states considered
+def plot_results_TO_all(q0_all,q1_all,x_init_all_sim,y_init_all_sim,x_all_sim,y_all_sim):
+    fig = plt.figure(figsize=(12,8))
+    ell1 = Ellipse((conf.XC1, conf.YC1), conf.A1, conf.B1, 0.0)
+    ell1.set_facecolor([30/255, 130/255, 76/255, 1])
+    ell2 = Ellipse((conf.XC2, conf.YC2), conf.A2, conf.B2, 0.0)
+    ell2.set_facecolor([30/255, 130/255, 76/255, 1])
+    ell3 = Ellipse((conf.XC3, conf.YC3), conf.A3, conf.B3, 0.0)
+    ell3.set_facecolor([30/255, 130/255, 76/255, 1]) 
+
+    K = range(len(q0_all))
+    q0,q1,x1,x2,y1,y2 = [],[],[],[],[],[]
+    for i in K:
+        q0.append(q0_all[i][0])
+        q1.append(q1_all[i][0])
+        
+    x0 = -7.0
+    y0 = 0.0
+    for k in K:
+        x1.append(x0 + conf.l*cos(q0[k]))    
+        x2.append(x0 + conf.l*(cos(q0[k]) + cos(q0[k]+q1[k])))
+        y1.append(conf.l*sin(q0[k]))
+        y2.append(conf.l*(sin(q0[k]) + sin(q0[k]+q1[k])))  
+
+    ax = fig.add_subplot(1, 1, 1)
+    for idx in range(len(x_all_sim)):
+        ax.plot(x_all_sim[idx], y_all_sim[idx], 'ro', linewidth=1, markersize=1)
+        init_states, = ax.plot(-7,0,'tab:orange',marker = 'o', ms = 15, mec = 'r')
+        ax.plot(x_init_all_sim[idx], y_init_all_sim[idx], 'mo', linewidth=1, markersize=1, alpha=0.4)
+    targ_state, = ax.plot(conf.x_des,conf.y_des,'k*',markersize=15)
+    ax.legend([init_states, targ_state], ['Base', 'Target point EE'], fontsize=20, loc='lower right')    
+    ax.add_artist(ell1)
+    ax.add_artist(ell2) 
+    ax.add_artist(ell3)
+    for i in K:
+        ax.plot([x0,x1[i],x2[i]],[y0,y1[i],y2[i]],'b',marker = 'o', ms = 5, mec = 'r', linewidth=1, alpha=0.8) #xp, yp, 'r--',
+        ax.plot([x2[i],x_all_sim[i][0]],[y2[i],y_all_sim[i][0]],'b', linewidth=1, alpha=0.8)
+        if abs(x_all_sim[i][0]-x2[i]) >= 1e-5:
+            angle = math.degrees(math.atan((y_all_sim[i][0]-y2[i])/(x_all_sim[i][0]-x2[i])))
+        elif y_all_sim[i][0]>0:
+            angle = -90
+        else:
+            angle = 90
+        if angle<=1e5 and angle>=-1e5 and x_all_sim[i][0]<0:
+            angle+=180
+        t = mpl.markers.MarkerStyle(marker='$\sqsubset$')
+        t._transform = t.get_transform().rotate_deg(angle)      
+        ax.plot(x_all_sim[i][0],y_all_sim[i][0],'b', marker=t, ms = 15, linewidth=1) 
+    ax.set_xlim([-41, 31])
+    ax.set_aspect('equal', 'box')
+    ax.tick_params(axis='x')
+    ax.tick_params(axis='y')
+    ax.set_xlabel('X [m]',fontsize=20)
+    ax.set_ylabel('Y [m]',fontsize=20)
+    ax.set_ylim(-35, 35)
+    ax.grid(True)
+
+    fig.tight_layout()
+    plt.show()
+
+# Plot trajectories of joint angles and velocities of all TO problems related to the set of initial states considered
+def plot_results_TO_all_trajs(q0_all,q1_all,q2_all,v0_all,v1_all,v2_all,tau0_all,tau1_all,tau2_all):
+    fig = plt.figure(figsize=(12,10))
+
+    K = range(len(q0_all[0]))
+    time = [conf.dt*k for k in K]
+
+    ax2 = fig.add_subplot(3, 1, 1)
+    for idx in range(len(q0_all)):
+        ax2.plot(time, q0_all[idx], 'r', linewidth=1, markersize=1)
+        ax2.plot(time, q1_all[idx], 'b', linewidth=1, markersize=1) 
+        ax2.plot(time, q2_all[idx], 'g', linewidth=1, markersize=1) 
+    ax2.legend(['q0','q1','q2'],fontsize=16)
+    ax2.set_ylabel('[rad]',fontsize=20)
+    ax2.grid(True)
+
+    ax3 = fig.add_subplot(3, 1, 2)
+    for idx in range(len(q0_all)):
+        ax3.plot(time, v0_all[idx], 'r', linewidth=1, markersize=1) 
+        ax3.plot(time, v1_all[idx], 'b', linewidth=1, markersize=1) 
+        ax3.plot(time, v2_all[idx], 'g', linewidth=1, markersize=1) 
+    ax3.legend(['v0','v1','v2'],fontsize=16)
+    ax3.set_ylabel('[rad/s]',fontsize=20)
+    ax3.grid(True)
+
+    ax4 = fig.add_subplot(3, 1, 3)
+    for idx in range(len(q0_all)):
+        ax4.plot(time, tau0_all[idx], 'r', linewidth=1, markersize=1)
+        ax4.plot(time, tau1_all[idx], 'b', linewidth=1, markersize=1)
+        ax4.plot(time, tau2_all[idx], 'g', linewidth=1, markersize=1)
+    ax4.legend(['tau0','tau1','tau2'],fontsize=16) 
+    ax4.set_ylabel('[Nm]',fontsize=20)
+    ax4.set_xlabel('[s]',fontsize=20)
+    ax4.grid(True)
+
+    fig.tight_layout()
+    plt.show()
+
+# Plot result of one TO problem
+def plot_results_TO(m):
+    K = np.array([k for k in m.k])  
+    tau0 = [m.tau0[k]() for k in K]
+    tau1 = [m.tau1[k]() for k in K]
+    tau2 = [m.tau2[k]() for k in K]
+    x_ee = [(-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))() for k in K]
+    y_ee = [(conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))() for k in K]
+    q0 = [m.q0[k]() for k in K]
+    q1 = [m.q1[k]() for k in K]
+    q2 = [m.q2[k]() for k in K]
+    v0 = [m.v0[k]() for k in K]
+    v1 = [m.v1[k]() for k in K]
+    v2 = [m.v2[k]() for k in K]
+
+    x0 = -7.0
+    x1 = [x0 + conf.l*cos(q0[k]) for k in K]    
+    x2 = [x0 + conf.l*(cos(q0[k]) + cos(q0[k]+q1[k])) for k in K]
+    y0 = 0.0
+    y1 = [conf.l*sin(q0[k]) for k in K]
+    y2 = [conf.l*(sin(q0[k]) + sin(q0[k]+q1[k])) for k in K]
+ 
+    cost=m.obj()
+
+    fig = plt.figure(figsize=(12,10))
+    plt.suptitle('Discrete model', y=1, fontsize=16)
+
+    ax1 = fig.add_subplot(4, 2, 1)
+    ax1.plot(conf.dt*K, x_ee, 'ro', linewidth=1, markersize=1) 
+    ax1.plot(conf.dt*K, y_ee, 'bo', linewidth=1, markersize=1) 
+    ax1.set_title('Position',fontsize=16)
+    ax1.legend(['x','y'],fontsize=16)
+    ax1.set_ylim(-31, 31)
+    ax1.grid(True)
+
+    ax2 = fig.add_subplot(4, 2, 3)
+    ax2.plot(conf.dt*K, q0, 'ro', linewidth=1, markersize=1)
+    ax2.plot(conf.dt*K, q1, 'bo', linewidth=1, markersize=1) 
+    ax2.plot(conf.dt*K, q2, 'go', linewidth=1, markersize=1)
+    ax2.set_title('Joint position',fontsize=16)
+    ax2.legend(['q0','q1','q2'],fontsize=16)
+    ax2.grid(True)
+
+    ax3 = fig.add_subplot(4, 2, 5)
+    ax3.plot(conf.dt*K, v0, 'ro', linewidth=1, markersize=1) 
+    ax3.plot(conf.dt*K, v1, 'bo', linewidth=1, markersize=1) 
+    ax3.plot(conf.dt*K, v2, 'go', linewidth=1, markersize=1) 
+    ax3.set_title('Joint velocity',fontsize=16)
+    ax3.legend(['v0','v1','v2'],fontsize=16)
+    ax3.grid(True)
+
+    ax4 = fig.add_subplot(4, 2, 7)
+    ax4.plot(conf.dt*K, tau0, 'ro', linewidth=1, markersize=1) 
+    ax4.plot(conf.dt*K, tau1, 'bo', linewidth=1, markersize=1) 
+    ax4.plot(conf.dt*K, tau2, 'go', linewidth=1, markersize=1)
+    ax4.legend(['tau0','tau1','tau2'],fontsize=16) 
+    ax4.set_title('Controls',fontsize=16)
+
+    ell1 = Ellipse((conf.XC1, conf.YC1), conf.A1, conf.B1, 0.0)
+    ell1.set_facecolor([30/255, 130/255, 76/255, 1])
+    ell2 = Ellipse((conf.XC2, conf.YC2), conf.A2, conf.B2, 0.0)
+    ell2.set_facecolor([30/255, 130/255, 76/255, 1])
+    ell3 = Ellipse((conf.XC3, conf.YC3), conf.A3, conf.B3, 0.0)
+    ell3.set_facecolor([30/255, 130/255, 76/255, 1])
+    ax3 = fig.add_subplot(1, 2, 2)
+    ax3.plot(x_ee, y_ee, 'ro', linewidth=1, markersize=2)
+    N_points_conf = 3
+    for g in range(N_points_conf):
+        index = int(g*K[-1]/(N_points_conf-1))
+        if g==0:
+            ax3.plot([x0,x1[index],x2[index],x_ee[index]],[y0,y1[index],y2[index],y_ee[index]],'b',marker = 'o', ms = 5, mec = 'r', linewidth=1, alpha=0.8) #xp, yp, 'r--',
+        elif g==1:
+            ax3.plot([x0,x1[index],x2[index],x_ee[index]],[y0,y1[index],y2[index],y_ee[index]],'dimgrey',marker = 'o', ms = 5, mec = 'r', linewidth=1, alpha=0.8) #xp, yp, 'r--',
+        else:
+            ax3.plot([x0,x1[index],x2[index],x_ee[index]],[y0,y1[index],y2[index],y_ee[index]],'orange',marker = 'o', ms = 5, mec = 'r', linewidth=1, alpha=0.8) #xp, yp, 'r--',
+    ax3.add_artist(ell1)
+    ax3.add_artist(ell2) 
+    ax3.add_artist(ell3) 
+    ax3.set_xlim(-41, 31)
+    ax3.set_aspect('equal', 'box')
+    ax3.set_title("Plane, OBJ = {}".format(cost),fontsize=16)
+    ax3.set_xlabel('X',fontsize=16)
+    ax3.set_ylabel('Y',fontsize=16)
+    ax3.set_ylim(-35, 35)
+
+    
+    for ax in [ax1, ax2, ax3]:
+        ax.grid(True)
+
+    fig.tight_layout()
+    plt.show()
+
+# Create TO problem
+def TO_manipulator(ICS, init_q0, init_q1, init_q2, init_v0, init_v1, init_v2, init_a0, init_a1, init_a2, init_tau0, init_tau1, init_tau2, N, init_TO=None):
+    m = ConcreteModel()
+    m.k = RangeSet(0, N)
+
+    if init_TO != None:
+        init_TO_controls = init_TO[0]
+        init_TO_states = init_TO[1]        
+        m.tau0 = Var(m.k, initialize=init_tau0(m,m.k,init_TO_controls), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) 
+        m.tau1 = Var(m.k, initialize=init_tau1(m,m.k,init_TO_controls), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) 
+        m.tau2 = Var(m.k, initialize=init_tau2(m,m.k,init_TO_controls), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) 
+        m.q0 = Var(m.k, initialize=init_q0(m,m.k,init_TO_states))
+        m.q1 = Var(m.k, initialize=init_q1(m,m.k,init_TO_states))
+        m.q2 = Var(m.k, initialize=init_q2(m,m.k,init_TO_states))
+        m.v0 = Var(m.k, initialize=init_v0(m,m.k,init_TO_states))
+        m.v1 = Var(m.k, initialize=init_v1(m,m.k,init_TO_states))
+        m.v2 = Var(m.k, initialize=init_v2(m,m.k,init_TO_states))
+    else:    
+        m.tau0 = Var(m.k, initialize=init_tau0(m,m.k), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) 
+        m.tau1 = Var(m.k, initialize=init_tau1(m,m.k), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) 
+        m.tau2 = Var(m.k, initialize=init_tau2(m,m.k), bounds=(-conf.tau_upper_bound, conf.tau_upper_bound)) 
+        m.q0 = Var(m.k, initialize=init_q0(m,m.k,ICS))
+        m.q1 = Var(m.k, initialize=init_q1(m,m.k,ICS))
+        m.q2 = Var(m.k, initialize=init_q2(m,m.k,ICS))
+        m.v0 = Var(m.k, initialize=init_v0(m,m.k,ICS))
+        m.v1 = Var(m.k, initialize=init_v1(m,m.k,ICS))
+        m.v2 = Var(m.k, initialize=init_v2(m,m.k,ICS))
+        
+    m.a0 = Var(m.k, initialize=init_a0)
+    m.a1 = Var(m.k, initialize=init_a1)
+    m.a2 = Var(m.k, initialize=init_a2)
+
+    m.icfix_q0 = Constraint(rule = lambda m: m.q0[0] == ICS[0])
+    m.icfix_q1 = Constraint(rule = lambda m: m.q1[0] == ICS[1])
+    m.icfix_q2 = Constraint(rule = lambda m: m.q2[0] == ICS[2])
+    m.icfix_v0 = Constraint(rule = lambda m: m.v0[0] == ICS[3])        
+    m.icfix_v1 = Constraint(rule = lambda m: m.v1[0] == ICS[4])        
+    m.icfix_v2 = Constraint(rule = lambda m: m.v2[0] == ICS[5])        
+
+    ### Integration ###
+    m.v0_update = Constraint(m.k, rule = lambda m, k:
+           m.v0[k+1] == m.v0[k] + conf.dt*m.a0[k] if k < N else Constraint.Skip)
+
+    m.v1_update = Constraint(m.k, rule = lambda m, k:
+           m.v1[k+1] == m.v1[k] + conf.dt*m.a1[k] if k < N else Constraint.Skip)
+
+    m.v2_update = Constraint(m.k, rule = lambda m, k:
+           m.v2[k+1] == m.v2[k] + conf.dt*m.a2[k] if k < N else Constraint.Skip)
+
+    m.q0_update = Constraint(m.k, rule = lambda m, k:
+           m.q0[k+1] == m.q0[k] + conf.dt*m.v0[k] if k < N else Constraint.Skip)
+
+    m.q1_update = Constraint(m.k, rule = lambda m, k:
+           m.q1[k+1] == m.q1[k] + conf.dt*m.v1[k] if k < N else Constraint.Skip)
+
+    m.q2_update = Constraint(m.k, rule = lambda m, k:
+           m.q2[k+1] == m.q2[k] + conf.dt*m.v2[k] if k < N else Constraint.Skip)
+
+
+    ### Dyamics ###
+    m.EoM_0 = Constraint(m.k, rule = lambda m, k:
+    (1/4)*(4*conf.Iz*m.a2[k] + conf.l**2*conf.M*m.a2[k] + 2*conf.l**2*conf.M*m.a2[k]*cos(m.q2[k]) + 2*conf.l**2*conf.M*m.a2[k]*cos(m.q1[k] + m.q2[k]) + 2*m.a1[k]*(4*conf.Iz + 3*conf.l**2*conf.M + 3*conf.l**2*conf.M*cos(m.q1[k]) + 2*conf.l**2*conf.M*cos(m.q2[k]) + conf.l**2*conf.M*cos(m.q1[k] + m.q2[k])) + m.a0[k]*(12*conf.Iz + 15*conf.l**2*conf.M +
+    12*conf.l**2*conf.M*cos(m.q1[k]) + 4*conf.l**2*conf.M*cos(m.q2[k]) + 4*conf.l**2*conf.M*cos(m.q1[k] + m.q2[k])) - 4*m.tau0[k] - 12*conf.l**2*conf.M*sin(m.q1[k])*m.v0[k]*m.v1[k] - 4*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v0[k]*m.v1[k] - 6*conf.l**2*conf.M*sin(m.q1[k])*m.v1[k]**2 - 2*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v1[k]**2
+    - 4*conf.l**2*conf.M*sin(m.q2[k])*m.v0[k]*m.v2[k] - 4*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v0[k]*m.v2[k] - 4*conf.l**2*conf.M*sin(m.q2[k])*m.v1[k]*m.v2[k] - 4*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v1[k]*m.v2[k] - 2*conf.l**2*conf.M*sin(m.q2[k])*m.v2[k]**2 - 2*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v2[k]**2) == 0 if k < N else Constraint.Skip)
+
+    m.EoM_1 = Constraint(m.k, rule = lambda m, k:
+    (1/4)*(4*conf.Iz*m.a2[k] + conf.l**2*conf.M*m.a2[k] + 2*conf.l**2*conf.M*m.a2[k]*cos(m.q2[k]) + m.a1[k]*(8*conf.Iz + 6*conf.l**2*conf.M + 4*conf.l**2*conf.M*cos(m.q2[k])) + 2*m.a0[k]*(4*conf.Iz + 3*conf.l**2*conf.M + 3*conf.l**2*conf.M*cos(m.q1[k]) + 2*conf.l**2*conf.M*cos(m.q2[k]) + conf.l**2*conf.M*cos(m.q1[k] + m.q2[k]))
+    - 4*m.tau1[k] + 6*conf.l**2*conf.M*sin(m.q1[k])*m.v0[k]**2 + 2*conf.l**2*conf.M*sin(m.q1[k] + m.q2[k])*m.v0[k]**2 - 4*conf.l**2*conf.M*sin(m.q2[k])*m.v0[k]*m.v2[k] - 4*conf.l**2*conf.M*sin(m.q2[k])*m.v1[k]*m.v2[k] - 2*conf.l**2*conf.M*sin(m.q2[k])*m.v2[k]**2) == 0 if k < N else Constraint.Skip)
+
+    m.EoM_2 = Constraint(m.k, rule = lambda m, k:
+    (1/4)*(4*conf.Iz*m.a2[k] + conf.l**2*conf.M*m.a2[k] + m.a1[k]*(4*conf.Iz + conf.l**2*conf.M + 2*conf.l**2*conf.M*cos(m.q2[k])) + m.a0[k]*(4*conf.Iz + conf.l**2*conf.M + 2*conf.l**2*conf.M*cos(m.q2[k]) + 2*conf.l**2*conf.M*cos(m.q1[k] + m.q2[k])) - 4*m.tau2[k] + 2*conf.l**2*conf.M*sin(m.q2[k])*m.v0[k]**2 + 2*conf.l**2*conf.M*sin(m.q1[k] +
+    m.q2[k])*m.v0[k]**2 + 4*conf.l**2*conf.M*sin(m.q2[k])*m.v0[k]*m.v1[k] + 2*conf.l**2*conf.M*sin(m.q2[k])*m.v1[k]**2) == 0 if k < N else Constraint.Skip)
+
+    ### Penalties representing the obstacle ###
+    m.ell1_penalty = sum((log(exp(conf.alpha*-((((-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))-conf.XC1)**2)/((conf.A1/2)**2) + (((conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))-conf.YC1)**2)/((conf.B1/2)**2) - 1.0)) + 1)/conf.alpha)  for k in m.k) - (log(exp(conf.alpha*-(((-7 + conf.l*(cos(m.q0[0]) + cos(m.q0[0]+m.q1[0]) + cos(m.q0[0]+m.q1[0]+m.q2[0]))-conf.XC1)**2)/((conf.A1/2)**2) + ((conf.l*(sin(m.q0[0]) + sin(m.q0[0]+m.q1[0]) + sin(m.q0[0]+m.q1[0]+m.q2[0]))-conf.YC1)**2)/((conf.B1/2)**2) - 1.0)) + 1)/conf.alpha)
+    m.ell2_penalty = sum((log(exp(conf.alpha*-((((-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))-conf.XC2)**2)/((conf.A2/2)**2) + (((conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))-conf.YC2)**2)/((conf.B2/2)**2) - 1.0)) + 1)/conf.alpha) for k in m.k) - (log(exp(conf.alpha*-(((-7 + conf.l*(cos(m.q0[0]) + cos(m.q0[0]+m.q1[0]) + cos(m.q0[0]+m.q1[0]+m.q2[0]))-conf.XC2)**2)/((conf.A2/2)**2) + ((conf.l*(sin(m.q0[0]) + sin(m.q0[0]+m.q1[0]) + sin(m.q0[0]+m.q1[0]+m.q2[0]))-conf.YC2)**2)/((conf.B2/2)**2) - 1.0)) + 1)/conf.alpha)
+    m.ell3_penalty = sum((log(exp(conf.alpha*-((((-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))-conf.XC3)**2)/((conf.A3/2)**2) + (((conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))-conf.YC3)**2)/((conf.B3/2)**2) - 1.0)) + 1)/conf.alpha)  for k in m.k) - (log(exp(conf.alpha*-(((-7 + conf.l*(cos(m.q0[0]) + cos(m.q0[0]+m.q1[0]) + cos(m.q0[0]+m.q1[0]+m.q2[0]))-conf.XC3)**2)/((conf.A3/2)**2) + ((conf.l*(sin(m.q0[0]) + sin(m.q0[0]+m.q1[0]) + sin(m.q0[0]+m.q1[0]+m.q2[0]))-conf.YC3)**2)/((conf.B3/2)**2) - 1.0)) + 1)/conf.alpha)
+
+    ### Control effort term ###
+    m.u_obj = sum((m.tau0[k]**2 + m.tau1[k]**2 + m.tau2[k]**2) for k in m.k) - (m.tau0[N]**2 + m.tau1[N]**2 + m.tau2[N]**2)
+    
+    ### Distence to target term (quadratic term + log valley centered at target) ###
+    m.dist_cost = sum((conf.w_d*(((-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))-conf.x_des)**2 + ((conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))-conf.y_des)**2) - conf.w_peak*(log(exp(conf.alpha2*-(sqrt(((-7 + conf.l*(cos(m.q0[k]) + cos(m.q0[k]+m.q1[k]) + cos(m.q0[k]+m.q1[k]+m.q2[k])))-conf.x_des)**2 +0.1) - 0.1 + sqrt(((conf.l*(sin(m.q0[k]) + sin(m.q0[k]+m.q1[k]) + sin(m.q0[k]+m.q1[k]+m.q2[k])))-conf.y_des)**2 +0.1) - 0.1 -2*sqrt(0.1))) + 1)/conf.alpha2)) for k in m.k) - (conf.w_d*((-7 + conf.l*(cos(m.q0[0]) + cos(m.q0[0]+m.q1[0]) + cos(m.q0[0]+m.q1[0]+m.q2[0]))-conf.x_des)**2 + (conf.l*(sin(m.q0[0]) + sin(m.q0[0]+m.q1[0]) + sin(m.q0[0]+m.q1[0]+m.q2[0]))-conf.y_des)**2) - conf.w_peak*(log(exp(conf.alpha2*-(sqrt((-7 + conf.l*(cos(m.q0[0]) + cos(m.q0[0]+m.q1[0]) + cos(m.q0[0]+m.q1[0]+m.q2[0]))-conf.x_des)**2 +0.1) - 0.1 + sqrt((conf.l*(sin(m.q0[0]) + sin(m.q0[0]+m.q1[0]) + sin(m.q0[0]+m.q1[0]+m.q2[0]))-conf.y_des)**2 +0.1) - 0.1 -2*sqrt(0.1))) + 1)/conf.alpha2))
+
+    # m.v = sum(((m.v0[k])**2 + (m.v1[k])**2 + (m.v2[k])**2)*(k/m.k[-1])**2 for k in m.k) 
+    m.v = (m.v0[N])**2 + (m.v1[N])**2 + (m.v2[N])**2
+
+    m.obj = Objective(expr = (m.dist_cost + conf.w_v*m.v + conf.w_ob1*m.ell1_penalty + conf.w_ob2*m.ell2_penalty + conf.w_ob3*m.ell3_penalty + conf.w_u*m.u_obj - 10000)/100, sense=minimize)
+    return m
+
+# Simulate dynamics
+def simulate(dt,state,u):
+
+    # Create robot model in Pinocchio with q_init as initial configuration and v_init as initial velocities
+    q_init = np.zeros(int((len(state)-1)/2))
+    for state_index in range(int((len(state)-1)/2)):
+        q_init[state_index] = state[state_index]
+    q_init = q_init.T
+    v_init = np.zeros(int((len(state)-1)/2))
+    for state_index in range(int((len(state)-1)/2)):
+        v_init[state_index] = state[int((len(state)-1)/2)+state_index]
+    v_init = v_init.T    
+    r = load_urdf(URDF_PATH)
+    robot = RobotWrapper(r.model, r.collision_model, r.visual_model)
+    simu = RobotSimulator(robot, q_init, v_init, simulation_type, tau_coulomb_max)
+
+    # Simulate control u
+    simu.simulate(u, dt, 1)
+    q_next, v_next, a_next = np.copy(simu.q), np.copy(simu.v), np.copy(simu.dv)
+    q0_next, q1_next, q2_next = q_next[0],q_next[1],q_next[2]
+    v0_next, v1_next, v2_next = v_next[0],v_next[1],v_next[2]
+
+    t_next = state[-1] + dt
+
+    return np.array([q0_next,q1_next,q2_next,v0_next,v1_next,v2_next,t_next])
+
+if __name__ == "__main__":
+    from CACTO_manipulator3DoF_pyomo import get_actor
+    plt.rcParams['xtick.labelsize'] = 22
+    plt.rcParams['ytick.labelsize'] = 22 
+
+    N = conf.NSTEPS                         # Number of timesteps                           
+    num_states = 7                          # Number of states
+    num_actions = 3                         # Number of actions
+    q_norm = 15                             # Joint angle normalization
+    v_norm = 10                             # Velocity normalization
+
+    simulate_coulomb_friction = 0           # To simulate friction
+    simulation_type = 'euler'               # Either 'timestepping' or 'euler'
+    tau_coulomb_max = 0*np.ones(3)          # Expressed as percentage of torque max
+
+
+    # Path to the urdf file
+    URDF_PATH = "urdf/planar_manipulator_3dof.urdf"
+
+    ## Inverse kinematics considering the EE being in ICS_ee
+    # init_states_sim = []
+    # ICS_ee_list = [[10,0],[-10,0],[0,10],[0,-10],[7,7],[-7,7],[7,-7],[-7,-7],[6,0]]
+    # for i in range(len(ICS_ee_list)):
+    #     ICS_ee = ICS_ee_list[i]
+    #     phi = math.atan2(ICS_ee[1],(ICS_ee[0]+7))                           # SUM OF JOINT ANGLES FIXED = orientation of the segment connecting the base with the EE  
+    #     X3rd_joint = (ICS_ee[0]+7) - conf.l* math.cos(phi) 
+    #     Y3rd_joint = (ICS_ee[1]) - conf.l* math.sin(phi)
+    #     c2 = (X3rd_joint**2 + Y3rd_joint**2 -2*conf.l**2)/(2*conf.l**2)
+    #     if ICS_ee[1]>=0:
+    #         s2 = math.sqrt(1-c2**2)
+    #     else:
+    #         s2 = -math.sqrt(1-c2**2)
+    #     s1 = ((conf.l + conf.l*c2)*Y3rd_joint - conf.l*s2*X3rd_joint)/(X3rd_joint**2 + Y3rd_joint**2)  
+    #     c1 = ((conf.l + conf.l*c2)*X3rd_joint - conf.l*s2*Y3rd_joint)/(X3rd_joint**2 + Y3rd_joint**2)
+    #     ICS_q0 = math.atan2(s1,c1)
+    #     ICS_q1 = math.atan2(s2,c2)
+    #     init_states_sim.append(np.array([ICS_q0,ICS_q1,phi-ICS_q0-ICS_q1,0.0,0.0,0.0,0.0]))   # Initial state
+
+    obj_arr = np.array([])
+
+    # Set of initial states
+    # init_states_sim = [np.array([random.uniform(-math.pi,math.pi), random.uniform(-math.pi,math.pi), random.uniform(-math.pi,math.pi), 
+                    # random.uniform(-math.pi/4,math.pi/4), random.uniform(-math.pi/4,math.pi/4), random.uniform(-math.pi/4,math.pi/4)])]              
+    init_states_sim = [np.array([pi/4,-pi/8,-pi/8,0.0,0.0,0.0,0.0]),np.array([-pi/4,pi/8,pi/8,0.0,0.0,0.0,0.0]),np.array([pi/2,0.0,0.0,0.0,0.0,0.0,0.0]),
+                        np.array([-pi/2,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([3*pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([-3*pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),
+                        np.array([pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([-pi/4,0.0,0.0,0.0,0.0,0.0,0.0]),np.array([pi,0.0,0.0,0.0,0.0,0.0,0.0])]
+
+    # Lists to store TO solutions
+    x_ee_all_sim, y_ee_all_sim, q0_all_sim, q1_all_sim, q2_all_sim, v0_all_sim, v1_all_sim, v2_all_sim, a0_all_sim, a1_all_sim, a2_all_sim, tau0_all_sim, tau1_all_sim, tau2_all_sim = [],[],[],[],[],[],[],[],[],[],[],[],[],[]    
+    x_ee_init_all_sim, y_ee_init_all_sim, q0_init_all_sim, q1_init_all_sim, q2_init_all_sim, v0_init_all_sim, v1_init_all_sim, v2_init_all_sim, a0_init_all_sim, a1_init_all_sim, a2_init_all_sim = [],[],[],[],[],[],[],[],[],[],[]
+
+    CACTO_ROLLOUT = 1                                                   # Flag to use CACTO warm-start
+    N_try = 1                                                           # Number of CACTO training whose policy is going to be rolled-out
+    NORMALIZE_INPUTS = 1                                                # If states were normalized during training
+    state_norm_arr = np.array([q_norm,q_norm,q_norm,
+                                v_norm,v_norm,v_norm,int(N*conf.dt)])   
+    
+    if CACTO_ROLLOUT:
+        NNs_path = './Results/NNs/N_try_{}'.format(N_try)
+        
+        actor_model = get_actor()
+
+        ## If loading final weights
+        # actor_model.load_weights(NNs_path+"Manipulator3DoF_final_actor.h5")
+
+        ## If loading weights saved before completing the training
+        nupdates = 128000
+        actor_model.load_weights(NNs_path+"/Manipulator_{}.h5".format(nupdates))
+
+
+    # START SOLVING TO PROBLEMS
+    for i in range(len(init_states_sim)):
+
+        ICS = init_states_sim[i]
+
+        if CACTO_ROLLOUT:
+            # POLICY ROLLOUT
+            q0_CACTO = [init_states_sim[i][0]]
+            q1_CACTO = [init_states_sim[i][1]]
+            q2_CACTO = [init_states_sim[i][2]]
+            v0_CACTO = [init_states_sim[i][3]]
+            v1_CACTO = [init_states_sim[i][4]]
+            v2_CACTO = [init_states_sim[i][5]]
+            a0_CACTO = [0.0]
+            a1_CACTO = [0.0]
+            a2_CACTO = [0.0]
+            tau0_CACTO = []
+            tau1_CACTO = []
+            tau2_CACTO = []
+            step_counter_local = 0
+            prev_state_local = np.copy(init_states_sim[i])
+
+            if NORMALIZE_INPUTS:    
+                prev_state_norm = prev_state_local / state_norm_arr
+                prev_state_norm[-1] = 2*prev_state_norm[-1] - 1                                 
+                tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state_norm), 0)
+            else:
+                tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state_local), 0)
+
+            init_TO_states = np.zeros((num_states, N+1))
+            init_TO_states[0][0] = prev_state_local[0]
+            init_TO_states[1][0] = prev_state_local[1]
+            init_TO_states[2][0] = prev_state_local[2]
+            init_TO_states[3][0] = prev_state_local[3]                     
+            init_TO_states[4][0] = prev_state_local[4]                     
+            init_TO_states[5][0] = prev_state_local[5]   
+            init_TO_controls = np.zeros((num_actions, N+1))
+            init_TO_controls[0][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[0]
+            init_TO_controls[1][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[1] 
+            init_TO_controls[2][0] = tf.squeeze(actor_model(tf_prev_state)).numpy()[2] 
+            init_prev_state = np.copy(prev_state_local)
+
+            for i in range(1, N+1):
+                init_next_state =  simulate(conf.dt,init_prev_state,np.array([init_TO_controls[0][i-1],init_TO_controls[1][i-1],init_TO_controls[2][i-1]]))
+                init_TO_states[0][i] = init_next_state[0]
+                init_TO_states[1][i] = init_next_state[1]
+                init_TO_states[2][i] = init_next_state[2]
+                init_TO_states[3][i] = init_next_state[3] 
+                init_TO_states[4][i] = init_next_state[4] 
+                init_TO_states[5][i] = init_next_state[5] 
+                if NORMALIZE_INPUTS:
+                    init_next_state_norm = init_next_state / state_norm_arr
+                    init_next_state_norm[-1] = 2*init_next_state_norm[-1] - 1
+                    init_tf_next_state = tf.expand_dims(tf.convert_to_tensor(init_next_state_norm), 0)        
+                else:    
+                    init_tf_next_state = tf.expand_dims(tf.convert_to_tensor(init_next_state), 0)        
+                init_TO_controls[0][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[0]
+                init_TO_controls[1][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[1] 
+                init_TO_controls[2][i] = tf.squeeze(actor_model(init_tf_next_state)).numpy()[2] 
+                init_prev_state = np.copy(init_next_state)
+
+            mdl = TO_manipulator(ICS, init_q0, init_q1, init_q2, init_v0, init_v1, init_v2, init_0, init_0, init_0, init_tau0, init_tau1, init_tau2, N, init_TO = [init_TO_controls, init_TO_states])
+        else:
+            # Create TO problem choosing how to warm-start the variables between 1)ICS; 2)0s; 3)Random
+            mdl = TO_manipulator(ICS, init_q0_ICS, init_q1_ICS, init_q2_ICS, init_v0_ICS, init_v1_ICS, init_v2_ICS, init_0, init_0, init_0, init_0, init_0, init_0, N)
+            # mdl = TO_manipulator(ICS, init_0, init_0, init_0, init_0, init_0, init_0, init_0, init_0, init_0, init_0, init_0, init_0, conf.tau_upper_bound, conf.Iz, conf.M, conf.l, conf.w_d, conf.w_v, conf.w_peak, conf.w_ob1, conf.w_ob2, conf.w_ob3, conf.w_u, conf.x_des, conf.y_des, conf.XC1, conf.YC1, conf.XC2, conf.YC2, conf.XC3, conf.YC3, conf.A1, conf.B1, conf.A2, conf.B2, conf.A3, conf.B3, N, conf.dt, conf.alpha, conf.alpha2)
+            # mdl = TO_manipulator(ICS, init_rand_q0, init_rand_q1, init_rand_q2, init_rand_v0, init_rand_v1, init_rand_v2, init_rand_a0, init_rand_a1, init_rand_a2, init_rand_tau, init_rand_tau, init_rand_tau, conf.tau_upper_bound, conf.Iz, conf.M, conf.l, conf.w_d, conf.w_v, conf.w_peak, conf.w_ob1, conf.w_ob2, conf.w_ob3, conf.w_u,conf.x_des, conf.y_des, conf.XC1, conf.YC1, conf.XC2, conf.YC2, conf.XC3, conf.YC3, conf.A1, conf.B1, conf.A2, conf.B2, conf.A3, conf.B3, N, conf.dt, conf.alpha, conf.alpha2)
+
+        # Lists storing values to be plotted
+        K = np.array([k for k in mdl.k])  
+        x_ee_init = [-7+conf.l*(cos(mdl.q0[k]()) + cos(mdl.q0[k]()+mdl.q1[k]()) + cos(mdl.q0[k]()+mdl.q1[k]()+mdl.q2[k]())) for k in K]
+        y_ee_init = [(conf.l*(sin(mdl.q0[k]()) + sin(mdl.q0[k]()+mdl.q1[k]()) + sin(mdl.q0[k]()+mdl.q1[k]()+mdl.q2[k]()))) for k in K]
+        q0_init = [mdl.q0[k]() for k in K]
+        q1_init = [mdl.q1[k]() for k in K]
+        q2_init = [mdl.q2[k]() for k in K]
+        v0_init = [mdl.v0[k]() for k in K]
+        v1_init = [mdl.v1[k]() for k in K]
+        v2_init = [mdl.v2[k]() for k in K]
+        a0_init = [mdl.a0[k]() for k in K]
+        a1_init = [mdl.a1[k]() for k in K]
+        a2_init = [mdl.a2[k]() for k in K]
+        x_ee_init_all_sim.append(x_ee_init)
+        y_ee_init_all_sim.append(y_ee_init)
+        q0_init_all_sim.append(q0_init)
+        q1_init_all_sim.append(q1_init)
+        q2_init_all_sim.append(q2_init)
+        v0_init_all_sim.append(v0_init)
+        v1_init_all_sim.append(v1_init)
+        v2_init_all_sim.append(v2_init)
+        a0_init_all_sim.append(a0_init)
+        a1_init_all_sim.append(a1_init)
+        a2_init_all_sim.append(a2_init)
+
+        # Solve TO problem
+        solver = SolverFactory('ipopt')
+        solver.options['linear_solver'] = "ma57"
+        try:
+            results = solver.solve(mdl,tee=True)    
+            print('***** OBJ = ',mdl.obj(),'*****')
+            if str(results.solver.termination_condition) != "optimal":
+                print(results.solver.termination_condition)
+                continue
+            else:
+                obj_arr = np.append(obj_arr,mdl.obj())
+        except:
+            print("***** IPOPT FAILED *****")
+            continue
+
+        x_ee = [-7+conf.l*(cos(mdl.q0[k]()) + cos(mdl.q0[k]()+mdl.q1[k]()) + cos(mdl.q0[k]()+mdl.q1[k]()+mdl.q2[k]())) for k in K]
+        y_ee = [(conf.l*(sin(mdl.q0[k]()) + sin(mdl.q0[k]()+mdl.q1[k]()) + sin(mdl.q0[k]()+mdl.q1[k]()+mdl.q2[k]()))) for k in K]
+        q0 = [mdl.q0[k]() for k in K]
+        q1 = [mdl.q1[k]() for k in K]
+        q2 = [mdl.q2[k]() for k in K]
+        v0 = [mdl.v0[k]() for k in K]            
+        v1 = [mdl.v1[k]() for k in K]            
+        v2 = [mdl.v2[k]() for k in K]            
+        a0 = [mdl.a0[k]() for k in K] 
+        a1 = [mdl.a1[k]() for k in K] 
+        a2 = [mdl.a2[k]() for k in K] 
+        tau0 = [mdl.tau0[k]() for k in K]            
+        tau1 = [mdl.tau1[k]() for k in K]            
+        tau2 = [mdl.tau2[k]() for k in K]            
+        x_ee_all_sim.append(x_ee)
+        y_ee_all_sim.append(y_ee)
+        q0_all_sim.append(q0)
+        q1_all_sim.append(q1)
+        q2_all_sim.append(q2)
+        v0_all_sim.append(v0)        
+        v1_all_sim.append(v1)        
+        v2_all_sim.append(v2)        
+        a0_all_sim.append(a0)        
+        a1_all_sim.append(a1)        
+        a2_all_sim.append(a2)
+        tau0_all_sim.append(tau0)
+        tau1_all_sim.append(tau1)
+        tau2_all_sim.append(tau2)
+
+    print(obj_arr)
+
+    ## Print trajectories
+    # print("sol_tau0 = ",mdl.tau0[:](),"\nsol_tau1 = ",mdl.tau1[:](),"\nsol_tau2 = ",mdl.tau2[:](),"\nsol_q0 = ",mdl.q0[:](),"\nsol_q1 = ",mdl.q1[:](),"\nsol_q2 = ",mdl.q2[:](),"\nsol_v0 = ",mdl.v0[:](),"\nsol_v1 = ",mdl.v1[:](),"\nsol_v2 = ",mdl.v2[:](),"\nsol_a0 = ",mdl.a0[:](),"\nsol_a1 = ",mdl.a1[:](),"\nsol_a2 = ",mdl.a2[:]())
+    
+    # Plot stuff
+    plot_results_TO_all(q0_all_sim,q1_all_sim,x_ee_init_all_sim,y_ee_init_all_sim,x_ee_all_sim,y_ee_all_sim)
+    # plot_results_TO_all_trajs(q0_all_sim,q1_all_sim,q2_all_sim,v0_all_sim,v1_all_sim,v2_all_sim,tau0_all_sim,tau1_all_sim,tau2_all_sim)
+    # plot_results_TO(mdl) 
+
+    # pickle.dump({"q0":q0_all_sim,"q1":q1_all_sim,"q2":q2_all_sim,"v0":v0_all_sim,"v1":v1_all_sim,"v2":v2_all_sim,"tau0":tau0_all_sim,"tau1":tau1_all_sim,"tau2":tau2_all_sim},open('9_trajectories_ICSinit', 'wb'))
\ No newline at end of file
diff --git a/config_manipulator3DoF_pyomo.py b/config_manipulator3DoF_pyomo.py
new file mode 100644
index 0000000..f251e78
--- /dev/null
+++ b/config_manipulator3DoF_pyomo.py
@@ -0,0 +1,111 @@
+import numpy as np
+
+ep_no_update = 100                       # Episodes to wait before starting to update the NNs
+NEPISODES = 50000+ep_no_update            # Max training episodes
+EP_UPDATE = 25                          # Number of episodes before updating critic and actor
+NSTEPS = 100                            # Max episode length
+CRITIC_LEARNING_RATE = 0.001            # Learning rate for the critic network
+ACTOR_LEARNING_RATE = 0.0005            # Learning rate for the policy network
+UPDATE_RATE = 0.0005                    # Homotopy rate to update the target critic network
+UPDATE_LOOPS = 160                      # Number of updates of both critic and actor performed every EP_UPDATE episodes  
+REPLAY_SIZE = 2**15                     # Size of the replay buffer
+BATCH_SIZE = 64                         # Size of the mini-batch 
+NH1 = 256                               # 1st hidden layer size
+NH2 = 256                               # 2nd hidden layer size
+dt = 0.05                               # Timestep                           
+
+LR_SCHEDULE = 1                         # Flag to use a scheduler for the learning rates
+
+boundaries_schedule_LR_C = [200*REPLAY_SIZE/BATCH_SIZE, 
+                            300*REPLAY_SIZE/BATCH_SIZE,
+                            400*REPLAY_SIZE/BATCH_SIZE,
+                            500*REPLAY_SIZE/BATCH_SIZE]     
+# Values of critic LR                            
+values_schedule_LR_C = [CRITIC_LEARNING_RATE,
+                        CRITIC_LEARNING_RATE/2,
+                        CRITIC_LEARNING_RATE/4,
+                        CRITIC_LEARNING_RATE/8,
+                        CRITIC_LEARNING_RATE/16]  
+
+# Numbers of critic updates after which the actor LR is changed (based on values_schedule_LR_A)
+boundaries_schedule_LR_A = [200*REPLAY_SIZE/BATCH_SIZE,
+                            300*REPLAY_SIZE/BATCH_SIZE,
+                            400*REPLAY_SIZE/BATCH_SIZE,
+                            500*REPLAY_SIZE/BATCH_SIZE]   
+# Values of actor LR                            
+values_schedule_LR_A = [ACTOR_LEARNING_RATE,
+                        ACTOR_LEARNING_RATE/2,
+                        ACTOR_LEARNING_RATE/4,
+                        ACTOR_LEARNING_RATE/8,
+                        ACTOR_LEARNING_RATE/16]      
+
+prioritized_replay_alpha = 0            # α determines how much prioritization is used, set to 0 to use a normal buffer.
+                                        # Used to define the probability of sampling transition i --> P(i) = p_i**α / sum(p_k**α) where p_i is the priority of transition i 
+'''
+Prioritized replay introduces bias because it changes the sample distribution in an uncontrolled fashion (while the expectation is estimated with the mean
+so it would need a uniform sample distribution), and therefore changes the solution that the estimates will converge to. We can correct this bias by using importance-sampling weights: w_i = (1 / (N*P(i)) )**β
+that fully compensates for the non-uniform probabilities P(i) if β = 1. The unbiased nature of the updates is most important near convergence at the end of training, as the process is highly non-stationary anyway,
+due to changing policies, state distributions and bootstrap targets; so a small bias can be ignored in this context.
+'''
+prioritized_replay_beta0 = 0.6          
+prioritized_replay_beta_iters = None    # Therefore let's exploit the flexibility of annealing the amount of IS correction over time, by defining a schedule on the exponent β
+                                        # that from its initial value β0 reaches 1 only at the end of learning.
+prioritized_replay_eps = 1e-5           # It's a small positive constant that prevents the edge-case of transitions not being revisited once their error is zero
+
+NORMALIZE_INPUTS = 1                    # Flag to normalize inputs (state and action)
+
+num_states = 7                          # Number of states
+num_actions = 3                         # Number of actions
+tau_upper_bound = 200                   # Action upper bound
+tau_lower_bound = -200                  # Action upper bound
+state_norm_arr = np.array([15,15,15,10,
+                10,10,int(NSTEPS*dt)])  # Array used to normalize states
+
+EPISODE_CRITIC_PRETRAINING = 0          # Episodes of critic pretraining
+EPISODE_ICS_INIT = 0                    # Episodes where ICS warm-starting is used instead of actor rollout
+
+wreg_l1_A = 1e-2                        # Weight of L1 regularization in actor's network
+wreg_l2_A = 1e-2                        # Weight of L2 regularization in actor's network
+wreg_l1_C = 1e-2                        # Weight of L1 regularization in critic's network
+wreg_l2_C = 1e-2                        # Weight of L2 regularization in critic's network
+
+TD_N = 1                                # Flag to use n-step TD rather than 1-step TD
+nsteps_TD_N = 1                         # Number of lookahed steps
+
+XC1 = -2.0                              # X coord center ellipse 1
+YC1 = 0.0                               # Y coord center ellipse 1
+A1 = 6                                  # Width ellipse 1 
+B1 = 10                                 # Height ellipse 1 
+XC2 = 3.0                               # X coord center ellipse 2 
+YC2 = 4.0                               # Y coord center ellipse 2
+A2 = 12                                 # Width ellipse 2 
+B2 = 4                                  # Height ellipse 2 
+XC3 = 3.0                               # X coord center ellipse 2 
+YC3 = -4.0                              # Y coord center ellipse 2
+A3 = 12                                 # Width ellipse 2 
+B3 = 4                                  # Height ellipse 2 
+
+w_d = 100                               # Distance from target weight
+w_v = 10000                             # Velocity weight
+w_u = 1                                 # Control effort weight 
+w_peak = 500000                         # Target threshold weight    
+w_ob1 = 5000000                         # Ellipse 1 weight
+w_ob2 = 5000000                         # Ellipse 2 weight            
+w_ob3 = 5000000                         # Ellipse 3 weight
+alpha = 50                              # Soft abs coefficient (obstacle)
+alpha2 = 50                             # Soft abs coefficient (peak)                             
+
+M = 0.5                                 # Link mass
+l = 10                                  # Link length
+Iz = (M*l**2)/3                         # Inertia around z axis passing though an end of a link of mass M and length l
+x_base = -7.0                           # x coord base
+y_base = 0.0                            # y coord base
+x_des = -20.0                           # Target x position
+y_des = 0.0                             # Target y position
+TARGET_STATE = np.array([x_des,y_des])  # Target state
+
+
+'''' Needed in dynamics_manipulator3DoF '''
+simulate_coulomb_friction = 0           # To simulate friction
+simulation_type = 'euler'               # Either 'timestepping' or 'euler'
+tau_coulomb_max = 0*np.ones(3)          # Expressed as percentage of torque max
diff --git a/dynamics_manipulator3DoF.py b/dynamics_manipulator3DoF.py
new file mode 100644
index 0000000..4126ccf
--- /dev/null
+++ b/dynamics_manipulator3DoF.py
@@ -0,0 +1,538 @@
+import os
+import math
+import gepetto.corbaserver
+import time
+import subprocess
+import pinocchio as pin
+from pinocchio.robot_wrapper import RobotWrapper as PinocchioRobotWrapper
+import numpy as np
+from numpy.linalg import norm
+import matplotlib.pyplot as plt
+import random
+
+class Contact:
+    ''' A contact between a contact-point and a contact-surface
+    '''
+    def __init__(self, contact_point, contact_surface):
+        self.cp = contact_point
+        self.cs = contact_surface
+        self.reset_contact_position()
+
+    def reset_contact_position(self):
+        # Initialize anchor point p0, that is the initial (0-load) position of the spring
+        self.p0 = self.cp.get_position()
+        self.in_contact = True
+
+    def compute_force(self):
+        self.f, self.p0 = self.cs.compute_force(self.cp, self.p0)
+        return self.f
+        
+    def get_jacobian(self):
+        return self.cp.get_jacobian()
+
+class RobotWrapper(PinocchioRobotWrapper):
+    
+    @staticmethod
+    def BuildFromURDF(filename, package_dirs=None, root_joint=None, verbose=False, meshLoader=None):
+        robot = RobotWrapper()
+        robot.initFromURDF(filename, package_dirs, root_joint, verbose, meshLoader)
+        return robot
+    
+    @property
+    def na(self):
+        if(self.model.joints[0].nq==7):
+            return self.model.nv-6
+        return self.model.nv
+
+    def mass(self, q, update=True):
+        if(update):
+            return pin.crba(self.model, self.data, q)
+        return self.data.M
+
+    def nle(self, q, v, update=True):
+        if(update):
+            return pin.nonLinearEffects(self.model, self.data, q, v)
+        return self.data.nle
+        
+    def com(self, q=None, v=None, a=None, update=True):
+        if(update==False or q is None):
+            return PinocchioRobotWrapper.com(self, q)
+        if a is None:
+            if v is None:
+                return PinocchioRobotWrapper.com(self, q)
+            return PinocchioRobotWrapper.com(self, q, v)
+        return PinocchioRobotWrapper.com(self, q, v,a)
+        
+    def Jcom(self, q, update=True):
+        if(update):
+            return pin.jacobianCenterOfMass(self.model, self.data, q)
+        return self.data.Jcom
+        
+    def momentumJacobian(self, q, v, update=True):
+        if(update):
+            pin.ccrba(self.model, self.data, q, v)
+        return self.data.Ag
+
+    def computeAllTerms(self, q, v):
+        ''' pin.computeAllTerms is equivalent to calling:
+            pinocchio::forwardKinematics
+            pinocchio::crba
+            pinocchio::nonLinearEffects
+            pinocchio::computeJointJacobians
+            pinocchio::centerOfMass
+            pinocchio::jacobianCenterOfMass
+            pinocchio::kineticEnergy
+            pinocchio::potentialEnergy
+            This is too much for our needs, so we call only the functions
+            we need, including those for the frame kinematics
+        '''
+        #pin.computeAllTerms(self.model, self.data, q, v);
+        pin.forwardKinematics(self.model, self.data, q, v, np.zeros(self.model.nv))
+        pin.computeJointJacobians(self.model, self.data)
+        pin.updateFramePlacements(self.model, self.data)
+        pin.crba(self.model, self.data, q)
+        pin.nonLinearEffects(self.model, self.data, q, v)
+       
+    def forwardKinematics(self, q, v=None, a=None):
+        if v is not None:
+            if a is not None:
+                pin.forwardKinematics(self.model, self.data, q, v, a)
+            else:
+                pin.forwardKinematics(self.model, self.data, q, v)
+        else:
+            pin.forwardKinematics(self.model, self.data, q)
+
+    def inverseKinematics(self, ee_coords):
+        ''' Inverse kinematics algorithm to compute a joint configuration given the EE coordinates'''
+        
+        oMdes = pin.SE3(np.eye(3), np.array([ee_coords[0], ee_coords[1], 0.0]))
+        for j in range(100):
+            q = np.array([random.uniform(-math.pi,math.pi),random.uniform(-math.pi,math.pi),random.uniform(-math.pi,math.pi)])
+            # q = pin.neutral(self.model)
+
+            pin.forwardKinematics(self.model, self.data, q)
+            pin.updateFramePlacements(self.model, self.data)
+            eps    = 1e-4
+            IT_MAX = 1000
+            DT     = 1e-1
+            damp   = 1e-12
+            frame_id = self.model.getFrameId('fixed_ee')    
+            i=0
+            while True:
+                pin.forwardKinematics(self.model,self.data,q)
+                pin.updateFramePlacements(self.model, self.data)
+                dMi = oMdes.actInv(self.data.oMf[frame_id])
+                err = pin.log(dMi).vector
+                if norm(err) < eps:
+                    success = True
+                    break
+                if i >= IT_MAX:
+                    success = False
+                    break
+                J = pin.computeFrameJacobian(self.model,self.data,q,frame_id)
+                v = - J.T.dot(np.linalg.solve(J.dot(J.T) + damp * np.eye(6), err))
+                q = pin.integrate(self.model,q,v*DT)
+                if not i % 10:
+                    print('%d: error = %s' % (i, err.T))
+                i += 1
+            
+            if success:
+                print("Convergence achieved!")
+                break
+            else:
+                print("\nWarning: the iterative algorithm has not reached convergence to the desired precision. Retry ({}) with another initial configuration".format(j))
+                
+        return q, success
+
+    def frameJacobian(self, q, index, update=True, ref_frame=pin.ReferenceFrame.LOCAL_WORLD_ALIGNED):
+        ''' Call computeFrameJacobian if update is true. If not, user should call computeFrameJacobian first.
+            Then call getFrameJacobian and return the Jacobian matrix.
+            ref_frame can be: ReferenceFrame.LOCAL, ReferenceFrame.WORLD, ReferenceFrame.LOCAL_WORLD_ALIGNED
+        '''
+        if(update): 
+            pin.computeFrameJacobian(self.model, self.data, q, index)
+        return pin.getFrameJacobian(self.model, self.data, index, ref_frame)
+        
+    def frameVelocity(self, q, v, index, update_kinematics=True, ref_frame=pin.ReferenceFrame.LOCAL_WORLD_ALIGNED):
+        if update_kinematics:
+            pin.forwardKinematics(self.model, self.data, q, v)
+        v_local = pin.getFrameVelocity(self.model, self.data, index)
+        if ref_frame==pin.ReferenceFrame.LOCAL:
+            return v_local
+            
+        H = self.data.oMf[index]
+        if ref_frame==pin.ReferenceFrame.WORLD:
+            v_world = H.act(v_local)
+            return v_world
+        
+        Hr = pin.pin(H.rotation, np.zeros(3))
+        v = Hr.act(v_local)
+        return v
+            
+    def frameAcceleration(self, q, v, a, index, update_kinematics=True, ref_frame=pin.ReferenceFrame.LOCAL_WORLD_ALIGNED):
+        if update_kinematics:
+            pin.forwardKinematics(self.model, self.data, q, v, a)
+        a_local = pin.getFrameAcceleration(self.model, self.data, index)
+        if ref_frame==pin.ReferenceFrame.LOCAL:
+            return a_local
+            
+        H = self.data.oMf[index]
+        if ref_frame==pin.ReferenceFrame.WORLD:
+            a_world = H.act(a_local)
+            return a_world
+        
+        Hr = pin.pin(H.rotation, np.zeros(3))
+        a = Hr.act(a_local)
+        return a
+        
+    def frameClassicAcceleration(self, q, v, a, index, update_kinematics=True, ref_frame=pin.ReferenceFrame.LOCAL_WORLD_ALIGNED):
+        if update_kinematics:
+            pin.forwardKinematics(self.model, self.data, q, v, a)
+        v = pin.getFrameVelocity(self.model, self.data, index)
+        a_local = pin.getFrameAcceleration(self.model, self.data, index)
+        a_local.linear += np.cross(v.angular, v.linear, axis=0)
+        if ref_frame==pin.ReferenceFrame.LOCAL:
+            return a_local
+            
+        H = self.data.oMf[index]
+        if ref_frame==pin.ReferenceFrame.WORLD:
+            a_world = H.act(a_local)
+            return a_world
+        
+        Hr = pin.pin(H.rotation, np.zeros(3))
+        a = Hr.act(a_local)
+        return a
+      
+    def deactivateCollisionPairs(self, collision_pair_indexes):
+        for i in collision_pair_indexes:
+            self.collision_data.deactivateCollisionPair(i)
+            
+    def addAllCollisionPairs(self):
+        self.collision_model.addAllCollisionPairs()
+        self.collision_data = pin.GeometryData(self.collision_model)
+        
+    def isInCollision(self, q, stop_at_first_collision=True):
+        return pin.computeCollisions(self.model, self.data, self.collision_model, self.collision_data, np.asmatrix(q).reshape((self.model.nq,1)), stop_at_first_collision)
+
+    def findFirstCollisionPair(self, consider_only_active_collision_pairs=True):
+        for i in range(len(self.collision_model.collisionPairs)):
+            if(not consider_only_active_collision_pairs or self.collision_data.activeCollisionPairs[i]):
+                if(pin.computeCollision(self.collision_model, self.collision_data, i)):
+                    return (i, self.collision_model.collisionPairs[i])
+        return None
+        
+    def findAllCollisionPairs(self, consider_only_active_collision_pairs=True):
+        res = []
+        for i in range(len(self.collision_model.collisionPairs)):
+            if(not consider_only_active_collision_pairs or self.collision_data.activeCollisionPairs[i]):
+                if(pin.computeCollision(self.collision_model, self.collision_data, i)):
+                    res += [(i, self.collision_model.collisionPairs[i])]
+        return res
+
+class RobotSimulator:
+
+    def __init__(self, robot, q0_init, v0_init, simulation_type, tau_coulomb_max, use_viewer=False, DISPLAY_T=None, CAMERA_TRANSFORM=None, show_floor=False):
+        self.robot = robot
+        self.model = self.robot.model
+        self.data = self.model.createData()
+        self.t = 0.0                            # time
+        self.nv = nv = self.model.nv            # Dimension of joint velocities vector
+        self.na = na = robot.na                 # number of actuated joints
+        self.S = np.hstack((np.zeros((na, nv-na)), np.eye(na, na))) # Matrix S used as filter of vector of inputs U
+
+        self.contacts = []
+        self.candidate_contact_points = []      # candidate contact points
+        self.contact_surfaces = []
+        
+        self.DISPLAY_T = DISPLAY_T              # refresh period for viewer
+        self.use_viewer = use_viewer
+        self.tau_coulomb_max = tau_coulomb_max
+        self.display_counter = self.DISPLAY_T
+        self.init(q0_init, v0_init, True)
+        
+        self.tau_c = np.zeros(na)   # Coulomb friction torque
+        self.simulation_type = simulation_type
+        self.set_coulomb_friction(tau_coulomb_max)
+
+        # for gepetto viewer
+        self.gui = None
+        if(use_viewer):
+            try:
+                prompt = subprocess.getstatusoutput("ps aux |grep 'gepetto-gui'|grep -v 'grep'|wc -l")
+                if int(prompt[1]) == 0:
+                    os.system('gepetto-gui &')
+                time.sleep(1)
+            except:
+                pass
+            gepetto.corbaserver.Client()
+            self.robot.initViewer(loadModel=True)
+            self.gui = self.robot.viewer.gui
+            if(show_floor):
+                self.robot.viewer.gui.createSceneWithFloor('world')
+                self.gui.setLightingMode('world/floor', 'OFF')
+            self.robot.displayCollisions(False)
+            self.robot.displayVisuals(True)
+            self.robot.display(self.q)            
+            try:  
+                self.gui.setCameraTransform("python-pinocchio", CAMERA_TRANSFORM)
+            except:
+                self.gui.setCameraTransform(0, CAMERA_TRANSFORM)            
+
+    # Re-initialize the simulator
+    def init(self, q0=None, v0=None, reset_contact_positions=False):
+        self.first_iter = True
+
+        if q0 is not None:
+            self.q = q0.copy()
+            
+        if(v0 is None):
+            self.v = np.zeros(self.robot.nv)
+        else:
+            self.v = v0.copy()
+        self.dv = np.zeros(self.robot.nv)
+        self.resize_contact_data(reset_contact_positions)
+          
+    def resize_contact_data(self, reset_contact_positions=False):
+        self.nc = len(self.contacts)                    # number of contacts
+        self.nk = 3*self.nc                             # size of contact force vector
+        self.f = np.zeros(self.nk)                      # contact forces
+        self.Jc = np.zeros((self.nk, self.model.nv))    # contact Jacobian
+
+        # reset contact position
+        if(reset_contact_positions):
+            pin.forwardKinematics(self.model, self.data, self.q)
+            pin.updateFramePlacements(self.model, self.data)
+            for c in self.contacts:
+                c.reset_contact_position()
+
+        self.compute_forces(compute_data=True)
+        
+    def set_coulomb_friction(self, tau_max):
+        self.tau_coulomb_max = 1e-2*tau_max*self.model.effortLimit        
+        self.simulate_coulomb_friction = (norm(self.tau_coulomb_max)!=0.0)
+        
+    def collision_detection(self):
+        for s in self.contact_surfaces:                 # for each contact surface
+            for cp in self.candidate_contact_points:    # for each candidate contact point
+                p = cp.get_position()
+                if(s.check_collision(p)):               # check whether the point is colliding with the surface
+                    if(not cp.active):                  # if the contact was not already active
+                        print("Collision detected between point", cp.frame_name, " at ", p)
+                        cp.active = True
+                        cp.contact = Contact(cp, s)
+                        self.contacts += [cp.contact]
+                        self.resize_contact_data()
+                else:
+                    if(cp.active):
+                        print("Contact lost between point", cp.frame_name, " at ", p)
+                        cp.active = False
+                        self.contacts.remove(cp.contact)
+                        self.resize_contact_data()
+
+    def compute_forces(self, compute_data=True):
+        '''Compute the contact forces from q, v and elastic model'''
+        if compute_data:
+            pin.forwardKinematics(self.model, self.data, self.q, self.v)
+            # Computes the placements of all the operational frames according to the current joint placement stored in data
+            pin.updateFramePlacements(self.model, self.data)
+            self.collision_detection()
+            
+        i = 0
+        for c in self.contacts:
+            self.f[i:i+3] = c.compute_force()
+            self.Jc[i:i+3, :] = c.get_jacobian()
+            i += 3
+        return self.f
+
+    def step(self, u, dt=None):
+        if dt is None:
+            dt = self.dt
+
+        # (Forces are directly in the world frame, and aba wants them in the end effector frame)
+        pin.computeAllTerms(self.model, self.data, self.q, self.v)
+        pin.updateFramePlacements(self.model, self.data)
+        M = self.data.M
+        h = self.data.nle
+        self.collision_detection()
+        self.compute_forces(False)
+
+        if(self.simulate_coulomb_friction and self.simulation_type=='timestepping'):
+            # minimize kinetic energy using time stepping
+            from quadprog import solve_qp
+            '''
+            Solve a strictly convex quadratic program
+            
+            Minimize     1/2 x^T G x - a^T x
+            Subject to   C.T x >= b
+            
+            Input Parameters
+            ----------
+            G : array, shape=(n, n)
+            a : array, shape=(n,)
+            C : array, shape=(n, m) matrix defining the constraints
+            b : array, shape=(m), default=None, vector defining the constraints
+            meq : int, default=0
+                the first meq constraints are treated as equality constraints,
+                all further as inequality constraints
+            '''
+            # M (v' - v) = dt*S^T*(tau - tau_c) - dt*h + dt*J^T*f
+            # M v' = M*v + dt*(S^T*tau - h + J^T*f) - dt*S^T*tau_c
+            # M v' = b + B*tau_c
+            # v' = Minv*(b + B*tau_c)
+            b = M.dot(self.v) + dt*(self.S.T.dot(u) - h + self.Jc.T.dot(self.f))
+            B = - dt*self.S.T
+            # Minimize kinetic energy:
+            # min v'.T * M * v'
+            # min  (b+B*tau_c​).T*Minv*(b+B*tau_c​) 
+            # min tau_c.T * B.T*Minv*B* tau_C + 2*b.T*Minv*B*tau_c
+            Minv = np.linalg.inv(M)
+            G = B.T.dot(Minv.dot(B))
+            a = -b.T.dot(Minv.dot(B))
+            C = np.vstack((np.eye(self.na), -np.eye(self.na)))
+            c = np.concatenate((-self.tau_coulomb_max, -self.tau_coulomb_max))
+            solution = solve_qp(G, a, C.T, c, 0)
+            self.tau_c = solution[0]
+            self.v = Minv.dot(b + B.dot(self.tau_c))
+            self.q = pin.integrate(self.model, self.q, self.v*dt)
+            self.dv = np.linalg.solve(M, self.S.T.dot(u-self.tau_c) - h + self.Jc.T.dot(self.f))
+        elif(self.simulation_type=='euler' or self.simulate_coulomb_friction==False):
+            self.tau_c = self.tau_coulomb_max*np.sign(self.v[-self.na:])
+            self.dv = np.linalg.solve(M, self.S.T.dot(u-self.tau_c) - h + self.Jc.T.dot(self.f))
+            # v_mean = np.copy(self.v) + 0.5*dt*self.dv
+            self.q = pin.integrate(self.model, self.q, self.v*dt)
+            # self.q += dt*v_mean        
+            self.v += self.dv*dt
+        else:
+            print("[ERROR] Unknown simulation type:", self.simulation_type)
+
+        self.t += dt
+        return self.q, self.v, self.dv
+
+    def reset(self):
+        self.first_iter = True
+
+    def simulate(self, u, dt=0.05, ndt=1):
+        ''' Perform ndt steps, each lasting dt/ndt seconds '''
+        tau_c_avg = 0*self.tau_c
+        for i in range(ndt):
+            self.q, self.v, self.dv = self.step(u, dt/ndt)
+            tau_c_avg += self.tau_c
+        self.tau_c = tau_c_avg / ndt
+
+        if (self.use_viewer):
+            self.display_counter -= dt
+            if self.display_counter <= 0.0:
+                self.robot.display(self.q)
+                self.display_counter = self.DISPLAY_T
+
+        return self.q, self.v, self.f
+        
+    def display(self, q):
+        if(self.use_viewer):
+            self.robot.display(q)
+
+def load_urdf(URDF_PATH):
+    robot = RobotWrapper.BuildFromURDF(URDF_PATH)
+    return robot
+
+def create_empty_figure(nRows=1, nCols=1,sharex=True):
+    f, ax = plt.subplots(nRows,nCols,sharex=sharex)
+    return (f, ax)
+
+
+if __name__=='__main__':
+
+    N = 200                                 # Number of time steps
+    dt = 0.05                               # Controller time step
+    T_SIMULATION = N*dt                     # Number of seconds simulated
+    ndt = 1
+    q_init = np.array([math.pi/2,0.,0.])    # Initial joint position
+    v_init = np.array([ 0.,0.,0.]).T        # Initial joint velocity
+
+    simulate_coulomb_friction = 0           # To simulate friction
+    simulation_type = 'euler'               # Either 'timestepping' or 'euler'
+    tau_coulomb_max = 0*np.ones(3)          # Expressed as percentage of torque max
+
+    use_viewer = True
+    simulate_real_time = True
+    show_floor = False
+    PRINT_T = 1                             # Print every PRINT_N time steps
+    DISPLAY_T = 0.02                        # Update robot configuration in viewer every DISPLAY_N time steps
+    CAMERA_TRANSFORM = [3.3286049365997314, -11.498767852783203, 121.38613891601562, 0.051378559321165085, 0.023286784067749977, -0.001170438714325428, 0.9984070062637329]
+
+    PLOT_JOINT_POS = 1
+    PLOT_JOINT_VEL = 1
+    PLOT_JOINT_ACC = 1
+    PLOT_TORQUES = 1
+
+    URDF_PATH = "urdf/planar_manipulator_3dof.urdf"
+
+    r = load_urdf(URDF_PATH)
+    robot = RobotWrapper(r.model, r.collision_model, r.visual_model)
+    simu = RobotSimulator(robot, q_init, v_init, simulation_type, tau_coulomb_max, use_viewer, DISPLAY_T, CAMERA_TRANSFORM, show_floor)
+
+    frame_id = robot.model.getFrameId('fixed_ee')   # End-effector frame
+
+    # Example of TO solution from [np.array([pi/2,0.,0.,0.,0.,0.])], target=[-20,0]
+    tau0 =  [50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 3.913573721018818, -49.99999997812837, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -49.99999998263544, -49.99999995861845, -49.99999993221877, -49.99999990306833, -49.99999987071753, -49.99999983461132, -49.9999997940566, -49.99999974817655, -49.999999695845204, -49.999999635591124, -49.999999565453635, -49.99999948277056, -49.9999993838735, -49.99999926362621, -49.99999911450014, -49.99999892406386, -49.99999866788871, -49.999998291144024, -49.99999766106636, -49.99999641971001, -49.99999334107301, -49.99998117668781, -46.3305280659364, -24.952363144765123, -19.338494352862362, -17.665869556802168, -15.969375750675875, -13.875635089856688, -11.802969516419605, -10.018418774230609, -8.566108149374008, -7.391570403941093, -6.42809739367328, -5.62500889060553, -4.94790811736938, -4.3727372800157545, -3.8813594725379144, -3.459361037489522, -3.0950636583315094, -2.7789782267165024, -2.5033915044260913, -2.262015684970941, -2.049698117228155, -1.8621904342485147, -1.6959687118390823, -1.5480935908624742, -1.4161006130642122, -1.2979134200206421, -1.1917745049867159, -1.0961896285185433, -1.0098829686449564, -0.9317607540079113, -0.8608816318066894, -0.7964324077286956, -0.7377080914713674, -0.6840954101556022, -0.6350591283370008, -0.59013065006311, -0.5488984844539794, -0.5110002398113251, -0.47611587644719633, -0.443962000211009, -0.4142870197805363, -0.38686702350141183, -0.3615022576843117, -0.33801410955749306, -0.31624251481435955, -0.2960437238304552, -0.2772883714424923, -0.25985980487578647, -0.24365263123064845, -0.22857145298785428, -0.21452976412923114, -0.20144898449681217, -0.18925761312116307, -0.17789048415522468, -0.1672881117027469, -0.1573961116191082, -0.14816469026299792, -0.1395481916748371, -0.13150469555377498, -0.12399565976918234, -0.11698560197296447, -0.11044181542538248, -0.10433411498268876, -0.0986346096734115, -0.09331749870240803, -0.08835888827625434, -0.08373662679008599, -0.07943015630294098, -0.0754203785771209, -0.07168953402935123, -0.06822109216157565, -0.06499965235029037, -0.06201085377818968, -0.05924129365424839, -0.05667845278067173, -0.05431062782792054, -0.05212686954785943, -0.05011692632728752, -0.04827119268177411, -0.046580662063378186, -0.045036883658417244, -0.043631922768426736, -0.042358324419103935, -0.041209079956523405, -0.040177596288934546, -0.03925766757960086, -0.03844344915052737, -0.03772943342401835, -0.037110427659416016, -0.0365815334480049, -0.03613812768636835, -0.0357758449797231, -0.03549056133594179, -0.03527837907343018, -0.03513561283161326, -0.03505877652434707, -0.03504457090814823, -0.035089871084497695, -0.035191714362526536, -0.035347293877997926, -0.03555397404494562, -0.035809340997228704, -0.0361112098283853, -0.036457214815460895, -0.03684329262668612, -0.03726157437932053, -0.03770484714126743, -0.03819829113291652, -0.03887398161427306, -0.03998048529109933, -0.04134077898704407, -0.04040053594758688, -0.02937217479892824, 9.310123181419991e-22, 3.2867662109711427e-21] 
+    tau1 =  [-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -49.999999925160694, -49.99999623355654, 49.99999661317066, 49.99999933991629, 49.99999958845886, 49.99999953495631, 49.9999991795935, 49.99999757171713, -13.88363883189849, -49.999997992976596, -49.99999919548091, -49.999999489843475, -49.99999954363985, -49.9999993577583, -49.99999849772317, -29.756444147424194, 49.99999851297247, 49.99999962440779, 49.9999999322578, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 49.999999540051434, 49.999945429392575, 13.523097736728628, 49.99999982418196, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 49.99999984633092, 49.99999847815977, 45.367144514865466, -33.83630427701238, -13.867233954583172, -0.5150175332480839, 1.6956169602865967, 0.9012440381465341, 0.3187378661553381, 0.26707626201201495, 0.5171759442383039, 0.9285422723436255, 1.4510231388471262, 2.06302507674324, 2.7476945320290667, 3.4893591250457145, 4.273988285544616, 5.089016044958304, 5.921661917254661, 6.756124677893429, 7.574948431143914, 8.379001318796224, 9.236308477361941, 10.315414824693468, 11.75923166010499, 13.268471377299782, 13.62397567301964, 10.907637822540988, 3.969572656941659, -5.830225659311305, -14.779022841260165, -19.47803573384028, -19.589654048970615, -17.768356513401585, -16.36754183864119, -15.36199581612403, -14.35125645595521, -13.244510677137141, -12.135092802473562, -11.11344736797613, -10.211734347258979, -9.423094529313513, -8.72906392583842, -8.113100899129789, -7.5631701490718495, -7.070407986273108, -6.627657375747211, -6.228745775955465, -5.868258643336417, -5.5414765828689125, -5.244321457631866, -4.973282798621214, -4.725337382788586, -4.497875009074871, -4.288635258573854, -4.095654894536202, -3.9172240426050853, -3.751849416948938, -3.5982233037430147, -3.455197342753047, -3.321760340634281, -3.19701947317666, -3.0801843310492707, -2.970553349808765, -2.867502240312227, -2.770474100184701, -2.6789709408909914, -2.5925464096857915, -2.51079952199875, -2.4333692507602214, -2.359929843838086, -2.2901867617590623, -2.223873145112396, -2.1607467353941847, -2.1005871847778277, -2.0431937006102228, -1.9883829781302658, -1.9359873826438572, -1.8858533472115067, -1.8378399580151306, -1.791817702152308, -1.7476673580051276, -1.705279009121051, -1.6645511668765645, -1.625389988380853, -1.5877085778626319, -1.5514263618540947, -1.5164685291262032, -1.4827655280009002, -1.4502526146865036, -1.4188694463462252, -1.3885597141467942, -1.3592708120698516, -1.3309535372229344, -1.303561818469381, -1.2770524703289947, -1.2513849693768573, -1.2265212510808312, -1.202425524762064, -1.1790641047486285, -1.1564052564097662, -1.1344190554708453, -1.1130772590507383, -1.092353187839831, -1.0722216177572568, -1.052658680675167, -1.033641772964263, -1.0151494715756355, -0.9971614566465445, -0.9796584399147069, -0.9626220990198487, -0.946035016537702, -0.9298806236668248, -0.9141431481429039, -0.8988075658099768, -0.8838595559304274, -0.8692854595130881, -0.8550722406754522, -0.8412074506631448, -0.8276791944692483, -0.814476099441573, -0.8015872864325896, -0.7890023425591132, -0.7767112958181723, -0.7647045914228321, -0.7529730698099669, -0.7415079462538057, -0.730300791389307, -0.7193435108300817, -0.7086283186315634, -0.6981477071505003, -0.6878944562286702, -0.6778618247056026, -0.6680440769047711, -0.6584367080561528, -0.649032784005356, -0.6398077540538428, -0.630695334893884, -0.6216287801285251, -0.6128950581280358, -0.6060640106567209, -0.6043160857918533, -0.6056359031766879, -0.5752845647421657, -0.40450819960279844, -5.263219031959085e-22, -1.5380345359532698e-21] 
+    tau2 =  [-50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, -50.0, 15.176472068098633, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 49.99999934388885, -29.77803281309609, -16.221752389165278, 4.122727684604918, 4.166393767837994, 3.514732370695024, 3.5692856129789194, 2.4256753138456904, -1.452743994278073, -1.2080631473419272, -0.16824452899613926, 0.029834958070621483, -0.09106383508912538, -0.21308198787284016, -0.29430046226428763, -0.35447656421814966, -0.40657909246598867, -0.45473497787083195, -0.5002374257267803, -0.5439659948642026, -0.5867233286245945, -0.6291705892311026, -0.6718331473214855, -0.7151390073252673, -0.7592564187013162, -0.8036391351722044, -0.8475769438021128, -0.8952290113173695, -0.9649183468551131, -1.0844424965204236, -1.2399487595565644, -1.2931714157860015, -0.995371696188642, -0.22354939862974435, 0.7437711833682358, 1.3358156125712366, 1.1994824770073702, 0.7863938473922568, 1.3479870385766368, 1.4423112083688652, 1.2691014140109311, 1.0840586470440485, 0.9515668250469531, 0.8585881498965094, 0.785163907513267, 0.7211673137280206, 0.6636031227311561, 0.6119758492465908, 0.566025511310649, 0.5252350824888673, 0.4889536610814826, 0.4565551896804238, 0.42750295577466474, 0.40135140687931065, 0.3777287206766371, 0.3563203706244419, 0.336857711997533, 0.31911016327731473, 0.30287928134679887, 0.28799387108429686, 0.2743058332244163, 0.26168665101999106, 0.2500244422575814, 0.23922149608186133, 0.22919221341436574, 0.21986138001835878, 0.21116271364150332, 0.20303763764195223, 0.19543424288068909, 0.18830640613489275, 0.181613039559615, 0.17531744973111885, 0.1693867885113119, 0.16379158166933863, 0.15850532265275655, 0.1535041219366205, 0.14876640321139548, 0.1442726397412392, 0.14000512485046981, 0.13594777169912195, 0.1320859379448401, 0.128406272477954, 0.12489658021600719, 0.1215457035178347, 0.11834341711682667, 0.11528033564763057, 0.11234783120603883, 0.10953796063043401, 0.10684340052928179, 0.10425738941269863, 0.10177367615415638, 0.09938647369364768, 0.09709041781637545, 0.09488052996924161, 0.09275218385042916, 0.09070107543134344, 0.08872319590873598, 0.08681480716783921, 0.08497241977806003, 0.08319277288486598, 0.08147281610339675, 0.07980969304210336, 0.07820072634929624, 0.07664340395951144, 0.07513536677545843, 0.07367439736441252, 0.07225840936640654, 0.07088543832539371, 0.06955363273033474, 0.06826124625714236, 0.06700663034399097, 0.06578822753486432, 0.0646045653322765, 0.06345425040158229, 0.06233596348969316, 0.061248454517693826, 0.06019053807870064, 0.059161089434241546, 0.0581590405595334, 0.057183376768826046, 0.05623313332378255, 0.05530739245838624, 0.05440528053964406, 0.05352596548789028, 0.05266865427903696, 0.05183259077469347, 0.05101705355436031, 0.05022135390940832, 0.049444834134151606, 0.04868686570280136, 0.04794684774067304, 0.04722420541324774, 0.04651838864563251, 0.04582887084885529, 0.04515514861748155, 0.044496741667771206, 0.0438531899725731, 0.04322403595870467, 0.04260877802808388, 0.04200684603254139, 0.041417895698825716, 0.040843079005469345, 0.04028715618387913, 0.039755323695318014, 0.0392231914332549, 0.03855375230108693, 0.03747277495639135, 0.036335197108619516, 0.03874870351250417, 0.04599993272961439, -1.1268571878427161e-21, -1.7791824201931178e-21]
+
+    tau = [tau0,tau1,tau2]
+
+    t = 0.0
+    q = np.zeros((3,N+1))
+    v = np.zeros((3,N+1))
+    dv = np.zeros((3,N+1))
+    q[:,0] = q_init
+    v[:,0] = v_init
+
+    # Simulation
+    for i in range(len(tau0)):
+        time_start = time.time()  
+        
+        # Send joint torques to simulator
+        simu.simulate([tau0[i],tau1[i],tau2[i]], dt, ndt)
+        q[:,i], v[:,i], dv[:,i] = simu.q, simu.v, simu.dv    
+        t += dt
+            
+        time_spent = time.time() - time_start
+        if(simulate_real_time and time_spent < dt): 
+            time.sleep(dt-time_spent)
+
+    # PLOT STUFF
+    time = np.arange(0.0, N*dt+dt, dt)
+
+    if PLOT_JOINT_POS:    
+        (f, ax) = create_empty_figure(int(robot.nv/2),3)
+        ax = ax.reshape(robot.nv)
+        for i in range(robot.nv):
+            ax[i].plot(time, q[i,:], label='q')
+            ax[i].set_xlabel('Time [s]')
+            ax[i].set_ylabel(r'$q_'+str(i)+'$ [rad]')
+            
+    if PLOT_JOINT_VEL:    
+        (f, ax) = create_empty_figure(int(robot.nv/2),3)
+        ax = ax.reshape(robot.nv)
+        for i in range(robot.nv):
+            ax[i].plot(time, v[i,:], label='v')
+            ax[i].set_xlabel('Time [s]')
+            ax[i].set_ylabel(r'v_'+str(i)+' [rad/s]')
+            
+    if PLOT_JOINT_ACC:    
+        (f, ax) = create_empty_figure(int(robot.nv/2),3)
+        ax = ax.reshape(robot.nv)
+        for i in range(robot.nv):
+            ax[i].plot(time, dv[i,:], label=r'$\dot{v}$')
+            ax[i].set_xlabel('Time [s]')
+            ax[i].set_ylabel(r'$\dot{v}_'+str(i)+'$ [rad/s^2]')
+    
+    if PLOT_TORQUES:    
+        (f, ax) = create_empty_figure(int(robot.nv/2),3)
+        ax = ax.reshape(robot.nv)
+        for i in range(robot.nv):
+            ax[i].plot(time, tau[i], label=r'$\tau_$ '+str(i))
+            ax[i].set_xlabel('Time [s]')
+            ax[i].set_ylabel(r'$tau_'+str(i)+'$ [Nm]')
+            
+    plt.show()
diff --git a/inits.py b/inits.py
new file mode 100644
index 0000000..f51935e
--- /dev/null
+++ b/inits.py
@@ -0,0 +1,57 @@
+''' Init functions to warm-start TO (Pyomo needs python functions to warm-start its variables)
+'''
+
+# Warmstarting with CACTO rollout
+
+def init_tau0(m,k,init_TO_controls):
+    return init_TO_controls[0][k]
+
+def init_tau1(m,k,init_TO_controls):
+    return init_TO_controls[1][k]
+
+def init_tau2(m,k,init_TO_controls):
+    return init_TO_controls[2][k]
+        
+def init_q0(m,k,init_TO_states):
+    return init_TO_states[0][k]
+
+def init_q1(m,k,init_TO_states):
+    return init_TO_states[1][k]
+
+def init_q2(m,k,init_TO_states):
+    return init_TO_states[2][k]
+
+def init_v0(m,k,init_TO_states):
+    return init_TO_states[3][k]
+
+def init_v1(m,k,init_TO_states):
+    return init_TO_states[4][k]
+
+def init_v2(m,k,init_TO_states):
+    return init_TO_states[5][k]
+
+# Warmstarting with initial conditions
+
+def init_q0_ICS(m,k,prev_state):
+    return prev_state[0]
+
+def init_q1_ICS(m,k,prev_state):
+    return prev_state[1]
+
+def init_q2_ICS(m,k,prev_state):
+    return prev_state[2]
+
+def init_v0_ICS(m,k,prev_state):
+    return prev_state[3]
+
+def init_v1_ICS(m,k,prev_state):
+    return prev_state[4]
+
+def init_v2_ICS(m,k,prev_state):
+    return prev_state[5]
+
+# Warmstarting with zeros
+
+def init_0(m,k):
+    return 0.0
+
diff --git a/replay_buffer.py b/replay_buffer.py
new file mode 100644
index 0000000..2676f08
--- /dev/null
+++ b/replay_buffer.py
@@ -0,0 +1,187 @@
+import numpy as np
+import random
+import math
+
+from segment_tree import SumSegmentTree, MinSegmentTree
+
+
+class ReplayBuffer(object):
+    def __init__(self, size):
+        """Create Replay buffer.
+        Parameters
+        ----------
+        size: int
+            Max number of transitions to store in the buffer. When the buffer
+            overflows the old memories are dropped.
+        """
+        self._storage = []
+        self._maxsize = size
+        self._next_idx = 0
+
+    def __len__(self):
+        return len(self._storage)
+
+    def add(self, obs_t, action_pol, action_expl, reward, obs_tp1, done):
+        data = (obs_t, action_pol, action_expl, reward, obs_tp1, done)
+
+        if self._next_idx >= len(self._storage):
+            self._storage.append(data)
+        else:
+            self._storage[self._next_idx] = data
+        self._next_idx = (self._next_idx + 1) % self._maxsize
+
+    def _encode_sample(self, idxes):
+        obses_t, actions_pol, actions_expl, rewards, obses_tp1, dones = [], [], [], [], [], []
+        for i in idxes:
+            data = self._storage[i]
+            obs_t, action_pol, action_expl, reward, obs_tp1, done = data
+            obses_t.append(np.array(obs_t, copy=False))
+            actions_pol.append(np.array(action_pol, copy=False))
+            actions_expl.append(np.array(action_expl, copy=False))
+            rewards.append(reward)
+            obses_tp1.append(np.array(obs_tp1, copy=False))
+            dones.append(done)
+        return np.array(obses_t), np.array(actions_pol), np.array(actions_expl), np.array(rewards), np.array(obses_tp1), np.array(dones)
+
+    def sample(self, batch_size):
+        """Sample a batch of experiences.
+        Parameters
+        ----------
+        batch_size: int
+            How many transitions to sample.
+        Returns
+        -------
+        obs_batch: np.array
+            batch of observations
+        act_batch: np.array
+            batch of actions executed given obs_batch
+        rew_batch: np.array
+            rewards received as results of executing act_batch
+        next_obs_batch: np.array
+            next set of observations seen after executing act_batch
+        done_mask: np.array
+            done_mask[i] = 1 if executing act_batch[i] resulted in
+            the end of an episode and 0 otherwise.
+        TO_mask: np.array
+            TO_mask[i] = 1 if exectuting TO action   
+        """
+        idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
+        return self._encode_sample(idxes)
+
+
+class PrioritizedReplayBuffer(ReplayBuffer):
+    def __init__(self, size, alpha):
+        """Create Prioritized Replay buffer.
+        Parameters
+        ----------
+        size: int
+            Max number of transitions to store in the buffer. When the buffer
+            overflows the old memories are dropped.
+        alpha: float
+            how much prioritization is used
+            (0 - no prioritization, 1 - full prioritization)
+        See Also
+        --------
+        ReplayBuffer.__init__
+        """
+        super(PrioritizedReplayBuffer, self).__init__(size)
+        assert alpha >= 0
+        self._alpha = alpha
+
+        it_capacity = 1
+        while it_capacity < size:
+            it_capacity *= 2
+
+        self._it_sum = SumSegmentTree(it_capacity)
+        self._it_min = MinSegmentTree(it_capacity)
+        self._max_priority = 1.0
+
+    def add(self, *args, **kwargs):
+        """See ReplayBuffer.store_effect"""
+        idx = self._next_idx
+        super().add(*args, **kwargs)
+        self._it_sum[idx] = self._max_priority ** self._alpha   # Call to __setitem__
+        self._it_min[idx] = self._max_priority ** self._alpha
+
+    def _sample_proportional(self, batch_size):
+        res = []
+        p_total = self._it_sum.sum(0, len(self._storage) - 1)       
+        every_range_len = p_total / batch_size
+        for i in range(batch_size):
+            mass = random.random() * every_range_len + i * every_range_len 
+            idx = self._it_sum.find_prefixsum_idx(mass)
+            res.append(idx)
+        return res
+
+    def sample(self, batch_size, beta):
+        """Sample a batch of experiences.
+        compared to ReplayBuffer.sample
+        it also returns importance weights and idxes
+        of sampled experiences.
+        Parameters
+        ----------
+        batch_size: int
+            How many transitions to sample.
+        beta: float
+            To what degree to use importance weights
+            (0 - no corrections, 1 - full correction)
+        Returns
+        -------
+        obs_batch: np.array
+            batch of observations
+        act_batch: np.array
+            batch of actions executed given obs_batch
+        rew_batch: np.array
+            rewards received as results of executing act_batch
+        next_obs_batch: np.array
+            next set of observations seen after executing act_batch
+        done_mask: np.array
+            done_mask[i] = 1 if executing act_batch[i] resulted in
+            the end of an episode and 0 otherwise.
+        weights: np.array
+            Array of shape (batch_size,) and dtype np.float32
+            denoting importance weight of each sampled transition
+        idxes: np.array
+            Array of shape (batch_size,) and dtype np.int32
+            idexes in buffer of sampled experiences
+        """
+        assert beta > 0
+
+        idxes = self._sample_proportional(batch_size)
+
+        weights = []
+        p_min = self._it_min.min() / self._it_sum.sum()
+        max_weight = (p_min * len(self._storage)) ** (-beta)
+
+        for idx in idxes:
+            p_sample = self._it_sum[idx] / self._it_sum.sum()
+            weight = (p_sample * len(self._storage)) ** (-beta)
+            weights.append(weight / max_weight)
+        weights = np.array(weights)
+        encoded_sample = self._encode_sample(idxes)
+        return tuple(list(encoded_sample) + [weights, idxes])
+
+    def update_priorities(self, idxes, priorities):
+        """Update priorities of sampled transitions.
+        sets priority of transition at index idxes[i] in buffer
+        to priorities[i].
+        Parameters
+        ----------
+        idxes: [int]
+            List of idxes of sampled transitions
+        priorities: [float]
+            List of updated priorities corresponding to
+            transitions at the sampled idxes denoted by
+            variable `idxes`.
+        """
+        assert len(idxes) == len(priorities)
+        for idx, priority in zip(idxes, priorities):
+            if  math.isnan(priority):
+                print("\n ######################################       PRIORITY IS NAN     #######################################\n")
+                priority = self._max_priority
+            assert priority > 0                                                          
+            assert 0 <= idx < len(self._storage)
+            self._it_sum[idx] = priority ** self._alpha
+            self._it_min[idx] = priority ** self._alpha
+
+            self._max_priority = max(self._max_priority, priority)
\ No newline at end of file