fix ilqr max step and add step counter (#173)

* fix ilqr max step and add step counter * remove redundancy in ilqr init
utiasDSL · Nov 5, 2024 · 441b65b · 441b65b
1 parent 0d91ee2
commit 441b65b
Showing 1 changed file with 22 additions and 10 deletions.
diff --git a/safe_control_gym/controllers/lqr/ilqr.py b/safe_control_gym/controllers/lqr/ilqr.py
@@ -1,4 +1,4 @@
-'''Linear Quadratic Regulator (LQR)
+'''iterative Linear Quadratic Regulator (iLQR)
 
 [1] https://studywolf.wordpress.com/2016/02/03/the-iterative-linear-quadratic-regulator-method/
 [2] https://arxiv.org/pdf/1708.09342.pdf
@@ -39,7 +39,10 @@ def __init__(
             max_iterations (int): The number of iterations to train iLQR.
             lamb_factor (float): The amount for which to increase lambda when training fails.
             lamb_max (float): The maximum lambda allowed.
-            epsilon (float): The convergence tolerance.
+            epsilon (float): The convergence tolerance of the cost function.
+
+        Note: This implementation has a Hessian regularization term lambda
+        to make sure the Hessian H is well-conditioned for inversion. See [1] for more details.
         '''
 
         super().__init__(env_func, **kwargs)
@@ -98,9 +101,13 @@ def learn(self, env=None, **kwargs):
         # Initialize previous cost
         self.previous_total_cost = -float('inf')
 
+        # determine the maximum number of steps
+        self.max_steps = int(self.env.CTRL_FREQ * self.env.EPISODE_LEN_SEC)
+
         # Loop through iLQR iterations
         while self.ite_counter < self.max_iterations:
-            self.run(env=env, training=True)
+            self.traj_step = 0
+            self.run(env=env, max_steps=self.max_steps, training=True)
 
             # Save data and update policy if iteration is finished.
             self.state_stack = np.vstack((self.state_stack, self.final_obs))
@@ -174,6 +181,8 @@ def learn(self, env=None, **kwargs):
 
             self.ite_counter += 1
 
+        self.reset()
+
     def update_policy(self, env):
         '''Updates policy.
 
@@ -275,29 +284,31 @@ def select_action(self, obs, info=None, training=False):
         Args:
             obs (ndarray): The observation at this timestep.
             info (dict): The info at this timestep.
+            training (bool): Whether the algorithm is training or evaluating.
 
         Returns:
             action (ndarray): The action chosen by the controller.
         '''
 
-        step = self.extract_step(info)
-
         if training:
             if self.ite_counter == 0:
-                action, gains_fb, input_ff = self.calculate_lqr_action(obs, step)
+                action, gains_fb, input_ff = self.calculate_lqr_action(obs, self.traj_step)
                 # Save gains and feedforward term
-                if step == 0:
+                if self.traj_step == 0:
                     self.gains_fb = gains_fb.reshape((1, self.model.nu, self.model.nx))
                     self.input_ff = input_ff.reshape(self.model.nu, 1)
                 else:
                     self.gains_fb = np.append(self.gains_fb, gains_fb.reshape((1, self.model.nu, self.model.nx)), axis=0)
                     self.input_ff = np.append(self.input_ff, input_ff.reshape(self.model.nu, 1), axis=1)
             else:
-                action = self.gains_fb[step].dot(obs) + self.input_ff[:, step]
+                action = self.gains_fb[self.traj_step].dot(obs) + self.input_ff[:, self.traj_step]
         elif self.gains_fb_best is not None:
-            action = self.gains_fb_best[step].dot(obs) + self.input_ff_best[:, step]
+            action = self.gains_fb_best[self.traj_step].dot(obs) + self.input_ff_best[:, self.traj_step]
         else:
-            action, _, _ = self.calculate_lqr_action(obs, step)
+            action, _, _ = self.calculate_lqr_action(obs, self.traj_step)
+
+        if self.traj_step < self.max_steps - 1:
+            self.traj_step += 1
 
         return action
 
@@ -330,6 +341,7 @@ def reset(self):
         '''Prepares for evaluation.'''
         self.env.reset()
         self.ite_counter = 0
+        self.traj_step = 0
 
     def run(self, env=None, max_steps=500, training=True):
         '''Runs evaluation with current policy.