diff --git a/assume/reinforcement_learning/algorithms/matd3.py b/assume/reinforcement_learning/algorithms/matd3.py index 857bc4e0..0d900497 100644 --- a/assume/reinforcement_learning/algorithms/matd3.py +++ b/assume/reinforcement_learning/algorithms/matd3.py @@ -398,6 +398,21 @@ def update_policy(self): logger.debug("Updating Policy") n_rl_agents = len(self.learning_role.rl_strats.keys()) + + # Update noise decay and leanrng rate + updated_noise_decay = self.learning_role.noise_schedule( + self.learning_role.get_progress_remaining() + ) + + #loop again over all units to avoid update call for every gradient step, as it will be ambiguous + for u_id, unit_strategy in self.learning_role.rl_strats.items(): + critic = self.learning_role.critics[u_id] + actor = self.learning_role.rl_strats[u_id].actor + unit_strategy.action_noise.update_noise_decay(updated_noise_decay) + # Update learning rate + self.update_learning_rate([critic.optimizer, actor.optimizer]) + + for _ in range(self.gradient_steps): self.n_updates += 1 i = 0 @@ -408,9 +423,7 @@ def update_policy(self): actor = self.learning_role.rl_strats[u_id].actor actor_target = self.learning_role.rl_strats[u_id].actor_target - # Update learning rate - self.update_learning_rate([critic.optimizer, actor.optimizer]) - + if i % 100 == 0: # only update target networks every 100 steps, to have delayed network update transitions = self.learning_role.buffer.sample(self.batch_size) @@ -531,9 +544,4 @@ def update_policy(self): ) i += 1 - # Update noise decay - updated_noise_decay = self.learning_role.noise_schedule( - self.learning_role.get_progress_remaining() - ) - for unit_strategy in self.learning_role.rl_strats.values(): - unit_strategy.action_noise.update_noise_decay(updated_noise_decay) + \ No newline at end of file diff --git a/assume/reinforcement_learning/learning_role.py b/assume/reinforcement_learning/learning_role.py index c63ab6c4..de664d3f 100644 --- a/assume/reinforcement_learning/learning_role.py +++ b/assume/reinforcement_learning/learning_role.py @@ -87,6 +87,7 @@ def __init__( if use_lr_schedule: self.lr_schedule = get_schedule_fn(linear_schedule(self.learning_rate)) else: + #linear schedule as no config item stating it should be sheduled is present self.lr_schedule = get_schedule_fn(self.learning_rate) noise_dt = learning_config.get("noise_dt", 1) diff --git a/examples/inputs/example_02a/config.yaml b/examples/inputs/example_02a/config.yaml index 8da7961f..588e8667 100644 --- a/examples/inputs/example_02a/config.yaml +++ b/examples/inputs/example_02a/config.yaml @@ -17,12 +17,13 @@ tiny: actor_architecture: mlp learning_rate: 0.001 training_episodes: 10 - episodes_collecting_initial_experience: 3 + episodes_collecting_initial_experience: 1 train_freq: 24h gradient_steps: -1 batch_size: 64 gamma: 0.99 device: cpu + use_lr_schedule: True noise_sigma: 0.1 noise_scale: 1 noise_dt: 1 @@ -62,11 +63,12 @@ base: learning_rate: 0.001 training_episodes: 50 episodes_collecting_initial_experience: 5 - train_freq: 24h + train_freq: 12h gradient_steps: -1 batch_size: 256 gamma: 0.99 device: cpu + use_lr_schedule: True noise_sigma: 0.1 noise_scale: 1 noise_dt: 1