Implement standardized advantages; converges like a boss

frank-lsf · Sep 26, 2020 · ab39493 · ab39493
1 parent 856adab
commit ab39493
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 4 deletions.
diff --git a/hw2/cs285/agents/pg_agent.py b/hw2/cs285/agents/pg_agent.py
@@ -3,6 +3,7 @@
 from .base_agent import BaseAgent
 from cs285.policies.MLP_policy import MLPPolicyPG
 from cs285.infrastructure.replay_buffer import ReplayBuffer
+from cs285.infrastructure.utils import normalize
 
 
 class PGAgent(BaseAgent):
@@ -97,10 +98,9 @@ def estimate_advantage(self, obs, q_values):
 
         # Normalize the resulting advantages
         if self.standardize_advantages:
-            ## TODO: standardize the advantages to have a mean of zero
+            ## standardize the advantages to have a mean of zero
             ## and a standard deviation of one
-            ## HINT: there is a `normalize` function in `infrastructure.utils`
-            advantages = TODO
+            advantages = normalize(advantages)
 
         return advantages
 

diff --git a/hw2/cs285/infrastructure/utils.py b/hw2/cs285/infrastructure/utils.py
@@ -190,7 +190,11 @@ def get_pathlength(path):
     return len(path["reward"])
 
 
-def normalize(data, mean, std, eps=1e-8):
+def normalize(data, mean=None, std=None, eps=1e-8):
+    if mean is None:
+        mean = np.mean(data)
+    if std is None:
+        std = np.std(data)
     return (data - mean) / (std + eps)