EmbarkStudios · riley-mld · Mar 29, 2022 · Mar 18, 2022 · Mar 21, 2022 · Mar 21, 2022
@@ -2,6 +2,8 @@
 from torch import nn
 from torch import Tensor
 
+from emote.nn.initialization import ortho_init_
+
 
 class ActionValue(nn.Module):
     def __init__(self, observation_dim, action_dim, hidden_dims):
@@ -14,9 +16,10 @@ def __init__(self, observation_dim, action_dim, hidden_dims):
                 for n_in, n_out in zip(
                     [observation_dim + action_dim] + hidden_dims, hidden_dims
                 )
-            ]
+            ],
+            nn.Linear(hidden_dims[-1], 1)
         )
-        self.head = nn.Linear(hidden_dims[-1], 1)
+        self.seq.apply(ortho_init_)
 
     def forward(self, action: Tensor, obs: Tensor) -> Tensor:
         bsz, obs_d = obs.shape
@@ -25,7 +28,6 @@ def forward(self, action: Tensor, obs: Tensor) -> Tensor:
         assert obs_d == self.obs_d
         assert act_d == self.act_d
         x = torch.cat([obs, action], dim=1)
-        x = self.seq(x)
-        out = self.head(x)
+        out = self.seq(x)
         assert (bsz, 1) == out.shape
         return out
@@ -8,6 +8,8 @@
 import torch.nn.functional as F
 from torch import Tensor
 
+from emote.nn.initialization import ortho_init_
+
 
 class SquashStretchTransform(transforms.Transform):
     r"""
@@ -111,11 +113,10 @@ def __init__(self, observation_dim, action_dim, hidden_dims):
             *[
                 nn.Sequential(nn.Linear(n_in, n_out), nn.ReLU())
                 for n_in, n_out in zip([observation_dim] + hidden_dims, hidden_dims)
-            ]
+            ],
+            GaussianPolicyHead(hidden_dims[-1], action_dim)
         )
-        self.head = GaussianPolicyHead(hidden_dims[-1], action_dim)
+        self.seq.apply(ortho_init_)
 
     def forward(self, obs):
-        x = self.seq(obs)
-        pre_actions, neg_log_probs = self.head(x)
-        return pre_actions, neg_log_probs
+        return self.seq(obs)
diff --git a/emote/trainer.py b/emote/trainer.py
@@ -57,7 +57,9 @@ def train(self, shutdown_signal: Callable = None):
         try:
 
             for bp_step, batch in zip(count(1), self.dataloader):
+                batch_size = batch['batch_size']
                 self.state["bp_step"] = bp_step
+                self.state["bp_samples"] = bp_step * batch_size
                 self.state.update(batch)
 
                 if shutdown_signal():

@@ -0,0 +1,18 @@
+from distutils.core import setup
+
+setup(
+    name='emote',
+    version='0.1',
+    description='A modular reinforcement learning library',
+    author ='Martin Singh-Blom, Tom Solberg, Jack Harmer, Jorge Del Val, Riley Miladi',
+    author_email='[email protected], [email protected], [email protected], [email protected], [email protected]',
+    packages=[],
+    install_requires=[
+        'gym',
+        'gym[atari]',
+        'gym[box2d]',
+        'gym[classic_control]',
+        'sphinx-rtd-theme',
+        'black'
+    ]
+)
@@ -30,6 +30,8 @@ def __init__(
 
     def collect_data(self):
         """Collect a single rollout"""
+        if self._render:
+            self._env.render()
         actions = self._agent(self._obs)
         next_obs = self._env.dict_step(actions)
         self._memory.add(self._obs, actions)
@@ -47,7 +49,7 @@ def begin_training(self):
         "Runs through the init, step cycle once on main thread to make sure all envs work."
         self._obs = self._env.dict_reset()
         actions = self._agent(self._obs)
-        _ = self._env.step(actions)
+        _ = self._env.dict_step(actions)
         self._obs = self._env.dict_reset()
 
 
@@ -58,8 +60,10 @@ def __init__(
         agent: AgentProxy,
         memory: MemoryProxy,
         render=True,
+        warmup_steps=0,
     ):
         super().__init__(env, agent, memory, render)
+        self._warmup_steps = warmup_steps
         self._stop = False
         self._thread = None
 
@@ -81,6 +85,11 @@ def collect_forever(self):
             self.collect_data()
 
     def begin_training(self):
+        # Collect trajectories for warmup steps before starting training
+        super().begin_training()
+        iterations_required = self._warmup_steps
+        self.collect_multiple(iterations_required)
+
         self._thread = threading.Thread(target=self.collect_forever)
         self._thread.start()
 

@@ -1,7 +1,7 @@
 import torch
 from torch import nn
 from torch.optim import Adam
-from gym.vector import AsyncVectorEnv
+from gym.vector import AsyncVectorEnv, SyncVectorEnv
 
 from emote import Trainer
 from emote.callbacks import (
@@ -78,7 +78,7 @@ def test_htm():
     ]
 
     callbacks = logged_cbs + [
-        SimpleGymCollector(env, agent_proxy, memory_proxy, warmup_steps=500),
+        SimpleGymCollector(env, agent_proxy, memory_proxy, warmup_steps=500, render=False),
         TerminalLogger(logged_cbs, 400),
         FinalLossTestCheck([logged_cbs[2]], [10.0], 2000),
     ]

@@ -0,0 +1,82 @@
+import torch
+from torch.utils.tensorboard import SummaryWriter
+from torch import nn
+from torch.optim import Adam
+from gym.vector import AsyncVectorEnv, SyncVectorEnv
+import gym
+import numpy as np
+
+from emote import Trainer
+from emote.callbacks import (
+    FinalLossTestCheck,
+    TensorboardLogger
+)
+from emote.nn import GaussianMLPPolicy, ActionValue
+from emote.memory.builder import DictObsTable
+from emote.sac import (
+    QLoss,
+    QTarget,
+    PolicyLoss,
+    AlphaLoss,
+    FeatureAgentProxy,
+)
+from emote.memory import TableMemoryProxy, MemoryLoader
+
+from .gym import SimpleGymCollector, DictGymWrapper
+
+
+N_HIDDEN = 256
+
+
+def test_lunar_lander():
+
+    experiment_name = "Lunar-lander_test2"
+
+    hidden_layers = [256, 256]
+
+    batch_size = 500
+    rollout_len = 2
+
+    n_env = 60
+
+    learning_rate = 1e-3
+
+    env = DictGymWrapper(SyncVectorEnv([_make_env(i) for i in range(n_env)]))
+    table = DictObsTable(spaces=env.dict_space, maxlen=4_000_000)
+    memory_proxy = TableMemoryProxy(table)
+    dataloader = MemoryLoader(table, batch_size, rollout_len, "batch_size")
+
+    num_actions = env.dict_space.actions.shape[0]
+    num_obs = list(env.dict_space.state.spaces.values())[0].shape[0]
+
+    q1 = ActionValue(num_obs, num_actions, hidden_layers)
+    q2 = ActionValue(num_obs, num_actions, hidden_layers)
+    policy = GaussianMLPPolicy(num_obs, num_actions, hidden_layers)
+
+    ln_alpha = torch.tensor(1.0, requires_grad=True)
+    agent_proxy = FeatureAgentProxy(policy)
+
+    logged_cbs = [
+        QLoss(name="q1", q=q1, opt=Adam(q1.parameters(), lr=learning_rate)),
+        QLoss(name="q2", q=q2, opt=Adam(q2.parameters(), lr=learning_rate)),
+        PolicyLoss(pi=policy, ln_alpha=ln_alpha, q=q1, opt=Adam(policy.parameters(), lr=learning_rate)),
+        AlphaLoss(pi=policy, ln_alpha=ln_alpha, opt=Adam([ln_alpha]), n_actions=num_actions),
+        QTarget(pi=policy, ln_alpha=ln_alpha, q1=q1, q2=q2),
+    ]
+
+    callbacks = logged_cbs + [
+        SimpleGymCollector(env, agent_proxy, memory_proxy, warmup_steps=batch_size*rollout_len),
+        TensorboardLogger(logged_cbs, SummaryWriter("runs/"+experiment_name), 2000),
+        FinalLossTestCheck([logged_cbs[2]], [10.0], 300_000_000),
+    ]
+
+    trainer = Trainer(callbacks, dataloader)
+    trainer.train()
+
+def _make_env(rank):
+    def _thunk():
+        env = gym.make("LunarLander-v2", continuous=True)
+        env.seed(rank)
+        return env
+    return _thunk
+