Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make train on lunar lander #14

Merged
merged 17 commits into from
Mar 29, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions emote/nn/action_value_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from torch import nn
from torch import Tensor

from emote.nn.initialization import ortho_init_


class ActionValue(nn.Module):
def __init__(self, observation_dim, action_dim, hidden_dims):
Expand All @@ -14,9 +16,10 @@ def __init__(self, observation_dim, action_dim, hidden_dims):
for n_in, n_out in zip(
[observation_dim + action_dim] + hidden_dims, hidden_dims
)
]
],
nn.Linear(hidden_dims[-1], 1)
)
self.head = nn.Linear(hidden_dims[-1], 1)
self.seq.apply(ortho_init_)

def forward(self, action: Tensor, obs: Tensor) -> Tensor:
bsz, obs_d = obs.shape
Expand All @@ -25,7 +28,6 @@ def forward(self, action: Tensor, obs: Tensor) -> Tensor:
assert obs_d == self.obs_d
assert act_d == self.act_d
x = torch.cat([obs, action], dim=1)
x = self.seq(x)
out = self.head(x)
out = self.seq(x)
assert (bsz, 1) == out.shape
return out
11 changes: 6 additions & 5 deletions emote/nn/gaussian_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import torch.nn.functional as F
from torch import Tensor

from emote.nn.initialization import ortho_init_


class SquashStretchTransform(transforms.Transform):
r"""
Expand Down Expand Up @@ -111,11 +113,10 @@ def __init__(self, observation_dim, action_dim, hidden_dims):
*[
nn.Sequential(nn.Linear(n_in, n_out), nn.ReLU())
for n_in, n_out in zip([observation_dim] + hidden_dims, hidden_dims)
]
],
GaussianPolicyHead(hidden_dims[-1], action_dim)
)
self.head = GaussianPolicyHead(hidden_dims[-1], action_dim)
self.seq.apply(ortho_init_)

def forward(self, obs):
x = self.seq(obs)
pre_actions, neg_log_probs = self.head(x)
return pre_actions, neg_log_probs
return self.seq(obs)
2 changes: 2 additions & 0 deletions emote/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ def train(self, shutdown_signal: Callable = None):
try:

for bp_step, batch in zip(count(1), self.dataloader):
batch_size = batch['batch_size']
riley-mld marked this conversation as resolved.
Show resolved Hide resolved
self.state["bp_step"] = bp_step
self.state["bp_samples"] = bp_step * batch_size
riley-mld marked this conversation as resolved.
Show resolved Hide resolved
self.state.update(batch)

if shutdown_signal():
Expand Down
3 changes: 0 additions & 3 deletions pip-requirements.txt

This file was deleted.

18 changes: 18 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from distutils.core import setup

setup(
name='emote',
version='0.1',
description='A modular reinforcement learning library',
author ='Martin Singh-Blom, Tom Solberg, Jack Harmer, Jorge Del Val, Riley Miladi',
author_email='[email protected], [email protected], [email protected], [email protected], [email protected]',
packages=[],
install_requires=[
'gym',
'gym[atari]',
'gym[box2d]',
'gym[classic_control]',
'sphinx-rtd-theme',
'black'
]
)
11 changes: 10 additions & 1 deletion tests/gym/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def __init__(

def collect_data(self):
"""Collect a single rollout"""
if self._render:
self._env.render()
actions = self._agent(self._obs)
next_obs = self._env.dict_step(actions)
self._memory.add(self._obs, actions)
Expand All @@ -47,7 +49,7 @@ def begin_training(self):
"Runs through the init, step cycle once on main thread to make sure all envs work."
self._obs = self._env.dict_reset()
actions = self._agent(self._obs)
_ = self._env.step(actions)
_ = self._env.dict_step(actions)
self._obs = self._env.dict_reset()


Expand All @@ -58,8 +60,10 @@ def __init__(
agent: AgentProxy,
memory: MemoryProxy,
render=True,
warmup_steps=0,
):
super().__init__(env, agent, memory, render)
self._warmup_steps = warmup_steps
self._stop = False
self._thread = None

Expand All @@ -81,6 +85,11 @@ def collect_forever(self):
self.collect_data()

def begin_training(self):
# Collect trajectories for warmup steps before starting training
riley-mld marked this conversation as resolved.
Show resolved Hide resolved
super().begin_training()
iterations_required = self._warmup_steps
self.collect_multiple(iterations_required)

self._thread = threading.Thread(target=self.collect_forever)
self._thread.start()

Expand Down
4 changes: 2 additions & 2 deletions tests/test_htm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch
from torch import nn
from torch.optim import Adam
from gym.vector import AsyncVectorEnv
from gym.vector import AsyncVectorEnv, SyncVectorEnv
riley-mld marked this conversation as resolved.
Show resolved Hide resolved

from emote import Trainer
from emote.callbacks import (
Expand Down Expand Up @@ -78,7 +78,7 @@ def test_htm():
]

callbacks = logged_cbs + [
SimpleGymCollector(env, agent_proxy, memory_proxy, warmup_steps=500),
SimpleGymCollector(env, agent_proxy, memory_proxy, warmup_steps=500, render=False),
TerminalLogger(logged_cbs, 400),
FinalLossTestCheck([logged_cbs[2]], [10.0], 2000),
]
Expand Down
82 changes: 82 additions & 0 deletions tests/test_lunar_lander.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import torch
from torch.utils.tensorboard import SummaryWriter
from torch import nn
from torch.optim import Adam
from gym.vector import AsyncVectorEnv, SyncVectorEnv
import gym
import numpy as np

from emote import Trainer
from emote.callbacks import (
FinalLossTestCheck,
TensorboardLogger
)
from emote.nn import GaussianMLPPolicy, ActionValue
from emote.memory.builder import DictObsTable
from emote.sac import (
QLoss,
QTarget,
PolicyLoss,
AlphaLoss,
FeatureAgentProxy,
)
from emote.memory import TableMemoryProxy, MemoryLoader

from .gym import SimpleGymCollector, DictGymWrapper


N_HIDDEN = 256
riley-mld marked this conversation as resolved.
Show resolved Hide resolved


def test_lunar_lander():

experiment_name = "Lunar-lander_test2"

hidden_layers = [256, 256]

batch_size = 500
rollout_len = 2

n_env = 60

learning_rate = 1e-3

env = DictGymWrapper(SyncVectorEnv([_make_env(i) for i in range(n_env)]))
table = DictObsTable(spaces=env.dict_space, maxlen=4_000_000)
memory_proxy = TableMemoryProxy(table)
dataloader = MemoryLoader(table, batch_size, rollout_len, "batch_size")

num_actions = env.dict_space.actions.shape[0]
num_obs = list(env.dict_space.state.spaces.values())[0].shape[0]

q1 = ActionValue(num_obs, num_actions, hidden_layers)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be controversial but I really think we should kill the ActionValue and GaussianMLPPolicy files and all the contents in them. Just do the network construction explicitly in this file instead. Otherwise we're hiding one of the things I'd like to make obvious, which is which network you're using. In the branch I was working on I have done this already and I think it's much better and more obvious what's happening. So, at the beginning just pop in a small

class QNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.q = nn.Sequential(
            nn.Linear(N_OBSERVATIONS + N_ACTIONS, N_HIDDEN),
            nn.ReLU(),
            nn.Linear(N_HIDDEN, N_HIDDEN),
            nn.ReLU(),
            nn.Linear(N_HIDDEN, 1),
        )
        self.q.apply(ortho_init_)

    def forward(self, action, obs):
        x = torch.cat([obs, action], dim=1)
        return self.q(x)


class Policy(nn.Module):
    def __init__(self):
        super().__init__()
        self.pi = nn.Sequential(
            nn.Linear(N_OBSERVATIONS, N_HIDDEN),
            nn.ReLU(),
            nn.Linear(N_HIDDEN, N_HIDDEN),
            nn.ReLU(),
            GaussianPolicyHead(N_HIDDEN, N_ACTIONS),
        )
        self.pi.apply(ortho_init_)

    def forward(self, obs):
        return self.pi(obs)

This goes with the philosophy of making test and experiment code the configuration that I really think will make our iterations faster and easier in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point, I'm not sure I agree though. As you said we did have an issue where it wasn't always obvious what's the network architecture and perhaps that should have been more clear but I'm not convinced this is the right answer. For lunar lander and hit the middle, sure it makes sense to add a few lines of code for clarity sake but I'm imagining a bit down the line where we might use bigger/more complicated networks and plus other networks we need for CURL, GAIL, world models etc. we might end up having to write so many lines for defining the networks that i'm not sure it makes sense. Plus there is always the issue of losing performance because we might forget an implementation detail of the network for that specific test/experiment.
Maybe the solution is to keep ActionValue and GaussianMLPPolicy and other types of networks and give both options to the user and they can use it as they see fit.
I changed this to what you suggested for now, as I think that's a bit outside of the scope of this PR, but I think maybe this is a conversation we should have and make changes if needed in a separate PR.

q2 = ActionValue(num_obs, num_actions, hidden_layers)
policy = GaussianMLPPolicy(num_obs, num_actions, hidden_layers)

ln_alpha = torch.tensor(1.0, requires_grad=True)
agent_proxy = FeatureAgentProxy(policy)

logged_cbs = [
QLoss(name="q1", q=q1, opt=Adam(q1.parameters(), lr=learning_rate)),
QLoss(name="q2", q=q2, opt=Adam(q2.parameters(), lr=learning_rate)),
PolicyLoss(pi=policy, ln_alpha=ln_alpha, q=q1, opt=Adam(policy.parameters(), lr=learning_rate)),
AlphaLoss(pi=policy, ln_alpha=ln_alpha, opt=Adam([ln_alpha]), n_actions=num_actions),
QTarget(pi=policy, ln_alpha=ln_alpha, q1=q1, q2=q2),
]

callbacks = logged_cbs + [
SimpleGymCollector(env, agent_proxy, memory_proxy, warmup_steps=batch_size*rollout_len),
TensorboardLogger(logged_cbs, SummaryWriter("runs/"+experiment_name), 2000),
FinalLossTestCheck([logged_cbs[2]], [10.0], 300_000_000),
]

trainer = Trainer(callbacks, dataloader)
trainer.train()

def _make_env(rank):
def _thunk():
env = gym.make("LunarLander-v2", continuous=True)
env.seed(rank)
return env
return _thunk