Skip to content

Commit

Permalink
Improve replay buffer documentation
Browse files Browse the repository at this point in the history
Summary: A diff improving comments in replay buffer classes.

Reviewed By: jb3618columbia

Differential Revision: D61639097

fbshipit-source-id: b0964200ed8433d90d319fe4fc5edd094373b696
  • Loading branch information
rodrigodesalvobraz authored and facebook-github-bot committed Sep 9, 2024
1 parent bd509b8 commit 479c959
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

# pyre-strict

from typing import Any, Dict, List, Optional, Tuple, Type
from typing import Any, List, Optional, Tuple, Type

import torch
from pearl.action_representation_modules.action_representation_module import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@


class FIFOOnPolicyReplayBuffer(TensorBasedReplayBuffer):
"""
This replay buffer is used to delay push for SARSA.
It waits until next action is available and only then does it push a transition
that contains that information.
"""

def __init__(self, capacity: int) -> None:
super(FIFOOnPolicyReplayBuffer, self).__init__(capacity)
# this is used to delay push SARS
# wait for next action is available and then final push
# this is designed for single transition for now
self.cache: Optional[Transition] = None

def _store_transition(
Expand Down
69 changes: 39 additions & 30 deletions pearl/replay_buffers/tensor_based_replay_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,41 +144,50 @@ def _process_single_cost(self, cost: Optional[float]) -> Optional[torch.Tensor]:
def _process_single_terminated(self, terminated: bool) -> torch.Tensor:
return torch.tensor([terminated]) # (1,)

"""
This function is only used for discrete action space.
An example:
----------------------------------------------------------
Suppose the environment at every step has a maximum number of 5 actions, and
the agent uses a onehot action representation module. At time step t, if the agent offers
2 actions, [0, 3], then the result of this function will be:
available_actions_tensor_with_padding = [
[0],
[3],
[0],
[0],
[0],
]
unavailable_actions_mask = [0, 0, 1, 1, 1]
Note that although the actions and padding can have overlap, the mask will always disable the
unavailable actions so won't impact algorithm.
The same goes to the case where the agent uses an identity action representation
(assuming some random features for action 0 and 3), then it would be
available_actions_tensor_with_padding = [
[0.1, 0.6, 0.3, 1.8, 2.0],
[0.8, -0.3, 0.6, 1.9, 3.0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
]
unavailable_actions_mask = [0, 0, 1, 1, 1]
"""

def _create_action_tensor_and_mask(
self,
max_number_actions: Optional[int],
available_action_space: Optional[ActionSpace],
) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Takes an action space containing only available actions,
and the maximum number of actions possible.
If the action space is continuous, returns (None, None).
If the action space is discrete, returns a pair of tensors:
1. A tensor of shape (1 x action_space_size x action_dim) that contains the available
actions.
2. A mask tensor of shape (1 x action_space_size) that contains 0 for available actions.
Example:
----------------------------------------------------------
Suppose the environment at every step has a maximum number of 5 actions, and
the agent uses a onehot action representation module. At time step t, if the agent offers
2 actions, [0, 3], then the result of this function will be:
available_actions_tensor_with_padding = [
[0],
[3],
[0],
[0],
[0],
]
unavailable_actions_mask = [0, 0, 1, 1, 1]
Note that although the actions and padding can overlap, the mask will always disable
the unavailable actions so won't impact algorithm.
The same goes to the case where the agent uses an identity action representation
(assuming some random features for action 0 and 3), then it would be
available_actions_tensor_with_padding = [
[0.1, 0.6, 0.3, 1.8, 2.0],
[0.8, -0.3, 0.6, 1.9, 3.0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
]
unavailable_actions_mask = [0, 0, 1, 1, 1]
"""
if (
self._is_action_continuous
or max_number_actions is None
Expand Down
4 changes: 2 additions & 2 deletions test/unit/with_pytorch/test_replay_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ def test_replay_buffer_not_stored_on_gpu(self) -> None:
OnPolicyReplayBuffer(replay_buffer_size),
FIFOOffPolicyReplayBuffer(replay_buffer_size),
BootstrapReplayBuffer(replay_buffer_size, p=0.5, ensemble_size=3),
# We meant to test FIFOOnPolicyReplayBuffer, but we observe
# We meant to test FIFOOnPolicyReplayBuffer as well, but we observe
# that it does not get filled when input is random, because
# it requires the next state of one transition
# to be the current state of the next transition.
# to be equal the current state of the next transition.
# TODO: verify that we really need this restriction on this
# replay buffer.
# FIFOOnPolicyReplayBuffer,
Expand Down

0 comments on commit 479c959

Please sign in to comment.