Improve replay buffer documentation

Summary: A diff improving comments in replay buffer classes. Reviewed By: jb3618columbia Differential Revision: D61639097 fbshipit-source-id: b0964200ed8433d90d319fe4fc5edd094373b696
facebookresearch · Sep 9, 2024 · 479c959 · 479c959
1 parent bd509b8
commit 479c959
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 36 deletions.
diff --git a/pearl/policy_learners/sequential_decision_making/deep_q_learning.py b/pearl/policy_learners/sequential_decision_making/deep_q_learning.py
@@ -7,7 +7,7 @@
 
 # pyre-strict
 
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import Any, List, Optional, Tuple, Type
 
 import torch
 from pearl.action_representation_modules.action_representation_module import (

diff --git a/pearl/replay_buffers/sequential_decision_making/fifo_on_policy_replay_buffer.py b/pearl/replay_buffers/sequential_decision_making/fifo_on_policy_replay_buffer.py
@@ -20,11 +20,14 @@
 
 
 class FIFOOnPolicyReplayBuffer(TensorBasedReplayBuffer):
+    """
+    This replay buffer is used to delay push for SARSA.
+    It waits until next action is available and only then does it push a transition
+    that contains that information.
+    """
+
     def __init__(self, capacity: int) -> None:
         super(FIFOOnPolicyReplayBuffer, self).__init__(capacity)
-        # this is used to delay push SARS
-        # wait for next action is available and then final push
-        # this is designed for single transition for now
         self.cache: Optional[Transition] = None
 
     def _store_transition(

diff --git a/pearl/replay_buffers/tensor_based_replay_buffer.py b/pearl/replay_buffers/tensor_based_replay_buffer.py
@@ -144,41 +144,50 @@ def _process_single_cost(self, cost: Optional[float]) -> Optional[torch.Tensor]:
     def _process_single_terminated(self, terminated: bool) -> torch.Tensor:
         return torch.tensor([terminated])  # (1,)
 
-    """
-    This function is only used for discrete action space.
-    An example:
-    ----------------------------------------------------------
-    Suppose the environment at every step has a maximum number of 5 actions, and
-    the agent uses a onehot action representation module. At time step t, if the agent offers
-    2 actions, [0, 3], then the result of this function will be:
-    available_actions_tensor_with_padding = [
-        [0],
-        [3],
-        [0],
-        [0],
-        [0],
-    ]
-    unavailable_actions_mask = [0, 0, 1, 1, 1]
-    Note that although the actions and padding can have overlap, the mask will always disable the
-    unavailable actions so won't impact algorithm.
-
-    The same goes to the case where the agent uses an identity action representation
-    (assuming some random features for action 0 and 3), then it would be
-    available_actions_tensor_with_padding = [
-        [0.1, 0.6, 0.3, 1.8, 2.0],
-        [0.8, -0.3, 0.6, 1.9, 3.0],
-        [0, 0, 0, 0, 0],
-        [0, 0, 0, 0, 0],
-        [0, 0, 0, 0, 0],
-    ]
-    unavailable_actions_mask = [0, 0, 1, 1, 1]
-    """
-
     def _create_action_tensor_and_mask(
         self,
         max_number_actions: Optional[int],
         available_action_space: Optional[ActionSpace],
     ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Takes an action space containing only available actions,
+        and the maximum number of actions possible.
+
+        If the action space is continuous, returns (None, None).
+
+        If the action space is discrete, returns a pair of tensors:
+
+        1. A tensor of shape (1 x action_space_size x action_dim) that contains the available
+        actions.
+        2. A mask tensor of shape (1 x action_space_size) that contains 0 for available actions.
+
+        Example:
+        ----------------------------------------------------------
+        Suppose the environment at every step has a maximum number of 5 actions, and
+        the agent uses a onehot action representation module. At time step t, if the agent offers
+        2 actions, [0, 3], then the result of this function will be:
+        available_actions_tensor_with_padding = [
+            [0],
+            [3],
+            [0],
+            [0],
+            [0],
+        ]
+        unavailable_actions_mask = [0, 0, 1, 1, 1]
+        Note that although the actions and padding can overlap, the mask will always disable
+        the unavailable actions so won't impact algorithm.
+
+        The same goes to the case where the agent uses an identity action representation
+        (assuming some random features for action 0 and 3), then it would be
+        available_actions_tensor_with_padding = [
+            [0.1, 0.6, 0.3, 1.8, 2.0],
+            [0.8, -0.3, 0.6, 1.9, 3.0],
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0],
+        ]
+        unavailable_actions_mask = [0, 0, 1, 1, 1]
+        """
         if (
             self._is_action_continuous
             or max_number_actions is None

diff --git a/test/unit/with_pytorch/test_replay_buffer.py b/test/unit/with_pytorch/test_replay_buffer.py
@@ -32,10 +32,10 @@ def test_replay_buffer_not_stored_on_gpu(self) -> None:
             OnPolicyReplayBuffer(replay_buffer_size),
             FIFOOffPolicyReplayBuffer(replay_buffer_size),
             BootstrapReplayBuffer(replay_buffer_size, p=0.5, ensemble_size=3),
-            # We meant to test FIFOOnPolicyReplayBuffer, but we observe
+            # We meant to test FIFOOnPolicyReplayBuffer as well, but we observe
             # that it does not get filled when input is random, because
             # it requires the next state of one transition
-            # to be the current state of the next transition.
+            # to be equal the current state of the next transition.
             # TODO: verify that we really need this restriction on this
             # replay buffer.
             # FIFOOnPolicyReplayBuffer,