From da3dc2dc788a8d483e75abc306ebb94d562e2c37 Mon Sep 17 00:00:00 2001
From: kenjyoung <kjyoung@ualberta.ca>
Date: Wed, 12 Oct 2022 16:22:31 -0600
Subject: [PATCH] fixed a bug in seaquest which caused the oxygen bar to flash
 to full one time step before oxygen ran out. Thanks to @ehsan-fmb for
 pointing this out!

---
 README.md                        | 7 ++++++-
 minatar/environments/seaquest.py | 6 +++---
 setup.py                         | 2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index 2554775..a28ef2d 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ MinAtar is a testbed for AI agents which implements miniaturized versions of sev
 <img src="img/space_invaders.gif" width="200" />
 </p>
 
+
 ## Quick Start
 To use MinAtar, you need python3 installed, make sure pip is also up to date.  To run the included `DQN` and `AC_lambda` examples, you need `PyTorch`.  To install MinAtar, please follow the steps below:
 
@@ -113,7 +114,9 @@ This will enter the agent environment interaction loop and then run the GUI thre
 - [Julia](https://github.com/mkschleg/MinAtar.jl/blob/master/README.md)
 
 ## Results
-The following plots display results for DQN (Mnih et al., 2015) and actor-critic (AC) with eligibility traces. Our DQN agent uses a significantly smaller network compared to that of Mnih et al., 2015. We display results for DQN with and without experience reply. Our AC agent uses a similar architecture to DQN, but does not use experience replay. We display results for two values of the trace decay parameter, 0.8 and 0.0.  Each curve is the average of 30 independent runs with different random seeds. The top plots display the sensitivity of final performance to the step-size parameter, while the bottom plots display the average return during training as a function of training frames. For further information, see the paper on MinAtar available [here](https://arxiv.org/abs/1903.03176).
+The following plots display results for DQN (Mnih et al., 2015) and actor-critic (AC) with eligibility traces. Our DQN agent uses a significantly smaller network compared to that of Mnih et al., 2015. We display results for DQN with and without experience reply. Our AC agent uses a similar architecture to DQN, but does not use experience replay. We display results for two values of the trace decay parameter, 0.8 and 0.0.  Each curve is the average of 30 independent runs with different random seeds. The top plots display the sensitivity of final performance to the step-size parameter, while the bottom plots display the average return during training as a function of training frames. For further information, see the paper on MinAtar available [here](https://arxiv.org/abs/1903.03176). 
+
+**Note, the currently displayed results for Seaquest are for the version in MinAtar v1.0.10 and lower, where a bug caused the oxygen bar to flash to full one step before running out**. Results for the updated version may be different.
 
 <img align="center" src="img/sensitivity_curves.gif" width=800>
 <img align="center" src="img/learning_curves.gif" width=800>
@@ -139,6 +142,8 @@ The player begins at the bottom of the screen and the motion is restricted to tr
 ### Seaquest
 The player controls a submarine consisting of two cells, front and back, to allow direction to be determined. The player can also fire bullets from the front of the submarine. Enemies consist of submarines and fish, distinguished by the fact that submarines shoot bullets and fish do not. A reward of +1 is given each time an enemy is struck by one of the player's bullets, at which point the enemy is also removed. There are also divers which the player can move onto to pick up, doing so increments a bar indicated by another channel along the bottom of the screen. The player also has a limited supply of oxygen indicated by another bar in another channel. Oxygen degrades over time and is replenished whenever the player moves to the top of the screen as long as the player has at least one rescued diver on board. The player can carry a maximum of 6 divers. When surfacing with less than 6, one diver is removed. When surfacing with 6, all divers are removed and a reward is given for each active cell in the oxygen bar. Each time the player surfaces the difficulty is increased by increasing the spawn rate and movement speed of enemies. Termination occurs when the player is hit by an enemy fish, sub or bullet; or when oxygen reaches 0; or when the player attempts to surface with no rescued divers. Enemy and diver directions are indicated by a trail channel active in their previous location to reduce partial observability.
 
+**Note: MinAtar v1.0.10 and lower have a bug in Seaquest which causes the oxygen bar to flash to full one time-step before termination occured due to oxygen running out. This could have a significant impact on agents which learn from one step transitions as a full oxygen bar could either mean full oxygen or imminent termination due to no oxygen. For this reason Seaquest results obtained prior to v1.0.11 may not be consistent with results obtained from v1.0.11 onward.**
+
 [Video](https://www.youtube.com/watch?v=W9k38b5QPxA&t)
 
 ### Space Invaders
diff --git a/minatar/environments/seaquest.py b/minatar/environments/seaquest.py
index d42276d..f718919 100644
--- a/minatar/environments/seaquest.py
+++ b/minatar/environments/seaquest.py
@@ -66,7 +66,7 @@ def act(self, a):
         r = 0
         if(self.terminal):
             return r, self.terminal
-            
+
         a = self.action_map[a]
 
         # Spawn enemy if timer is up
@@ -194,7 +194,7 @@ def act(self, a):
         self.e_spawn_timer -= self.e_spawn_timer>0
         self.d_spawn_timer -= self.d_spawn_timer>0
         self.shot_timer -= self.shot_timer>0
-        if(self.oxygen<0):
+        if(self.oxygen<=0):
             self.terminal = True
         if(self.sub_y>0):
             self.oxygen-=1
@@ -260,7 +260,7 @@ def state(self):
         state[self.sub_y,self.sub_x,self.channels['sub_front']] = 1
         back_x = self.sub_x-1 if self.sub_or else self.sub_x+1
         state[self.sub_y,back_x,self.channels['sub_back']] = 1
-        state[9,0:self.oxygen*10//max_oxygen, self.channels['oxygen_guage']] = 1
+        state[9,0:max(0,self.oxygen)*10//max_oxygen, self.channels['oxygen_guage']] = 1
         state[9,9-self.diver_count:9, self.channels['diver_guage']] = 1
         for bullet in self.f_bullets:
             state[bullet[1],bullet[0], self.channels['friendly_bullet']] = 1
diff --git a/setup.py b/setup.py
index b2e9f3e..f827fb5 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@
 
 setup(
     name='MinAtar',
-    version='1.0.10',
+    version='1.0.11',
     description='A miniaturized version of the arcade learning environment.',
     url='https://github.com/kenjyoung/MinAtar',
     author='Kenny Young',