SupaeroDataScience · gprieto · Mar 11, 2018 · Mar 11, 2018 · Mar 11, 2018 · Mar 11, 2018
diff --git a/PRIETO/0_Lambda SARSA/FlappyAgent.py b/PRIETO/0_Lambda SARSA/FlappyAgent.py
@@ -0,0 +1,50 @@
+import numpy as np
+
+Q = np.load("Q_matrix_LSARSA_BEST.npy")
+
+actions = [None, 119]
+
+state_min = np.array([0,
+                      -48,
+                      -10])
+
+bin_c = np.array([
+    4,
+    8,
+    2])
+
+state_shape = np.array([round((284)/4+1),
+                        round(704/8+1),
+                        round(40/2+1)])
+
+print(f"The tabular Q contains {sum(Q.shape):_} value-action values.")
+
+
+def FlappyPolicy(state, screen):
+
+	# Discretize state
+    s = get_engineered_state(state)
+
+    # Ravel indexes
+    s = np.ravel_multi_index(s, state_shape)
+
+    # Greedy policy
+    a = Q[s, :].argmax()
+
+    # Return greedy action
+    return actions[a]
+
+
+def get_engineered_state(state):
+    y = state['player_y']
+    speed = state['player_vel']
+    next_y_b = state['next_pipe_bottom_y']
+    next_dist = state['next_pipe_dist_to_player']
+
+    engineered_state = np.array([next_dist,
+                                 next_y_b - y,
+                                 speed])
+
+    s = np.round(engineered_state/bin_c - state_min).astype(int)
+
+    return s
diff --git a/PRIETO/0_Lambda SARSA/Q_matrix_LSARSA_BEST.npy b/PRIETO/0_Lambda SARSA/Q_matrix_LSARSA_BEST.npy
diff --git a/PRIETO/0_Lambda SARSA/__pycache__/FlappyAgent.cpython-36.pyc b/PRIETO/0_Lambda SARSA/__pycache__/FlappyAgent.cpython-36.pyc
diff --git a/PRIETO/0_Lambda SARSA/run.py b/PRIETO/0_Lambda SARSA/run.py
@@ -0,0 +1,31 @@
+# You're not allowed to change this file
+from ple.games.flappybird import FlappyBird
+from ple import PLE
+import numpy as np
+from FlappyAgent import FlappyPolicy
+
+game = FlappyBird(graphics="fancy") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
+p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
+# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.
+
+p.init()
+reward = 0.0
+
+nb_games = 10
+cumulated = np.zeros((nb_games))
+
+for i in range(nb_games):
+    p.reset_game()
+
+    while(not p.game_over()):
+        state = game.getGameState()
+        action=FlappyPolicy(state, None) ### Your job is to define this function.
+
+        reward = p.act(action)
+        cumulated[i] = cumulated[i] + reward
+
+average_score = np.mean(cumulated)
+max_score = np.max(cumulated)
+
+print(f"average_score: {average_score}")
+print(f"max_score: {max_score}")
diff --git a/PRIETO/1_Q learning/FlappyAgent.py b/PRIETO/1_Q learning/FlappyAgent.py
@@ -0,0 +1,50 @@
+import numpy as np
+
+Q = np.load("Q_matrix_QLEARNING_BEST.npy")
+
+actions = [None, 119]
+
+state_min = np.array([0,
+                      -48,
+                      -10])
+
+bin_c = np.array([
+    4,
+    8,
+    2])
+
+state_shape = np.array([round((284)/4+1),
+                        round(704/8+1),
+                        round(40/2+1)])
+
+print(f"The tabular Q contains {sum(Q.shape):_} value-action values.")
+
+
+def FlappyPolicy(state, screen):
+
+	# Discretize state
+    s = get_engineered_state(state)
+
+    # Ravel indexes
+    s = np.ravel_multi_index(s, state_shape)
+
+    # Greedy policy
+    a = Q[s, :].argmax()
+
+    # Return greedy action
+    return actions[a]
+
+
+def get_engineered_state(state):
+    y = state['player_y']
+    speed = state['player_vel']
+    next_y_b = state['next_pipe_bottom_y']
+    next_dist = state['next_pipe_dist_to_player']
+
+    engineered_state = np.array([next_dist,
+                                 next_y_b - y,
+                                 speed])
+
+    s = np.round(engineered_state/bin_c - state_min).astype(int)
+
+    return s
diff --git a/PRIETO/1_Q learning/Q_matrix_QLEARNING_BEST.npy b/PRIETO/1_Q learning/Q_matrix_QLEARNING_BEST.npy
diff --git a/PRIETO/1_Q learning/__pycache__/FlappyAgent.cpython-36.pyc b/PRIETO/1_Q learning/__pycache__/FlappyAgent.cpython-36.pyc
diff --git a/PRIETO/1_Q learning/run.py b/PRIETO/1_Q learning/run.py
@@ -0,0 +1,31 @@
+# You're not allowed to change this file
+from ple.games.flappybird import FlappyBird
+from ple import PLE
+import numpy as np
+from FlappyAgent import FlappyPolicy
+
+game = FlappyBird(graphics="fancy") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
+p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
+# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.
+
+p.init()
+reward = 0.0
+
+nb_games = 10
+cumulated = np.zeros((nb_games))
+
+for i in range(nb_games):
+    p.reset_game()
+
+    while(not p.game_over()):
+        state = game.getGameState()
+        action=FlappyPolicy(state, None) ### Your job is to define this function.
+
+        reward = p.act(action)
+        cumulated[i] = cumulated[i] + reward
+
+average_score = np.mean(cumulated)
+max_score = np.max(cumulated)
+
+print(f"average_score: {average_score}")
+print(f"max_score: {max_score}")
diff --git a/PRIETO/2_DQN/DQN.h5 b/PRIETO/2_DQN/DQN.h5
diff --git a/PRIETO/2_DQN/FlappyAgent.py b/PRIETO/2_DQN/FlappyAgent.py
@@ -0,0 +1,19 @@
+import numpy as np
+from skimage.color import rgb2gray
+from skimage.transform import resize
+
+def FlappyPolicy(state, X, Q):
+
+    actions = [119,None]
+
+    # Greedy policy
+    X = np.expand_dims(np.stack(X, axis=-1), axis=0)
+    qa = Q.predict(X)
+    a = np.argmax(qa)
+
+    # Return greedy action
+    return actions[a]
+
+
+def process_screen(screen):
+    return 255*resize(rgb2gray(screen[60:, 25:310, :]), (80, 80))
diff --git a/PRIETO/2_DQN/__pycache__/FlappyAgent.cpython-36.pyc b/PRIETO/2_DQN/__pycache__/FlappyAgent.cpython-36.pyc
diff --git a/PRIETO/2_DQN/run.py b/PRIETO/2_DQN/run.py
@@ -0,0 +1,48 @@
+# You're not allowed to change this file
+from ple.games.flappybird import FlappyBird
+from ple import PLE
+import numpy as np
+from FlappyAgent import FlappyPolicy, process_screen
+from collections import deque
+from keras.models import load_model
+
+game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
+p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
+# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.
+
+p.init()
+reward = 0.0
+
+nb_games = 100
+cumulated = np.zeros((nb_games))
+
+Q = load_model("DQN.h5")
+
+print(Q.summary())
+
+for i in range(nb_games):
+    p.reset_game()
+    X = deque([np.zeros((80, 80)),
+                       np.zeros((80, 80)),
+                       np.zeros((80, 80)),
+                       np.zeros((80, 80))], maxlen=4)
+
+    while(not p.game_over()):
+        state = game.getGameState()
+
+        # Process screen
+        s = process_screen(p.getScreenRGB())
+
+        X.append(s)
+
+        action = FlappyPolicy(state,X,Q) ### Your job is to define this function.
+
+        reward = p.act(action)
+        cumulated[i] = cumulated[i] + reward
+        start = False
+
+average_score = np.mean(cumulated)
+max_score = np.max(cumulated)
+
+print(f"average_score: {average_score}")
+print(f"max_score: {max_score}")
diff --git a/PRIETO/README.md b/PRIETO/README.md
@@ -0,0 +1,28 @@
+# RL challenge
+
+Several reinforcement algorithms and extensions where explored while conducting this assignment. Most of them achieved above human performance scores (well above the 15 score demanded by the original assignment), while one didn't converge. The source code, containing the training scripts, reflects all the exploration that was done. The explored algorithms and their status are:
+
+1. Lambda SARSA - Status: OK
+   * on engineered state
+2. Q-learning - Status: OK
+   * pn engineered state
+3. Q-learning with NN - Status: NOT OK
+   * on full state
+   * with Prioritized Experience replay \[ ICLR 2016 \]
+   * and Double QN \[Deep Reinforcement Learning with Double Q-learning AAAI 2016\]
+4. DQN - Status: OK
+   * on pixels
+
+## To run
+
+In order to evaluate the performance of the trained algorithms, you should open the corresponding folder:
+* `0_Lambda SARSA/`
+* `1_Q learning/`
+* `2_DQN/`
+
+and run `run.py`. Note that only the algorithms that converged successfully are available to evaluate.
+
+
+## Source code
+
+All the source code is contained in the folder `training/`