Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Submission #51

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions PRIETO/0_Lambda SARSA/FlappyAgent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np

Q = np.load("Q_matrix_LSARSA_BEST.npy")

actions = [None, 119]

state_min = np.array([0,
-48,
-10])

bin_c = np.array([
4,
8,
2])

state_shape = np.array([round((284)/4+1),
round(704/8+1),
round(40/2+1)])

print(f"The tabular Q contains {sum(Q.shape):_} value-action values.")


def FlappyPolicy(state, screen):

# Discretize state
s = get_engineered_state(state)

# Ravel indexes
s = np.ravel_multi_index(s, state_shape)

# Greedy policy
a = Q[s, :].argmax()

# Return greedy action
return actions[a]


def get_engineered_state(state):
y = state['player_y']
speed = state['player_vel']
next_y_b = state['next_pipe_bottom_y']
next_dist = state['next_pipe_dist_to_player']

engineered_state = np.array([next_dist,
next_y_b - y,
speed])

s = np.round(engineered_state/bin_c - state_min).astype(int)

return s
Binary file added PRIETO/0_Lambda SARSA/Q_matrix_LSARSA_BEST.npy
Binary file not shown.
Binary file not shown.
31 changes: 31 additions & 0 deletions PRIETO/0_Lambda SARSA/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# You're not allowed to change this file
from ple.games.flappybird import FlappyBird
from ple import PLE
import numpy as np
from FlappyAgent import FlappyPolicy

game = FlappyBird(graphics="fancy") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

p.init()
reward = 0.0

nb_games = 10
cumulated = np.zeros((nb_games))

for i in range(nb_games):
p.reset_game()

while(not p.game_over()):
state = game.getGameState()
action=FlappyPolicy(state, None) ### Your job is to define this function.

reward = p.act(action)
cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)

print(f"average_score: {average_score}")
print(f"max_score: {max_score}")
50 changes: 50 additions & 0 deletions PRIETO/1_Q learning/FlappyAgent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np

Q = np.load("Q_matrix_QLEARNING_BEST.npy")

actions = [None, 119]

state_min = np.array([0,
-48,
-10])

bin_c = np.array([
4,
8,
2])

state_shape = np.array([round((284)/4+1),
round(704/8+1),
round(40/2+1)])

print(f"The tabular Q contains {sum(Q.shape):_} value-action values.")


def FlappyPolicy(state, screen):

# Discretize state
s = get_engineered_state(state)

# Ravel indexes
s = np.ravel_multi_index(s, state_shape)

# Greedy policy
a = Q[s, :].argmax()

# Return greedy action
return actions[a]


def get_engineered_state(state):
y = state['player_y']
speed = state['player_vel']
next_y_b = state['next_pipe_bottom_y']
next_dist = state['next_pipe_dist_to_player']

engineered_state = np.array([next_dist,
next_y_b - y,
speed])

s = np.round(engineered_state/bin_c - state_min).astype(int)

return s
Binary file added PRIETO/1_Q learning/Q_matrix_QLEARNING_BEST.npy
Binary file not shown.
Binary file not shown.
31 changes: 31 additions & 0 deletions PRIETO/1_Q learning/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# You're not allowed to change this file
from ple.games.flappybird import FlappyBird
from ple import PLE
import numpy as np
from FlappyAgent import FlappyPolicy

game = FlappyBird(graphics="fancy") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

p.init()
reward = 0.0

nb_games = 10
cumulated = np.zeros((nb_games))

for i in range(nb_games):
p.reset_game()

while(not p.game_over()):
state = game.getGameState()
action=FlappyPolicy(state, None) ### Your job is to define this function.

reward = p.act(action)
cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)

print(f"average_score: {average_score}")
print(f"max_score: {max_score}")
Binary file added PRIETO/2_DQN/DQN.h5
Binary file not shown.
19 changes: 19 additions & 0 deletions PRIETO/2_DQN/FlappyAgent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numpy as np
from skimage.color import rgb2gray
from skimage.transform import resize

def FlappyPolicy(state, X, Q):

actions = [119,None]

# Greedy policy
X = np.expand_dims(np.stack(X, axis=-1), axis=0)
qa = Q.predict(X)
a = np.argmax(qa)

# Return greedy action
return actions[a]


def process_screen(screen):
return 255*resize(rgb2gray(screen[60:, 25:310, :]), (80, 80))
Binary file added PRIETO/2_DQN/__pycache__/FlappyAgent.cpython-36.pyc
Binary file not shown.
48 changes: 48 additions & 0 deletions PRIETO/2_DQN/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# You're not allowed to change this file
from ple.games.flappybird import FlappyBird
from ple import PLE
import numpy as np
from FlappyAgent import FlappyPolicy, process_screen
from collections import deque
from keras.models import load_model

game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors.
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes.

p.init()
reward = 0.0

nb_games = 100
cumulated = np.zeros((nb_games))

Q = load_model("DQN.h5")

print(Q.summary())

for i in range(nb_games):
p.reset_game()
X = deque([np.zeros((80, 80)),
np.zeros((80, 80)),
np.zeros((80, 80)),
np.zeros((80, 80))], maxlen=4)

while(not p.game_over()):
state = game.getGameState()

# Process screen
s = process_screen(p.getScreenRGB())

X.append(s)

action = FlappyPolicy(state,X,Q) ### Your job is to define this function.

reward = p.act(action)
cumulated[i] = cumulated[i] + reward
start = False

average_score = np.mean(cumulated)
max_score = np.max(cumulated)

print(f"average_score: {average_score}")
print(f"max_score: {max_score}")
28 changes: 28 additions & 0 deletions PRIETO/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# RL challenge

Several reinforcement algorithms and extensions where explored while conducting this assignment. Most of them achieved above human performance scores (well above the 15 score demanded by the original assignment), while one didn't converge. The source code, containing the training scripts, reflects all the exploration that was done. The explored algorithms and their status are:

1. Lambda SARSA - Status: OK
* on engineered state
2. Q-learning - Status: OK
* pn engineered state
3. Q-learning with NN - Status: NOT OK
* on full state
* with Prioritized Experience replay \[ ICLR 2016 \]
* and Double QN \[Deep Reinforcement Learning with Double Q-learning AAAI 2016\]
4. DQN - Status: OK
* on pixels

## To run

In order to evaluate the performance of the trained algorithms, you should open the corresponding folder:
* `0_Lambda SARSA/`
* `1_Q learning/`
* `2_DQN/`

and run `run.py`. Note that only the algorithms that converged successfully are available to evaluate.


## Source code

All the source code is contained in the folder `training/`
Loading