diff --git a/Caulet/FlappyAgent.py b/Caulet/FlappyAgent.py new file mode 100644 index 0000000..7e52807 --- /dev/null +++ b/Caulet/FlappyAgent.py @@ -0,0 +1,34 @@ +import numpy as np +import _pickle as cPickle + +def discrete_state(state): + x = str(int(round(state['next_pipe_dist_to_player']/20))) + y = str(int(round((state['player_y'] - state['next_pipe_bottom_y'])/20))) + v = str(int(round(state['player_vel']))) + return x+"-"+y+"-"+v +flag_dict=False +Q= dict() + +def FlappyPolicy(state, screen): + action=None + global flag_dict + global Q + + if not flag_dict: + Q = cPickle.load(open("Qql",'rb')) + flag_dict=False + s=discrete_state(state) + + if s in Q.keys(): + a = np.argmax(Q[s][:]) + else: + a = 0 + + if a==0: + action=0 + else: + action=119 + + return action + + diff --git a/Caulet/Note.txt b/Caulet/Note.txt new file mode 100644 index 0000000..89aad1a --- /dev/null +++ b/Caulet/Note.txt @@ -0,0 +1,5 @@ +Implémentation d'un algorithme de Q-Learning sur le jeu Flappy Bird +L'entrainement se fait à partir du fichier training.py et le résultat est stocké sous la forme d'un dictionnaire. +L'espace du jeu a été discrétisé par bloc de 20 pixel par rapport au prochain tuyau. +Cette discrétisation est sortie sous la forme d'une chaine de caractère (x-y-vel) et celle-ci est utilisée comme clef du dictionnaire. +La structure de l'algorithme de Q-learning est grandement inspirée de celui-vu dans RL3. diff --git a/Caulet/Qql b/Caulet/Qql new file mode 100644 index 0000000..6878699 Binary files /dev/null and b/Caulet/Qql differ diff --git a/Caulet/run.py b/Caulet/run.py new file mode 100644 index 0000000..20761a8 --- /dev/null +++ b/Caulet/run.py @@ -0,0 +1,31 @@ +# You're not allowed to change this file +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +from FlappyAgent import FlappyPolicy + +game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) +# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. + +p.init() +reward = 0.0 + +nb_games = 100 +cumulated = np.zeros((nb_games)) + +for i in range(nb_games): + p.reset_game() + + while(not p.game_over()): + state = game.getGameState() + screen = p.getScreenRGB() + action=FlappyPolicy(state, screen) ### Your job is to define this function. + + reward = p.act(action) + cumulated[i] = cumulated[i] + reward + +average_score = np.mean(cumulated) +max_score = np.max(cumulated) +print(average_score) +print(max_score) diff --git a/Caulet/training.py b/Caulet/training.py new file mode 100644 index 0000000..02da4a8 --- /dev/null +++ b/Caulet/training.py @@ -0,0 +1,110 @@ +import numpy as np +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +import _pickle as cPickle + +#Discretisation de l'espace +def discrete_state(state): + x = str(int(round(state['next_pipe_dist_to_player']/20))) + y = str(int(round((state['player_y'] - state['next_pipe_bottom_y'])/20))) + v = str(int(round(state['player_vel']))) + return x+"-"+y+"-"+v + +#GLIE actor #state = s' ajouté pour aider à la décision +def epsilon_greedy(Q, s, epsilon, state): + a = 0 + + if s in Q.keys(): + a = np.argmax(Q[s][:]) + random_act=np.random.rand() + if random_act <= epsilon : + if random_act <= 0.5 * epsilon: + if state['next_pipe_bottom_y'] - state['player_y'] < 50 : + a = 1 + else: + a = 0 + else: + if state['player_y'] - state['next_pipe_top_y'] > 50 : + a = 0 + else: + a = 1 + return a + +# passer de 1 à 119 +def call_action(a): + if a==0: + action=0 + else: + action=119 + + return action + +#Init +gamma = 0.95 +alpha = 0.9 +epsilon = 0.1 +nb_games = 60000 +resolution = 10 +Q= dict() +game = FlappyBird(graphics="fixed") +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) +score =0 +score_100=0 +# Q-learning +for i in range(1,nb_games): + if i%100 == 0: + print('moyenne sur 100 : %.2f' %(5+score_100 /100)) #dernière reward = -5 + if score_100/100>200: + break + score_100 = 0 # reset score100 + + if i% 1000 == 0 : + if alpha>0.1: + alpha/=2 + print('parties jouées : %d, états recensés : %d' %(i,len(Q))) + print('Moyenne : %.2f' % (5 + score / 1000)) #dernière reward = -5 + if score /1000 > 100: + break + score = 0 # reset score + + if i% 4000 ==0: + epsilon/=2 + #Init du Q-learning + p.init() + p.reset_game() + state=game.getGameState() + reward = training_reward = 0 + + s = discrete_state(state) + action = epsilon_greedy(Q,s,epsilon,state) + Q[s] = [0.0,0.0] + + while not p.game_over(): # repeat + + reward = p.act(call_action(action)) #retourne un entier correspondant la récompense associée à l'action 0 si action sans effet immediat, 1 si on depasse un tuyau et -5 si l'on meurt. + if reward == -5: + training_reward = -1000 #rejet de cette action + else: + training_reward = 1 + + state_ = game.getGameState() #s' + s_ = discrete_state(state_)#s' discrete + action_ = epsilon_greedy(Q,s_,epsilon, state_) #In s, choose a (GLIE actor) + #added s' to help the action choice in obvious situation + + if s_ not in Q.keys(): + Q[s_] = [0.0,0.0] + + delta = (training_reward + gamma * np.max(Q[s_][:]) - Q[s][action]) #Temporal difference: δ=r+γmaxa′Q(s′,a′)−Q(s,a) + Q[s][action]=Q[s][action] + alpha *delta #Update Q: Q(s,a)←Q(s,a)+αδ + + s = s_ #s←s′ + action =action_ + + score +=reward + score_100+=reward + +with open('Qql', 'wb') as f: + cPickle.dump(Q,f) + diff --git a/RandomBird/FlappyAgent.py b/RandomBird/FlappyAgent.py index 9f3ec84..7e52807 100644 --- a/RandomBird/FlappyAgent.py +++ b/RandomBird/FlappyAgent.py @@ -1,9 +1,34 @@ import numpy as np +import _pickle as cPickle + +def discrete_state(state): + x = str(int(round(state['next_pipe_dist_to_player']/20))) + y = str(int(round((state['player_y'] - state['next_pipe_bottom_y'])/20))) + v = str(int(round(state['player_vel']))) + return x+"-"+y+"-"+v +flag_dict=False +Q= dict() def FlappyPolicy(state, screen): - action=None - if(np.random.randint(0,2)<1): - action=119 - return action + action=None + global flag_dict + global Q + + if not flag_dict: + Q = cPickle.load(open("Qql",'rb')) + flag_dict=False + s=discrete_state(state) + + if s in Q.keys(): + a = np.argmax(Q[s][:]) + else: + a = 0 + + if a==0: + action=0 + else: + action=119 + + return action diff --git a/RandomBird/Note.txt b/RandomBird/Note.txt new file mode 100644 index 0000000..89aad1a --- /dev/null +++ b/RandomBird/Note.txt @@ -0,0 +1,5 @@ +Implémentation d'un algorithme de Q-Learning sur le jeu Flappy Bird +L'entrainement se fait à partir du fichier training.py et le résultat est stocké sous la forme d'un dictionnaire. +L'espace du jeu a été discrétisé par bloc de 20 pixel par rapport au prochain tuyau. +Cette discrétisation est sortie sous la forme d'une chaine de caractère (x-y-vel) et celle-ci est utilisée comme clef du dictionnaire. +La structure de l'algorithme de Q-learning est grandement inspirée de celui-vu dans RL3. diff --git a/RandomBird/Qql b/RandomBird/Qql new file mode 100644 index 0000000..6878699 Binary files /dev/null and b/RandomBird/Qql differ diff --git a/RandomBird/run.py b/RandomBird/run.py index 39b5801..20761a8 100644 --- a/RandomBird/run.py +++ b/RandomBird/run.py @@ -5,7 +5,7 @@ from FlappyAgent import FlappyPolicy game = FlappyBird(graphics="fixed") # use "fancy" for full background, random bird color and random pipe color, use "fixed" (default) for black background and constant bird and pipe colors. -p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=False, display_screen=True) +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes. p.init() @@ -27,3 +27,5 @@ average_score = np.mean(cumulated) max_score = np.max(cumulated) +print(average_score) +print(max_score) diff --git a/RandomBird/training.py b/RandomBird/training.py new file mode 100644 index 0000000..02da4a8 --- /dev/null +++ b/RandomBird/training.py @@ -0,0 +1,110 @@ +import numpy as np +from ple.games.flappybird import FlappyBird +from ple import PLE +import numpy as np +import _pickle as cPickle + +#Discretisation de l'espace +def discrete_state(state): + x = str(int(round(state['next_pipe_dist_to_player']/20))) + y = str(int(round((state['player_y'] - state['next_pipe_bottom_y'])/20))) + v = str(int(round(state['player_vel']))) + return x+"-"+y+"-"+v + +#GLIE actor #state = s' ajouté pour aider à la décision +def epsilon_greedy(Q, s, epsilon, state): + a = 0 + + if s in Q.keys(): + a = np.argmax(Q[s][:]) + random_act=np.random.rand() + if random_act <= epsilon : + if random_act <= 0.5 * epsilon: + if state['next_pipe_bottom_y'] - state['player_y'] < 50 : + a = 1 + else: + a = 0 + else: + if state['player_y'] - state['next_pipe_top_y'] > 50 : + a = 0 + else: + a = 1 + return a + +# passer de 1 à 119 +def call_action(a): + if a==0: + action=0 + else: + action=119 + + return action + +#Init +gamma = 0.95 +alpha = 0.9 +epsilon = 0.1 +nb_games = 60000 +resolution = 10 +Q= dict() +game = FlappyBird(graphics="fixed") +p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=False) +score =0 +score_100=0 +# Q-learning +for i in range(1,nb_games): + if i%100 == 0: + print('moyenne sur 100 : %.2f' %(5+score_100 /100)) #dernière reward = -5 + if score_100/100>200: + break + score_100 = 0 # reset score100 + + if i% 1000 == 0 : + if alpha>0.1: + alpha/=2 + print('parties jouées : %d, états recensés : %d' %(i,len(Q))) + print('Moyenne : %.2f' % (5 + score / 1000)) #dernière reward = -5 + if score /1000 > 100: + break + score = 0 # reset score + + if i% 4000 ==0: + epsilon/=2 + #Init du Q-learning + p.init() + p.reset_game() + state=game.getGameState() + reward = training_reward = 0 + + s = discrete_state(state) + action = epsilon_greedy(Q,s,epsilon,state) + Q[s] = [0.0,0.0] + + while not p.game_over(): # repeat + + reward = p.act(call_action(action)) #retourne un entier correspondant la récompense associée à l'action 0 si action sans effet immediat, 1 si on depasse un tuyau et -5 si l'on meurt. + if reward == -5: + training_reward = -1000 #rejet de cette action + else: + training_reward = 1 + + state_ = game.getGameState() #s' + s_ = discrete_state(state_)#s' discrete + action_ = epsilon_greedy(Q,s_,epsilon, state_) #In s, choose a (GLIE actor) + #added s' to help the action choice in obvious situation + + if s_ not in Q.keys(): + Q[s_] = [0.0,0.0] + + delta = (training_reward + gamma * np.max(Q[s_][:]) - Q[s][action]) #Temporal difference: δ=r+γmaxa′Q(s′,a′)−Q(s,a) + Q[s][action]=Q[s][action] + alpha *delta #Update Q: Q(s,a)←Q(s,a)+αδ + + s = s_ #s←s′ + action =action_ + + score +=reward + score_100+=reward + +with open('Qql', 'wb') as f: + cPickle.dump(Q,f) +