ddqn_Agent.py

import tensorflow as tf
from keras.layers import Dense, Activation
from keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
import numpy as np
import time

physical_devices = tf.config.list_physical_devices('GPU')
# replay buffer to allow the agent to sample state action reward... across many different episodes
# and also for the agent so that he doesn't get stuck
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions, discrete=False):
        self.mem_size = max_size
        self.mem_cntr = 0
        # because we are handling a continues action spaces
        self.discrete = discrete
        self.state_memory = np.zeros((self.mem_size, input_shape))
        # to store the state after taking an action
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8 if self.discrete else np.float32
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size)
        # the expected reward for terminal state is 0 
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)

    def store_transition(self, state, action, reward, state_, done):
        # find first avilable memory
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        # store one hot encoding of actions, if appropriate
        if self.discrete:
            actions = np.zeros(self.action_memory.shape[1])
            actions[action] = 1.0
            self.action_memory[index] = actions
        else:
            self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - done
        self.mem_cntr += 1
    
    def sample_buffer(self, batch_size):
        #for not sampling the zeros we want to find max between the two
        max_mem = min(self.mem_cntr, self.mem_size)
        # get array from 0 to max_mem-1
        batch = np.random.choice(max_mem, batch_size)
        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, actions, rewards, states_, terminal

def Model(lr, n_actions, input_dims, fc_dims):
    model = Sequential([
                Dense(fc_dims, input_shape=(input_dims,),activation='relu'),
                Dense(fc_dims,activation='relu'),
                Dense(fc_dims,activation='relu'),
                Dense(n_actions)])

    model.compile(optimizer=Adam(learning_rate=lr,decay=0.001), loss='mse')

    return model

class DDQNAgent(object):
    # NB : the gamma here is to reduce the predicted reward because it may or may not end-up in the same tragedy 
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
                 input_dims, epsilon_dec=0.9995,  epsilon_end=0.01,
                 mem_size=1000000, fname='Model',
                 replace_target=100):
        self.action_space = [i for i in range(n_actions)]
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        self.replace_target = replace_target
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions,
                                   discrete=True)
        self.q_eval = Model(alpha, n_actions, input_dims, 32)
        self.q_target = Model(alpha, n_actions, input_dims, 32)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, state):
        state=np.array(state)
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)

        return action

    def learn(self):
        if self.memory.mem_cntr > self.batch_size:
            state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)

            action_values = np.array(self.action_space, dtype=np.int8)
            action_indices = np.dot(action, action_values)

            q_next = self.q_target.predict(new_state)
            q_eval = self.q_eval.predict(new_state)
            q_pred = self.q_eval.predict(state)

            max_actions = np.argmax(q_eval, axis=1)

            q_target = q_pred

            batch_index = np.arange(self.batch_size, dtype=np.int32)

            q_target[batch_index, action_indices] = reward + \
                    self.gamma*q_next[batch_index, max_actions.astype(int)]*done

            _ = self.q_eval.fit(state, q_target, verbose=0)

            self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
                           self.epsilon_min else self.epsilon_min
            if self.memory.mem_cntr % self.replace_target == 0:
                self.update_network_parameters()

    def update_network_parameters(self):
        self.q_target.set_weights(self.q_eval.get_weights())

    def save_model(self):
        timestr = time.strftime("%d-%m-%Y-%H-%M")
        self.q_eval.save("Models/"+self.model_file+timestr+".h5")

    def load_model(self,path):
        self.q_eval = load_model(path)
        self.q_eval.summary()
        self.q_target = load_model(path)
        # if we are in evaluation mode we want to use the best weights for
        # q_target
        if self.epsilon == 0.0:
            self.update_network_parameters()
    
    def Plotit(self):
        plot_model(self.q_eval, to_file="dot_img_file.png", show_shapes=True)