ZYunfeii
diff --git a/‎DQN_family/Agent.py
+260 b/‎DQN_family/Agent.py
+260
diff --git a/‎DQN_family/__pycache__/Agent.cpython-38.pyc
10.5 KB b/‎DQN_family/__pycache__/Agent.cpython-38.pyc
10.5 KB
diff --git a/‎DQN_family/__pycache__/core.cpython-38.pyc
7.98 KB b/‎DQN_family/__pycache__/core.cpython-38.pyc
7.98 KB
@@ -0,0 +1,260 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+import numpy as np
+import numpy.random as rd
+from copy import deepcopy
+from core import *
+from torch.utils.tensorboard import SummaryWriter
+
+class AgentBase:
+    def __init__(self):
+        self.learning_rate = 1e-4
+        self.soft_update_tau = 2 ** -8  # 5e-3 ~= 2 ** -8
+        self.state = None  # set for self.update_buffer(), initialize before training
+        self.device = None
+
+        self.act = self.act_target = None
+        self.cri = self.cri_target = None
+        self.act_optimizer = None
+        self.cri_optimizer = None
+        self.criterion = None
+
+        self.writer = SummaryWriter()
+        self.update_num = 0
+
+    def init(self, net_dim, state_dim, action_dim):
+        """initialize the self.object in `__init__()`
+
+        replace by different DRL algorithms
+        explict call self.init() for multiprocessing.
+
+        :int net_dim: the dimension of networks (the width of neural networks)
+        :int state_dim: the dimension of state (the number of state vector)
+        :int action_dim: the dimension of action (the number of discrete action)
+        """
+
+    def select_action(self, state) -> np.ndarray:
+        """Select actions for exploration
+
+        :array state: state.shape==(state_dim, )
+        :return array action: action.shape==(action_dim, ), (action.min(), action.max())==(-1, +1)
+        """
+        states = torch.as_tensor((state,), dtype=torch.float32, device=self.device).detach_()
+        action = self.act(states)[0]
+        return action.cpu().numpy()
+
+    def select_actions(self, states) -> np.ndarray:
+        """Select actions for exploration
+
+        :array states: (state, ) or (state, state, ...) or state.shape==(n, *state_dim)
+        :return array action: action.shape==(-1, action_dim), (action.min(), action.max())==(-1, +1)
+        """
+        states = torch.as_tensor(states, dtype=torch.float32, device=self.device).detach_()
+        actions = self.act(states)
+        return actions.cpu().numpy()  # -1 < action < +1
+
+    def explore_env(self, env, buffer, target_step, reward_scale, gamma) -> int:
+        """actor explores in env, then stores the env transition to ReplayBuffer
+
+        :env: RL training environment. env.reset() env.step()
+        :buffer: Experience Replay Buffer. buffer.append_buffer() buffer.extend_buffer()
+        :int target_step: explored target_step number of step in env
+        :float reward_scale: scale reward, 'reward * reward_scale'
+        :float gamma: discount factor, 'mask = 0.0 if done else gamma'
+        :return int target_step: collected target_step number of step in env
+        """
+        for _ in range(target_step):
+            action = self.select_action(self.state)
+            next_s, reward, done, _ = env.step(action)
+            other = (reward * reward_scale, 0.0 if done else gamma, *action)
+            buffer.append_buffer(self.state, other)
+            self.state = env.reset() if done else next_s
+        return target_step
+
+    def update_net(self, buffer, target_step, batch_size, repeat_times) -> (float, float):
+        """update the neural network by sampling batch data from ReplayBuffer
+
+        replace by different DRL algorithms.
+        return the objective value as training information to help fine-tuning
+
+        :buffer: Experience replay buffer. buffer.append_buffer() buffer.extend_buffer()
+        :int target_step: explore target_step number of step in env
+        :int batch_size: sample batch_size of data for Stochastic Gradient Descent
+        :float repeat_times: the times of sample batch = int(target_step * repeat_times) in off-policy
+        :return float obj_a: the objective value of actor
+        :return float obj_c: the objective value of critic
+        """
+
+    def save_load_model(self, cwd, if_save):
+        """save or load model files
+
+        :str cwd: current working directory, we save model file here
+        :bool if_save: save model or load model
+        """
+        act_save_path = '{}/actor.pth'.format(cwd)
+        cri_save_path = '{}/critic.pth'.format(cwd)
+
+        def load_torch_file(network, save_path):
+            network_dict = torch.load(save_path, map_location=lambda storage, loc: storage)
+            network.load_state_dict(network_dict)
+
+        if if_save:
+            if self.act is not None:
+                torch.save(self.act.state_dict(), act_save_path)
+            if self.cri is not None:
+                torch.save(self.cri.state_dict(), cri_save_path)
+        elif (self.act is not None) and os.path.exists(act_save_path):
+            load_torch_file(self.act, act_save_path)
+            print("Loaded act:", cwd)
+        elif (self.cri is not None) and os.path.exists(cri_save_path):
+            load_torch_file(self.cri, cri_save_path)
+            print("Loaded cri:", cwd)
+        else:
+            print("FileNotFound when load_model: {}".format(cwd))
+
+    @staticmethod
+    def soft_update(target_net, current_net, tau):
+        """soft update a target network via current network
+
+        :nn.Module target_net: target network update via a current network, it is more stable
+        :nn.Module current_net: current network update via an optimizer
+        """
+        for tar, cur in zip(target_net.parameters(), current_net.parameters()):
+            tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1 - tau))
+
+class AgentDQN(AgentBase):
+    def __init__(self):
+        super().__init__()
+        self.explore_rate = 0.1  # the probability of choosing action randomly in epsilon-greedy
+        self.action_dim = None  # chose discrete action randomly in epsilon-greedy
+
+    def init(self, net_dim, state_dim, action_dim):
+        self.action_dim = action_dim
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        self.cri = QNet(net_dim, state_dim, action_dim).to(self.device)
+        self.cri_target = deepcopy(self.cri)
+        self.act = self.cri  # to keep the same from Actor-Critic framework
+
+        self.criterion = torch.torch.nn.MSELoss()
+        self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate)
+
+    def select_action(self, state) -> int:  # for discrete action space
+        if rd.rand() < self.explore_rate:  # epsilon-greedy
+            a_int = rd.randint(self.action_dim)  # choosing action randomly
+        else:
+            states = torch.as_tensor((state,), dtype=torch.float32, device=self.device).detach_()
+            action = self.act(states)[0]
+            a_int = action.argmax(dim=0).cpu().numpy()
+        return a_int
+
+    def explore_env(self, env, buffer, target_step, reward_scale, gamma) -> int:
+        for _ in range(target_step):
+            action = self.select_action(self.state)
+            next_s, reward, done, _ = env.step(action)
+
+            other = (reward * reward_scale, 0.0 if done else gamma, action)  # action is an int
+            buffer.append_buffer(self.state, other)
+            self.state = env.reset() if done else next_s
+        return target_step
+
+    def update_net(self, buffer, target_step, batch_size, repeat_times) -> (float, float):
+        buffer.update_now_len_before_sample()
+
+        next_q = obj_critic = None
+        for _ in range(int(target_step * repeat_times)):
+            with torch.no_grad():
+                reward, mask, action, state, next_s = buffer.sample_batch(batch_size)  # next_state
+                next_q = self.cri_target(next_s).max(dim=1, keepdim=True)[0]
+                q_label = reward + mask * next_q
+            q_eval = self.cri(state).gather(1, action.type(torch.long))
+            obj_critic = self.criterion(q_eval, q_label)
+
+            self.cri_optimizer.zero_grad()
+            obj_critic.backward()
+            self.cri_optimizer.step()
+            self.soft_update(self.cri_target, self.cri, self.soft_update_tau)
+        return next_q.mean().item(), obj_critic.item()
+
+class AgentDoubleDQN(AgentDQN):
+    def __init__(self):
+        super().__init__()
+        self.explore_rate = 0.25  # the probability of choosing action randomly in epsilon-greedy
+        self.softmax = torch.nn.Softmax(dim=1)
+
+    def init(self, net_dim, state_dim, action_dim):
+        self.action_dim = action_dim
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        self.cri = QNetTwin(net_dim, state_dim, action_dim).to(self.device)
+        self.cri_target = deepcopy(self.cri)
+        self.act = self.cri
+
+        self.criterion = torch.nn.SmoothL1Loss()
+        self.cri_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate)
+
+    def select_action(self, state) -> int:  # for discrete action space
+        states = torch.as_tensor((state,), dtype=torch.float32, device=self.device).detach_()
+        actions = self.act(states)
+        if rd.rand() < self.explore_rate:  # epsilon-greedy
+            action = self.softmax(actions)[0]
+            a_prob = action.detach().cpu().numpy()  # choose action according to Q value
+            a_int = rd.choice(self.action_dim, p=a_prob)
+        else:
+            action = actions[0]
+            a_int = action.argmax(dim=0).cpu().numpy()
+        return a_int
+
+    def update_net(self, buffer, target_step, batch_size, repeat_times) -> (float, float):
+        """Contribution of DDQN (Double DQN)
+
+        Twin Q-Network. Use min(q1, q2) to reduce over-estimation.
+        """
+        buffer.update_now_len_before_sample()
+
+        next_q = obj_critic = None
+        for _ in range(int(target_step * repeat_times)):
+            with torch.no_grad():
+                reward, mask, action, state, next_s = buffer.sample_batch(batch_size)
+                next_q = torch.min(*self.cri_target.get_q1_q2(next_s))
+                next_q = next_q.max(dim=1, keepdim=True)[0]
+                q_label = reward + mask * next_q
+            act_int = action.type(torch.long)
+            q1, q2 = [qs.gather(1, act_int) for qs in self.act.get_q1_q2(state)]
+            obj_critic = self.criterion(q1, q_label) + self.criterion(q2, q_label)
+
+            self.cri_optimizer.zero_grad()
+            obj_critic.backward()
+            self.cri_optimizer.step()
+            self.soft_update(self.cri_target, self.cri, self.soft_update_tau)
+
+            self.update_num += 1
+            self.writer.add_scalar('loss_Q', obj_critic, self.update_num)
+        return next_q.mean().item(), obj_critic.item() / 2
+
+class AgentD3QN(AgentDoubleDQN):  # D3QN: Dueling Double DQN
+    def __init__(self):
+        super().__init__()
+
+    def init(self, net_dim, state_dim, action_dim):
+        """Contribution of D3QN (Dueling Double DQN)
+
+        There are not contribution of D3QN.
+        Obviously, DoubleDQN is compatible with DuelingDQN.
+        Any beginner can come up with this idea (D3QN) independently.
+        """
+        self.action_dim = action_dim
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        self.cri = QNetTwinDuel(net_dim, state_dim, action_dim).to(self.device)
+        self.cri_target = deepcopy(self.cri)
+        self.act = self.cri
+
+        self.criterion = torch.nn.SmoothL1Loss()
+        self.cri_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate)
+
+if __name__ == "__main__":
+    agent = AgentD3QN()
+    agent.init(128,3,1)