sipercai
diff --git a/‎Buffer_module.py
+163 b/‎Buffer_module.py
+163
diff --git a/‎NN_module.py
+72 b/‎NN_module.py
+72
diff --git a/‎dqn_agent.py
+137 b/‎dqn_agent.py
+137
diff --git a/‎gym_2048.py
+1-1 b/‎gym_2048.py
+1-1
@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+# Buffer_priority.py
+# author: yangrui
+# description: 
+# created: 2019-11-01T11:52:44.496Z+08:00
+# last-modified: 2019-11-01T11:52:44.496Z+08:00
+# email: [email protected]
+
+import numpy as np
+
+class SumTree(object):
+    data_pointer = 0
+
+    def __init__(self, capacity):
+        self.capacity = capacity  # for all priority values
+        self.tree = np.zeros(2 * capacity - 1)
+        # [--------------Parent nodes-------------][-------leaves to recode priority-------]
+        #             size: capacity - 1                       size: capacity
+        self.data = np.zeros(capacity, dtype=object)  # for all transitions，格式是对象，相当于指针
+        # [--------------data frame-------------]
+        #             size: capacity
+
+    def add(self, p, data):
+        tree_idx = self.data_pointer + self.capacity - 1   # 在树的叶子节点的位置
+        self.data[self.data_pointer] = data  # update data_frame
+        self.update(tree_idx, p)  # update tree_frame
+
+        self.data_pointer += 1
+        if self.data_pointer >= self.capacity:  # replace when exceed the capacity
+            self.data_pointer = 0
+
+    def update(self, tree_idx, p):
+        change = p - self.tree[tree_idx]
+        self.tree[tree_idx] = p
+        # then propagate the change through tree
+        while tree_idx != 0:    # this method is faster than the recursive loop in the reference code
+            tree_idx = (tree_idx - 1) // 2
+            self.tree[tree_idx] += change
+
+    def get_leaf(self, v):
+        """
+        Tree structure and array storage:
+
+        Tree index:
+             0         -> storing priority sum
+            / \
+          1     2
+         / \   / \
+        3   4 5   6    -> storing priority for transitions
+
+        Array type for storing:
+        [0,1,2,3,4,5,6]
+        """
+        parent_idx = 0
+        while True:     # the while loop is faster than the method in the reference code
+            cl_idx = 2 * parent_idx + 1         # this leaf's left and right kids
+            cr_idx = cl_idx + 1
+            if cl_idx >= len(self.tree):        # reach bottom, end search
+                leaf_idx = parent_idx
+                break
+            else:       # downward search, always search for a higher priority node
+                if v <= self.tree[cl_idx]:
+                    parent_idx = cl_idx
+                else:
+                    v -= self.tree[cl_idx]
+                    parent_idx = cr_idx
+
+        data_idx = leaf_idx - self.capacity + 1
+        return leaf_idx, self.tree[leaf_idx], self.data[data_idx]
+
+    @property
+    def total_p(self):
+        return self.tree[0]  # the root
+
+
+class Buffer_PER(object):  # stored as ( s, a, r, s_ ) in SumTree
+    epsilon = 0.01  # small amount to avoid zero priority
+    alpha = 0.6  # [0~1] convert the importance of TD error to priority
+    beta = 0.4  # importance-sampling, from initial value increasing to 1
+    beta_increment_per_sampling = 0.001
+    abs_err_upper = 1.  # clipped abs error
+
+    def __init__(self, capacity):
+        self.tree = SumTree(capacity)
+
+    def store(self, transition):
+        max_p = np.max(self.tree.tree[-self.tree.capacity:])
+        if max_p == 0:
+            max_p = self.abs_err_upper
+        self.tree.add(max_p, transition)   # set the max p for new p
+
+    def sample(self, n):
+        b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, self.tree.data[0].size)), np.empty((n, 1))
+        pri_seg = self.tree.total_p / n       # priority segment
+        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])  # max = 1
+
+        min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p     # for later calculate ISweight
+        for i in range(n):
+            a, b = pri_seg * i, pri_seg * (i + 1)
+            v = np.random.uniform(a, b)
+            idx, p, data = self.tree.get_leaf(v)
+            prob = p / self.tree.total_p
+            ISWeights[i, 0] = np.power(prob/min_prob, -self.beta)
+            b_idx[i], b_memory[i, :] = idx, data
+            
+        return b_idx, b_memory, ISWeights
+
+    def batch_update(self, tree_idx, abs_errors):
+        abs_errors += self.epsilon  # convert to abs and avoid 0
+        clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
+        ps = np.power(clipped_errors, self.alpha)
+        for ti, p in zip(tree_idx, ps):
+            self.tree.update(ti, p)
+
+# 总的buffer类
+class Buffer():
+    def __init__(self,n_features, buffer_type='', capacity=1e4):
+        self.memory_size = capacity
+        self.n_features = n_features
+        self.type = buffer_type
+        self.memory_counter = 0
+
+        if self.type == 'priority':
+            self.memory = Buffer_PER(capacity=capacity)
+        else:
+            self.memory = np.zeros((self.memory_size, n_features*2+2))
+
+    def store(self, transition):
+        self.memory_counter += 1
+
+        if self.type == 'priority':
+            self.memory.store(transition)
+        else:
+            index = self.memory_counter % self.memory_size
+            self.memory[index, :] = transition
+            
+
+    def sample(self, batch_size):
+        info = None
+        if self.type == 'priority':
+            tree_idx, batch_memory, ISWeights = self.memory.sample(batch_size)
+            info = (tree_idx, ISWeights)
+        else:
+            sample_index = np.random.choice(self.memory_size, size=batch_size)     # 考虑buffer已先填满
+            batch_memory = self.memory[sample_index, :]
+        
+        return batch_memory, info
+    
+    def update(self, tree_idx, td_errors):
+        assert self.type == 'priority' 
+        self.memory.batch_update(tree_idx, td_errors)
+
+
+
+    
+
+        
+
+
+
+
+
+
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+# NN.py
+# author: yangrui
+# description: 
+# created: 2019-10-30T16:32:31.081Z+08:00
+# last-modified: 2019-10-30T16:32:31.081Z+08:00
+# email: [email protected]
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np 
+
+# CNN网络
+class CNN_Net(nn.Module):
+    def __init__(self, input_len, output_num, conv_size=(32, 64), fc_size=(1024, 128), out_softmax=False):
+        super(CNN_Net, self).__init__()
+        self.input_len = input_len
+        self.output_num = output_num
+        self.out_softmax = out_softmax 
+
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(1, conv_size[0], kernel_size=3, stride=1, padding=1),
+            # nn.BatchNorm2d(32),
+            nn.ReLU(inplace=True)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(conv_size[0], conv_size[1], kernel_size=3, stride=1, padding=1),
+            # nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            # nn.MaxPool2d(kernel_size=2, stride=2)
+        )
+        
+        self.fc1 = nn.Linear(conv_size[1] * self.input_len * self.input_len, fc_size[0])
+        self.fc2 = nn.Linear(fc_size[0], fc_size[1])
+        self.head = nn.Linear(fc_size[1], self.output_num)
+
+    def forward(self, x):
+        x = x.reshape(-1,1,self.input_len, self.input_len)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = x.view(x.size(0), -1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+
+        output = self.head(x)
+        if self.out_softmax:
+            output = F.softmax(output, dim=1)   #值函数估计不应该有softmax
+        return output
+
+
+# 全连接网络
+class FC_Net(nn.Module):
+    def __init__(self, input_num, output_num, fc_size=(1024, 128), out_softmax=False):
+        super(FC_Net, self).__init__()
+        self.input_num = input_num
+        self.output_num = output_num
+        self.out_softmax = out_softmax 
+
+        self.fc1 = nn.Linear(self.input_num, fc_size[0])
+        self.fc2 = nn.Linear(fc_size[0], fc_size[1])
+        self.head = nn.Linear(fc_size[1], self.output_num)
+
+    def forward(self, x):
+        x = x.reshape(-1, self.input_num)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+
+        output = self.head(x)
+        if self.out_softmax:
+            output = F.softmax(output, dim=1)   #值函数估计不应该有softmax
+        return output
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+# dqn_agent.py
+# author: yangrui
+# description: 
+# created: 2019-10-12T11:07:45.524Z+08:00
+# last-modified: 2019-10-12T11:07:45.524Z+08:00
+# email: [email protected]
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import copy
+import utils
+from NN_module import CNN_Net, FC_Net
+from Buffer_module import Buffer
+
+
+class DQN():
+    batch_size = 128
+    lr = 1e-4
+    epsilon = 0.15   
+    memory_capacity =  int(1e4)
+    gamma = 0.99
+    q_network_iteration = 200
+    save_path = "./save/"
+    soft_update_theta = 0.1
+    clip_norm_max = 1
+    train_interval = 5
+    conv_size = (32, 64)   # num filters
+    fc_size = (512, 128)
+
+    def __init__(self, num_state, num_action, enable_double=False, enable_priority=True):
+        super(DQN, self).__init__()
+        self.num_state = num_state
+        self.num_action = num_action
+        self.state_len = int(np.sqrt(self.num_state))
+        self.enable_double = enable_double
+        self.enable_priority = enable_priority
+
+        self.eval_net, self.target_net = CNN_Net(self.state_len, num_action,self.conv_size, self.fc_size), CNN_Net(self.state_len, num_action, self.conv_size, self.fc_size)
+        # self.eval_net, self.target_net = FC_Net(self.num_state, self.num_action), FC_Net(self.num_state, self.num_action)
+
+        self.learn_step_counter = 0
+        self.buffer = Buffer(self.num_state, 'priority', self.memory_capacity)
+        # self.memory = np.zeros((self.memory_capacity, num_state * 2 + 2))     
+        self.initial_epsilon = self.epsilon
+        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=self.lr)
+
+
+    def select_action(self, state, random=False, deterministic=False):
+        state = torch.unsqueeze(torch.FloatTensor(state), 0) 
+        if not random and np.random.random() > self.epsilon or deterministic:  # greedy policy
+            action_value = self.eval_net.forward(state)
+            action = torch.max(action_value.reshape(-1,4), 1)[1].data.numpy()
+        else: # random policy
+            action = np.random.randint(0,self.num_action)
+        return action
+
+
+    def store_transition(self, state, action, reward, next_state):
+        state = state.reshape(-1)
+        next_state = next_state.reshape(-1)
+
+        transition = np.hstack((state, [action, reward], next_state))
+        self.buffer.store(transition)
+        # index = self.memory_counter % self.memory_capacity
+        # self.memory[index, :] = transition
+        # self.memory_counter += 1
+
+
+    def update(self):
+        #soft update the parameters
+        if self.learn_step_counter % self.q_network_iteration ==0 and self.learn_step_counter:
+            for p_e, p_t in zip(self.eval_net.parameters(), self.target_net.parameters()):
+                p_t.data = self.soft_update_theta * p_e.data + (1 - self.soft_update_theta) * p_t.data
+                
+        self.learn_step_counter+=1
+
+        #sample batch from memory
+        if self.enable_priority:
+            batch_memory, (tree_idx, ISWeights) = self.buffer.sample(self.batch_size)
+        else:
+            batch_memory, _ = self.buffer.sample(self.batch_size)
+
+        batch_state = torch.FloatTensor(batch_memory[:, :self.num_state])
+        batch_action = torch.LongTensor(batch_memory[:, self.num_state: self.num_state+1].astype(int))
+        batch_reward = torch.FloatTensor(batch_memory[:, self.num_state+1: self.num_state+2])
+        batch_next_state = torch.FloatTensor(batch_memory[:,-self.num_state:])
+
+        #q_eval
+        q_eval_total = self.eval_net(batch_state)
+        q_eval = q_eval_total.gather(1, batch_action)
+        q_next = self.target_net(batch_next_state).detach()
+
+        if self.enable_double:
+            q_eval_argmax = q_eval_total.max(1)[1].view(self.batch_size, 1)
+            q_max = q_next.gather(1, q_eval_argmax).view(self.batch_size, 1)
+        else:
+            q_max = q_next.max(1)[0].view(self.batch_size, 1)
+
+        q_target = batch_reward + self.gamma * q_max
+
+        if self.enable_priority:
+            abs_errors = (q_target - q_eval.data).abs()
+            self.buffer.update(tree_idx, abs_errors)
+            # loss = (torch.FloatTensor(ISWeights) * (q_target - q_eval).pow(2)).mean()   
+            loss = (q_target - q_eval).pow(2).mean() # 可能去掉ISweight更好？？
+
+            
+            # print(ISWeights)
+            # print(loss)
+
+            # import pdb; pdb.set_trace()
+        else:
+            loss = F.mse_loss(q_eval, q_target)
+        
+
+        self.optimizer.zero_grad()
+        loss.backward()
+        nn.utils.clip_grad_norm_(self.eval_net.parameters(), self.clip_norm_max)
+        self.optimizer.step()
+
+        return loss
+
+    
+    def save(self, path=None, name='dqn_net.pkl'):
+        path = self.save_path if not path else path
+        utils.check_path_exist(path)
+        torch.save(self.eval_net.state_dict(), path + name)
+
+    def load(self, path=None, name='dqn_net.pkl'):
+        path = self.save_path if not path else path
+        self.eval_net.load_state_dict(torch.load(path + name))
+
+
+    def epsilon_decay(self, episode, total_episode):
+        self.epsilon = self.initial_epsilon * (1 - episode / total_episode)
@@ -57,7 +57,7 @@ def __init__(self):
         self.set_illegal_move_reward(0.)
         self.set_max_tile(None)
 
-        self.max_illegal = 50     # max number of illegal actions
+        self.max_illegal = 10     # max number of illegal actions
         self.num_illegal = 0
 
         # Initialise seed