updates for version 0.3.1: Higher NPS 250->300, enabled transposition…

… table, added time management regime by spending less time on obious moves, added opening guard moves to avoid exploration of moves < 5% for a given number of moves in the opening, added increasing cpuct value as described by recent DeepMind publicdation
QueensGambit · Dec 26, 2018 · ddbc816 · ddbc816
1 parent 8c705dc
commit ddbc816
Show file tree

Hide file tree

Showing 30 changed files with 1,135 additions and 473 deletions.
diff --git a/DeepCrazyhouse/src/domain/__init__.py b/DeepCrazyhouse/src/domain/__init__.py
diff --git a/DeepCrazyhouse/src/domain/abstract_cls/_GameState.py b/DeepCrazyhouse/src/domain/abstract_cls/_GameState.py
@@ -17,7 +17,7 @@ def __init__(self, board):
         self.board = board
         self._fen_dic = {}
 
-    def apply_move(self, move: chess.Move, remember_state=False):
+    def apply_move(self, move: chess.Move): #, remember_state=False):
         self.board.push(move)
 
     def get_state_planes(self):
@@ -52,5 +52,19 @@ def get_board_fen(self):
         return self.board.fen()
         #return self.board.fen().rsplit(' ', 1)[0]
 
+    def get_transposition_key(self):
+        """
+        Returns an identifier key for the current board state excluding move counters.
+        Calling ._transposition_key() is faster than .fen()
+        :return:
+        """
+        return self.board._transposition_key()
+
     def new_game(self):
         raise NotImplementedError
+
+    def get_halfmove_counter(self):
+        return self.board.halfmove_clock
+
+    def get_fullmove_number(self):
+        return self.board.fullmove_number
diff --git a/DeepCrazyhouse/src/domain/abstract_cls/__init__.py b/DeepCrazyhouse/src/domain/abstract_cls/__init__.py
diff --git a/DeepCrazyhouse/src/domain/agent/NeuralNetAPI.py b/DeepCrazyhouse/src/domain/agent/NeuralNetAPI.py
@@ -1,6 +1,5 @@
 import logging
 import numpy as np
-import DeepCrazyhouse.src.runtime.Colorer
 import time
 import json
 import glob
@@ -79,6 +78,13 @@ def __init__(self, ctx='cpu', batch_size=1):
                                         grad_req='null', force_rebind=True)
         self.executor.copy_params_from(arg_params, aux_params)
 
+        self.executors = []
+        for i in range(batch_size):
+            executor = sym.simple_bind(ctx=self.ctx, data=(i+1, NB_CHANNELS_FULL, BOARD_HEIGHT, BOARD_WIDTH),
+                                            grad_req='null', force_rebind=True)
+            executor.copy_params_from(arg_params, aux_params)
+            self.executors.append(executor)
+
     def get_executor(self):
         """
         Returns the executor object used for inference

diff --git a/DeepCrazyhouse/src/domain/agent/README.md b/DeepCrazyhouse/src/domain/agent/README.md
diff --git a/DeepCrazyhouse/src/domain/agent/__init__.py b/DeepCrazyhouse/src/domain/agent/__init__.py
diff --git a/DeepCrazyhouse/src/domain/agent/player/MCTSAgent.py b/DeepCrazyhouse/src/domain/agent/player/MCTSAgent.py
diff --git a/DeepCrazyhouse/src/domain/agent/player/RawNetAgent.py b/DeepCrazyhouse/src/domain/agent/player/RawNetAgent.py
@@ -13,32 +13,38 @@
 from DeepCrazyhouse.src.domain.agent.NeuralNetAPI import NeuralNetAPI
 from DeepCrazyhouse.src.domain.crazyhouse.output_representation import get_probs_of_move_list, value_to_centipawn
 from time import time
-
+import sys
 
 class RawNetAgent(_Agent):
 
-    def __init__(self, net: NeuralNetAPI, temperature=0., clip_quantil=0., verbose=True):
-        super().__init__(temperature, clip_quantil, verbose)
+    def __init__(self, net: NeuralNetAPI, temperature=0., temperature_moves=4, verbose=True):
+        super().__init__(temperature, temperature_moves, verbose)
         self._net = net
 
-    def evaluate_board_state(self, state: _GameState, verbose=True):
+    def evaluate_board_state(self, state: _GameState):
         """
 
         :param state:
         :return:
         """
+
         t_start_eval = time()
         pred_value, pred_policy = self._net.predict_single(state.get_state_planes())
 
         legal_moves = list(state.get_legal_moves())
+
         p_vec_small = get_probs_of_move_list(pred_policy, legal_moves, state.is_white_to_move())
 
-        if verbose is True:
-            # use the move with the highest probability as the best move for logging
-            instinct_move = legal_moves[p_vec_small.argmax()]
+        # use the move with the highest probability as the best move for logging
+        instinct_move = legal_moves[p_vec_small.argmax()]
 
-            # show the best calculated line
-            print('info score cp %d depth %d nodes %d time %d pv %s' % (
-            value_to_centipawn(pred_value), 1, 1, (time() - t_start_eval) * 1000, instinct_move.uci()))
+        # define the remaining return variables
+        time_e = (time() - t_start_eval)
+        cp = value_to_centipawn(pred_value)
+        depth = 1
+        nodes = 1
+        time_elapsed_s = time_e * 1000
+        nps = nodes/time_e
+        pv = instinct_move.uci()
 
-        return pred_value, legal_moves, p_vec_small
+        return pred_value, legal_moves, p_vec_small, cp, depth, nodes, time_elapsed_s, nps, pv
diff --git a/DeepCrazyhouse/src/domain/agent/player/_Agent.py b/DeepCrazyhouse/src/domain/agent/player/_Agent.py
@@ -17,10 +17,11 @@ class _Agent:
     The greedy agent always performs the first legal move with the highest move probability
     """
 
-    def __init__(self, temperature=0., clip_quantil=0., verbose=True):
+    def __init__(self, temperature=0, temperature_moves=4, verbose=True):
         self.temperature = temperature
-        self.p_vec_small = None
-        self.clip_quantil = clip_quantil
+        self.temperature_current = temperature
+        self.temperature_moves = temperature_moves
+        #self.p_vec_small = None
         self.verbose = verbose
 
     def evaluate_board_state(self, state: _GameState):
@@ -29,76 +30,57 @@ def evaluate_board_state(self, state: _GameState):
     def perform_action(self, state: _GameState):
 
         # the first step is to call you policy agent to evaluate the given position
-        value, legal_moves, self.p_vec_small = self.evaluate_board_state(state)
+        value, legal_moves, p_vec_small, cp, depth, nodes, time_elapsed_s, nps, pv = self.evaluate_board_state(state)
 
-        if len(legal_moves) != len(self.p_vec_small):
-            raise Exception('Legal move list %s is uncompatible to policy vector %s' % (legal_moves, self.p_vec_small))
+        if len(legal_moves) != len(p_vec_small):
+            raise Exception('Legal move list %s is uncompatible to policy vector %s' % (legal_moves, p_vec_small))
+
+        if state.get_fullmove_number() <= self.temperature_moves:
+            self.temperature_current = self.temperature
+        else:
+            self.temperature_current = 0
 
         if len(legal_moves) == 1:
             selected_move = legal_moves[0]
             confidence = 1.
             idx = 0
         else:
-            if self.temperature <= 0.01:
-                idx = self.p_vec_small.argmax()
+            if self.temperature_current <= 0.01:
+                idx = p_vec_small.argmax()
             else:
-                self._apply_temperature_to_policy()
-                self._apply_quantil_clipping()
-                idx = np.random.choice(range(len(legal_moves)), p=self.p_vec_small)
+                p_vec_small = self._apply_temperature_to_policy(p_vec_small)
+                idx = np.random.choice(range(len(legal_moves)), p=p_vec_small)
 
             selected_move = legal_moves[idx]
-            confidence = self.p_vec_small[idx]
+            confidence = p_vec_small[idx]
 
-        return value, selected_move, confidence, idx
-
-    def _apply_quantil_clipping(self):
-        """
+            if value > 0:
+                # check for draw and decline if value is greater 0
+                state_future = deepcopy(state)
+                state_future.apply_move(selected_move)
+                if state_future.get_pythonchess_board().can_claim_threefold_repetition() is True:
+                    p_vec_small[idx] = 0
+                    idx = p_vec_small.argmax()
+                    selected_move = legal_moves[idx]
+                    confidence = p_vec_small[idx]
 
-        :param p_vec_small:
-        :param clip_quantil:
-        :return:
-        """
+        return value, selected_move, confidence, idx, cp, depth, nodes, time_elapsed_s, nps, pv
 
-        if self.clip_quantil > 0:
-            # remove the lower percentage values in order to avoid strange blunders for moves with low confidence
-            p_vec_small_clipped = deepcopy(self.p_vec_small)
-
-            # get the sorted indices in ascending order
-            idx_order = np.argsort(self.p_vec_small)
-            # create a quantil tank which measures how much quantil power is left
-            quantil_tank = self.clip_quantil
-
-            # iterate over the indices (ascending) and apply the quantil clipping to it
-            for idx in idx_order:
-                if quantil_tank >= p_vec_small_clipped[idx]:
-                    # remove the prob from the quantil tank
-                    quantil_tank -= p_vec_small_clipped[idx]
-                    # clip the index to 0
-                    p_vec_small_clipped[idx] = 0
-                else:
-                    # the target prob is greate than the current quantil tank
-                    p_vec_small_clipped[idx] -= quantil_tank
-                    # stop the for loop
-                    break
-
-            # renormalize the policy
-            p_vec_small_clipped /= p_vec_small_clipped.sum()
-
-            # apply the changes
-            self.p_vec_small = p_vec_small_clipped
-
-    def _apply_temperature_to_policy(self):
+    def _apply_temperature_to_policy(self, p_vec_small):
         """
 
         :return:
         """
         # treat very small temperature value as a deterministic policy
-        if self.temperature <= 0.01:
-            p_vec_one_hot = np.zeros_like(self.p_vec_small)
-            p_vec_one_hot[np.argmax(self.p_vec_small)] = 1.
-            self.p_vec_small = p_vec_one_hot
+        if self.temperature_current <= 0.01:
+            p_vec_one_hot = np.zeros_like(p_vec_small)
+            p_vec_one_hot[np.argmax(p_vec_small)] = 1.
+            p_vec_small = p_vec_one_hot
         else:
             # apply exponential scaling
-            self.p_vec_small = np.power(self.p_vec_small, 1/self.temperature)
+            p_vec_small = p_vec_small ** (1/self.temperature_current)
             # renormalize the values to probabilities again
-            self.p_vec_small /= self.p_vec_small.sum()
+            p_vec_small /= p_vec_small.sum()
+
+        return p_vec_small
+
diff --git a/DeepCrazyhouse/src/domain/agent/player/__init__.py b/DeepCrazyhouse/src/domain/agent/player/__init__.py
diff --git a/DeepCrazyhouse/src/domain/agent/player/util/NetPredService.py b/DeepCrazyhouse/src/domain/agent/player/util/NetPredService.py
@@ -15,17 +15,25 @@
 import numpy as np
 from DeepCrazyhouse.src.domain.crazyhouse.output_representation import NB_LABELS, LABELS
 from time import time
+import cython
 
 
 class NetPredService:
 
-    def __init__(self, pipe_endings: [connection], net: NeuralNetAPI, batch_size, enable_timeout=False):
+    def __init__(self, pipe_endings: [connection], net: NeuralNetAPI, batch_size, batch_state_planes: np.ndarray,
+                 batch_value_results: np.ndarray, batch_policy_results: np.ndarray):
         """
 
         :param pipe_endings: List of pip endings which are for communicating with the thread workers.
         :param net: Neural Network API object which provides the reference for the neural network.
         :param batch_size: Constant batch_size used for inference.
-        :param enable_timeout: Decides wether to enable a timout if a batch didn't occur under 1 second.
+        :param batch_state_planes: Shared numpy memory in which all threads set their state plane request for the
+                                   prediction service. Each threads has it's own channel.
+        :param batch_value_results: Shared numpy memory in which the value results of all threads are stored.
+                                    Each threads has it's own channel.
+        :param batch_policy_results: Shared numpy memory in which the policy results of all threads are stored.
+                                    Each threads has it's own channel.
+        #:param enable_timeout: Decides wether to enable a timout if a batch didn't occur under 1 second.
         """
         self.net = net
         self.my_pipe_endings = pipe_endings
@@ -34,41 +42,72 @@ def __init__(self, pipe_endings: [connection], net: NeuralNetAPI, batch_size, en
         self.thread_inference = Thread(target=self._provide_inference, args=(pipe_endings,), daemon=True)
         self.batch_size = batch_size
 
-        self.time_start = None
-        self.timeout_second = 1
-        #self.enable_timeout = enable_timeout
+        self.batch_state_planes = batch_state_planes
+        self.batch_value_results = batch_value_results
+        self.batch_policy_results = batch_policy_results
 
+
+    #@cython.boundscheck(False)
+    #@cython.wraparound(False)
     def _provide_inference(self, pipe_endings):
 
         print('provide inference...')
-        #use_random = False
+        #use_random = True
+
+        #cdef double[:, :, :, ::1] batch_state_planes_view = self.batch_state_planes
+        #cdef double[::1] batch_value_results_view = self.batch_value_results
+        #cdef double[:, ::1] batch_policy_results = self.batch_policy_results
+
+        send_batches = False #True
 
         while self.running is True:
 
             filled_pipes = connection.wait(pipe_endings)
 
             if filled_pipes:
 
-                if True or len(filled_pipes) >= self.batch_size:
+                if True or len(filled_pipes) >= self.batch_size: # 1
+
+                        if send_batches is True:
+                            planes_batch = []
+                            pipes_pred_output = []
+
+                            for pipe in filled_pipes[:self.batch_size]:
+                                while pipe.poll():
+                                    planes_batch.append(pipe.recv())
+                                    pipes_pred_output.append(pipe)
 
-                        planes_batch = []
-                        pipes_pred_output = []
+                            # logging.debug('planes_batch length: %d %d' % (len(planes_batch), len(filled_pipes)))
+                            state_planes_mxnet = mx.nd.array(planes_batch, ctx=self.net.get_ctx())
+                        else:
+                            planes_ids = []
+                            pipes_pred_output = []
 
-                        for pipe in filled_pipes[:self.batch_size]:
-                            while pipe.poll():
-                                planes_batch.append(pipe.recv())
-                                pipes_pred_output.append(pipe)
+                            for pipe in filled_pipes[:self.batch_size]:
+                                while pipe.poll():
+                                    planes_ids.append(pipe.recv())
+                                    pipes_pred_output.append(pipe)
 
-                        #logging.debug('planes_batch length: %d %d' % (len(planes_batch), len(filled_pipes)))
-                        planes_batch = mx.nd.array(planes_batch, ctx=self.net.get_ctx())
+                            #logging.debug('planes_batch length: %d %d' % (len(planes_batch), len(filled_pipes)))
+                            state_planes_mxnet = mx.nd.array(self.batch_state_planes[planes_ids], ctx=self.net.get_ctx())
 
-                        #pred = self.net.get_executor().forward(is_train=False, data=planes_batch)
-                        pred = self.net.get_net()(planes_batch)
+
+                        #print(len(state_planes_mxnet))
+                        executor = self.net.executors[len(state_planes_mxnet)-1]
+                        pred = executor.forward(is_train=False, data=state_planes_mxnet)
+                        #pred = self.net.get_net()(state_planes_mxnet)
+                        #print('pred: %.3f' % (time()-t_s)*1000)
+                        #t_s = time()
 
                         value_preds = pred[0].asnumpy()
 
+                        # renormalize to [0,1]
+                        #value_preds += 1
+                        #value_preds /= 2
+
                         # for the policy prediction we still have to apply the softmax activation
                         #  because it's not done by the neural net
+                        #policy_preds = pred[1].softmax().asnumpy()
                         policy_preds = pred[1].softmax().asnumpy()
 
                         #if use_random is True:
@@ -77,10 +116,20 @@ def _provide_inference(self, pipe_endings):
 
                         # send the predictions back to the according workers
                         for i, pipe in enumerate(pipes_pred_output):
-                            pipe.send([value_preds[i], policy_preds[i]])
 
-                        # reset the timer
-                        self.time_start = time()
+                            if send_batches is True:
+                                pipe.send([value_preds[i], policy_preds[i]])
+                            else:
+                                # get the according channel index for setting the result
+                                channel_idx = planes_ids[i]
+
+                                # set the value result
+                                self.batch_value_results[channel_idx] = value_preds[i]
+                                self.batch_policy_results[channel_idx] = policy_preds[i]
+                                # give the thread the signal that the result has been set by sending back his channel_idx
+                                pipe.send(channel_idx)
+
+                        #print('send back res: %.3f' % (time()-t_s)*1000)
 
     def start(self):
         print('start inference thread...')