Merge remote-tracking branch 'upstream/master' into clean_implement_d…

…istribution
sony · Jan 9, 2020 · 2aa8449 · 2aa8449
2 parents 96cf86e + 5618029
commit 2aa8449
Show file tree

Hide file tree

Showing 37 changed files with 175 additions and 7 deletions.
diff --git a/doc/python/api/function.rst b/doc/python/api/function.rst
@@ -59,6 +59,7 @@ Neural Network Activation
 .. autofunction:: tanh
 .. autofunction:: relu
 .. autofunction:: softmax
+.. autofunction:: log_softmax
 .. autofunction:: elu
 .. autofunction:: selu
 .. autofunction:: crelu
@@ -80,6 +81,7 @@ Normalization
 -------------
 
 .. autofunction:: batch_normalization
+.. autofunction:: fused_batch_normalization
 .. autofunction:: sync_batch_normalization
 .. autofunction:: mean_subtraction
 .. autofunction:: clip_by_value
@@ -146,6 +148,11 @@ Logical
 .. autofunction:: maximum2
 .. autofunction:: minimum_scalar
 .. autofunction:: maximum_scalar
+.. autofunction:: isnan
+.. autofunction:: isinf
+.. autofunction:: reset_nan
+.. autofunction:: reset_inf
+.. autofunction:: where
 
 
 Math
@@ -200,6 +207,8 @@ Array Manipulation
 .. autofunction:: batch_inv
 .. autofunction:: batch_det
 .. autofunction:: assign
+.. autofunction:: top_k_data
+.. autofunction:: top_k_grad
 
 
 Stochasticity
@@ -209,8 +218,6 @@ Stochasticity
 .. autofunction:: randint
 .. autofunction:: randn
 .. autofunction:: dropout
-.. autofunction:: top_k_data
-.. autofunction:: top_k_grad
 .. autofunction:: random_choice
 .. autofunction:: random_crop
 .. autofunction:: random_flip
@@ -253,7 +260,9 @@ Quantized Neural Network Layers
 .. autofunction:: min_max_quantize
 .. autofunction:: pow2_quantize
 .. autofunction:: prune
-
+.. autofunction:: inq_affine
+.. autofunction:: inq_convolution
+
 
 Unsupported, Special Use
 ------------------------
@@ -262,6 +271,7 @@ Unsupported, Special Use
 .. autofunction:: unlink
 .. autofunction:: sink
 .. autofunction:: warp_by_flow
+.. autofunction:: confusion_matrix
 
 
 Image Object Detection
@@ -274,3 +284,4 @@ Validation
 ----------
 
 .. autofunction:: top_n_error
+.. autofunction:: binary_error
diff --git a/doc/python/api/parametric_function.rst b/doc/python/api/parametric_function.rst
@@ -71,6 +71,7 @@ Here is the list of parametric functions.
 .. autofunction:: deconvolution
 .. autofunction:: depthwise_deconvolution
 .. autofunction:: batch_normalization
+.. autofunction:: fused_batch_normalization
 .. autofunction:: sync_batch_normalization
 .. autofunction:: mean_subtraction
 .. autofunction:: layer_normalization
@@ -113,9 +114,9 @@ Here is the list of parametric functions.
 .. autofunction:: spectral_norm
 .. autofunction:: weight_normalization
 .. autofunction:: multi_head_attention
-.. autoclass:: transformer
-.. autoclass:: transformer_encode
-.. autoclass:: transformer_decode
+.. autofunction:: transformer
+.. autofunction:: transformer_encode
+.. autofunction:: transformer_decode
 
 Parameter Initializer
 ---------------------
@@ -138,6 +139,12 @@ listed below.
 .. autoclass:: UniformInitializer
     :show-inheritance:
 
+.. autoclass:: UniformIntInitializer
+    :show-inheritance:
+
+.. autoclass:: RangeInitializer
+    :show-inheritance:
+
 .. autoclass:: OrthogonalInitializer
     :show-inheritance:
 

diff --git a/doc/python/api/solver.rst b/doc/python/api/solver.rst
@@ -28,3 +28,5 @@ List of solvers
 .. autofunction:: Adamax
 .. autofunction:: AMSGRAD
 .. autofunction:: AMSBound
+.. autofunction:: AdamW
+.. autofunction:: SgdW
diff --git a/include/nbla/solver.hpp b/include/nbla/solver.hpp
@@ -165,6 +165,11 @@ class NBLA_API Solver {
   */
   void weight_decay(float decay_rate);
 
+  /** Clip gradients by norm.
+  The norm is calculated at each variable.
+   */
+  void clip_grad_by_norm(float norm);
+
   /** Check if there is any inf on the gradients which were setup.
    */
   bool check_inf_grad();
@@ -225,6 +230,15 @@ class NBLA_API Solver {
   virtual void weight_decay_impl(const string &key, VariablePtr param,
                                  float decay_rate) = 0;
 
+  /** Clip gradients by norm implementation.
+
+  @param key Key of parameter.
+  @param param A parameter Variable.
+  @param norm A value of norm.
+  */
+  virtual void clip_grad_by_norm_impl(const string &key, VariablePtr param,
+                                      float clip_norm) = 0;
+
   /** Check if there is any inf on the gradients which were setup.
    */
   virtual bool check_inf_grad_impl(const string &key, VariablePtr param) = 0;
@@ -258,6 +272,17 @@ class NBLA_API Solver {
     WEIGHT_DECAY_FUNC<T>(this->ctx_, param, decay_rate);                       \
   }
 
+#define NBLA_DECL_CLIP_GRAD_BY_NORM()                                          \
+  virtual void clip_grad_by_norm_impl(const string &key, VariablePtr param,    \
+                                      float clip_norm)
+
+#define NBLA_DEF_CLIP_GRAD_BY_NORM(SOLVER, CLIP_GRAD_BY_NORM_FUNC)             \
+  template <typename T>                                                        \
+  void SOLVER<T>::clip_grad_by_norm_impl(const string &key, VariablePtr param, \
+                                         float clip_norm) {                    \
+    CLIP_GRAD_BY_NORM_FUNC<T>(this->ctx_, param, clip_norm);                   \
+  }
+
 #define NBLA_DECL_CHECK_INF_GRAD()                                             \
   virtual bool check_inf_grad_impl(const string &key, VariablePtr param)
 

diff --git a/include/nbla/solver/adabound.hpp b/include/nbla/solver/adabound.hpp
@@ -54,6 +54,7 @@ template <typename T> class NBLA_API AdaBound : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/adadelta.hpp b/include/nbla/solver/adadelta.hpp
@@ -61,6 +61,7 @@ template <typename T> class NBLA_API Adadelta : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/adagrad.hpp b/include/nbla/solver/adagrad.hpp
@@ -59,6 +59,7 @@ template <typename T> class NBLA_API Adagrad : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/adam.hpp b/include/nbla/solver/adam.hpp
@@ -62,6 +62,7 @@ template <typename T> class NBLA_API Adam : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/adamax.hpp b/include/nbla/solver/adamax.hpp
@@ -63,6 +63,7 @@ template <typename T> class NBLA_API Adamax : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/adamw.hpp b/include/nbla/solver/adamw.hpp
@@ -65,6 +65,7 @@ template <typename T> class NBLA_API AdamW : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/amsbound.hpp b/include/nbla/solver/amsbound.hpp
@@ -56,6 +56,7 @@ template <typename T> class NBLA_API AMSBound : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/amsgrad.hpp b/include/nbla/solver/amsgrad.hpp
@@ -70,6 +70,7 @@ template <typename T> class NBLA_API AMSGRAD : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/clip_grad.hpp b/include/nbla/solver/clip_grad.hpp
@@ -0,0 +1,41 @@
+// Copyright (c) 2017 Sony Corporation. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __NBLA_SOLVER_CLIP_GRAD_HPP__
+#define __NBLA_SOLVER_CLIP_GRAD_HPP__
+
+#include <nbla/context.hpp>
+#include <nbla/variable.hpp>
+
+#include <memory>
+
+namespace nbla {
+
+template <typename T>
+void clip_grad_by_norm_cpu(const Context &ctx, const shared_ptr<Variable> param,
+                           float clip_norm) {
+  Size_t size = param->size();
+  T *grad = param->cast_grad_and_get_pointer<T>(ctx);
+  T sum = 0;
+  for (int i = 0; i < size; ++i)
+    sum += grad[i] * grad[i];
+  // sum > 0.0 is to avoid zero sqrt
+  if (sum > 0.0 && sum > clip_norm * clip_norm) {
+    T norm = std::sqrt(sum);
+    for (int i = 0; i < size; ++i)
+      grad[i] = clip_norm * grad[i] / norm;
+  }
+}
+}
+#endif
diff --git a/include/nbla/solver/lars.hpp b/include/nbla/solver/lars.hpp
@@ -64,6 +64,7 @@ template <typename T> class NBLA_API Lars : public Solver {
   virtual void remove_state_impl(const string &key) override;
   virtual void update_impl(const string &key, VariablePtr param) override;
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/momentum.hpp b/include/nbla/solver/momentum.hpp
@@ -56,6 +56,7 @@ template <typename T> class NBLA_API Momentum : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/nesterov.hpp b/include/nbla/solver/nesterov.hpp
@@ -55,6 +55,7 @@ template <typename T> class NBLA_API Nesterov : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/rmsprop.hpp b/include/nbla/solver/rmsprop.hpp
@@ -60,6 +60,7 @@ template <typename T> class NBLA_API RMSprop : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/sgd.hpp b/include/nbla/solver/sgd.hpp
@@ -45,6 +45,7 @@ template <typename T> class NBLA_API Sgd : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/include/nbla/solver/sgdw.hpp b/include/nbla/solver/sgdw.hpp
@@ -52,6 +52,7 @@ template <typename T> class NBLA_API SgdW : public Solver {
   virtual void remove_state_impl(const string &key);
   virtual void update_impl(const string &key, VariablePtr param);
   NBLA_DECL_WEIGHT_DECAY();
+  NBLA_DECL_CLIP_GRAD_BY_NORM();
   NBLA_DECL_CHECK_INF_GRAD();
   NBLA_DECL_CHECK_NAN_GRAD();
   NBLA_DECL_CHECK_INF_OR_NAN_GRAD();

diff --git a/python/src/nnabla/solver.pxd.tmpl b/python/src/nnabla/solver.pxd.tmpl
@@ -47,6 +47,7 @@ cdef extern from "nbla/solver.hpp" namespace "nbla":
         void set_states(vector[pair[string, CSolverState]]) except +
         void update() nogil except +
         void weight_decay(float decay_rate) nogil except +
+        void clip_grad_by_norm(float clip_norm) nogil except +
         cpp_bool check_inf_grad() nogil except +
         cpp_bool check_nan_grad() nogil except +
         cpp_bool check_inf_or_nan_grad() nogil except +

diff --git a/python/src/nnabla/solver.pyx.tmpl b/python/src/nnabla/solver.pyx.tmpl
@@ -76,6 +76,7 @@ cdef class Solver:
             solver.zero_grad()  # All gradient buffer being 0
             loss.backward()
             solver.weight_decay(decay_rate)  # Apply weight decay
+            solver.clip_grad_by_norm(clip_norm)  # Apply clip grad by norm
             solver.update()  # updating parameters
 
     Note:
@@ -353,6 +354,18 @@ cdef class Solver:
         with nogil:
             self.solverp.weight_decay(decay_rate)
 
+    def clip_grad_by_norm(self, float clip_norm):
+        """
+        Clip gradients by norm.
+        When called, the gradient will be clipped by the given norm.
+
+        Args:
+            clip_norm (float): The value of clipping norm.
+        """
+
+        with nogil:
+            self.solverp.clip_grad_by_norm(clip_norm)
+
     def check_inf_grad(self, ):
         """
         Check if there is any inf on the gradients which were setup.

diff --git a/python/test/solver/solver_test_utils.py b/python/test/solver/solver_test_utils.py
@@ -46,9 +46,14 @@ def weight_decay(self, grads, decay_rate):
             param = self.params[key]
             grad[...] = grad + decay_rate * param
 
+    def clip_grad_by_norm(self, grads, clip_norm):
+        for key, grad in iteritems(grads):
+            norm = np.sqrt(np.sum(grad ** 2))
+            grad[...] = clip_norm * grad / max(clip_norm, norm)
+
 
 def solver_tester(rng, solver, ref_solver, solver_args=[], solver_kwargs={},
-                  num_itr=5, decay=1e-4, atol=1e-6,
+                  num_itr=5, decay=1e-4, clip_norm=0.5, atol=1e-6,
                   ctx=None, solver_name=None):
     if ctx is None:
         ctx = nn.Context()
@@ -89,6 +94,14 @@ def solver_tester(rng, solver, ref_solver, solver_args=[], solver_kwargs={},
     for p, ref_p in zip(params.values(), grad_copy.values()):
         assert_allclose(ref_p, p.g, atol=atol)
 
+    # Check clip grad by norm.
+    grad_copy = OrderedDict([(k, p.g.copy())
+                             for k, p in iteritems(params)])
+    s.clip_grad_by_norm(clip_norm)
+    ref_s.clip_grad_by_norm(grad_copy, clip_norm)
+    for p, ref_p in zip(params.values(), grad_copy.values()):
+        assert np.allclose(ref_p, p.g, atol=atol)
+
     # Check solver udpate.
     for i in range(num_itr):
         grads = OrderedDict([(k, rng.randn(*p.shape))

diff --git a/src/nbla/solver.cpp b/src/nbla/solver.cpp
@@ -145,6 +145,19 @@ void Solver::weight_decay(float decay_rate) {
   }
 }
 
+void Solver::clip_grad_by_norm(float norm) {
+  if (norm == 0)
+    return;
+  for (auto &kv : params_) {
+    SyncedArrayPtr g = kv.second.p->grad()->array();
+    if (g->zeroing()) {
+      // The gradient is not computed. Skip.
+      continue;
+    }
+    clip_grad_by_norm_impl(kv.first, kv.second.p, norm);
+  }
+}
+
 bool Solver::check_inf_grad() {
   for (auto &kv : params_) {
     SyncedArrayPtr g = kv.second.p->grad()->array();