Add allow_rebase_history flag and fix codegen functions for multiple …

…views (pytorch#32790) Summary: Pull Request resolved: pytorch#32790 Same as pytorch#31990 but without the first commit in the stack that is problematic for a lot of people. Test Plan: Imported from OSS Differential Revision: D19814116 Pulled By: albanD fbshipit-source-id: d104911a5b098a5807b4bc08b69803ebd4f69fa6
orionr · Feb 11, 2020 · 3655975 · 3655975
1 parent 330d051
commit 3655975
Show file tree

Hide file tree

Showing 9 changed files with 124 additions and 96 deletions.
diff --git a/test/cpp/api/init.cpp b/test/cpp/api/init.cpp
@@ -105,7 +105,7 @@ TEST(InitTest, CanInitializeTensorThatRequiresGrad) {
   ASSERT_THROWS_WITH(
       tensor.fill_(1),
       "a leaf Variable that requires grad "
-      "has been used in an in-place operation");
+      "is being used in an in-place operation");
   ASSERT_EQ(torch::nn::init::ones_(tensor).sum().item<int32_t>(), 12);
 }
 
@@ -168,4 +168,4 @@ TEST(InitTest, FanModeLegacyEnum) {
 
   FANMODE_ENUM_LEGACY_WARNING_CHECK(kaiming_uniform_, FanIn, "torch::kFanIn")
   FANMODE_ENUM_LEGACY_WARNING_CHECK(kaiming_uniform_, FanOut, "torch::kFanOut")
-}
+}
diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -3516,11 +3516,11 @@ def maybe_check_raise(fn, should_raise):
         # The 3 elements are for view_as, first output of unbind and second output of unbind
         run_test(grad_mode=True, requires_grad=False, is_view=True,
                  should_raise_tuple=(None, None, None))
-        # TODO: Second should_raise should not be None below, third one should not raise an internal assert
+        inp_change_err = "The {}th output of UnbindBackward is being modified inplace but this is not allowed"
         run_test(grad_mode=True, requires_grad=True, is_view=True,
-                 should_raise_tuple=(None, None, "diff_view_meta->output_nr_ == 0 INTERNAL ASSERT FAILED"))
+                 should_raise_tuple=(None, inp_change_err.format("0"), inp_change_err.format("1")))
         # TODO: views require gradients when created in no_grad mode but their grad_fn is not populated
-        leaf_grad_err = "a leaf Variable that requires grad has been used in an in-place operation."
+        leaf_grad_err = "a leaf Variable that requires grad is being used in an in-place operation."
         run_test(grad_mode=False, requires_grad=True, is_view=True,
                  should_raise_tuple=(leaf_grad_err, leaf_grad_err, leaf_grad_err))
         run_test(grad_mode=False, requires_grad=False, is_view=True,

diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -94,6 +94,17 @@
 # used by our frontend.  You MUST check the frontend code; search for
 # OpName.apply to see if it's still using a legacy Python style API.
 #
+# Note: Returning views.
+# The following cases exist:
+#     - If a function returns no view, it can have arbitrary outputs.
+#     - If a function return at least one Tensor that is a differentiable view
+#       of one of its input:
+#         - If there is only one differentiable output, this Tensor is marked as a
+#           differentiable view. (alias or transpose for example)
+#         - If there are more than one differentiable output, by default all the views are
+#           marked as differentiable views and created with allow_rebase_history=false.
+#           Meaning that any inplace operation on it will raise an error. (unbind for example)
+#
 # NB: The parameter names here MUST be consistent with the parameter names
 # in Decalarations.yaml
 - name: abs(Tensor self) -> Tensor

diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
@@ -30,9 +30,7 @@
 from .utils import YamlLoader, split_name_params
 
 # See NOTE [ Autograd View Variables ] in variable.h for details.
-# A map: function name => two options:
-#      1. name of the argument that all outputs are view of
-#      2. map: output idx => name of the argument that this result is view of
+# A map: function name => name of the argument that all outputs are view of
 VIEW_FUNCTIONS = {
     'numpy_T': 'self',
     'alias': 'self',
@@ -67,7 +65,6 @@
 # returns a view of its inputs
 RETURNS_VIEWS_OF_INPUT = set(VIEW_FUNCTIONS.keys()).union({'chunk', 'split'})
 
-
 def format_return_type(returns):
     if len(returns) == 0:
         return 'void'

diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
@@ -562,6 +562,9 @@ def emit_body(declaration):
 
     base_name = name[:-1] if inplace else name[:-4] if is_out_fn else name
     view_info = VIEW_FUNCTIONS.get(base_name, None)
+    # TODO: Add back when https://github.com/pytorch/pytorch/pull/32044 lands again
+    # if view_info is None and base_name in RETURNS_VIEWS_OF_INPUT:
+    #     view_info = "self"
 
     def is_differentiable(arg):
         if 'TensorOptions' in arg['type']:
@@ -759,59 +762,38 @@ def declare_returned_variables():
         return '\n'.join(names)
 
     def wrap_output(call):
-        # Returns a 2-tuple `(wrapped_call, extra_wrapping_stmts)`, where
-        # `wrapped_call` is to drop-in replace `call`, and
-        # `extra_wrapping_stmts` is a list of extra statements to run after
-        # `call`.
+        # Returns `wrapped_call` which is a drop-in replacement for `call`
         if 'Tensor' not in declaration['return_type']:
-            return call, []
+            return call
         elif view_info is not None:
             # See NOTE [ Autograd View Variables ] in variable.h for details.
             differentiable_output_vars = {r['name'] for r in differentiable_outputs}
-            tensor_output_vars = {r['name'] for r in returns if 'Tensor' in r['type']}
-            if not isinstance(view_info, dict):
-                if len(differentiable_output_vars) == len(tensor_output_vars):
-                    # all outputs are differentiable
-                    return 'as_view({}, {}, true)'.format(view_info, call), []
-                elif len(differentiable_output_vars) == 0:
-                    # no output is differentiable
-                    return 'as_view({}, {}, false)'.format(view_info, call), []
-                else:
-                    # some of the outputs are differentiable
-                    # need to expand to dict mode, i.e., one entry per output
-                    base_name = view_info
-                    view_info_dict = {}
-                    for i, return_info in enumerate(returns):
-                        if 'Tensor' in return_info['type']:
-                            view_info_dict[i] = base_name
+
+            if not isinstance(view_info, str):
+                raise TypeError("The view info should be a string for {}, but it is: {}".format(base_name, view_info))
+
+            if len(differentiable_output_vars) == 0:
+                # no output is differentiable (.indices() for SparseTensors for example)
+                return 'as_view({}, {}, /* is_differentiable */ false)'.format(view_info, call)
+            elif len(differentiable_output_vars) == 1:
+                # Single differentiable output (Tensor or Tensor[])
+                return_info = differentiable_outputs[0]
+                # We only support simple Tensor or a TensorList for functions that return views
+                if not return_info['dynamic_type'] in ['Tensor', 'TensorList']:
+                    raise RuntimeError("{} that return differentiable views can only return Tensor or Tensor[]".format(base_name))
+                # Only allow rebasing of the history if we return a single Tensor
+                allow_rebase_history = 'true'
+                if return_info['dynamic_type'] == 'TensorList':
+                    allow_rebase_history = 'false'
+                wrapped_call = ("as_view(/* base */{}, /* output */ {}, /* is_differentiable */ true, "
+                                "/* allow_rebase_history */ {})").format(view_info, call, allow_rebase_history)
+                return wrapped_call
             else:
-                view_info_dict = view_info
-
-            def wrap_view_single(output_var, base_var):
-                fmt = '{output_var} = as_view({base_var}, {output_var}, {is_differentiable});'
-                if output_var in differentiable_output_vars:
-                    # If `GradMode::is_enabled()` is False, this is a
-                    # non-differentiable view. Gradients should not flow through.
-                    is_differentiable = 'true'
-                else:
-                    # This output is non-differentiable, so it is a
-                    # non-differentiable view. Gradients should not flow through.
-                    is_differentiable = 'false'
-                return fmt.format(output_var=output_var, base_var=base_var,
-                                  is_differentiable=is_differentiable)
-
-            extra_wrapping_stmts = []
-            for output_idx, return_info in enumerate(returns):
-                if 'Tensor' not in return_info['type']:
-                    assert output_idx not in view_info_dict, 'Can not wrap non-Tensor output as a view'
-                    continue
-                output_var = return_info['name']
-                if output_idx in view_info_dict:
-                    stmt = wrap_view_single(output_var, view_info_dict[output_idx])
-                extra_wrapping_stmts.append(stmt)
-            return call, extra_wrapping_stmts
+                # This could be supported but we don't need it at the moment, so keeping things simple.
+                raise RuntimeError("Function that return multiple differentiable output "
+                                   "when at least one of them is view is not supported.")
         else:
-            return 'std::move({})'.format(call), []
+            return 'std::move({})'.format(call)
 
     def enforce_same_tensorimpl_and_storage(env, call):
         save_ptrs_stmts = []
@@ -838,7 +820,6 @@ def enforce_same_tensorimpl_and_storage(env, call):
 
     def emit_call(env):
         combined = nested_dict(env, declaration)
-        extra_wrapping_stmts = []
         if strategy == 'use_derived':
             # We only care about adding `at::AutoNonVariableTypeMode` guard for non-variable dispatch
             # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure
@@ -852,7 +833,7 @@ def emit_call(env):
                 base_type_call = CALL_DISPATCH_VIA_METHOD.substitute(
                     combined, unpacked_method_args=unpacked_method_args)
             if not modifies_arguments and not returns_void:
-                rhs_value, extra_wrapping_stmts = wrap_output('tmp')
+                rhs_value = wrap_output('tmp')
                 call = DISPATCH_TO_NON_VAR_TYPE_WITH_RETURN_VALUES.substitute(
                     base_type_call=base_type_call,
                     return_values=tie_return_values(),
@@ -865,8 +846,6 @@ def emit_call(env):
             if not modifies_arguments and not returns_void:
                 call = '{} = {}'.format(tie_return_values(), call)
             call = call + ';'
-        for stmt in extra_wrapping_stmts:
-            call += '\n' + stmt
         call = enforce_same_tensorimpl_and_storage(env, call)
         return call
 

diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -255,7 +255,7 @@ Tensor detach(const Tensor & self) {
 
   }
   // <NON_GENERATED_CODE>
-  auto result = make_variable_view(self, self, /*is_differentiable=*/false, /*allow_tensor_metadata_change=*/false);
+  auto result = make_variable_non_differentiable_view(self, self, /*allow_tensor_metadata_change=*/false);
   namedinference::propagate_names(result, self);
   // </NON_GENERATED_CODE>
   if (jit::tracer::isTracing()) {

diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
@@ -41,9 +41,20 @@ namespace torch { namespace autograd {
 
 inline void check_inplace(const Tensor& tensor) {
   auto& var = static_cast<const Variable&>(tensor);
-  if (var.requires_grad() && var.is_leaf() && GradMode::is_enabled()) {
-    AT_ERROR(
-      "a leaf Variable that requires grad has been used in an in-place operation.");
+  if (var.requires_grad() && GradMode::is_enabled()) {
+    if (var.is_leaf()) {
+      AT_ERROR(
+        "a leaf Variable that requires grad is being used in an in-place operation.");
+    } else if (var.is_view()) {
+      // NB: is_view() ==> get_autograd_meta()
+      auto diff_view_meta = static_cast<DifferentiableViewMeta*>(impl::get_autograd_meta(var));
+      auto grad_fn = impl::grad_fn_unsafe(var);
+      // NB: !var.is_leaf() ==> grad_fn != nullptr
+      TORCH_INTERNAL_ASSERT(grad_fn);
+      TORCH_CHECK(diff_view_meta->allow_rebase_history,
+          "The ", diff_view_meta->output_nr_, "th output of ", grad_fn->name(),
+          " is being modified inplace but this is not allowed as it would prevent correct gradient computation.");
+    }
   }
 }
 
@@ -97,23 +108,33 @@ template<typename... Args> inline variable_list flatten_tensor_args(Args&&... ar
 }
 
 // See NOTE [ Autograd View Variables ] for details.
-inline Tensor as_view(const Tensor & base, Tensor tensor, bool is_differentiable = true) {
+inline Tensor as_view(const Tensor & base, Tensor tensor, bool is_differentiable, bool allow_rebase_history=true) {
   auto base_var = Variable(base);
   if (base_var.is_view()) {
     base_var = base_var.base();
   }
-  return make_variable_view(std::move(base_var), std::move(tensor), is_differentiable);
+  if (is_differentiable) {
+    return make_variable_differentiable_view(std::move(base_var), std::move(tensor), allow_rebase_history);
+  } else {
+    TORCH_CHECK(allow_rebase_history, "Non-differentiable views cannot set allow_rebase_history=false");
+    return make_variable_non_differentiable_view(std::move(base_var), std::move(tensor));
+  }
 }
 
 // See NOTE [ Autograd View Variables ] for details.
-inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor> tensors,
-                                   bool is_differentiable = true) {
+inline std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor> tensors, bool is_differentiable,
+                                   bool allow_rebase_history=true) {
   auto base_var = Variable(base);
   if (base_var.is_view()) {
     base_var = base_var.base();
   }
   for(Tensor &tensor : tensors) {
-    tensor = make_variable_view(base_var, std::move(tensor), is_differentiable);
+    if (is_differentiable) {
+      tensor = make_variable_differentiable_view(base_var, std::move(tensor), allow_rebase_history);
+    } else {
+      TORCH_CHECK(allow_rebase_history, "Non-differentiable views cannot set allow_rebase_history=false");
+      tensor = make_variable_non_differentiable_view(base_var, std::move(tensor));
+    }
   }
   return tensors;
 }

diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
@@ -25,8 +25,8 @@ namespace torch {
 namespace autograd {
 
 
-DifferentiableViewMeta::DifferentiableViewMeta(at::TensorImpl* self_impl, Variable base)
-    : AutogradMeta(self_impl, false) {
+DifferentiableViewMeta::DifferentiableViewMeta(at::TensorImpl* self_impl, Variable base, bool allow_rebase_history)
+    : AutogradMeta(self_impl), allow_rebase_history(allow_rebase_history) {
   base_ = std::move(base);
   TORCH_CHECK(base_.defined(), "base is undefined");
   if (base_.is_view()) {
@@ -72,12 +72,13 @@ namespace impl {
   }
 
   void rebase_history(const Variable& self, Edge gradient_edge) {
-    AT_ASSERT(gradient_edge.function != nullptr);
+    TORCH_INTERNAL_ASSERT(gradient_edge.function != nullptr);
     if (self.is_view()) {
       // NB: is_view() ==> get_autograd_meta()
       auto diff_view_meta = static_cast<DifferentiableViewMeta*>(get_autograd_meta(self));
-      AT_ASSERT(gradient_edge.input_nr == 0);
-      AT_ASSERT(gradient_edge.function);
+      TORCH_INTERNAL_ASSERT(diff_view_meta->allow_rebase_history);
+      TORCH_INTERNAL_ASSERT(gradient_edge.input_nr == 0);
+      TORCH_INTERNAL_ASSERT(gradient_edge.function);
       TORCH_CHECK(
           gradient_edge.function->num_inputs() == 1,
           "Functions which modify views in-place must return a single Variable");
@@ -323,7 +324,7 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(const Tenso
     }
     auto current_version = self._version();
     if (diff_view_meta->attr_version != current_version) {
-      AT_ASSERT(diff_view_meta->output_nr_ == 0);
+      TORCH_INTERNAL_ASSERT(diff_view_meta->output_nr_ == 0);
       auto fn = std::make_shared<torch::autograd::generated::AsStridedBackward>();
       fn->self_geometry = at::TensorGeometry(diff_view_meta->base_);
       fn->size = self.sizes().vec();

diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
@@ -295,6 +295,14 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
 ///   + if an in-place op is done on view, in rebase_history() of view, which is
 ///     called after every in-place op in VariableType.cpp, the grad_fn of base
 ///     is updated.
+///   + if a single autograd Node returns multiple differentiable views, if any
+///     output is modified by an inplace operation, the autograd engine will make
+///     an equivalent graph (corresponding to the view operations) without using
+///     equivalent graph, where each output is treated as if it were produced by a
+///     distinct view operation. This discards the original (e.g., user provided)
+///     grad_fn. If the provided grad_fn does more than the backward of the view,
+///     then the DifferentiableViewMeta must be created with allow_rebase_history=false
+///     to prevent the engine from ignoring the provided grad_fn.
 ///
 ///
 /// Non-Differentiable Views
@@ -313,8 +321,9 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
 ///      function are non-differentiable.
 /// These are called non-differentiable views as the gradients do not flow
 /// through the view relation.
-/// Relevant logic for non-differentiable views is implemented in
-/// make_variable_view below, and wrap_output of gen_variable_type.py.
+///
+/// Relevant logic for both differentiable and non-differentiable views is implemented in
+/// make_variable_(non_)differentiable_view below, and wrap_output of gen_variable_type.py.
 struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
   /// The base `Variable` (never a view).
   Variable base_;
@@ -324,11 +333,15 @@ struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
   /// version_counter.current_version().
   uint32_t attr_version;
 
+  /// Boolean flag that signifies if the history of this Tensor can be rebased
+  /// or if it is forbidden.
+  bool allow_rebase_history;
+
   bool requires_grad() const override {
     return requires_grad_ || grad_fn_ || (is_view_ && base_.requires_grad());
   }
 
-  DifferentiableViewMeta(at::TensorImpl* self_impl, Variable base);
+  DifferentiableViewMeta(at::TensorImpl* self_impl, Variable base, bool allow_rebase_history=true);
   ~DifferentiableViewMeta();
 };
 
@@ -353,28 +366,34 @@ struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
 /// prevent those changes from happening and is undesirable.
 
 // See NOTE [ Autograd View Variables ] for details.
-inline Variable make_variable_view(
+// Differentiable view. Track history with DifferentiableViewMeta.
+inline Variable make_variable_differentiable_view(
     Variable base,
     at::Tensor data,
-    bool is_differentiable = true,
-    bool allow_tensor_metadata_change = true) {
+    bool allow_rebase_history) {
   if (data.defined()) {
-    if (is_differentiable) {
-      /// Differentiable view. Track history with DifferentiableViewMeta.
-      auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
-        /*version_counter=*/0,
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-      data_impl_copy->set_autograd_meta(std::make_unique<DifferentiableViewMeta>(
-        data_impl_copy.get(), std::move(base)));
-      return Variable(data_impl_copy);
-    } else {
-      /// Non-differentiable view. Just share version counter.
-      auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
-        /*version_counter=*/impl::version_counter(base),
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-      data_impl_copy->set_autograd_meta(nullptr);
-      return Variable(data_impl_copy);
+    auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+      /*version_counter=*/0,
+      /*allow_tensor_metadata_change=*/true);
+    data_impl_copy->set_autograd_meta(std::make_unique<DifferentiableViewMeta>(
+      data_impl_copy.get(), std::move(base), allow_rebase_history));
+    return Variable(data_impl_copy);
     }
+  return Variable();
+}
+
+// See NOTE [ Autograd View Variables ] for details.
+// Non-differentiable view. Just share version counter.
+inline Variable make_variable_non_differentiable_view(
+    Variable base,
+    at::Tensor data,
+    bool allow_tensor_metadata_change = true) {
+  if (data.defined()) {
+    auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+      /*version_counter=*/impl::version_counter(base),
+      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    data_impl_copy->set_autograd_meta(nullptr);
+    return Variable(data_impl_copy);
   }
   return Variable();
 }