Lightning-AI · IvanYashchuk · Jan 16, 2025 · Nov 19, 2024 · Nov 19, 2024 · Nov 20, 2024
@@ -1,32 +1,24 @@
 from __future__ import annotations
 import operator
 import importlib
-from dataclasses import replace
-from contextlib import ContextDecorator
-from functools import wraps, partial
-from inspect import signature
-from itertools import groupby
+from functools import partial, wraps
 from numbers import Number
 from typing import TYPE_CHECKING
 from collections.abc import Callable
 from collections.abc import Hashable, Sequence
 from collections.abc import Sequence
 from types import ModuleType
-from enum import Enum, auto
 
 import torch
-import math
-from looseversion import LooseVersion
 
+from thunder.core.compile_data import get_compile_data
 import thunder.core.dtypes as dtypes
 from thunder.core.dtypes import to_torch_dtype, to_dtype
 import thunder.core.devices as devices
 from thunder.core.devices import to_torch_device, to_device
 import thunder.core.prims as prims
-from thunder.core.trace import TraceCtx, set_tracectx, reset_tracectx, from_trace
-from thunder.core.proxies import NumberProxy, TensorProxy, FutureTensorProxy, variableify, pytype
-from thunder.core.pytree import tree_flatten, tree_unflatten
-from thunder.core.symbol import Symbol, BoundSymbol
+from thunder.core.proxies import NumberProxy, TensorProxy, FutureTensorProxy, pytype
+from thunder.core.symbol import Symbol
 from thunder.distributed.prims import DistributedReduceOps
 import thunder.distributed.prims as dist_prims
 import thunder.core.utils as utils
@@ -2190,6 +2182,9 @@ def is_float_type(self, input):
 
 
 def _copy__impl(copy_from, copy_to):
+    cd = get_compile_data()
+    if cd is not None and cd.is_grad_enabled and copy_to.is_leaf and copy_to.requires_grad:
+        raise RuntimeError("a leaf Variable that requires grad is being used in an in-place operation.")
     copy_to.copy_(copy_from)
     return copy_to
 

@@ -476,31 +476,27 @@ def f(xs, ys, z):
     dtypes=NOTHING,
 )
 def test_inplace_to_tensors_with_grad(executor, device, _):
-    @torch.no_grad
     def add_y(x, y):
-        x.add_(y, alpha=0.1)
+        # inplace operations requiring grad on leafs are illegal, trick to make z a non-leaf
+        z = torch.abs(x) * torch.sgn(x)
+        z.add_(y, alpha=0.1)
 
-    @torch.no_grad
-    def add_grad(x, y):
-        x.add_(x.grad, alpha=0.1)
+    jitted_f = executor.make_callable(add_y)
+    x = make_tensor((2, 2), device=device, dtype=torch.float32, requires_grad=True)
+    x.grad = make_tensor((2, 2), device=device, dtype=torch.float32)
+    y = make_tensor((2, 2), device=device, dtype=torch.float32)
 
-    for f in (add_y, add_grad):
-        jitted_f = executor.make_callable(f)
-        x = make_tensor((2, 2), device=device, dtype=torch.float32, requires_grad=True)
-        x.grad = make_tensor((2, 2), device=device, dtype=torch.float32)
-        y = make_tensor((2, 2), device=device, dtype=torch.float32)
+    x_ref = x.clone().detach().requires_grad_(True)
+    x_ref.grad = x.grad.clone().detach()
+    y_ref = y.clone().detach()
 
-        x_ref = x.clone().detach().requires_grad_(True)
-        x_ref.grad = x.grad.clone().detach()
-        y_ref = y.clone().detach()
+    res = jitted_f(x, y)
+    res_ref = add_y(x_ref, y_ref)
 
-        res = jitted_f(x, y)
-        res_ref = f(x_ref, y_ref)
-
-        torch.testing.assert_close(x, x_ref)
-        torch.testing.assert_close(x.grad, x_ref.grad)
-        torch.testing.assert_close(y, y_ref)
-        torch.testing.assert_close(res, res_ref)
+    torch.testing.assert_close(x, x_ref)
+    torch.testing.assert_close(x.grad, x_ref.grad)
+    torch.testing.assert_close(y, y_ref)
+    torch.testing.assert_close(res, res_ref)
 
 
 @instantiate(
@@ -551,6 +547,8 @@ def single_tensor_adam(
 
     jitted = executor.make_callable(single_tensor_adam)
     params, grads, exp_avgs, exp_avg_sqs = tensors
+    cd = thunder.compile_data(jitted)
+    cd.compile_options["torch_compile_fullgraph"] = False
 
     jitted(params, grads, exp_avgs, exp_avg_sqs, state_steps)
     torch.testing.assert_close(actual=tensors + [state_steps], expected=ref_tensors + [ref_state_steps])