From aa7515ac9c17c0f4f07d7e1545bc9077dacd1060 Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Wed, 27 Nov 2024 09:47:55 +0200 Subject: [PATCH 1/3] Skip model_flop_per_sec report if it's not computed (#1469) --- thunder/benchmarks/benchmark_litgpt.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/thunder/benchmarks/benchmark_litgpt.py b/thunder/benchmarks/benchmark_litgpt.py index 8bcaf575e..4d6a7feb6 100644 --- a/thunder/benchmarks/benchmark_litgpt.py +++ b/thunder/benchmarks/benchmark_litgpt.py @@ -851,7 +851,8 @@ def benchmark_main(return_metrics_as_json=False, json_path="", **kwargs) -> None if tokens_per_sec: print(f"Tokens/s: {tokens_per_sec:.02f}") print(f"Tokens/s/GPU: {(tokens_per_sec / world_size):.02f}") - print(f"TFLOP/s: {benchmark.perf_metrics['model_flop_per_sec'] / 1e12:.02f}") + if benchmark.throughput: + print(f"TFLOP/s: {benchmark.perf_metrics['model_flop_per_sec'] / 1e12:.02f}") if benchmark.dump_memory_snapshot: file_name = f"{benchmark.model_name}_{benchmark.compile}_{benchmark.distributed_mode}" From 825c60e8ba23e714f5ddb4e2783590c3ddb0f730 Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Wed, 27 Nov 2024 10:04:17 +0100 Subject: [PATCH 2/3] ThunderFX: Save the reproducer script into files (#1380) --- thunder/core/transform_common.py | 3 + thunder/dynamo/compiler.py | 46 +++- thunder/dynamo/compiler_graph_benchmark.py | 26 +- thunder/dynamo/splitter.py | 13 +- thunder/dynamo/utils.py | 305 ++++++++++++++++++--- thunder/tests/test_dynamo.py | 86 +++++- 6 files changed, 433 insertions(+), 46 deletions(-) diff --git a/thunder/core/transform_common.py b/thunder/core/transform_common.py index c9449f9f8..bfe2cb376 100644 --- a/thunder/core/transform_common.py +++ b/thunder/core/transform_common.py @@ -404,6 +404,9 @@ def reverse_transform_state_dict_for_submodule( ) -> dict[str, Any]: return state_dict + def __repr__(self) -> str: + return f"{self.__class__.__module__}.{self.__class__.__name__}()" + def order_proxies(bsyms: Sequence[BoundSymbol]) -> dict[str, int]: """computes a canonical ordering of proxies in the bound symbols based on the order of appearance diff --git a/thunder/dynamo/compiler.py b/thunder/dynamo/compiler.py index d61aa1081..9eb7dd851 100644 --- a/thunder/dynamo/compiler.py +++ b/thunder/dynamo/compiler.py @@ -7,11 +7,13 @@ import torch from thunder.core.baseutils import run_once -from thunder.dynamo.utils import recompile_graph, remove_empty_autocast +from thunder.core.utils import safe_zip +from thunder.dynamo.utils import recompile_graph, remove_empty_autocast, reproducer, CompilerType from thunder.dynamo.splitter import _splitter if TYPE_CHECKING: from thunder.dynamo.utils import SubgraphInfo + from os import PathLike @run_once @@ -83,3 +85,45 @@ def __call__(self, gm: torch.fx.GraphModule, sample_args: list[torch.SymInt, tor split_module, subgraph_info = _splitter(gm, self._thunder_jit, self._torch_compile, sample_args) self.subgraph_infos.append(subgraph_info) return split_module + + def save_reproducer_to_folder(self, reproducer_folder: str | PathLike, use_pytest_benchmark: bool = False): + """ + Save the reproducer script for the GraphModule executed by Thunder to the specified `reproducer_folder`. + Each saved script is named as "graph[graph_id]_thunder_[module_id]", where: + + - `graph_id` indexes the graph generated by Dynamo, which is then passed to Thunder. + - `module_id` indexes the submodule split by the :func:`thunder.dynamo.utils._splitter`. + + Args: + reproducer_folder (str | PathLike): The folder where the reproducer code will be written. Can be specified as an absolute or relative path. + use_pytest_benchmark (str): Determines the type of script to create: + + - If use_pytest_benchmark=False: Creates a reproducer script. + - If use_pytest_benchmark=True: Creates a benchmark script to compare the reproducer's performance with other backends, including Torch eager, torch.compile, and torch.compile with `backend="eager"`. + """ + if not self.subgraph_infos: + raise TypeError(f"{self} doesn't seem to have been called yet.") + + for graph_idx, subgraph_info in enumerate(self.subgraph_infos): + thunder_module_names = [] + for node in subgraph_info.split_graph_module.graph.nodes: + target = node.target + if isinstance(target, str) and target.startswith("thunder_"): + thunder_module_names.append(target) + original_thunder_modules = ( + m + for m, compiled_m in subgraph_info.submodule_to_compiled_functions.items() + if compiled_m.compiler == CompilerType.THUNDER + ) + example_inputs = subgraph_info.thunder_compiled_fns_example_inputs + for cur_module, example_input, cur_name in safe_zip( + original_thunder_modules, example_inputs, thunder_module_names + ): + reproducer( + cur_module, + self.thunder_options, + example_input, + reproducer_folder, + f"graph{graph_idx}_{cur_name}", + use_pytest_benchmark, + ) diff --git a/thunder/dynamo/compiler_graph_benchmark.py b/thunder/dynamo/compiler_graph_benchmark.py index ddb7f80e5..5dc6eecd3 100644 --- a/thunder/dynamo/compiler_graph_benchmark.py +++ b/thunder/dynamo/compiler_graph_benchmark.py @@ -103,19 +103,25 @@ def run_bench(self, gm: torch.fx.GraphModule, name: str, *sample_args): if self.post_graph: compiled_fn = self.post_graph(compiled_fn, sample_args) - with record_peak_allocated_memory(self.bench): + # This guard ensures compatibility with CPU-only PyTorch builds. + if torch.cuda.is_available(): + with record_peak_allocated_memory(self.bench): + self.bench(compiled_fn, *sample_args) + else: self.bench(compiled_fn, *sample_args) # BenchmarkFixture.stats is created each time bench is called (ref: https://github.com/pybenchmark/pytest-benchmark/blob/8c9a5faa1dd178b53ab7b2a66f5364a77e903d74/src/pytest_benchmark/fixture.py#L150) # Adds the graph number, split module name and executor suffix to the name string gid_key, module_name_key, ex_key = GRAPH_BY_GRAPH_BENCHMARK_PARAMS_KEYS - self.bench.stats.name += f"-{gid_key}[{self.graph_idx+1}]-{module_name_key}[{name}]-{ex_key}[{ex_name}]" - assert MAX_ALLOCATED_MEMORY_KEYWORD in self.bench.extra_info - assert f"{self.bench.stats.name}_{MAX_ALLOCATED_MEMORY_KEYWORD}" not in self.bench.extra_info - # NOTE: A benchmark can include multiple stats, but only one extra_info field is allowed per benchmark. - # Therefore, we use the current stats name as a prefix to distinguish memory usage for each stats. - self.bench.extra_info[f"{self.bench.stats.name}_{MAX_ALLOCATED_MEMORY_KEYWORD}"] = ( - self.bench.extra_info.pop(MAX_ALLOCATED_MEMORY_KEYWORD) - ) + self.bench.stats.name += f"-{gid_key}[{self.graph_idx}]-{module_name_key}[{name}]-{ex_key}[{ex_name}]" + + if torch.cuda.is_available(): + assert MAX_ALLOCATED_MEMORY_KEYWORD in self.bench.extra_info + assert f"{self.bench.stats.name}_{MAX_ALLOCATED_MEMORY_KEYWORD}" not in self.bench.extra_info + # NOTE: A benchmark can include multiple stats, but only one extra_info field is allowed per benchmark. + # Therefore, we use the current stats name as a prefix to distinguish memory usage for each stats. + self.bench.extra_info[f"{self.bench.stats.name}_{MAX_ALLOCATED_MEMORY_KEYWORD}"] = ( + self.bench.extra_info.pop(MAX_ALLOCATED_MEMORY_KEYWORD) + ) # when the graph is segmented, the self.bench run multiple times, pybenchmark throws an error: # `FixtureAlreadyUsed("Fixture can only be used once. Previously it was used in %s mode." % self._mode)` @@ -158,7 +164,7 @@ def has_checkpoint_node(g): cur_nodes = cur_module.graph.nodes # Greates random input values for the current module based on the faketensor 'example_value' of the placeholder node placeholders = list(n for n in cur_nodes if n.op == "placeholder") - args = chain(*map(_get_example_inputs_from_placeholder, placeholders)) + args = list(map(_get_example_inputs_from_placeholder, placeholders)) # Runs the benchmark on the original module with the generated random inputs self.run_bench(compiled_functions_to_submodule[cur_module], target, *args) self.graph_idx += 1 diff --git a/thunder/dynamo/splitter.py b/thunder/dynamo/splitter.py index b128357b9..4b455f60b 100644 --- a/thunder/dynamo/splitter.py +++ b/thunder/dynamo/splitter.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import TYPE_CHECKING import copy +from functools import partial import torch from torch.fx.passes.split_module import split_module @@ -16,6 +17,7 @@ update_node_and_submodule, recompile_graph, checkpoint_converter, + _get_example_inputs_from_placeholder, ) if TYPE_CHECKING: @@ -124,8 +126,9 @@ def callback(node) -> int: return partition_cnt # There is a flip. Either from supported to unsupported or unsupported to supported. + if prev_value is not None: + partition_cnt += 1 # Bump the region cnt. prev_value = is_thunder_supported - partition_cnt += 1 # Bump the region cnt. if is_thunder_supported: supported_partitions.add(partition_cnt) @@ -142,11 +145,18 @@ def is_thunder_supported_partition(node: torch.fx.Node) -> bool: # Call compile on the split region/s. thunder_compiled_fns = [] + example_input_metadatas = [] submodule_to_compiled_fns = {} for node in split_gm.graph.nodes: node_name = node.name if is_thunder_supported_partition(node): graph_module = getattr(split_gm, node.name) + # Record the input tensor metadata of the current module based on the faketensor 'example_value' of the placeholder node + placeholders = list(n for n in graph_module.graph.nodes if n.op == "placeholder") + example_input_metadata = map( + partial(_get_example_inputs_from_placeholder, only_metadata=True), placeholders + ) + example_input_metadatas.append(list(example_input_metadata)) # Replace PyTorch operators within the checkpointed function with the corresponding Thunder operators checkpoint_converter(split_gm, graph_module) jit_fn = thunder_jit(graph_module) @@ -176,6 +186,7 @@ def is_thunder_supported_partition(node: torch.fx.Node) -> bool: original_split_gm, split_gm, thunder_compiled_fns, + example_input_metadatas, submodule_to_compiled_fns, split_reasons, ) diff --git a/thunder/dynamo/utils.py b/thunder/dynamo/utils.py index 668f2ef0b..4bab617cd 100644 --- a/thunder/dynamo/utils.py +++ b/thunder/dynamo/utils.py @@ -6,8 +6,11 @@ import inspect import itertools import copy +from pathlib import Path import torch +from torch.nn.modules.module import _addindent +from torch._subclasses.fake_tensor import FakeTensor from thunder.torch.default_torch_ops import torch_auto_registered_ops from thunder.torch import _torch_to_thunder_function_map @@ -16,6 +19,9 @@ if TYPE_CHECKING: from thunder.core.symbol import Symbol + import os + from typing import Any + from collections.abc import Sequence auto_register_ops = set(itertools.chain(*torch_auto_registered_ops.values())) @@ -74,6 +80,26 @@ class SplitReason: exception: Exception | None = None +@dataclasses.dataclass(frozen=True) +class ExampleInputMetaData: + """ + Describes the metadata of a tensor, used to generate a random tensor with matching properties + """ + + requires_grad: bool + layout: torch.layout + device: str | torch.device + dtype: torch.dtype + shape: list[int] + storage_shape: list[int] + strides: list[int] + min_val: int | None = None + max_val: int | None = None + + def stride(self) -> list[int]: + return self.strides + + @dataclasses.dataclass(frozen=True) class SubgraphInfo: """A dataclass containing information about a subgraph. @@ -87,6 +113,8 @@ class SubgraphInfo: thunder_compiled_fns: List of thunder optimized callables. This could be :obj:`None` if there the graph module was not supported by thunder. Look at the :attr:`split_reasons` for further information. + thunder_compiled_fns_example_inputs: List containing metadata of sample inputs for `thunder_compiled_fns`. + These inputs are used to generate random test inputs in the reproducer script. submodule_to_compiled_functions: Dict from subgraph in :attr:`original_split_graph_module` to compiled function. This will be a dict with one pair in case the graph was not split. split_reasons: List of reasons explaining why the subgraph was split. @@ -97,13 +125,14 @@ class SubgraphInfo: original_split_graph_module: torch.fx.GraphModule | None split_graph_module: torch.fx.GraphModule | None thunder_compiled_fns: list[Callable] | None + thunder_compiled_fns_example_inputs: list[list[ExampleInputMetaData]] | None submodule_to_compiled_functions: dict[torch.fx.GraphModule, CompiledFunction] split_reasons: list | None = None -def _concrete_shape(x): +def _concrete_value(vals: torch.Size | Sequence): """ - Get the concrete shape for a FakeTensor if it has `torch.SymInt` in its shape. + Get the concrete value from the input `vals` if it contains `torch.SymInt`. """ def get_backed_value(s): @@ -112,7 +141,7 @@ def get_backed_value(s): # Value is already concrete. return s - return tuple(map(get_backed_value, x.shape)) + return tuple(map(get_backed_value, vals)) def get_proxy_inputs_from_node(node: torch.fx.Node) -> tuple[tuple, dict]: @@ -147,11 +176,12 @@ def make_tensor_proxy(arg_node): # Here, we only want to verify that thunder can run an operation. # So, it is ok to verify with concrete value. example_value = example_value.new_ones( - _concrete_shape(example_value), device=example_value.device, dtype=example_value.dtype + _concrete_value(example_value.shape), device=example_value.device, dtype=example_value.dtype ) elif isinstance(example_value, tuple): example_value = tuple( - e_v.new_ones(_concrete_shape(e_v), device=e_v.device, dtype=e_v.dtype) for e_v in example_value + e_v.new_ones(_concrete_value(e_v.shape), device=e_v.device, dtype=e_v.dtype) + for e_v in example_value ) else: # NOTE - This will be caught will be caught and be part of the SplitReason. @@ -424,43 +454,78 @@ def recompile_graph(gm: torch.fx.GraphModule): return gm.recompile() -def _get_example_inputs_from_placeholder(node) -> tuple[torch.Tensor]: +def _get_storage_shape(t: torch.Tensor): + shape = _concrete_value(t.shape) + if t.is_contiguous(): + return shape + strides = _concrete_value(t.stride()) + storage_size = sum(strides[i] * (shape[i] - 1) for i in range(len(shape))) + 1 + return (storage_size,) + + +def _get_example_input_tensor_metadata(t: torch.Tensor) -> ExampleInputMetaData: + min_val = None + max_val = None + if not isinstance(t, FakeTensor) and t.numel() != 0: + minmax: tuple[torch.Tensor, torch.Tensor] = torch.aminmax(t) + min_val = minmax[0].cpu().item() + max_val = minmax[1].cpu().item() + meta_ev = ExampleInputMetaData( + t.requires_grad, + t.layout, + t.device, + t.dtype, + _concrete_value(t.shape), + _get_storage_shape(t), + _concrete_value(t.stride()), + min_val, + max_val, + ) + return meta_ev + + +def _create_random_tensor_from_tensor_metadata(t: ExampleInputMetaData) -> torch.Tensor: from thunder.tests.make_tensor import make_tensor + return make_tensor(t.storage_shape, dtype=t.dtype, device=t.device, requires_grad=t.requires_grad).as_strided( + t.shape, t.stride() + ) + + +def _get_example_inputs_from_placeholder( + node: torch.fx.Node, only_metadata=False +) -> tuple[torch.Tensor | ExampleInputMetaData] | torch.Tensor | ExampleInputMetaData: + """Retrieves example input data for a given placeholder `torch.fx.Node`. + - When `only_metadata` is `False`: Generates and returns a random example tensor based on the node's expected shape and data type, etc. + - When `only_metadata` is `True`: Returns only the tensor's metadata (e.g., shape, data type) without generating an actual tensor. + """ check(node.op == "placeholder", lambda: f"The node must be placeholder type", ValueError) # Prefers to use actual example value in GraphArg if available if "grapharg" in node.meta: - example_value = node.meta["grapharg"].example - if isinstance(example_value, torch.Tensor): - return (example_value.detach().clone().requires_grad_(example_value.requires_grad),) - - check("example_value" in node.meta, lambda: "example_value does not exist in the meta of {node}", ValueError) + ev = node.meta["grapharg"].example + if isinstance(ev, torch.Tensor): + if only_metadata: + return _get_example_input_tensor_metadata(ev) + return ev.detach().clone().requires_grad_(ev.requires_grad) + + if "example_value" not in node.meta: + return None example_value = node.meta["example_value"] if isinstance(example_value, torch.Tensor): - sz = _concrete_shape(example_value) - return ( - make_tensor( - sz, - dtype=example_value.dtype, - device=example_value.device, - requires_grad=example_value.requires_grad, - ).as_strided(sz, example_value.stride()), - ) + ev_metadata = _get_example_input_tensor_metadata(example_value) + if only_metadata: + return ev_metadata + return _create_random_tensor_from_tensor_metadata(ev_metadata) elif isinstance(example_value, tuple): - return tuple( - make_tensor( - _concrete_shape(e_v), - dtype=e_v.dtype, - device=e_v.device, - requires_grad=e_v.requires_grad, - ).as_strided(_concrete_shape(e_v), e_v.stride()) - for e_v in example_value - ) + ev_metadatas = tuple(_get_example_input_tensor_metadata(e_v) for e_v in example_value) + if only_metadata: + return ev_metadatas + return tuple(_create_random_tensor_from_tensor_metadata(ev_metadata) for ev_metadata in ev_metadatas) + elif isinstance(example_value, torch.types.py_sym_types): + return example_value.node.hint else: - raise TypeError( - "The 'example_value' in the placeholder node is expected to be either a Tensor or a Tuple of Tensors." - ) + raise TypeError(f"Unsupported example_value type: {type(example_value)}") def _checkpoint_function_converter(gm: torch.fx.GraphModule): @@ -555,3 +620,179 @@ def remove_empty_autocast(graph_module: torch.fx.GraphModule) -> torch.fx.GraphM empty_autocast_removed_graph_module.graph.erase_node(node) return empty_autocast_removed_graph_module + + +def arg_like_tensor(arg: torch.Tensor | ExampleInputMetaData): + """Creates a new argument like the given tensor or tensor metadata""" + min_val = None + max_val = None + if isinstance(arg, torch.Tensor): + if arg.numel() != 0: + min_val, max_val = torch.aminmax(arg) + min_val = min_val.cpu().item() + max_val = max_val.cpu().item() + else: + min_val, max_val = arg.min_val, arg.max_val + storage_shape = _get_storage_shape(arg) if isinstance(arg, torch.Tensor) else arg.storage_shape + if min_val is not None and min_val == max_val: + meta = f"{storage_shape}, {min_val}, dtype={arg.dtype}, device='{arg.device}', requires_grad={arg.requires_grad}, layout={arg.layout}" + return f"torch.full({meta}).as_strided({arg.shape}, {arg.stride()})," + meta = f"{storage_shape}, dtype={arg.dtype}, device='{arg.device}', requires_grad={arg.requires_grad}," + meta = f"{meta} low={min_val}, high={max_val}," + return f"torch.testing.make_tensor({meta}).as_strided({arg.shape}, {arg.stride()})," + + +def arg_like(arg: Any): + """Creates a new argument that is similar to the given arg.""" + if isinstance(arg, (torch.Tensor, ExampleInputMetaData)): + return f"{arg_like_tensor(arg)}" + else: + # Assume it's a literal that we can just print directly. + return f"{arg}," + + +def _readable( + module: torch.fx.GraphModule, + module_name: str, + print_output: bool = False, + include_stride: bool = True, + include_device: bool = True, + colored: bool = False, +): + """Modified from `torch.fx.graph_module._print_readable` (https://github.com/pytorch/pytorch/blob/3192bdeea428f2bf3a95274ee59ea41c4f8e31e9/torch/fx/graph_module.py#L297). + This is basically print_readable but it sets verbose=False (torch hardcodes it to True).""" + graph = module.graph + assert graph is not None and isinstance( + graph, torch.fx.Graph + ), "print_readable must be used on a module with a graph" + + verbose_python_code = graph.python_code( + root_module="self", + verbose=False, + include_stride=include_stride, + include_device=include_device, + colored=colored, + ) + module_code = verbose_python_code.src + module_code = module_code.lstrip("\n") + module_code = f"class {module_name}(torch.nn.Module):\n" + module_code + module_code = _addindent(module_code, 2) + + submodule_code_list = [""] + for submodule_name, submodule in module.named_children(): + if hasattr(submodule, "graph"): + submodule_code_list.append( + _readable( + submodule, + submodule_name, + print_output=False, + include_stride=include_stride, + include_device=include_device, + colored=colored, + ) + ) + submodule_code = "\n".join(submodule_code_list) + submodule_code = _addindent(submodule_code, 2) + + output = module_code + submodule_code + if print_output: + print(module_code + submodule_code) + return output + + +def get_env() -> tuple[str, str]: + """Retrieve detailed environment information using `torch.utils.collect_env.get_pretty_env_info()`. + Additionally, include the installed versions of Thunder and NvFuser (if available via pip). + """ + + from torch.utils.collect_env import run, get_pretty_env_info, get_pip_packages + + torch_env = get_pretty_env_info() + _, thunder_packages = get_pip_packages(run, {"lightning-thunder", "nvfuser"}) + return torch_env, thunder_packages + + +def thunder_options_to_str(thunder_options: dict) -> str: + from thunder import resolve_executors + + option_str = "" + for key, value in thunder_options.items(): + if key == "executors": + executors = resolve_executors(value) + option_str += f"{key}=[" + ",".join(f"thunder.extend.get_executor('{ex.name}')" for ex in executors) + "]" + else: + option_str += f"{key}={repr(value)}" + option_str += "," + return option_str + + +def reproducer( + gm: torch.fx.GraphModule, + thunder_options: dict, + args: tuple[torch.Tensor | ExampleInputMetaData], + folder: str | os.PathLike, + graph_name: str, + use_pytest_benchmark: bool = False, +): + folder = Path(folder) + folder.mkdir(exist_ok=True) + torch_env, thunder_pkgs = get_env() + # Ideally we'd use print_readable, but we want verbose=False and there's no + # way to set that with print_readable. + readable = _readable(gm, "DynamoModule", print_output=False) + has_cuda_args = any(hasattr(arg, "device") and arg.device.type == "cuda" for arg in args) + thunder_options_str = thunder_options_to_str(thunder_options) + with open(folder / f"{graph_name}.py", "w") as f: + comment_str = f'''""" +Environment information get from `torch.utils.collect_env.get_pretty_env_info()`: +{torch_env} + +Versions of Thunder related libraries: +{thunder_pkgs} + +The torch.fx.Graph: +{gm.graph} +""" +''' + if use_pytest_benchmark: + comment_str += f"""# NOTE: This script requires `pytest-benchmark==4.0.0` to be installed. +# To execute the script, run `pytest {graph_name}.py`""" + import_str = f"from functools import partial\n\nimport torch\nimport thunder\n" + if has_cuda_args: + import_str += "from thunder.transforms.cudagraph import CUDAGraphTransform\n" + import_str += "from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform\n" + if use_pytest_benchmark: + code_str = f"def test_{graph_name}(benchmark):\n{readable}\n" + else: + code_str = f"def test_{graph_name}():\n{readable}\n" + + if any(arg is None for arg in args): + code_str += f"# Warning: The inputs that cannot be inferred are set to None, requiring the user to manually give inputs according to the code\n" + input_str = f"""inputs = [\n{chr(10).join(arg_like(a) for a in args)}\n""" + code_str += f"{_addindent(input_str, 4)}\n]\n" + + if not use_pytest_benchmark: + code_str += f"compiled = thunder.jit(DynamoModule(), {thunder_options_str})\n" + code_str += "compiled(*inputs)" + else: + code_str += "from thunder.dynamo.compiler_graph_benchmark import ThunderCompilerGraphBenchmarking\n" + code_str = f"""{code_str} +bench_executors_dict = {{}} +bench_executors_dict["thunder"]=partial(thunder.jit, {thunder_options_str}) +bench_executors_dict["torch.compile"]=torch.compile +bench_executors_dict["dynamo_eager"]=partial(torch.compile, backend="eager") +bench_executors_dict["eager"]=None +""" + if has_cuda_args: + code_str = f"""{code_str}bench_executors_dict["thunder_cugraph"]=partial(thunder.jit, transform=CUDAGraphTransform())\n""" + code_str += f""" +backend = ThunderCompilerGraphBenchmarking(benchmark, executors=bench_executors_dict) +compiled = torch.compile(backend=backend)(DynamoModule()) +compiled(*inputs) +""" + print(comment_str, file=f) + print(import_str, file=f) + print(_addindent(code_str, 4), file=f) + + if not use_pytest_benchmark: + print(f"\ntest_{graph_name}()", file=f) diff --git a/thunder/tests/test_dynamo.py b/thunder/tests/test_dynamo.py index 65e6603f5..cc740ff40 100644 --- a/thunder/tests/test_dynamo.py +++ b/thunder/tests/test_dynamo.py @@ -1,6 +1,8 @@ import pytest import warnings import itertools +import os +from subprocess import run import torch import torch.fx import torch.nn as nn @@ -558,7 +560,7 @@ def f(x): all_nodes = itertools.chain( backend.subgraph_infos[0].split_graph_module.graph.nodes, - backend.subgraph_infos[0].split_graph_module.thunder_1.graph.nodes, + backend.subgraph_infos[0].split_graph_module.thunder_0.graph.nodes, ) assert all(node.target not in autocast_ops for node in all_nodes) @@ -575,7 +577,7 @@ def f(x): backend = _call_thunder_backend(f, (x,)) all_nodes = itertools.chain( backend.subgraph_infos[0].split_graph_module.graph.nodes, - backend.subgraph_infos[0].split_graph_module.thunder_1.graph.nodes, + backend.subgraph_infos[0].split_graph_module.thunder_0.graph.nodes, ) assert sum(node.target in autocast_ops for node in all_nodes) == 2 @@ -782,3 +784,83 @@ def find_target_module(model, target_module_name): for n in submodule.graph.nodes: if n.op == "call_function": assert isinstance(n.target, Symbol) + + +@instantiate( + dtypes=NOTHING, + executors=[DynamoThunderExecutor], + decorators=(pytest.mark.parametrize("use_pytest_benchmark", (True, False), ids=("benchmark", "repro")),), +) +def test_dynamo_reproducer_2graph(executor, device: str, dtype: dtypes.dtype, use_pytest_benchmark, tmp_path): + from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform + from thunder import nvfuser_executor + from thunder.transforms.cudagraph import CUDAGraphTransform + + if device.startswith("cuda"): + backend = ThunderCompiler( + transforms=[ + NvtxProfileTransform(), + CUDAGraphTransform(), + ], + executors=[nvfuser_executor], + cache="constant values", + langctx=None, + record_history=False, + ) + else: + backend = ThunderCompiler(executors=None) + # Test non-contiguous input tensor + x = make_tensor((4, 4), low=3, high=10, dtype=torch.int64, device=device, noncontiguous=True) + + @torch.compile(backend=backend) + def func(x): + x = torch.sin(x) + if x.sum() > 0: + return x + 1 + else: + return x - 1 + + out = func(x) + backend.save_reproducer_to_folder(tmp_path, use_pytest_benchmark=use_pytest_benchmark) + + s1 = f"{tmp_path}/graph0_thunder_0.py" + s2 = f"{tmp_path}/graph1_thunder_0.py" + assert os.path.exists(s1) + assert os.path.exists(s2) + cmd = "pytest" if use_pytest_benchmark else "python" + result1 = run([cmd, s1], capture_output=True, text=True) + result2 = run([cmd, s2], capture_output=True, text=True) + + assert result1.returncode == 0, f"Reproducer {s1} failed with return code {result1.returncode}" + assert result2.returncode == 0, f"Reproducer {s2} failed with return code {result2.returncode}" + + +@requiresCUDA +@pytest.mark.parametrize("use_pytest_benchmark", (True, False), ids=("benchmark", "repro")) +def test_dynamo_reproducer_submodules(use_pytest_benchmark, tmp_path): + from thunder.tests.distributed.helper import ToyModel + import torch.nn as nn + + class SimpleModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.sub_mod = ToyModel() + self.seq = nn.Sequential(self.sub_mod, nn.ReLU()) + + def forward(self, x): + x = torch.sin(x) + x = self.seq(x) + return x + + x = torch.randn(1, ToyModel.N_IN, device="cuda", requires_grad=True) + model = SimpleModel().cuda() + backend = ThunderCompiler() + jf = torch.compile(backend=backend)(model) + out = jf(x) + backend.save_reproducer_to_folder(tmp_path, use_pytest_benchmark=use_pytest_benchmark) + + s1 = f"{tmp_path}/graph0_thunder_0.py" + assert os.path.exists(s1) + cmd = "pytest" if use_pytest_benchmark else "python" + result1 = run([cmd, s1], capture_output=True, text=True) + assert result1.returncode == 0, f"Reproducer {s1} failed with return code {result1.returncode}" From ea0159ca20b4109a84f572020b73220fb3c85328 Mon Sep 17 00:00:00 2001 From: beverlylytle <57254617+beverlylytle@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:05:52 +0100 Subject: [PATCH 3/3] add leaky_relu op (#1459) Co-authored-by: Thomas Viehmann --- thunder/executors/torchex.py | 2 ++ thunder/tests/opinfos.py | 36 +++++++++++++++++++++--------- thunder/torch/__init__.py | 11 +++++++++ thunder/torch/default_torch_ops.py | 1 - 4 files changed, 39 insertions(+), 11 deletions(-) diff --git a/thunder/executors/torchex.py b/thunder/executors/torchex.py index f354e9abf..9385eb397 100644 --- a/thunder/executors/torchex.py +++ b/thunder/executors/torchex.py @@ -836,6 +836,7 @@ def _erfcinv_impl(a: torch.Tensor) -> torch.Tensor: celu = _register_torch_operation("celu", module=torch.nn.functional) elu = _register_torch_operation("elu", module=torch.nn.functional) gelu = _register_torch_operation("gelu", module=torch.nn.functional) +leaky_relu = _register_torch_operation("leaky_relu", module=torch.nn.functional) relu = _register_torch_operation("relu", module=torch.nn.functional) relu6 = _register_torch_operation("relu6", module=torch.nn.functional) hardswish = _register_torch_operation("hardswish", module=torch.nn.functional) @@ -850,6 +851,7 @@ def _elementwise_unary_with_inplace_checker(a: TensorProxy, /, inplace: bool = F _register_elementwise_unary_implementation(ltorch.elu, elu, checker=_always_executable) _register_elementwise_unary_implementation(ltorch.celu, celu, checker=_always_executable) _register_elementwise_unary_implementation(ltorch.gelu, gelu, checker=_always_executable) +_register_elementwise_unary_implementation(ltorch.leaky_relu, leaky_relu, checker=_always_executable) _register_elementwise_unary_implementation(ltorch.relu, relu, checker=_elementwise_unary_with_inplace_checker) _register_elementwise_unary_implementation(ltorch.relu6, relu6, checker=_elementwise_unary_with_inplace_checker) _register_elementwise_unary_implementation(ltorch.hardswish, hardswish, checker=_elementwise_unary_with_inplace_checker) diff --git a/thunder/tests/opinfos.py b/thunder/tests/opinfos.py index b8daca7de..5cc5c8907 100644 --- a/thunder/tests/opinfos.py +++ b/thunder/tests/opinfos.py @@ -1633,20 +1633,24 @@ def _abs_torch(x: torch.Tensor | Number): elementwise_unary_ops.append(reciprocal_opinfo) -def elementwise_unary_with_alpha_generator(op, device, dtype, requires_grad): - alphas = (None, -1.0, 0.5) - samples = elementwise_unary_generator(op, device, dtype, requires_grad) - for alpha, sample in itertools.product(alphas, samples): - if alpha is None: - yield sample - else: - yield SampleInput(*sample.args, alpha=alpha, **sample.kwargs) +def get_elementwise_unary_with_alpha_generator(): + kwargs_list = [{}, {"alpha": -1.0}, {"alpha": 0.5}] + return get_elementwise_unary_with_kwargs_generator(kwargs_list) + + +def get_elementwise_unary_with_kwargs_generator(kwargs_list): + def gen(op, device, dtype, requires_grad): + samples = elementwise_unary_generator(op, device, dtype, requires_grad) + for kwargs, sample in itertools.product(kwargs_list, samples): + yield SampleInput(*sample.args, **kwargs, **sample.kwargs) + + return gen celu_opinfo = OpInfo( ltorch.celu, dtypes=(datatypes.floating,), - sample_input_generator=elementwise_unary_with_alpha_generator, + sample_input_generator=get_elementwise_unary_with_alpha_generator(), torch_reference=_elementwise_unary_torch(torch.celu), test_directives=(), ) @@ -1656,7 +1660,7 @@ def elementwise_unary_with_alpha_generator(op, device, dtype, requires_grad): elu_opinfo = OpInfo( ltorch.elu, dtypes=(datatypes.floating,), - sample_input_generator=elementwise_unary_with_alpha_generator, + sample_input_generator=get_elementwise_unary_with_alpha_generator(), torch_reference=torch.nn.functional.elu, # fdm.jvp, which is used in test_vjp_correctness, behaves badly on (-1e-6, 1e-6) for this function singularity_fn=lambda x: x, @@ -1665,6 +1669,18 @@ def elementwise_unary_with_alpha_generator(op, device, dtype, requires_grad): elementwise_unary_ops.append(elu_opinfo) +leaky_relu_opinfo = OpInfo( + ltorch.leaky_relu, + dtypes=(datatypes.floating,), + sample_input_generator=get_elementwise_unary_with_kwargs_generator([{}, {"negative_slope": 0.5}]), + torch_reference=torch.nn.functional.leaky_relu, + # fdm.jvp, which is used in test_vjp_correctness, behaves badly on (-1e-6, 1e-6) for this function + singularity_fn=lambda x: x, + test_directives=(), +) +elementwise_unary_ops.append(leaky_relu_opinfo) + + relu_opinfo = OpInfo( ltorch.relu, sample_input_generator=elementwise_unary_generator, diff --git a/thunder/torch/__init__.py b/thunder/torch/__init__.py index a94ada1cc..5ec568e02 100644 --- a/thunder/torch/__init__.py +++ b/thunder/torch/__init__.py @@ -1801,6 +1801,17 @@ def gelu(a: TensorProxy, /, *, approximate: str = "none") -> TensorLike: raise ValueError(f"gelu does not support the approximate={approximate} argument") +@torchsymbol(torch.nn.functional.leaky_relu, is_method=False) +def leaky_relu(a: TensorProxy, /, negative_slope: float = 0.01, inplace: bool = False) -> TensorLike: + out = where(a > 0, a, a * negative_slope) + if inplace: + return prims.copy_(out, a) + return out + + +_inplace_to_out_of_place[leaky_relu] = leaky_relu, 2 + + # TODO Should this use clamp? -- Would that propagate NaNs properly? @torchsymbol(torch.relu, torch.nn.functional.relu, id="torch.relu", is_method=True) def relu(a: TensorLike, /, inplace: bool = False) -> TensorLike: diff --git a/thunder/torch/default_torch_ops.py b/thunder/torch/default_torch_ops.py index 84e0ae0f9..91ea98adf 100644 --- a/thunder/torch/default_torch_ops.py +++ b/thunder/torch/default_torch_ops.py @@ -356,7 +356,6 @@ torch.nn.functional.instance_norm, torch.nn.functional.kl_div, torch.nn.functional.l1_loss, - torch.nn.functional.leaky_relu, torch.nn.functional.local_response_norm, torch.nn.functional.logsigmoid, torch.nn.functional.lp_pool1d,