From 233376d8f111e2571f745e6f31729db9bc2183ac Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 5 Nov 2024 16:37:40 +0100 Subject: [PATCH] Add warmup runs and profile all iterations to benchmarks (#1402) Add support for initial warmup runs in benchmarks and allows profiling all iterations or just the last one. This is technically a breaking change since `--profile` now profiles all iterations, and the new `--profile-last` option profiles only the last one as `--profile` used to behave. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Mads R. B. Kristensen (https://github.com/madsbk) URL: https://github.com/rapidsai/dask-cuda/pull/1402 --- dask_cuda/benchmarks/common.py | 21 ++++++++++++----- dask_cuda/benchmarks/local_cudf_groupby.py | 13 +++-------- dask_cuda/benchmarks/local_cudf_merge.py | 8 +------ dask_cuda/benchmarks/local_cudf_shuffle.py | 13 +++-------- dask_cuda/benchmarks/local_cupy.py | 15 ++++-------- .../benchmarks/local_cupy_map_overlap.py | 15 ++++-------- dask_cuda/benchmarks/utils.py | 23 ++++++++++++++++++- 7 files changed, 52 insertions(+), 56 deletions(-) diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py index 7f48d4fa..49676fee 100644 --- a/dask_cuda/benchmarks/common.py +++ b/dask_cuda/benchmarks/common.py @@ -1,3 +1,4 @@ +import contextlib from argparse import Namespace from functools import partial from typing import Any, Callable, List, Mapping, NamedTuple, Optional, Tuple @@ -7,7 +8,7 @@ import pandas as pd import dask -from distributed import Client +from distributed import Client, performance_report from dask_cuda.benchmarks.utils import ( address_to_index, @@ -87,12 +88,20 @@ def run_benchmark(client: Client, args: Namespace, config: Config): If ``args.profile`` is set, the final run is profiled. """ + results = [] - for _ in range(max(1, args.runs) - 1): - res = config.bench_once(client, args, write_profile=None) - results.append(res) - results.append(config.bench_once(client, args, write_profile=args.profile)) - return results + for _ in range(max(0, args.warmup_runs)): + config.bench_once(client, args, write_profile=None) + + ctx = contextlib.nullcontext() + if args.profile is not None: + ctx = performance_report(filename=args.profile) + with ctx: + for _ in range(max(1, args.runs) - 1): + res = config.bench_once(client, args, write_profile=None) + results.append(res) + results.append(config.bench_once(client, args, write_profile=args.profile_last)) + return results def gather_bench_results(client: Client, args: Namespace, config: Config): diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py index f094ff18..a9e7d833 100644 --- a/dask_cuda/benchmarks/local_cudf_groupby.py +++ b/dask_cuda/benchmarks/local_cudf_groupby.py @@ -98,10 +98,9 @@ def bench_once(client, args, write_profile=None): "False": False, }.get(args.shuffle, args.shuffle) - if write_profile is None: - ctx = contextlib.nullcontext() - else: - ctx = performance_report(filename=args.profile) + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) with ctx: t1 = clock() @@ -260,12 +259,6 @@ def parse_args(): "type": str, "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')", }, - { - "name": "--runs", - "default": 3, - "type": int, - "help": "Number of runs", - }, ] return parse_benchmark_args( diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py index e2b03520..6ebe005a 100644 --- a/dask_cuda/benchmarks/local_cudf_merge.py +++ b/dask_cuda/benchmarks/local_cudf_merge.py @@ -190,7 +190,7 @@ def bench_once(client, args, write_profile=None): if args.backend == "explicit-comms": ctx1 = dask.config.set(explicit_comms=True) if write_profile is not None: - ctx2 = performance_report(filename=args.profile) + ctx2 = performance_report(filename=write_profile) with ctx1: with ctx2: @@ -346,12 +346,6 @@ def parse_args(): "action": "store_true", "help": "Don't shuffle the keys of the left (base) dataframe.", }, - { - "name": "--runs", - "default": 3, - "type": int, - "help": "Number of runs", - }, { "name": [ "-s", diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py index 25f42e59..3a0955c4 100644 --- a/dask_cuda/benchmarks/local_cudf_shuffle.py +++ b/dask_cuda/benchmarks/local_cudf_shuffle.py @@ -121,10 +121,9 @@ def create_data( def bench_once(client, args, write_profile=None): data_processed, df = create_data(client, args) - if write_profile is None: - ctx = contextlib.nullcontext() - else: - ctx = performance_report(filename=args.profile) + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) with ctx: if args.backend in {"dask", "dask-noop"}: @@ -228,12 +227,6 @@ def parse_args(): "type": str, "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')", }, - { - "name": "--runs", - "default": 3, - "type": int, - "help": "Number of runs", - }, { "name": "--ignore-index", "action": "store_true", diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py index c9c8fe1c..ba88db30 100644 --- a/dask_cuda/benchmarks/local_cupy.py +++ b/dask_cuda/benchmarks/local_cupy.py @@ -141,12 +141,11 @@ def bench_once(client, args, write_profile=None): chunksize = x.chunksize data_processed = sum(arg.nbytes for arg in func_args) - # Execute the operations to benchmark - if args.profile is not None and write_profile is not None: - ctx = performance_report(filename=args.profile) - else: - ctx = contextlib.nullcontext() + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) + # Execute the operations to benchmark with ctx: rng = start_range(message=args.operation, color="purple") result = func(*func_args) @@ -297,12 +296,6 @@ def parse_args(): "type": int, "help": "Chunk size (default 2500).", }, - { - "name": "--runs", - "default": 3, - "type": int, - "help": "Number of runs (default 3).", - }, { "name": [ "-b", diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py index 8b975a24..ecefa52a 100644 --- a/dask_cuda/benchmarks/local_cupy_map_overlap.py +++ b/dask_cuda/benchmarks/local_cupy_map_overlap.py @@ -42,12 +42,11 @@ def bench_once(client, args, write_profile=None): data_processed = x.nbytes - # Execute the operations to benchmark - if args.profile is not None and write_profile is not None: - ctx = performance_report(filename=args.profile) - else: - ctx = contextlib.nullcontext() + ctx = contextlib.nullcontext() + if write_profile is not None: + ctx = performance_report(filename=write_profile) + # Execute the operations to benchmark with ctx: result = x.map_overlap(mean_filter, args.kernel_size, shape=ks) if args.backend == "dask-noop": @@ -168,12 +167,6 @@ def parse_args(): "type": int, "help": "Kernel size, 2*k+1, in each dimension (default 1)", }, - { - "name": "--runs", - "default": 3, - "type": int, - "help": "Number of runs", - }, { "name": [ "-b", diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py index de7e2ae1..4f87a025 100644 --- a/dask_cuda/benchmarks/utils.py +++ b/dask_cuda/benchmarks/utils.py @@ -323,7 +323,16 @@ def parse_benchmark_args( metavar="PATH", default=None, type=str, - help="Write dask profile report (E.g. dask-report.html)", + help="Write dask profile report (E.g. dask-report.html) on all " + "iterations (excluding warmup).", + ) + parser.add_argument( + "--profile-last", + metavar="PATH", + default=None, + type=str, + help="Write dask profile report (E.g. dask-report.html) on last " + "iteration only.", ) # See save_benchmark_data for more information parser.add_argument( @@ -344,6 +353,18 @@ def parse_benchmark_args( type=parse_bytes, help="Bandwidth statistics: ignore messages smaller than this (default '1 MB')", ) + parser.add_argument( + "--runs", + default=3, + type=int, + help="Number of runs", + ) + parser.add_argument( + "--warmup-runs", + default=1, + type=int, + help="Number of warmup runs", + ) for args in args_list: name = args.pop("name")