From 77e3649a24520f1ebd85ce180f5d6996bffc8c0f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 4 Oct 2024 14:43:35 -0500 Subject: [PATCH 1/5] Add utility to submit empty task into the queue specifying dependent events --- dpctl/utils/src/order_keeper.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dpctl/utils/src/order_keeper.cpp b/dpctl/utils/src/order_keeper.cpp index f54b21780c..7a1e881001 100644 --- a/dpctl/utils/src/order_keeper.cpp +++ b/dpctl/utils/src/order_keeper.cpp @@ -26,4 +26,17 @@ PYBIND11_MODULE(_seq_order_keeper, m) &SequentialOrder::add_to_submitted_events) .def("wait", &SequentialOrder::wait, py::call_guard()); + + auto submit_empty_task_fn = + [](sycl::queue &exec_q, + const std::vector &depends) -> sycl::event { + return exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.single_task([]() { + // empty body + }); + }); + }; + m.def("_submit_empty_task", submit_empty_task_fn, py::arg("sycl_queue"), + py::arg("depends") = py::list()); } From 8139fece8ffc54b9ec1cbc5f6f6b202497a6d0c3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 4 Oct 2024 14:46:23 -0500 Subject: [PATCH 2/5] Extend SyclTimer SyclTimer now supports device_timer keyword argument, a legacy behavior "queue_barrier", and new one based on sequential order manager, which inserts an empty task into the manager to record start and end of block of timed code. Docstring of SyclTimer updated. All data attributes needed for functioning of the timer are created during class instance construction now. --- dpctl/_sycl_timer.py | 88 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 77 insertions(+), 11 deletions(-) diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py index 1f8dfa7e00..056a04ef01 100644 --- a/dpctl/_sycl_timer.py +++ b/dpctl/_sycl_timer.py @@ -44,6 +44,44 @@ def device_dt(self): return self._device_dt +class BaseDeviceTimer: + __slots__ = ["queue"] + + def __init__(self, sycl_queue): + if not isinstance(sycl_queue, SyclQueue): + raise TypeError(f"Expected type SyclQueue, got {type(sycl_queue)}") + self.queue = sycl_queue + + +class QueueBarrierDeviceTimer(BaseDeviceTimer): + __slots__ = [] + + def __init__(self, sycl_queue): + super(QueueBarrierDeviceTimer, self).__init__(sycl_queue) + + def get_event(self): + return self.queue.submit_barrier() + + +class OrderManagerDeviceTimer(BaseDeviceTimer): + __slots__ = ["_order_manager", "_submit_empty_task_fn"] + + def __init__(self, sycl_queue): + import dpctl.utils._seq_order_keeper as s_ok + from dpctl.utils import SequentialOrderManager as seq_om + + super(OrderManagerDeviceTimer, self).__init__(sycl_queue) + self._order_manager = seq_om[self.queue] + self._submit_empty_task_fn = s_ok._submit_empty_task + + def get_event(self): + ev = self._submit_empty_task_fn( + sycl_queue=self.queue, depends=self._order_manager.submitted_events + ) + self._order_manager.add_event_pair(ev, ev) + return ev + + class SyclTimer: """ Context to measure device time and host wall-time of execution @@ -58,7 +96,7 @@ class SyclTimer: q = dpctl.SyclQueue(property="enable_profiling") # create the timer - milliseconds_sc = 1e-3 + milliseconds_sc = 1e3 timer = dpctl.SyclTimer(time_scale = milliseconds_sc) # use the timer @@ -73,7 +111,7 @@ class SyclTimer: wall_dt, device_dt = timer.dt .. note:: - The timer submits barriers to the queue at the entrance and the + The timer submits tasks to the queue at the entrance and the exit of the context and uses profiling information from events associated with these submissions to perform the timing. Thus :class:`dpctl.SyclTimer` requires the queue with ``"enable_profiling"`` @@ -81,17 +119,28 @@ class SyclTimer: the ``dt`` property ensures that both submitted barriers complete their execution and thus effectively synchronizes the queue. + `device_timer` keyword argument controls the type of tasks submitted. + With `device_timer="queue_barrier"`, queue barrier tasks are used. With + `device_timer="order_manager"`, a single empty body task is inserted + instead relying on order manager (used by `dpctl.tensor` operations) to + order these tasks so that they fence operations performed within + timer's context. + Args: host_timer (callable, optional): A callable such that host_timer() returns current host time in seconds. Default: :py:func:`timeit.default_timer`. + device_timer (Literal["queue_barrier", "order_manager"], optional): + Device timing method. Default: "queue_barrier". time_scale (Union[int, float], optional): Ratio of the unit of time of interest and one second. Default: ``1``. """ - def __init__(self, host_timer=timeit.default_timer, time_scale=1): + def __init__( + self, host_timer=timeit.default_timer, device_timer=None, time_scale=1 + ): """ Create new instance of :class:`.SyclTimer`. @@ -100,6 +149,8 @@ def __init__(self, host_timer=timeit.default_timer, time_scale=1): A function that takes no arguments and returns a value measuring time. Default: :meth:`timeit.default_timer`. + device_timer (Literal["queue_barrier", "order_manager"], optional): + Device timing method. Default: "queue_barrier" time_scale (Union[int, float], optional): Scaling factor applied to durations measured by the host_timer. Default: ``1``. @@ -109,11 +160,26 @@ def __init__(self, host_timer=timeit.default_timer, time_scale=1): self.queue = None self.host_times = [] self.bracketing_events = [] + self._context_data = list() + if device_timer is None: + device_timer = "queue_barrier" + if device_timer == "queue_barrier": + self._device_timer_class = QueueBarrierDeviceTimer + elif device_timer == "order_manager": + self._device_timer_class = OrderManagerDeviceTimer + else: + raise ValueError( + "Supported values for device_timer keyword are " + "'queue_barrier', 'order_manager', got " + f"'{device_timer}'" + ) + self._device_timer = None def __call__(self, queue=None): if isinstance(queue, SyclQueue): if queue.has_enable_profiling: self.queue = queue + self._device_timer = self._device_timer_class(queue) else: raise ValueError( "The given queue was not created with the " @@ -127,17 +193,17 @@ def __call__(self, queue=None): return self def __enter__(self): - self._event_start = self.queue.submit_barrier() - self._host_start = self.timer() + _event_start = self._device_timer.get_event() + _host_start = self.timer() + self._context_data.append((_event_start, _host_start)) return self def __exit__(self, *args): - self.host_times.append((self._host_start, self.timer())) - self.bracketing_events.append( - (self._event_start, self.queue.submit_barrier()) - ) - del self._event_start - del self._host_start + _event_end = self._device_timer.get_event() + _host_end = self.timer() + _event_start, _host_start = self._context_data.pop() + self.host_times.append((_host_start, _host_end)) + self.bracketing_events.append((_event_start, _event_end)) @property def dt(self): From ec6ad3bf25451eb61f3c96e6c71bb85b6304333a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 22 Oct 2024 08:26:53 -0500 Subject: [PATCH 3/5] Add test_sycl_timer file Check different device_timer values, test argument validation, and test cumulative timing. --- dpctl/tests/test_sycl_timer.py | 108 +++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 dpctl/tests/test_sycl_timer.py diff --git a/dpctl/tests/test_sycl_timer.py b/dpctl/tests/test_sycl_timer.py new file mode 100644 index 0000000000..899bbb2153 --- /dev/null +++ b/dpctl/tests/test_sycl_timer.py @@ -0,0 +1,108 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + +import pytest + +import dpctl +import dpctl.tensor as dpt + + +@pytest.fixture +def profiling_queue(): + try: + q = dpctl.SyclQueue(property="enable_profiling") + except dpctl.SyclQueueCreationError: + pytest.skip( + "Could not created profiling queue " "for default-selected device" + ) + return q + + +@pytest.mark.parametrize( + "device_timer", [None, "queue_barrier", "order_manager"] +) +def test_sycl_timer_queue_barrier(profiling_queue, device_timer): + dev = dpt.Device.create_device(profiling_queue) + + timer = dpctl.SyclTimer( + host_timer=time.perf_counter, device_timer=device_timer, time_scale=1e3 + ) + + with timer(dev.sycl_queue): + x = dpt.linspace(0, 1, num=10**6, device=dev) + y = 3.0 - dpt.square(x - 0.5) + z = dpt.sort(y) + res1 = z[-1] + res2 = dpt.max(y) + + host_dt, device_dt = timer.dt + + assert dpt.all(res1 == res2) + assert host_dt > 0 + assert device_dt > 0 + + +def test_sycl_timer_accumulation(profiling_queue): + q = profiling_queue + + timer = dpctl.SyclTimer( + host_timer=time.perf_counter, + device_timer="order_manager", + time_scale=1e3, + ) + + # initial condition + x = dpt.linspace(0, 1, num=10**6, sycl_queue=q) + + aitkens_data = [ + x, + ] + + # 16 iterations of Aitken's accelerated Newton's method + # x <- x - f(x)/f'(x) for f(x) = x - cos(x) + for _ in range(16): + # only time Newton step + with timer(q): + s = dpt.sin(x) + x = (dpt.cos(x) + x * s) / (1 + s) + aitkens_data.append(x) + aitkens_data = aitkens_data[-3:] + if len(aitkens_data) == 3: + # apply Aitkens acceleration + d1 = aitkens_data[-1] - aitkens_data[-2] + d2 = aitkens_data[-2] - aitkens_data[-3] + if not dpt.any(d1 == d2): + x = aitkens_data[-1] - dpt.square(d1) / (d1 - d2) + + # Total time for 16 iterations + dev_dt = timer.dt.device_dt + assert dev_dt > 0 + + # check convergence + assert dpt.max(x) - dpt.min(x) < 1e-5 + + +def test_sycl_timer_validation(): + with pytest.raises(ValueError): + dpctl.SyclTimer(device_timer="invalid") + + timer = dpctl.SyclTimer() + mock_queue = Ellipsis + + with pytest.raises(TypeError): + timer(mock_queue) From d1011c55aef788ce6ae0b47f4f5daaf162e5492e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 22 Oct 2024 09:03:08 -0500 Subject: [PATCH 4/5] Add change log entry for change to SyclTimer --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85602f23e9..598f6d70a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,13 +8,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -### Change +### Changed * Improved performance of copy-and-cast operations from `numpy.ndarray` to `tensor.usm_ndarray` for contiguous inputs [gh-1829](https://github.com/IntelPython/dpctl/pull/1829) * Improved performance of copying operation to C-/F-contig array, with optimization for batch of square matrices [gh-1850](https://github.com/IntelPython/dpctl/pull/1850) * Improved performance of `tensor.argsort` function for all types [gh-1859](https://github.com/IntelPython/dpctl/pull/1859) * Improved performance of `tensor.sort` and `tensor.argsort` for short arrays in the range [16, 64] elements [gh-1866](https://github.com/IntelPython/dpctl/pull/1866) * Implement radix sort algorithm to be used in `dpt.sort` and `dpt.argsort` [gh-1867](https://github.com/IntelPython/dpctl/pull/1867) +* Extended `dpctl.SyclTimer` with `device_timer` keyword, implementing different methods of collecting device times [gh-1872](https://github.com/IntelPython/dpctl/pull/1872) ### Fixed * Fix for `tensor.result_type` when all inputs are Python built-in scalars [gh-1877](https://github.com/IntelPython/dpctl/pull/1877) From 77520787ec005645a41baa66c1fc495db1b970ab Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 15 Nov 2024 06:59:08 -0800 Subject: [PATCH 5/5] Expanded docstring as suggested in review --- dpctl/_sycl_timer.py | 53 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py index 056a04ef01..f53836bd47 100644 --- a/dpctl/_sycl_timer.py +++ b/dpctl/_sycl_timer.py @@ -84,8 +84,7 @@ def get_event(self): class SyclTimer: """ - Context to measure device time and host wall-time of execution - of commands submitted to :class:`dpctl.SyclQueue`. + Context to time execution of tasks submitted to :class:`dpctl.SyclQueue`. :Example: .. code-block:: python @@ -99,13 +98,18 @@ class SyclTimer: milliseconds_sc = 1e3 timer = dpctl.SyclTimer(time_scale = milliseconds_sc) + untimed_code_block_1 # use the timer with timer(queue=q): - code_block1 + timed_code_block1 + + untimed_code_block_2 # use the timer with timer(queue=q): - code_block2 + timed_code_block2 + + untimed_code_block_3 # retrieve elapsed times in milliseconds wall_dt, device_dt = timer.dt @@ -116,16 +120,41 @@ class SyclTimer: associated with these submissions to perform the timing. Thus :class:`dpctl.SyclTimer` requires the queue with ``"enable_profiling"`` property. In order to be able to collect the profiling information, - the ``dt`` property ensures that both submitted barriers complete their - execution and thus effectively synchronizes the queue. - - `device_timer` keyword argument controls the type of tasks submitted. - With `device_timer="queue_barrier"`, queue barrier tasks are used. With - `device_timer="order_manager"`, a single empty body task is inserted - instead relying on order manager (used by `dpctl.tensor` operations) to + the ``dt`` property ensures that both tasks submitted by the timer + complete their execution and thus effectively synchronizes the queue. + + Execution of the above example results in the following task graph, + where each group of tasks is ordered after the one preceding it, + ``[tasks_of_untimed_block1]``, ``[timer_fence_start_task]``, + ``[tasks_of_timed_block1]``, ``[timer_fence_finish_task]``, + ``[tasks_of_untimed_block2]``, ``[timer_fence_start_task]``, + ``[tasks_of_timed_block2]``, ``[timer_fence_finish_task]``, + ``[tasks_of_untimed_block3]``. + + ``device_timer`` keyword argument controls the type of tasks submitted. + With ``"queue_barrier"`` value, queue barrier tasks are used. With + ``"order_manager"`` value, a single empty body task is inserted + and order manager (used by all `dpctl.tensor` operations) is used to order these tasks so that they fence operations performed within timer's context. + Timing offloading operations that do not use the order manager with + the timer that uses ``"order_manager"`` as ``device_timer`` value + will be misleading becaused the tasks submitted by the timer will not + be ordered with respect to tasks we intend to time. + + Note, that host timer effectively measures the time of task + submissions. To measure host timer wall-time that includes execution + of submitted tasks, make sure to include synchronization point in + the timed block. + + :Example: + .. code-block:: python + + with timer(q): + timed_block + q.wait() + Args: host_timer (callable, optional): A callable such that host_timer() returns current @@ -134,7 +163,7 @@ class SyclTimer: device_timer (Literal["queue_barrier", "order_manager"], optional): Device timing method. Default: "queue_barrier". time_scale (Union[int, float], optional): - Ratio of the unit of time of interest and one second. + Ratio of one second and the unit of time-scale of interest. Default: ``1``. """