diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index f58db37753..30840f5385 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -171,48 +171,63 @@ ur_result_t urEnqueueEventsWaitWithBarrier( std::scoped_lock lock(Queue->Mutex); // Helper function for appending a barrier to a command list. - auto insertBarrierIntoCmdList = - [&Queue](ur_command_list_ptr_t CmdList, - const _ur_ze_event_list_t &EventWaitList, - ur_event_handle_t &Event, bool IsInternal) { - UR_CALL(createEventAndAssociateQueue( - Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, - IsInternal, false)); - - Event->WaitList = EventWaitList; - - // For in-order queue we don't need a real barrier, just wait for - // requested events in potentially different queues and add a "barrier" - // event signal because it is already guaranteed that previous commands - // in this queue are completed when the signal is started. - // - // Only consideration here is that when profiling is used, signalEvent - // cannot be used if EventWaitList.Lenght == 0. In those cases, we need - // to fallback directly to barrier to have correct timestamps. See here: - // https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t - // - // TODO: this and other special handling of in-order queues to be - // updated when/if Level Zero adds native support for in-order queues. - // - if (Queue->isInOrderQueue() && InOrderBarrierBySignal && - !Queue->isProfilingEnabled()) { - // If we are using driver in order lists, then append wait on events - // is unnecessary and we can signal the event created. - if (EventWaitList.Length && !CmdList->second.IsInOrderList) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (CmdList->first, EventWaitList.Length, - EventWaitList.ZeEventList)); + auto insertBarrierIntoCmdList = [&Queue](ur_command_list_ptr_t CmdList, + _ur_ze_event_list_t &EventWaitList, + ur_event_handle_t &Event, + bool IsInternal) { + UR_CALL(createEventAndAssociateQueue(Queue, &Event, + UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, + CmdList, IsInternal, false)); + + Event->WaitList = EventWaitList; + + // For in-order queue we don't need a real barrier, just wait for + // requested events in potentially different queues and add a "barrier" + // event signal because it is already guaranteed that previous commands + // in this queue are completed when the signal is started. + // + // Only consideration here is that when profiling is used, signalEvent + // cannot be used if EventWaitList.Lenght == 0. In those cases, we need + // to fallback directly to barrier to have correct timestamps. See here: + // https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t + // + // TODO: this and other special handling of in-order queues to be + // updated when/if Level Zero adds native support for in-order queues. + // + if (Queue->isInOrderQueue() && InOrderBarrierBySignal && + !Queue->isProfilingEnabled()) { + // If we are using driver in order lists, then append wait on events + // is unnecessary IF the cmdlists match. + if (EventWaitList.Length) { + if (CmdList->second.IsInOrderList) { + for (unsigned i = EventWaitList.Length; i-- < 0;) { + // if the events is from the same cmdlist, we can remove it + // from the waitlist. + if (EventWaitList.UrEventList[i]->CommandList == CmdList) { + EventWaitList.Length--; + if (EventWaitList.Length != i) { + std::swap(EventWaitList.UrEventList[i], + EventWaitList.UrEventList[EventWaitList.Length]); + std::swap(EventWaitList.ZeEventList[i], + EventWaitList.ZeEventList[EventWaitList.Length]); + } + } } - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CmdList->first, Event->ZeEvent)); - } else { - ZE2UR_CALL(zeCommandListAppendBarrier, - (CmdList->first, Event->ZeEvent, EventWaitList.Length, - EventWaitList.ZeEventList)); } + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (CmdList->first, EventWaitList.Length, EventWaitList.ZeEventList)); + } + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CmdList->first, Event->ZeEvent)); + } else { + ZE2UR_CALL(zeCommandListAppendBarrier, + (CmdList->first, Event->ZeEvent, EventWaitList.Length, + EventWaitList.ZeEventList)); + } - return UR_RESULT_SUCCESS; - }; + return UR_RESULT_SUCCESS; + }; // If the queue is in-order then each command in it effectively acts as a // barrier, so we don't need to do anything except if we were requested @@ -349,9 +364,9 @@ ur_result_t urEnqueueEventsWaitWithBarrier( // command-lists. std::vector EventWaitVector(CmdLists.size()); for (size_t I = 0; I < CmdLists.size(); ++I) { - UR_CALL(insertBarrierIntoCmdList(CmdLists[I], _ur_ze_event_list_t{}, - EventWaitVector[I], - true /*IsInternal*/)); + _ur_ze_event_list_t waitlist; + UR_CALL(insertBarrierIntoCmdList( + CmdLists[I], waitlist, EventWaitVector[I], true /*IsInternal*/)); } // If there were multiple queues we need to create a "convergence" event to // be our active barrier. This convergence event is signalled by a barrier @@ -376,8 +391,9 @@ ur_result_t urEnqueueEventsWaitWithBarrier( // If there is only a single queue then insert a barrier and the single // result event can be used as our active barrier and used as the return // event. Take into account whether output event is discarded or not. - UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{}, - ResultEvent, IsInternal)); + _ur_ze_event_list_t waitlist; + UR_CALL(insertBarrierIntoCmdList(CmdLists[0], waitlist, ResultEvent, + IsInternal)); } // Execute each command list so the barriers can be encountered. diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt index b1c34b8916..a497b8e1a2 100644 --- a/test/adapters/level_zero/CMakeLists.txt +++ b/test/adapters/level_zero/CMakeLists.txt @@ -15,6 +15,7 @@ if(UR_BUILD_ADAPTER_L0) SOURCES urProgramLink.cpp urKernelCreateWithNativeHandle.cpp + urEventCreateWithNativeHandle.cpp ENVIRONMENT "UR_ADAPTERS_FORCE_LOAD=\"$\"" ) diff --git a/test/adapters/level_zero/urEventCreateWithNativeHandle.cpp b/test/adapters/level_zero/urEventCreateWithNativeHandle.cpp new file mode 100644 index 0000000000..7e667bfe30 --- /dev/null +++ b/test/adapters/level_zero/urEventCreateWithNativeHandle.cpp @@ -0,0 +1,109 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "ur_api.h" +#include "uur/checks.h" +#include "ze_api.h" +#include +#include +#include + +using namespace std::chrono_literals; +using urLevelZeroEventNativeHandleTest = uur::urQueueTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEventNativeHandleTest); + +#define TEST_MEMCPY_SIZE 4096 + +TEST_P(urLevelZeroEventNativeHandleTest, WaitForNative) { + ze_event_pool_desc_t desc; + desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC; + desc.pNext = nullptr; + desc.count = 1; + desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + + ur_native_handle_t nativeContext; + ASSERT_SUCCESS(urContextGetNativeHandle(context, &nativeContext)); + + ur_native_handle_t nativeDevice; + ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &nativeDevice)); + + ze_event_pool_handle_t pool = nullptr; + + ASSERT_EQ(zeEventPoolCreate((ze_context_handle_t)nativeContext, &desc, 1, + (ze_device_handle_t *)&nativeDevice, &pool), + ZE_RESULT_SUCCESS); + + ze_event_desc_t eventDesc; + eventDesc.pNext = nullptr; + eventDesc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + eventDesc.wait = 0; + + ze_event_handle_t zeEvent; + ASSERT_EQ(zeEventCreate(pool, &eventDesc, &zeEvent), ZE_RESULT_SUCCESS); + + ur_event_native_properties_t pprops; + pprops.isNativeHandleOwned = false; + pprops.pNext = nullptr; + pprops.stype = UR_STRUCTURE_TYPE_EVENT_NATIVE_PROPERTIES; + + ur_event_handle_t urEvent; + ASSERT_SUCCESS(urEventCreateWithNativeHandle((ur_native_handle_t)zeEvent, + context, &pprops, &urEvent)); + + int *src = (int *)malloc(TEST_MEMCPY_SIZE); + memset(src, 0xc, TEST_MEMCPY_SIZE); + + int *dst = (int *)malloc(TEST_MEMCPY_SIZE); + memset(dst, 0, TEST_MEMCPY_SIZE); + + int *dst2 = (int *)malloc(TEST_MEMCPY_SIZE); + memset(dst, 0, TEST_MEMCPY_SIZE); + + ur_event_handle_t memcpyEvent2; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst2, src, TEST_MEMCPY_SIZE, + 0, nullptr, &memcpyEvent2)); + + ur_event_handle_t memcpyEvent3; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst2, src, TEST_MEMCPY_SIZE, + 0, nullptr, &memcpyEvent3)); + + // just to make wait lists contain more than 1 event + ur_event_handle_t events[] = {memcpyEvent2, urEvent, memcpyEvent3}; + + ur_event_handle_t waitEvent; + ASSERT_SUCCESS( + urEnqueueEventsWaitWithBarrier(queue, 3, events, &waitEvent)); + + ur_event_handle_t memcpyEvent; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst, src, TEST_MEMCPY_SIZE, + 1, &waitEvent, &memcpyEvent)); + + // urQueueFinish would hang, so we flush and then wait + // some time to make sure the gpu had plenty of time + // to do the memcpy. + urQueueFlush(queue); + std::this_thread::sleep_for(500ms); + + ASSERT_NE(memcmp(src, dst, TEST_MEMCPY_SIZE), 0); + + zeEventHostSignal(zeEvent); + + urQueueFinish(queue); + + ASSERT_EQ(memcmp(src, dst, 4096), 0); + + free(src); + free(dst); + free(dst2); + urEventRelease(urEvent); + urEventRelease(waitEvent); + urEventRelease(memcpyEvent); + urEventRelease(memcpyEvent2); + urEventRelease(memcpyEvent3); + zeEventDestroy(zeEvent); + zeEventPoolDestroy(pool); +}