From 39483ab51fb67b3ac50a82001a719ed261f57596 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi Date: Tue, 19 Nov 2024 16:15:10 -0500 Subject: [PATCH 1/4] [SYCL] Add support for work group memory free function kernel parameter (#15861) This PR concludes the implementation of the work group memory [extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc). It adds support for work group memory parameters when using free function kernels. --------- Co-authored-by: lorenc.bushi --- clang/lib/Sema/SemaSYCL.cpp | 141 +++++++++++++++--- .../CodeGenSYCL/free_function_int_header.cpp | 40 ++++- .../free_function_kernel_params.cpp | 17 ++- .../SemaSYCL/free_function_kernel_params.cpp | 22 ++- ...sycl_ext_oneapi_work_group_memory.asciidoc | 10 +- .../oneapi/experimental/work_group_memory.hpp | 4 +- sycl/include/sycl/handler.hpp | 4 +- sycl/source/feature_test.hpp.in | 1 + .../WorkGroupMemory/common_free_function.hpp | 1 - .../reduction_free_function.cpp | 3 - 10 files changed, 210 insertions(+), 33 deletions(-) rename sycl/doc/extensions/{proposed => experimental}/sycl_ext_oneapi_work_group_memory.asciidoc (98%) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index a4cf8c20058f8..242e6c8a9d7d4 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -1522,7 +1522,7 @@ class KernelObjVisitor { void visitParam(ParmVarDecl *Param, QualType ParamTy, HandlerTys &...Handlers) { if (isSyclSpecialType(ParamTy, SemaSYCLRef)) - KP_FOR_EACH(handleOtherType, Param, ParamTy); + KP_FOR_EACH(handleSyclSpecialType, Param, ParamTy); else if (ParamTy->isStructureOrClassType()) { if (KP_FOR_EACH(handleStructType, Param, ParamTy)) { CXXRecordDecl *RD = ParamTy->getAsCXXRecordDecl(); @@ -2075,8 +2075,11 @@ class SyclKernelFieldChecker : public SyclKernelFieldHandler { } bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { - Diag.Report(PD->getLocation(), diag::err_bad_kernel_param_type) << ParamTy; - IsInvalid = true; + if (!SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) { + Diag.Report(PD->getLocation(), diag::err_bad_kernel_param_type) + << ParamTy; + IsInvalid = true; + } return isValid(); } @@ -2228,8 +2231,8 @@ class SyclKernelUnionChecker : public SyclKernelFieldHandler { } bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { - // TODO - unsupportedFreeFunctionParamType(); + if (!SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) + unsupportedFreeFunctionParamType(); // TODO return true; } @@ -3013,9 +3016,26 @@ class SyclKernelDeclCreator : public SyclKernelFieldHandler { return handleSpecialType(FD, FieldTy); } - bool handleSyclSpecialType(ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { + if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) { + const auto *RecordDecl = ParamTy->getAsCXXRecordDecl(); + assert(RecordDecl && "The type must be a RecordDecl"); + CXXMethodDecl *InitMethod = getMethodByName(RecordDecl, InitMethodName); + assert(InitMethod && "The type must have the __init method"); + // Don't do -1 here because we count on this to be the first parameter + // added (if any). + size_t ParamIndex = Params.size(); + for (const ParmVarDecl *Param : InitMethod->parameters()) { + QualType ParamTy = Param->getType(); + addParam(Param, ParamTy.getCanonicalType()); + // Propagate add_ir_attributes_kernel_parameter attribute. + if (const auto *AddIRAttr = + Param->getAttr()) + Params.back()->addAttr(AddIRAttr->clone(SemaSYCLRef.getASTContext())); + } + LastParamIndex = ParamIndex; + } else // TODO + unsupportedFreeFunctionParamType(); return true; } @@ -3291,9 +3311,7 @@ class SyclKernelArgsSizeChecker : public SyclKernelFieldHandler { } bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { - // TODO - unsupportedFreeFunctionParamType(); - return true; + return handleSpecialType(ParamTy); } bool handleSyclSpecialType(const CXXRecordDecl *, const CXXBaseSpecifier &BS, @@ -4442,6 +4460,45 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { {}); } + MemberExpr *buildMemberExpr(Expr *Base, ValueDecl *Member) { + DeclAccessPair MemberDAP = DeclAccessPair::make(Member, AS_none); + MemberExpr *Result = SemaSYCLRef.SemaRef.BuildMemberExpr( + Base, /*IsArrow */ false, FreeFunctionSrcLoc, NestedNameSpecifierLoc(), + FreeFunctionSrcLoc, Member, MemberDAP, + /*HadMultipleCandidates*/ false, + DeclarationNameInfo(Member->getDeclName(), FreeFunctionSrcLoc), + Member->getType(), VK_LValue, OK_Ordinary); + return Result; + } + + void createSpecialMethodCall(const CXXRecordDecl *RD, StringRef MethodName, + Expr *MemberBaseExpr, + SmallVectorImpl &AddTo) { + CXXMethodDecl *Method = getMethodByName(RD, MethodName); + if (!Method) + return; + unsigned NumParams = Method->getNumParams(); + llvm::SmallVector ParamDREs(NumParams); + llvm::ArrayRef KernelParameters = + DeclCreator.getParamVarDeclsForCurrentField(); + for (size_t I = 0; I < NumParams; ++I) { + QualType ParamType = KernelParameters[I]->getOriginalType(); + ParamDREs[I] = SemaSYCLRef.SemaRef.BuildDeclRefExpr( + KernelParameters[I], ParamType, VK_LValue, FreeFunctionSrcLoc); + } + MemberExpr *MethodME = buildMemberExpr(MemberBaseExpr, Method); + QualType ResultTy = Method->getReturnType(); + ExprValueKind VK = Expr::getValueKindForType(ResultTy); + ResultTy = ResultTy.getNonLValueExprType(SemaSYCLRef.getASTContext()); + llvm::SmallVector ParamStmts; + const auto *Proto = cast(Method->getType()); + SemaSYCLRef.SemaRef.GatherArgumentsForCall(FreeFunctionSrcLoc, Method, + Proto, 0, ParamDREs, ParamStmts); + AddTo.push_back(CXXMemberCallExpr::Create( + SemaSYCLRef.getASTContext(), MethodME, ParamStmts, ResultTy, VK, + FreeFunctionSrcLoc, FPOptionsOverride())); + } + public: static constexpr const bool VisitInsideSimpleContainers = false; @@ -4461,9 +4518,53 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { return true; } - bool handleSyclSpecialType(ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + // Default inits the type, then calls the init-method in the body. + // A type may not have a public default constructor as per its spec so + // typically if this is the case the default constructor will be private and + // in such cases we must manually override the access specifier from private + // to public just for the duration of this default initialization. + // TODO: Revisit this approach once https://github.com/intel/llvm/issues/16061 + // is closed. + bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { + if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) { + const auto *RecordDecl = ParamTy->getAsCXXRecordDecl(); + AccessSpecifier DefaultConstructorAccess; + auto DefaultConstructor = + std::find_if(RecordDecl->ctor_begin(), RecordDecl->ctor_end(), + [](auto it) { return it->isDefaultConstructor(); }); + DefaultConstructorAccess = DefaultConstructor->getAccess(); + DefaultConstructor->setAccess(AS_public); + + QualType Ty = PD->getOriginalType(); + ASTContext &Ctx = SemaSYCLRef.SemaRef.getASTContext(); + VarDecl *WorkGroupMemoryClone = VarDecl::Create( + Ctx, DeclCreator.getKernelDecl(), FreeFunctionSrcLoc, + FreeFunctionSrcLoc, PD->getIdentifier(), PD->getType(), + Ctx.getTrivialTypeSourceInfo(Ty), SC_None); + InitializedEntity VarEntity = + InitializedEntity::InitializeVariable(WorkGroupMemoryClone); + InitializationKind InitKind = + InitializationKind::CreateDefault(FreeFunctionSrcLoc); + InitializationSequence InitSeq(SemaSYCLRef.SemaRef, VarEntity, InitKind, + std::nullopt); + ExprResult Init = InitSeq.Perform(SemaSYCLRef.SemaRef, VarEntity, + InitKind, std::nullopt); + WorkGroupMemoryClone->setInit( + SemaSYCLRef.SemaRef.MaybeCreateExprWithCleanups(Init.get())); + WorkGroupMemoryClone->setInitStyle(VarDecl::CallInit); + DefaultConstructor->setAccess(DefaultConstructorAccess); + + Stmt *DS = new (SemaSYCLRef.getASTContext()) + DeclStmt(DeclGroupRef(WorkGroupMemoryClone), FreeFunctionSrcLoc, + FreeFunctionSrcLoc); + BodyStmts.push_back(DS); + Expr *MemberBaseExpr = SemaSYCLRef.SemaRef.BuildDeclRefExpr( + WorkGroupMemoryClone, Ty, VK_PRValue, FreeFunctionSrcLoc); + createSpecialMethodCall(RecordDecl, InitMethodName, MemberBaseExpr, + BodyStmts); + ArgExprs.push_back(MemberBaseExpr); + } else // TODO + unsupportedFreeFunctionParamType(); return true; } @@ -4748,9 +4849,11 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { return true; } - bool handleSyclSpecialType(ParmVarDecl *, QualType) final { - // TODO - unsupportedFreeFunctionParamType(); + bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { + if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) + addParam(PD, ParamTy, SYCLIntegrationHeader::kind_work_group_memory); + else + unsupportedFreeFunctionParamType(); // TODO return true; } @@ -6227,7 +6330,6 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "#include \n"; O << "#include \n"; O << "#include \n"; - O << "\n"; LangOptions LO; @@ -6502,6 +6604,7 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "\n"; O << "// Forward declarations of kernel and its argument types:\n"; + Policy.SuppressDefaultTemplateArgs = false; FwdDeclEmitter.Visit(K.SyclKernel->getType()); O << "\n"; @@ -6579,6 +6682,8 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { } O << ";\n"; O << "}\n"; + Policy.SuppressDefaultTemplateArgs = true; + Policy.EnforceDefaultTemplateArgs = false; // Generate is_kernel, is_single_task_kernel and nd_range_kernel functions. O << "namespace sycl {\n"; diff --git a/clang/test/CodeGenSYCL/free_function_int_header.cpp b/clang/test/CodeGenSYCL/free_function_int_header.cpp index ccaf85aa897ca..6a196dedc2fc2 100644 --- a/clang/test/CodeGenSYCL/free_function_int_header.cpp +++ b/clang/test/CodeGenSYCL/free_function_int_header.cpp @@ -2,7 +2,7 @@ // RUN: FileCheck -input-file=%t.h %s // // This test checks integration header contents for free functions with scalar, -// pointer and non-decomposed struct parameters. +// pointer, non-decomposed struct parameters and work group memory parameters. #include "mock_properties.hpp" #include "sycl.hpp" @@ -96,6 +96,12 @@ void ff_7(KArgWithPtrArray KArg) { template void ff_7(KArgWithPtrArray KArg); +__attribute__((sycl_device)) +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 0)]] +void ff_8(sycl::work_group_memory) { +} + + // CHECK: const char* const kernel_names[] = { // CHECK-NEXT: {{.*}}__sycl_kernel_ff_2Piii // CHECK-NEXT: {{.*}}__sycl_kernel_ff_2Piiii @@ -105,6 +111,7 @@ template void ff_7(KArgWithPtrArray KArg); // CHECK-NEXT: {{.*}}__sycl_kernel_ff_410NoPointers8Pointers3Agg // CHECK-NEXT: {{.*}}__sycl_kernel_ff_6I3Agg7DerivedEvT_T0_i // CHECK-NEXT: {{.*}}__sycl_kernel_ff_7ILi3EEv16KArgWithPtrArrayIXT_EE +// CHECK-NEXT: {{.*}}__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE // CHECK-NEXT: "" // CHECK-NEXT: }; @@ -148,6 +155,9 @@ template void ff_7(KArgWithPtrArray KArg); // CHECK: //--- _Z18__sycl_kernel_ff_7ILi3EEv16KArgWithPtrArrayIXT_EE // CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 48, 0 }, +// CHECK: //--- _Z18__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE +// CHECK-NEXT: { kernel_param_kind_t::kind_work_group_memory, 8, 0 }, + // CHECK: { kernel_param_kind_t::kind_invalid, -987654321, -987654321 }, // CHECK-NEXT: }; @@ -294,6 +304,26 @@ template void ff_7(KArgWithPtrArray KArg); // CHECK-NEXT: }; // CHECK-NEXT: } +// CHECK: Definition of _Z18__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE as a free function kernel + +// CHECK: Forward declarations of kernel and its argument types: +// CHECK: template class work_group_memory; + +// CHECK: void ff_8(sycl::work_group_memory); +// CHECK-NEXT: static constexpr auto __sycl_shim9() { +// CHECK-NEXT: return (void (*)(class sycl::work_group_memory))ff_8; +// CHECK-NEXT: } +// CHECK-NEXT: namespace sycl { +// CHECK-NEXT: template <> +// CHECK-NEXT: struct ext::oneapi::experimental::is_kernel<__sycl_shim9()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim9()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; +// CHECK-NEXT: } + // CHECK: #include // CHECK: Definition of kernel_id of _Z18__sycl_kernel_ff_2Piii @@ -359,3 +389,11 @@ template void ff_7(KArgWithPtrArray KArg); // CHECK-NEXT: return sycl::detail::get_kernel_id_impl(std::string_view{"_Z18__sycl_kernel_ff_7ILi3EEv16KArgWithPtrArrayIXT_EE"}); // CHECK-NEXT: } // CHECK-NEXT: } + +// CHECK: Definition of kernel_id of _Z18__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE +// CHECK-NEXT: namespace sycl { +// CHECK-NEXT: template <> +// CHECK-NEXT: kernel_id ext::oneapi::experimental::get_kernel_id<__sycl_shim9()>() { +// CHECK-NEXT: return sycl::detail::get_kernel_id_impl(std::string_view{"_Z18__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE"}); +// CHECK-NEXT: } +// CHECK-NEXT: } diff --git a/clang/test/CodeGenSYCL/free_function_kernel_params.cpp b/clang/test/CodeGenSYCL/free_function_kernel_params.cpp index a11d55f483966..2e78116824ad2 100644 --- a/clang/test/CodeGenSYCL/free_function_kernel_params.cpp +++ b/clang/test/CodeGenSYCL/free_function_kernel_params.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -triple spir64 \ // RUN: -emit-llvm %s -o - | FileCheck %s // This test checks parameter IR generation for free functions with parameters -// of non-decomposed struct type. +// of non-decomposed struct type and work group memory type. #include "sycl.hpp" @@ -56,3 +56,18 @@ template void ff_6(KArgWithPtrArray KArg); // CHECK: %struct.KArgWithPtrArray = type { [3 x ptr addrspace(4)], [3 x i32], [3 x i32] } // CHECK: define dso_local spir_kernel void @{{.*}}__sycl_kernel{{.*}}(ptr noundef byval(%struct.NoPointers) align 4 %__arg_S1, ptr noundef byval(%struct.__generated_Pointers) align 8 %__arg_S2, ptr noundef byval(%struct.__generated_Agg) align 8 %__arg_S3) // CHECK: define dso_local spir_kernel void @{{.*}}__sycl_kernel_ff_6{{.*}}(ptr noundef byval(%struct.__generated_KArgWithPtrArray) align 8 %__arg_KArg) + +__attribute__((sycl_device)) +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 0)]] +void ff_7(sycl::work_group_memory mem) { +} + +// CHECK: define dso_local spir_kernel void @{{.*}}__sycl_kernel_ff_7{{.*}}(ptr addrspace(3) noundef align 4 %__arg_Ptr) +// CHECK: %__arg_Ptr.addr = alloca ptr addrspace(3), align 8 +// CHECK-NEXT: %mem = alloca %"class.sycl::_V1::work_group_memory", align 8 +// CHECK: %__arg_Ptr.addr.ascast = addrspacecast ptr %__arg_Ptr.addr to ptr addrspace(4) +// CHECK-NEXT: %mem.ascast = addrspacecast ptr %mem to ptr addrspace(4) +// CHECK: store ptr addrspace(3) %__arg_Ptr, ptr addrspace(4) %__arg_Ptr.addr.ascast, align 8 +// CHECK-NEXT: [[REGISTER:%[a-zA-Z0-9_]+]] = load ptr addrspace(3), ptr addrspace(4) %__arg_Ptr.addr.ascast, align 8 +// CHECK-NEXT: call spir_func void @{{.*}}work_group_memory{{.*}}__init{{.*}}(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %mem.ascast, ptr addrspace(3) noundef [[REGISTER]]) + diff --git a/clang/test/SemaSYCL/free_function_kernel_params.cpp b/clang/test/SemaSYCL/free_function_kernel_params.cpp index 2de4f896a1513..da229145a34ad 100644 --- a/clang/test/SemaSYCL/free_function_kernel_params.cpp +++ b/clang/test/SemaSYCL/free_function_kernel_params.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ast-dump \ // RUN: %s -o - | FileCheck %s // This test checks parameter rewriting for free functions with parameters -// of type scalar, pointer and non-decomposed struct. +// of type scalar, pointer, non-decomposed struct and work group memory. #include "sycl.hpp" @@ -171,3 +171,23 @@ template void ff_6(Agg S1, Derived1 S2, int); // CHECK-NEXT: DeclRefExpr {{.*}} '__generated_Derived1' lvalue ParmVar {{.*}} '__arg_S2' '__generated_Derived1' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} '__arg_end' 'int' + +__attribute__((sycl_device)) +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 0)]] +void ff_7(sycl::work_group_memory mem) { +} +// CHECK: FunctionDecl {{.*}}__sycl_kernel{{.*}}'void (__local int *)' +// CHECK-NEXT: ParmVarDecl {{.*}} used __arg_Ptr '__local int *' +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: DeclStmt +// CHECK-NEXT: VarDecl {{.*}} used mem 'sycl::work_group_memory' callinit +// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::work_group_memory' 'void () noexcept' +// CHECK-NEXT: CXXMemberCallExpr {{.*}} 'void' +// CHECK-NEXT: MemberExpr {{.*}} 'void (__local int *)' lvalue .__init +// CHECK-NEXT: DeclRefExpr {{.*}} 'sycl::work_group_memory' Var {{.*}} 'mem' 'sycl::work_group_memory' +// CHECK-NEXT: ImplicitCastExpr {{.*}} '__local int *' +// CHECK-NEXT: DeclRefExpr {{.*}} '__local int *' lvalue ParmVar {{.*}} '__arg_Ptr' '__local int *' +// CHECK-NEXT: CallExpr {{.*}} 'void' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(sycl::work_group_memory)' +// CHECK-NEXT: DeclRefExpr {{.*}} 'void (sycl::work_group_memory)' lvalue Function {{.*}} 'ff_7' 'void (sycl::work_group_memory)' +// CHECK-NEXT: DeclRefExpr {{.*}} 'sycl::work_group_memory' Var {{.*}} 'mem' 'sycl::work_group_memory' diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_memory.asciidoc similarity index 98% rename from sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_memory.asciidoc index 296b77acf82fb..2cbc9d0b2d28b 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_work_group_memory.asciidoc @@ -49,12 +49,10 @@ This extension also depends on the following other SYCL extensions: == Status -This is a proposed extension specification, intended to gather community -feedback. -Interfaces defined in this specification may not be implemented yet or may be -in a preliminary state. -The specification itself may also change in incompatible ways before it is -finalized. +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in this +specification are implemented in {dpcpp}, but they are not finalized and may +change incompatibly in future versions of {dpcpp} without prior notice. *Shipping software products should not rely on APIs defined in this specification.* diff --git a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp index c156c484f539d..254fd8d877f8e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp @@ -1,5 +1,4 @@ //===-------------------- work_group_memory.hpp ---------------------------===// -// // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -103,6 +102,9 @@ class __SYCL_SPECIAL_CLASS __SYCL_TYPE(work_group_memory) work_group_memory } private: + friend class sycl::handler; // needed in order for handler class to be aware + // of the private inheritance with + // work_group_memory_impl as base class decoratedPtr ptr = nullptr; }; } // namespace ext::oneapi::experimental diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 4e8f62d53c36d..d0a9867ec4c40 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1849,7 +1849,9 @@ class __SYCL_EXPORT handler { void set_arg( int ArgIndex, ext::oneapi::experimental::work_group_memory &Arg) { - setArgHelper(ArgIndex, Arg); + // slice the base class object out of Arg + detail::work_group_memory_impl &ArgImpl = Arg; + setArgHelper(ArgIndex, ArgImpl); } // set_arg for graph dynamic_parameters diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in index 8f4fb05752efc..c1e62f5492abe 100644 --- a/sycl/source/feature_test.hpp.in +++ b/sycl/source/feature_test.hpp.in @@ -109,6 +109,7 @@ inline namespace _V1 { #define SYCL_EXT_ONEAPI_PROFILING_TAG 1 #define SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND 1 #define SYCL_EXT_ONEAPI_GET_KERNEL_INFO 1 +#define SYCL_EXT_ONEAPI_WORK_GROUP_MEMORY 1 // In progress yet #define SYCL_EXT_ONEAPI_ATOMIC16 0 diff --git a/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp b/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp index e13f50214593d..7cc1b6008bd78 100644 --- a/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp +++ b/sycl/test-e2e/WorkGroupMemory/common_free_function.hpp @@ -1,7 +1,6 @@ #pragma once #include "common.hpp" -#include "common_lambda.hpp" #include #include #include diff --git a/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp b/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp index ff2aa8aa19385..1f2f5ccd0c5e1 100644 --- a/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp +++ b/sycl/test-e2e/WorkGroupMemory/reduction_free_function.cpp @@ -5,9 +5,6 @@ // UNSUPPORTED: cuda // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16004 -// XFAIL: * -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/15927 - #include "common_free_function.hpp" // Basic usage reduction test using free function kernels. From f0899ff9b208221e12e98137a1514c3c4ab73c0d Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Tue, 19 Nov 2024 22:55:08 -0800 Subject: [PATCH 2/4] [NFC][SYCL] Modernize (idiomatic C++17) `SingleNontypePropertyValueBase` (#16128) --- sycl/include/sycl/ext/oneapi/properties/property_value.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/properties/property_value.hpp b/sycl/include/sycl/ext/oneapi/properties/property_value.hpp index c3b825e6054d6..813e4ecf964ea 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property_value.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property_value.hpp @@ -17,16 +17,11 @@ inline namespace _V1 { namespace ext::oneapi::experimental { namespace detail { -// Checks if a type T has a static value member variable. -template struct HasValue : std::false_type {}; -template -struct HasValue : std::true_type {}; - // Base class for property values with a single non-type value template struct SingleNontypePropertyValueBase {}; template -struct SingleNontypePropertyValueBase::value>> { +struct SingleNontypePropertyValueBase> { static constexpr auto value = T::value; }; From 0e2094de551ff3a2c4acb87c5ff250487867468e Mon Sep 17 00:00:00 2001 From: Udit Agarwal Date: Tue, 19 Nov 2024 23:03:23 -0800 Subject: [PATCH 3/4] [SYCL] Implement eviction for in-memory program cache (#16062) Fixes: CMPLRLLVM-27640, https://github.com/intel/llvm/issues/2517 The PR implements LRU cache eviction policy for in-memory program caches. The high-level idea is to store programs in a linked-list, called eviction list. When the program is first added to the cache, it is also added to the eviction list. When a program is fetched from cache, we move the program to the end of the eviction list. So, that the programs at the beginning of the eviction list are always least recently used. When adding a new program to cache, we check if the size of the program cache exceeds the threshold, if so, we evict the program from cache and corresponding kernels from Kernel and fast kernel cache. This PR also adds a new environment variable, `SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD` that user can use to control the size of in-memory cache. By default, cache eviction is disabled. --- sycl/doc/EnvironmentVariables.md | 1 + sycl/source/detail/config.def | 2 + sycl/source/detail/config.hpp | 50 +++ sycl/source/detail/kernel_program_cache.hpp | 341 +++++++++++++++++- .../program_manager/program_manager.cpp | 24 +- sycl/unittests/assert/assert.cpp | 12 + sycl/unittests/config/ConfigTests.cpp | 63 ++++ .../kernel-and-program/CMakeLists.txt | 1 + .../kernel-and-program/InMemCacheEviction.cpp | 225 ++++++++++++ 9 files changed, 708 insertions(+), 11 deletions(-) create mode 100644 sycl/unittests/kernel-and-program/InMemCacheEviction.cpp diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 3172bc2446aee..5ee2c40542ced 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -14,6 +14,7 @@ compiler and runtime. | `SYCL_CACHE_DISABLE_PERSISTENT (deprecated)` | Any(\*) | Has no effect. | | `SYCL_CACHE_PERSISTENT` | Integer | Controls persistent device compiled code cache. Turns it on if set to '1' and turns it off if set to '0'. When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries. Default is off. | | `SYCL_CACHE_IN_MEM` | '1' or '0' | Enable ('1') or disable ('0') in-memory caching of device compiled code. When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries. Default is '1'. | +| `SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD` | Positive integer | `SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD` accepts an integer that specifies the maximum size of the in-memory program cache in bytes. Eviction is performed when the cache size exceeds the threshold. The default value is 0 which means that eviction is disabled. | | `SYCL_CACHE_EVICTION_DISABLE` | Any(\*) | Switches persistent cache eviction off when the variable is set. | | `SYCL_CACHE_MAX_SIZE` | Positive integer | Persistent cache eviction is triggered once total size of cached images exceeds the value in megabytes (default - 8 192 for 8 GB). Set to 0 to disable size-based cache eviction. | | `SYCL_CACHE_THRESHOLD` | Positive integer | Persistent cache eviction threshold in days (default value is 7 for 1 week). Set to 0 for disabling time-based cache eviction. | diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def index 9172df2a1497b..f459a2dffa50d 100644 --- a/sycl/source/detail/config.def +++ b/sycl/source/detail/config.def @@ -27,6 +27,8 @@ CONFIG(SYCL_HOST_UNIFIED_MEMORY, 1, __SYCL_HOST_UNIFIED_MEMORY) // 260 (Windows limit) - 12 (filename) - 84 (cache directory structure) CONFIG(SYCL_CACHE_DIR, 164, __SYCL_CACHE_DIR) CONFIG(SYCL_CACHE_TRACE, 4, __SYCL_CACHE_TRACE) +CONFIG(SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD, 16, + __SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD) CONFIG(SYCL_CACHE_DISABLE_PERSISTENT, 1, __SYCL_CACHE_DISABLE_PERSISTENT) CONFIG(SYCL_CACHE_PERSISTENT, 1, __SYCL_CACHE_PERSISTENT) CONFIG(SYCL_CACHE_EVICTION_DISABLE, 1, __SYCL_CACHE_EVICTION_DISABLE) diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp index 3c1f2f6822807..ace69d0a9420e 100644 --- a/sycl/source/detail/config.hpp +++ b/sycl/source/detail/config.hpp @@ -756,6 +756,56 @@ template <> class SYCLConfig { } }; +// SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD accepts an integer that specifies +// the maximum size of the in-memory Program cache. +// Cache eviction is performed when the cache size exceeds the threshold. +// The thresholds are specified in bytes. +// The default value is "0" which means that eviction is disabled. +template <> class SYCLConfig { + using BaseT = SYCLConfigBase; + +public: + static int get() { return getCachedValue(); } + static void reset() { (void)getCachedValue(true); } + + static int getProgramCacheSize() { return getCachedValue(); } + + static bool isProgramCacheEvictionEnabled() { + return getProgramCacheSize() > 0; + } + +private: + static int getCachedValue(bool ResetCache = false) { + const auto Parser = []() { + const char *ValStr = BaseT::getRawValue(); + + // Disable eviction by default. + if (!ValStr) + return 0; + + int CacheSize = 0; + try { + CacheSize = std::stoi(ValStr); + if (CacheSize < 0) + throw INVALID_CONFIG_EXCEPTION(BaseT, "Value must be non-negative"); + } catch (...) { + std::string Msg = std::string{ + "Invalid input to SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD. Please try " + "a positive integer."}; + throw exception(make_error_code(errc::runtime), Msg); + } + + return CacheSize; + }; + + static auto EvictionThresholds = Parser(); + if (ResetCache) + EvictionThresholds = Parser(); + + return EvictionThresholds; + } +}; + #undef INVALID_CONFIG_EXCEPTION } // namespace detail diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index f58cda059bcce..9f06d0ebcde8d 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -21,7 +21,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -36,6 +38,20 @@ namespace sycl { inline namespace _V1 { namespace detail { class context_impl; + +// During SYCL program execution SYCL runtime will create internal objects +// representing kernels and programs, it may also invoke JIT compiler to bring +// kernels in a program to executable state. Those runtime operations are quite +// expensive. To avoid redundant operations and to speed up the execution, SYCL +// runtime employs in-memory cache for kernels and programs. When a kernel is +// invoked multiple times, the runtime will fetch the kernel from the cache +// instead of creating it from scratch. +// By default, there is no upper bound on the cache size. +// When the system runs out of memory, the cache will be cleared. Alternatively, +// the cache size can be limited by setting SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD +// to a positive value. When the cache size exceeds the threshold, the least +// recently used programs, and associated kernels, will be evicted from the +// cache. class KernelProgramCache { public: /// Denotes build error data. The data is filled in from sycl::exception @@ -127,10 +143,51 @@ class KernelProgramCache { using CommonProgramKeyT = std::pair>; + // A custom hashing and equality function for ProgramCacheKeyT. + // These are used to compare and hash the keys in the cache. + struct ProgramCacheKeyHash { + std::size_t operator()(const ProgramCacheKeyT &Key) const { + std::size_t Hash = 0; + // Hash the serialized object, representing spec consts. + for (const auto &Elem : Key.first.first) + Hash ^= std::hash{}(Elem); + + // Hash the imageId. + Hash ^= std::hash{}(Key.first.second); + + // Hash the devices. + for (const auto &Elem : Key.second) + Hash ^= std::hash{}(static_cast(Elem)); + return Hash; + } + }; + + struct ProgramCacheKeyEqual { + bool operator()(const ProgramCacheKeyT &LHS, + const ProgramCacheKeyT &RHS) const { + // Check equality of SerializedObj (Spec const) + return std::equal(LHS.first.first.begin(), LHS.first.first.end(), + RHS.first.first.begin()) && + // Check equality of imageId + LHS.first.second == RHS.first.second && + // Check equality of devices + std::equal(LHS.second.begin(), LHS.second.end(), + RHS.second.begin(), RHS.second.end()); + } + }; + struct ProgramCache { ::boost::unordered_map Cache; ::boost::unordered_multimap KeyMap; + // Mapping between a UR program and its size. + std::unordered_map ProgramSizeMap; + + size_t ProgramCacheSizeInBytes = 0; + inline size_t GetProgramCacheSizeInBytes() const noexcept { + return ProgramCacheSizeInBytes; + } + // Returns number of entries in the cache. size_t size() const noexcept { return Cache.size(); } }; @@ -184,6 +241,62 @@ class KernelProgramCache { using KernelFastCacheT = ::boost::unordered_flat_map; + // DS to hold data and functions related to Program cache eviction. + struct EvictionList { + private: + // Linked list of cache entries to be evicted in case of cache overflow. + std::list MProgramEvictionList; + + // Mapping between program handle and the iterator to the eviction list. + std::unordered_map::iterator, + ProgramCacheKeyHash, ProgramCacheKeyEqual> + MProgramToEvictionListMap; + + public: + std::list &getProgramEvictionList() { + return MProgramEvictionList; + } + + void clear() { + MProgramEvictionList.clear(); + MProgramToEvictionListMap.clear(); + } + + void emplaceBack(const ProgramCacheKeyT &CacheKey) { + MProgramEvictionList.emplace_back(CacheKey); + + // In std::list, the iterators are not invalidated when elements are + // added/removed/moved to the list. So, we can safely store the iterators. + MProgramToEvictionListMap[CacheKey] = + std::prev(MProgramEvictionList.end()); + traceProgram("Program added to the end of eviction list.", CacheKey); + } + + // This function is called on the hot path, whenever a kernel/program + // is accessed. So, it should be very fast. + void moveToEnd(const ProgramCacheKeyT &CacheKey) { + auto It = MProgramToEvictionListMap.find(CacheKey); + if (It != MProgramToEvictionListMap.end()) { + MProgramEvictionList.splice(MProgramEvictionList.end(), + MProgramEvictionList, It->second); + traceProgram("Program moved to the end of eviction list.", CacheKey); + } + // else: This can happen if concurrently the program is removed from + // eviction list by another thread. + } + + bool empty() { return MProgramEvictionList.empty(); } + + size_t size() { return MProgramEvictionList.size(); } + + void popFront() { + if (!MProgramEvictionList.empty()) { + MProgramToEvictionListMap.erase(MProgramEvictionList.front()); + MProgramEvictionList.pop_front(); + } + } + }; + ~KernelProgramCache() = default; void setContextPtr(const ContextPtr &AContext) { MParentContext = AContext; } @@ -197,12 +310,24 @@ class KernelProgramCache { int ImageId = CacheKey.first.second; std::stringstream DeviceList; + std::vector SerializedObjVec = CacheKey.first.first; + + // Convert spec constants to string. Spec constants are stored as + // ASCII values, so we need need to convert them to int and then to + // string. + std::string SerializedObjString; + SerializedObjString.reserve(SerializedObjVec.size() * sizeof(size_t)); + for (unsigned char c : SerializedObjVec) + SerializedObjString += std::to_string((int)c) + ","; + for (const auto &Device : CacheKey.second) DeviceList << "0x" << std::setbase(16) << reinterpret_cast(Device) << ","; std::string Identifier = "[Key:{imageId = " + std::to_string(ImageId) + - ",urDevice = " + DeviceList.str() + "}]: "; + ",urDevice = " + DeviceList.str() + + ", serializedObj = " + SerializedObjString + + "}]: "; std::cerr << "[In-Memory Cache][Thread Id:" << std::this_thread::get_id() << "][Program Cache]" << Identifier << Msg << std::endl; @@ -232,6 +357,10 @@ class KernelProgramCache { return {MKernelsPerProgramCache, MKernelsPerProgramCacheMutex}; } + Locked acquireEvictionList() { + return {MEvictionList, MProgramEvictionListMutex}; + } + std::pair getOrInsertProgram(const ProgramCacheKeyT &CacheKey) { auto LockedCache = acquireCachedPrograms(); @@ -268,8 +397,7 @@ class KernelProgramCache { std::make_pair(CacheKey.first.second, CacheKey.second); ProgCache.KeyMap.emplace(CommonKey, CacheKey); traceProgram("Program inserted.", CacheKey); - } else - traceProgram("Program fetched.", CacheKey); + } return DidInsert; } @@ -300,6 +428,23 @@ class KernelProgramCache { template void saveKernel(KeyT &&CacheKey, ValT &&CacheVal) { + + if (SYCLConfig:: + isProgramCacheEvictionEnabled()) { + + ur_program_handle_t Program = std::get<3>(CacheVal); + // Save kernel in fast cache only if the corresponding program is also + // in the cache. + auto LockedCache = acquireCachedPrograms(); + auto &ProgCache = LockedCache.get(); + if (ProgCache.ProgramSizeMap.find(Program) == + ProgCache.ProgramSizeMap.end()) + return; + + // Save reference between the program and the fast cache key. + MProgramToKernelFastCacheKeyMap[Program].emplace_back(CacheKey); + } + std::unique_lock Lock(MKernelFastCacheMutex); // if no insertion took place, thus some other thread has already inserted // smth in the cache @@ -307,6 +452,167 @@ class KernelProgramCache { MKernelFastCache.emplace(CacheKey, CacheVal); } + // Evict programs from cache to free up space. + void evictPrograms(size_t DesiredCacheSize, size_t CurrentCacheSize) { + + // Figure out how many programs from the beginning we need to evict. + if (CurrentCacheSize < DesiredCacheSize || MCachedPrograms.Cache.empty()) + return; + + // Evict programs from the beginning of the cache. + { + std::lock_guard Lock(MProgramEvictionListMutex); + auto &ProgramEvictionList = MEvictionList.getProgramEvictionList(); + size_t CurrCacheSize = MCachedPrograms.ProgramCacheSizeInBytes; + + // Traverse the eviction list and remove the LRU programs. + // The LRU programs will be at the front of the list. + while (CurrCacheSize > DesiredCacheSize && !MEvictionList.empty()) { + ProgramCacheKeyT CacheKey = ProgramEvictionList.front(); + auto LockedCache = acquireCachedPrograms(); + auto &ProgCache = LockedCache.get(); + auto It = ProgCache.Cache.find(CacheKey); + + if (It != ProgCache.Cache.end()) { + // We are about to remove this program now. + // (1) Remove it from KernelPerProgram cache. + // (2) Remove corresponding entries from KernelFastCache. + // (3) Remove it from ProgramCache KeyMap. + // (4) Remove it from the ProgramCache. + // (5) Remove it from ProgramSizeMap. + // (6) Update the cache size. + + // Remove entry from the KernelsPerProgram cache. + ur_program_handle_t NativePrg = It->second->Val; + { + auto LockedCacheKP = acquireKernelsPerProgramCache(); + // List kernels that are to be removed from the cache, if tracing is + // enabled. + if (SYCLConfig::isTraceInMemCache()) { + for (const auto &Kernel : LockedCacheKP.get()[NativePrg]) + traceKernel("Kernel evicted.", Kernel.first); + } + LockedCacheKP.get().erase(NativePrg); + } + + // Remove corresponding entries from KernelFastCache. + auto FastCacheKeyItr = + MProgramToKernelFastCacheKeyMap.find(NativePrg); + if (FastCacheKeyItr != MProgramToKernelFastCacheKeyMap.end()) { + for (const auto &FastCacheKey : FastCacheKeyItr->second) { + std::unique_lock Lock(MKernelFastCacheMutex); + MKernelFastCache.erase(FastCacheKey); + traceKernel("Kernel evicted.", std::get<2>(FastCacheKey), true); + } + MProgramToKernelFastCacheKeyMap.erase(FastCacheKeyItr); + } + + // Remove entry from ProgramCache KeyMap. + CommonProgramKeyT CommonKey = + std::make_pair(CacheKey.first.second, CacheKey.second); + // Since KeyMap is a multi-map, we need to iterate over all entries + // with this CommonKey and remove those that match the CacheKey. + auto KeyMapItrRange = LockedCache.get().KeyMap.equal_range(CommonKey); + for (auto KeyMapItr = KeyMapItrRange.first; + KeyMapItr != KeyMapItrRange.second; ++KeyMapItr) { + if (KeyMapItr->second == CacheKey) { + LockedCache.get().KeyMap.erase(KeyMapItr); + break; + } + } + + // Get size of the program. + size_t ProgramSize = MCachedPrograms.ProgramSizeMap[It->second->Val]; + // Evict program from the cache. + ProgCache.Cache.erase(It); + // Remove program size from the cache size. + MCachedPrograms.ProgramCacheSizeInBytes -= ProgramSize; + MCachedPrograms.ProgramSizeMap.erase(NativePrg); + + traceProgram("Program evicted.", CacheKey); + } else + // This should never happen. + throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), + "Program not found in the cache."); + + CurrCacheSize = MCachedPrograms.ProgramCacheSizeInBytes; + // Remove the program from the eviction list. + MEvictionList.popFront(); + } + } + } + + // Register that a program has been fetched from the cache. + // If it is the first time the program is fetched, add it to the eviction + // list. + void registerProgramFetch(const ProgramCacheKeyT &CacheKey, + const ur_program_handle_t &Program, + const bool IsBuilt) { + + size_t ProgramCacheEvictionThreshold = + SYCLConfig::getProgramCacheSize(); + + // No need to populate the eviction list if eviction is disabled. + if (ProgramCacheEvictionThreshold == 0) + return; + + // If the program is not in the cache, add it to the cache. + if (IsBuilt) { + // This is the first time we are adding this entry. Add it to the end of + // eviction list. + { + std::lock_guard Lock(MProgramEvictionListMutex); + MEvictionList.emplaceBack(CacheKey); + } + + // Store size of the program and check if we need to evict some entries. + // Get Size of the program. + size_t ProgramSize = 0; + auto Adapter = getAdapter(); + + try { + // Get number of devices this program was built for. + unsigned int DeviceNum = 0; + Adapter->call( + Program, UR_PROGRAM_INFO_NUM_DEVICES, sizeof(DeviceNum), &DeviceNum, + nullptr); + + // Get binary sizes for each device. + std::vector BinarySizes(DeviceNum); + Adapter->call( + Program, UR_PROGRAM_INFO_BINARY_SIZES, + sizeof(size_t) * BinarySizes.size(), BinarySizes.data(), nullptr); + + // Sum up binary sizes. + ProgramSize = + std::accumulate(BinarySizes.begin(), BinarySizes.end(), 0); + } catch (const exception &Ex) { + std::cerr << "Failed to get program size: " << Ex.what() << std::endl; + std::rethrow_exception(std::current_exception()); + } + // Store program size in the cache. + size_t CurrCacheSize = 0; + { + std::lock_guard Lock(MProgramCacheMutex); + MCachedPrograms.ProgramSizeMap[Program] = ProgramSize; + MCachedPrograms.ProgramCacheSizeInBytes += ProgramSize; + CurrCacheSize = MCachedPrograms.ProgramCacheSizeInBytes; + } + + // Evict programs if the cache size exceeds the threshold. + if (CurrCacheSize > ProgramCacheEvictionThreshold) + evictPrograms(ProgramCacheEvictionThreshold, CurrCacheSize); + } + // If the program is already in the cache, move it to the end of the list. + // Since we are following LRU eviction policy, we need to move the program + // to the end of the list. Items in the front of the list are the least + // recently This code path is "hot" and should be very fast. + else { + std::lock_guard Lock(MProgramEvictionListMutex); + MEvictionList.moveToEnd(CacheKey); + } + } + /// Clears cache state. /// /// This member function should only be used in unit tests. @@ -317,6 +623,11 @@ class KernelProgramCache { MCachedPrograms = ProgramCache{}; MKernelsPerProgramCache = KernelCacheT{}; MKernelFastCache = KernelFastCacheT{}; + MProgramToKernelFastCacheKeyMap.clear(); + + // Clear the eviction lists and its mutexes. + std::lock_guard EvictionListLock(MProgramEvictionListMutex); + MEvictionList.clear(); } /// Try to fetch entity (kernel or program) from cache. If there is no such @@ -341,8 +652,10 @@ class KernelProgramCache { /// /// \return a pointer to cached build result, return value must not be /// nullptr. - template - auto getOrBuild(GetCachedBuildFT &&GetCachedBuild, BuildFT &&Build) { + template + auto getOrBuild(GetCachedBuildFT &&GetCachedBuild, BuildFT &&Build, + EvictFT &&EvictFunc = nullptr) { using BuildState = KernelProgramCache::BuildState; constexpr size_t MaxAttempts = 2; for (size_t AttemptCounter = 0;; ++AttemptCounter) { @@ -356,8 +669,11 @@ class KernelProgramCache { BuildState NewState = BuildResult->waitUntilTransition(); // Build succeeded. - if (NewState == BuildState::BS_Done) + if (NewState == BuildState::BS_Done) { + if constexpr (!std::is_same_v) + EvictFunc(BuildResult->Val, /*IsBuilt=*/false); return BuildResult; + } // Build failed, or this is the last attempt. if (NewState == BuildState::BS_Failed || @@ -381,6 +697,9 @@ class KernelProgramCache { try { BuildResult->Val = Build(); + if constexpr (!std::is_same_v) + EvictFunc(BuildResult->Val, /*IsBuilt=*/true); + BuildResult->updateAndNotify(BuildState::BS_Done); return BuildResult; } catch (const exception &Ex) { @@ -414,6 +733,16 @@ class KernelProgramCache { std::mutex MKernelFastCacheMutex; KernelFastCacheT MKernelFastCache; + + // Map between fast kernel cache keys and program handle. + // MKernelFastCacheMutex will be used for synchronization. + std::unordered_map> + MProgramToKernelFastCacheKeyMap; + + EvictionList MEvictionList; + // Mutexes that will be used when accessing the eviction lists. + std::mutex MProgramEvictionListMutex; + friend class ::MockKernelProgramCache; const AdapterPtr &getAdapter(); diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 8f13c0745ad21..dfc5d019051a9 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -925,7 +925,13 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( if (!SYCLConfig::get()) return BuildF(); - auto BuildResult = Cache.getOrBuild(GetCachedBuildF, BuildF); + auto EvictFunc = [&Cache, &CacheKey](ur_program_handle_t Program, + bool isBuilt) { + return Cache.registerProgramFetch(CacheKey, Program, isBuilt); + }; + + auto BuildResult = + Cache.getOrBuild(GetCachedBuildF, BuildF, EvictFunc); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); @@ -939,10 +945,12 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( // update it here and re-use that lambda. CacheKey.first.second = BImg->getImageID(); bool DidInsert = Cache.insertBuiltProgram(CacheKey, ResProgram); - if (DidInsert) { + + // Add to the eviction list. + Cache.registerProgramFetch(CacheKey, ResProgram, DidInsert); + if (DidInsert) // For every cached copy of the program, we need to increment its refcount Adapter->call(ResProgram); - } } // If caching is enabled, one copy of the program handle will be @@ -2699,7 +2707,13 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, return Cache.getOrInsertProgram(CacheKey); }; - auto BuildResult = Cache.getOrBuild(GetCachedBuildF, BuildF); + auto EvictFunc = [&Cache, &CacheKey](ur_program_handle_t Program, + bool isBuilt) { + return Cache.registerProgramFetch(CacheKey, Program, isBuilt); + }; + + auto BuildResult = + Cache.getOrBuild(GetCachedBuildF, BuildF, EvictFunc); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); @@ -2728,7 +2742,7 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, } // Change device in the cache key to reduce copying of spec const data. CacheKey.second = Subset; - Cache.getOrBuild(GetCachedBuildF, CacheSubsets); + Cache.getOrBuild(GetCachedBuildF, CacheSubsets, EvictFunc); // getOrBuild is not supposed to return nullptr assert(BuildResult != nullptr && "Invalid build result"); } diff --git a/sycl/unittests/assert/assert.cpp b/sycl/unittests/assert/assert.cpp index b45996238358f..e11184d3a24d2 100644 --- a/sycl/unittests/assert/assert.cpp +++ b/sycl/unittests/assert/assert.cpp @@ -319,6 +319,18 @@ static ur_result_t redefinedProgramGetInfo(void *pParams) { return UR_RESULT_SUCCESS; } + // Required if program cache eviction is enabled. + if (UR_PROGRAM_INFO_BINARY_SIZES == *params.ppropName) { + size_t BinarySize = 1; + + if (*params.ppPropValue) + memcpy(*params.ppPropValue, &BinarySize, sizeof(size_t)); + if (*params.ppPropSizeRet) + **params.ppPropSizeRet = sizeof(size_t); + + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_UNKNOWN; } diff --git a/sycl/unittests/config/ConfigTests.cpp b/sycl/unittests/config/ConfigTests.cpp index 3022ccbd52e65..756a340c8f82d 100644 --- a/sycl/unittests/config/ConfigTests.cpp +++ b/sycl/unittests/config/ConfigTests.cpp @@ -324,3 +324,66 @@ TEST(ConfigTests, CheckSyclCacheTraceTest) { sycl::detail::SYCLConfig::reset(); TestConfig(0, 0, 0, 0); } + +// SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD accepts an integer that specifies +// the maximum size of the in-memory Program cache. +// Cache eviction is performed when the cache size exceeds the threshold. +// The thresholds are specified in bytes. +// The default value is "0" which means that eviction is disabled. +TEST(ConfigTests, CheckSyclCacheEvictionThresholdTest) { + + using InMemEvicType = + sycl::detail::SYCLConfig; + + // Lambda to test parsing of SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD. + auto TestConfig = [](int expectedProgramCacheSize) { + EXPECT_EQ(expectedProgramCacheSize, InMemEvicType::getProgramCacheSize()); + EXPECT_EQ(expectedProgramCacheSize > 0, + InMemEvicType::isProgramCacheEvictionEnabled()); + }; + + // Lambda to set SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD. + auto SetSyclInMemCacheEvictionThresholdEnv = [](const char *value) { +#ifdef _WIN32 + _putenv_s("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD", value); +#else + setenv("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD", value, 1); +#endif + }; + + // Lambda to test invalid inputs. An exception should be thrown + // when parsing invalid values. + auto TestInvalidValues = [&](const char *value, const char *errMsg) { + SetSyclInMemCacheEvictionThresholdEnv(value); + try { + InMemEvicType::reset(); + TestConfig(0); + FAIL() << errMsg; + } catch (...) { + } + }; + + // Test eviction threshold with zero. + SetSyclInMemCacheEvictionThresholdEnv("0"); + sycl::detail::readConfig(true); + TestConfig(0); + + // Test invalid values. + TestInvalidValues("-1", "Should throw exception for negative value"); + TestInvalidValues("a", "Should throw exception for non-integer value"); + + // Test valid values. + SetSyclInMemCacheEvictionThresholdEnv("1024"); + InMemEvicType::reset(); + TestConfig(1024); + + // When SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD is not set, it should default to + // 0:0:0. +#ifdef _WIN32 + _putenv_s("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD", ""); +#else + unsetenv("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD"); +#endif + InMemEvicType::reset(); + TestConfig(0); +} diff --git a/sycl/unittests/kernel-and-program/CMakeLists.txt b/sycl/unittests/kernel-and-program/CMakeLists.txt index 8736f6f60a76a..0d06d2fc29aa0 100644 --- a/sycl/unittests/kernel-and-program/CMakeLists.txt +++ b/sycl/unittests/kernel-and-program/CMakeLists.txt @@ -7,5 +7,6 @@ add_sycl_unittest(KernelAndProgramTests OBJECT PersistentDeviceCodeCache.cpp KernelBuildOptions.cpp OutOfResources.cpp + InMemCacheEviction.cpp ) target_compile_definitions(KernelAndProgramTests PRIVATE -D__SYCL_INTERNAL_API) diff --git a/sycl/unittests/kernel-and-program/InMemCacheEviction.cpp b/sycl/unittests/kernel-and-program/InMemCacheEviction.cpp new file mode 100644 index 0000000000000..70c121053cee9 --- /dev/null +++ b/sycl/unittests/kernel-and-program/InMemCacheEviction.cpp @@ -0,0 +1,225 @@ +//==----- InMemCacheEviction.cpp --- In-memory cache eviction tests -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file contains tests covering eviction in in-memory program cache. + +#define SYCL2020_DISABLE_DEPRECATION_WARNINGS + +#include "../thread_safety/ThreadUtils.h" +#include "detail/context_impl.hpp" +#include "detail/kernel_program_cache.hpp" +#include +#include +#include +#include + +#include + +#include + +using namespace sycl; + +class Kernel1; +class Kernel2; +class Kernel3; + +MOCK_INTEGRATION_HEADER(Kernel1) +MOCK_INTEGRATION_HEADER(Kernel2) +MOCK_INTEGRATION_HEADER(Kernel3) + +static sycl::unittest::MockDeviceImage Img[] = { + sycl::unittest::generateDefaultImage({"Kernel1"}), + sycl::unittest::generateDefaultImage({"Kernel2"}), + sycl::unittest::generateDefaultImage({"Kernel3"})}; + +static sycl::unittest::MockDeviceImageArray<3> ImgArray{Img}; + +// Number of times urProgramCreateWithIL is called. This is used to check +// if the program is created or fetched from the cache. +static int NumProgramBuild = 0; + +constexpr int ProgramSize = 10000; + +static ur_result_t redefinedProgramCreateWithIL(void *) { + ++NumProgramBuild; + return UR_RESULT_SUCCESS; +} + +static ur_result_t redefinedProgramGetInfoAfter(void *pParams) { + auto params = *static_cast(pParams); + if (*params.ppropName == UR_PROGRAM_INFO_NUM_DEVICES) { + auto value = reinterpret_cast(*params.ppPropValue); + *value = 1; + } + + if (*params.ppropName == UR_PROGRAM_INFO_BINARY_SIZES) { + auto value = reinterpret_cast(*params.ppPropValue); + value[0] = ProgramSize; + } + + if (*params.ppropName == UR_PROGRAM_INFO_BINARIES) { + auto value = reinterpret_cast(*params.ppPropValue); + value[0] = 0; + } + + return UR_RESULT_SUCCESS; +} + +// Function to set SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD. +static void setCacheEvictionEnv(const char *value) { +#ifdef _WIN32 + _putenv_s("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD", value); +#else + if (value) + setenv("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD", value, 1); + else + (void)unsetenv("SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD"); +#endif + + sycl::detail::readConfig(true); + sycl::detail::SYCLConfig< + sycl::detail::SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD>::reset(); +} + +// Function to check number of entries in the cache and eviction list. +static inline void +CheckNumberOfEntriesInCacheAndEvictionList(detail::context_impl &CtxImpl, + size_t ExpectedNumEntries) { + auto &KPCache = CtxImpl.getKernelProgramCache(); + EXPECT_EQ(KPCache.acquireCachedPrograms().get().size(), ExpectedNumEntries) + << "Unexpected number of entries in the cache"; + auto EvcList = KPCache.acquireEvictionList(); + EXPECT_EQ(EvcList.get().size(), ExpectedNumEntries) + << "Unexpected number of entries in the eviction list"; +} + +class InMemCacheEvictionTests : public ::testing::Test { +protected: + void TearDown() override { setCacheEvictionEnv(""); } +}; + +TEST(InMemCacheEvictionTests, TestBasicEvictionAndLRU) { + NumProgramBuild = 0; + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urProgramCreateWithIL", + &redefinedProgramCreateWithIL); + mock::getCallbacks().set_after_callback("urProgramGetInfo", + &redefinedProgramGetInfoAfter); + + sycl::platform Plt{sycl::platform()}; + sycl::context Ctx{Plt}; + auto CtxImpl = detail::getSyclObjImpl(Ctx); + queue q(Ctx, default_selector_v); + + // One program is of 10000 bytes, so 20005 eviction threshold can + // accommodate two programs. + setCacheEvictionEnv("20005"); + + // Cache is empty, so one urProgramCreateWithIL call. + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 1); + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 1); + + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 2); + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 2); + + // Move first program to end of eviction list. + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 2); + + // Calling Kernel3, Kernel2, and Kernel1 in a cyclic manner to + // verify LRU's working. + + // Kernel2's program should have been evicted. + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 3); + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 2); + + // Calling Kernel2 again should trigger urProgramCreateWithIL and + // should evict Kernel1's program. + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 3); + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 2); + + // Calling Kernel1 again should trigger urProgramCreateWithIL and + // should evict Kernel3's program. + q.single_task([] {}); + EXPECT_EQ(NumProgramBuild, 4); + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 2); +} + +// Test to verify eviction using concurrent kernel invocation. +TEST(InMemCacheEvictionTests, TestConcurrentEvictionSameQueue) { + NumProgramBuild = 0; + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urProgramCreateWithIL", + &redefinedProgramCreateWithIL); + mock::getCallbacks().set_after_callback("urProgramGetInfo", + &redefinedProgramGetInfoAfter); + + sycl::platform Plt{sycl::platform()}; + context Ctx{Plt}; + auto CtxImpl = detail::getSyclObjImpl(Ctx); + queue q(Ctx, default_selector_v); + + // One program is of 10000 bytes, so 20005 eviction threshold can + // accommodate two programs. + setCacheEvictionEnv("20005"); + + constexpr size_t ThreadCount = 200; + Barrier barrier(ThreadCount); + { + auto ConcurrentInvokeKernels = [&](std::size_t threadId) { + barrier.wait(); + q.single_task([] {}); + q.single_task([] {}); + q.single_task([] {}); + }; + + ThreadPool MPool(ThreadCount, ConcurrentInvokeKernels); + } + q.wait_and_throw(); + + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 2); +} + +// Test to verify eviction using concurrent kernel invocation when +// cache size is very less so as to trigger immediate eviction. +TEST(InMemCacheEvictionTests, TestConcurrentEvictionSmallCache) { + NumProgramBuild = 0; + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urProgramCreateWithIL", + &redefinedProgramCreateWithIL); + mock::getCallbacks().set_after_callback("urProgramGetInfo", + &redefinedProgramGetInfoAfter); + + context Ctx{platform()}; + auto CtxImpl = detail::getSyclObjImpl(Ctx); + queue q(Ctx, default_selector_v); + + // One program is of 10000 bytes, so 100 eviction threshold will + // trigger immediate eviction. + setCacheEvictionEnv("100"); + + // Fetch the same kernel concurrently from multiple threads. + // This should cause some threads to insert a program and other + // threads to evict the same program. + constexpr size_t ThreadCount = 300; + Barrier barrier(ThreadCount); + { + auto ConcurrentInvokeKernels = [&](std::size_t threadId) { + barrier.wait(); + q.single_task([] {}); + }; + + ThreadPool MPool(ThreadCount, ConcurrentInvokeKernels); + } + q.wait_and_throw(); + + CheckNumberOfEntriesInCacheAndEvictionList(*CtxImpl, 0); +} From 023cb2b4ca9f7272388d9968ee89f4ca360a0c19 Mon Sep 17 00:00:00 2001 From: Pietro Ghiglio Date: Wed, 20 Nov 2024 10:37:45 +0100 Subject: [PATCH 4/4] [SYCL][NATIVECPU] Fix header inclusion in shuffle_abi test (#16124) Fixes the headers included in `sycl/test/check_device_code/native_cpu/shuffle_abi.cpp`, using just `sycl/sycl.hpp`. --- sycl/test/check_device_code/native_cpu/shuffle_abi.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp b/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp index cdbaab90ce65c..8a94745f08100 100644 --- a/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp +++ b/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp @@ -13,9 +13,7 @@ // Tests that sub-group shuffles work even when abi is different to what is // expected -#include -#include -#include +#include static constexpr size_t NumElems = VEC_WIDTH; static constexpr size_t NumWorkItems = 64;