From 8915b7afcfe1ba9836d571d81341dd1839d1e671 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Tue, 13 Feb 2024 17:01:01 +0000 Subject: [PATCH 1/8] Add force range rounding option and introduce new compiler flag This commit adds a new preference for range rounding, force, such that if the compile flag is used, only the range rounded parallel_for kernel will be generated. This can make binaries smaller as there is no duplication of SYCL range kernels across range rounded and unrounded versions. I have also added the flag: -fsycl-range-rounding, which can have values: on, force or disable. This flag aims to supercede the fsycl-disable-range-rounding flag. I have also added to existing tests to check for the functionality of the new flag and refactored the range rounding sycl-e2e test. --- clang/include/clang/Basic/LangOptions.def | 3 +- clang/include/clang/Basic/LangOptions.h | 6 + clang/include/clang/Driver/Options.td | 15 +- clang/lib/Driver/Driver.cpp | 4 + clang/lib/Driver/ToolChains/Clang.cpp | 2 +- clang/lib/Frontend/InitPreprocessor.cpp | 7 +- clang/lib/Sema/SemaSYCL.cpp | 10 +- .../integration_header_ppmacros.cpp | 10 +- clang/test/Driver/sycl-offload-intelfpga.cpp | 14 +- clang/test/Driver/sycl-offload.c | 6 +- clang/test/Preprocessor/predefined-macros.c | 54 +++- sycl/include/sycl/handler.hpp | 13 +- .../Basic/parallel_for_range_roundup.cpp | 277 ++++++++++-------- 13 files changed, 265 insertions(+), 156 deletions(-) diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 526498d1675e3..431dc20038941 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -298,7 +298,8 @@ LANGOPT( "SYCL compiler assumes value fits within MAX_INT for member function of " "get/operator[], get_id/operator[] and get_global_id/get_global_linear_id " "in SYCL class id, iterm and nd_iterm") -LANGOPT(SYCLDisableRangeRounding, 1, 0, "Disable parallel for range rounding") +ENUM_LANGOPT(SYCLRangeRounding, SYCLRangeRoundingPreference, 2, SYCLRangeRoundingPreference::On, + "Preference for SYCL parallel_for range rounding") LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the " "SYCL integration header") LANGOPT(SYCLAllowVirtualFunctions, 1, 0, diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 2c508c32674c3..96c58a308168a 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -151,6 +151,12 @@ class LangOptionsBase { undefined }; + enum class SYCLRangeRoundingPreference { + On = 0, + Disable = 1, + Force = 2, + }; + enum HLSLLangStd { HLSL_Unset = 0, HLSL_2015 = 2015, diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 0eb5701a3a455..ebef5188982bb 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -8256,9 +8256,20 @@ defm sycl_allow_func_ptr: BoolFOption<"sycl-allow-func-ptr", def fenable_sycl_dae : Flag<["-"], "fenable-sycl-dae">, HelpText<"Enable Dead Argument Elimination in SPIR kernels">, MarshallingInfoFlag>; +def fsycl_range_rounding_EQ : Joined<["-"], "fsycl-range-rounding=">, + Visibility<[ClangOption, CLOption, DXCOption]>, + Values<"on,disable,force">, + NormalizedValuesScope<"LangOptions::SYCLRangeRoundingPreference">, + NormalizedValues<["On", "Disable", "Force"]>, + MarshallingInfoEnum, "On">, + HelpText<"Options for range rounding of SYCL range kernels: " + "disable (do not generate range rounded kernels) " + "force (only generate range rounded kernels) " + "on (generate range rounded kernels as well as unrounded kernels). Default is 'on'">; def fsycl_disable_range_rounding : Flag<["-"], "fsycl-disable-range-rounding">, - HelpText<"Disable parallel for range rounding.">, - MarshallingInfoFlag>; + Alias, AliasArgs<["disable"]>, + HelpText<"Deprecated: please use -fsycl-range-rounding=disable instead.">, + Flags<[Deprecated]>; def fsycl_enable_int_header_diags: Flag<["-"], "fsycl-enable-int-header-diags">, HelpText<"Enable diagnostics that require the SYCL integration header.">, MarshallingInfoFlag>; diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index eb9c374741d81..13dbf17fad334 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -1158,6 +1158,10 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, checkSingleArgValidity(DeviceCodeSplit, {"per_kernel", "per_source", "auto", "off"}); + Arg *RangeRoundingPreference = + C.getInputArgs().getLastArg(options::OPT_fsycl_range_rounding_EQ); + checkSingleArgValidity(RangeRoundingPreference, {"disable", "force", "on"}); + Arg *SYCLForceTarget = getArgRequiringSYCLRuntime(options::OPT_fsycl_force_target_EQ); if (SYCLForceTarget) { diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 9b94b761a0897..2fcd80364bf7e 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5454,7 +5454,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, DisableRangeRounding = true; } if (DisableRangeRounding || HasFPGA) - CmdArgs.push_back("-fsycl-disable-range-rounding"); + CmdArgs.push_back("-fsycl-range-rounding=disable"); if (HasFPGA) { // Pass -fintelfpga to both the host and device SYCL compilations if set. diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 6ea2be70b6d3d..073660d86c0a6 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -579,8 +579,13 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, // Set __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ macro for // both host and device compilations if -fsycl-disable-range-rounding // flag is used. - if (LangOpts.SYCLDisableRangeRounding) + if (LangOpts.getSYCLRangeRounding() == + LangOptions::SYCLRangeRoundingPreference::Disable) Builder.defineMacro("__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__"); + + if (LangOpts.getSYCLRangeRounding() == + LangOptions::SYCLRangeRoundingPreference::Force) + Builder.defineMacro("__SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__"); } if (LangOpts.DeclareSPIRVBuiltins) { diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 89705ca50ab83..a9b3f8698f668 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -5172,12 +5172,20 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "#endif //" << Macro.first << "\n\n"; } - if (S.getLangOpts().SYCLDisableRangeRounding) { + if (S.getLangOpts().getSYCLRangeRounding() == + LangOptions::SYCLRangeRoundingPreference::Disable) { O << "#ifndef __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ \n"; O << "#define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1\n"; O << "#endif //__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__\n\n"; } + if (S.getLangOpts().getSYCLRangeRounding() == + LangOptions::SYCLRangeRoundingPreference::Force) { + O << "#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ \n"; + O << "#define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1\n"; + O << "#endif //__SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__\n\n"; + } + if (SpecConsts.size() > 0) { O << "// Forward declarations of templated spec constant types:\n"; for (const auto &SC : SpecConsts) diff --git a/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp b/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp index 0cd39fd53fee2..752189ca53847 100644 --- a/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp +++ b/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp @@ -2,8 +2,10 @@ // RUN: FileCheck -input-file=%t.h %s --check-prefix=CHECK-SYCL2020 // RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -sycl-std=2017 -fsycl-int-header=%t.h %s // RUN: FileCheck -input-file=%t.h %s --check-prefix=CHECK-SYCL2017 -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsycl-disable-range-rounding -fsycl-int-header=%t.h %s +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsycl-range-rounding=disable -fsycl-int-header=%t.h %s // RUN: FileCheck -input-file=%t.h %s --check-prefix=CHECK-RANGE +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsycl-range-rounding=force -fsycl-int-header=%t.h %s +// RUN: FileCheck -input-file=%t.h %s --check-prefix=CHECK-FORCE-RANGE // RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsycl-int-header=%t.h %s // RUN: FileCheck -input-file=%t.h %s --check-prefix=CHECK-NO-RANGE @@ -33,4 +35,10 @@ int main() { // CHECK-RANGE: #ifndef __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ // CHECK-RANGE-NEXT: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 // CHECK-RANGE-NEXT: #endif //__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ + +// CHECK-FORCE-RANGE: #ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ +// CHECK-FORCE-RANGE-NEXT: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1 +// CHECK-FORCE-RANGE-NEXT: #endif //__SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + // CHECK-NO-RANGE-NOT: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 +// CHECK-NO-RANGE-NOT: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1 diff --git a/clang/test/Driver/sycl-offload-intelfpga.cpp b/clang/test/Driver/sycl-offload-intelfpga.cpp index 92fddf11cc877..f86f2cc7e4f7e 100644 --- a/clang/test/Driver/sycl-offload-intelfpga.cpp +++ b/clang/test/Driver/sycl-offload-intelfpga.cpp @@ -26,13 +26,13 @@ // CHK-HOST-DEVICE: clang{{.*}} "-cc1"{{.*}} "-fsycl-is-device"{{.*}} "-fintelfpga" // CHK-HOST-DEVICE: clang{{.*}} "-cc1"{{.*}} "-fintelfpga"{{.*}} "-fsycl-is-host" -/// FPGA target implies -fsycl-disable-range-rounding +/// FPGA target implies -fsycl-range-rounding=disable // RUN: %clangxx -### -target x86_64-unknown-linux-gnu -fintelfpga %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-RANGE-ROUNDING %s // RUN: %clangxx -### -target x86_64-unknown-linux-gnu -fsycl -fsycl-targets=spir64_fpga-unknown-unknown %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-RANGE-ROUNDING %s -// CHK-RANGE-ROUNDING: clang{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-disable-range-rounding" -// CHK-RANGE-ROUNDING: clang{{.*}} "-fsycl-disable-range-rounding"{{.*}} "-fsycl-is-host" +// CHK-RANGE-ROUNDING: clang{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-range-rounding=disable" +// CHK-RANGE-ROUNDING: clang{{.*}} "-fsycl-range-rounding=disable"{{.*}} "-fsycl-is-host" /// FPGA target implies -emit-only-kernels-as-entry-points in sycl-post-link // RUN: %clangxx -### -target x86_64-unknown-linux-gnu -fintelfpga %s 2>&1 \ @@ -41,12 +41,12 @@ // RUN: | FileCheck -check-prefix=CHK-NON-KERNEL-ENTRY-POINTS %s // CHK-NON-KERNEL-ENTRY-POINTS: sycl-post-link{{.*}} "-emit-only-kernels-as-entry-points" -/// -fsycl-disable-range-rounding is applied to all compilations if fpga is used +/// -fsycl-range-rounding=disable is applied to all compilations if fpga is used // RUN: %clangxx -### -target x86_64-unknown-linux-gnu -fsycl -fsycl-targets=spir64_fpga-unknown-unknown,spir64_gen-unknown-unknown %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-RANGE-ROUNDING-MULTI %s -// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-triple" "spir64_gen-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-disable-range-rounding" -// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-fsycl-disable-range-rounding"{{.*}} "-fsycl-is-host" -// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-triple" "spir64_fpga-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-disable-range-rounding" +// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-triple" "spir64_gen-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-range-rounding=disable" +// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-fsycl-range-rounding=disable"{{.*}} "-fsycl-is-host" +// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-triple" "spir64_fpga-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-range-rounding=disable" /// -fintelfpga with -reuse-exe= // RUN: touch %t.cpp diff --git a/clang/test/Driver/sycl-offload.c b/clang/test/Driver/sycl-offload.c index 636d9e89b8092..3d6916cfe3b75 100644 --- a/clang/test/Driver/sycl-offload.c +++ b/clang/test/Driver/sycl-offload.c @@ -508,13 +508,13 @@ // RUN: | FileCheck -check-prefix=CHK-TOOLS-OPTS2 %s // CHK-TOOLS-OPTS2: clang-offload-wrapper{{.*}} "-link-opts=-DFOO1 -DFOO2" -/// -fsycl-disable-range-rounding settings +/// -fsycl-range-rounding settings // RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \ // RUN: -fsycl-targets=spir64 -O0 %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-DISABLE-RANGE-ROUNDING %s // RUN: %clang_cl -### -fsycl -fsycl-targets=spir64 -Od %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-DISABLE-RANGE-ROUNDING %s -// CHK-DISABLE-RANGE-ROUNDING: "-fsycl-disable-range-rounding" +// CHK-DISABLE-RANGE-ROUNDING: "-fsycl-range-rounding=disable" // RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \ // RUN: -fsycl-targets=spir64 -O2 %s 2>&1 \ @@ -527,6 +527,8 @@ // RUN: %clang_cl -### -fsycl -fsycl-targets=spir64 %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-RANGE-ROUNDING %s // CHK-RANGE-ROUNDING-NOT: "-fsycl-disable-range-rounding" +// CHK-RANGE-ROUNDING-NOT: "-fsycl-range-rounding=disable" +// CHK-RANGE-ROUNDING-NOT: "-fsycl-range-rounding=force" /// ########################################################################### diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c index 119350ebf3fab..199fd03bee63b 100644 --- a/clang/test/Preprocessor/predefined-macros.c +++ b/clang/test/Preprocessor/predefined-macros.c @@ -284,32 +284,60 @@ // CHECK-RDC: #define __CLANG_RDC__ 1 // RUN: %clang_cc1 %s -E -dM -fsycl-is-device \ -// RUN: -triple spir64-unknown-unknown -fsycl-disable-range-rounding -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-RANGE +// RUN: -triple spir64-unknown-unknown -fsycl-range-rounding=disable -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-RANGE // RUN: %clang_cc1 %s -E -dM -fsycl-is-device \ // RUN: -triple spir64_fpga-unknown-unknown -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-RANGE +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-NO-RANGE -// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -fsycl-disable-range-rounding \ +// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -fsycl-range-rounding=disable \ // RUN: -triple spir64_fpga-unknown-unknown -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-RANGE +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-RANGE // RUN: %clang_cc1 %s -E -dM -fsycl-is-device -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-RANGE +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-NO-RANGE // RUN: %clang_cc1 %s -E -dM -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-RANGE +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-NO-RANGE // RUN: %clang_cc1 %s -E -dM -fsycl-is-host \ -// RUN: -triple x86_64-unknown-linux-gnu -fsycl-disable-range-rounding -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-RANGE +// RUN: -triple x86_64-unknown-linux-gnu -fsycl-range-rounding=disable -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-RANGE // RUN: %clang_cc1 %s -E -dM -fsycl-is-host -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-RANGE +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-NO-RANGE -// CHECK-RANGE: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 -// CHECK-NO-RANGE-NOT: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 +// CHECK-DISABLE-RANGE: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 +// CHECK-DISABLE-NO-RANGE-NOT: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-device \ +// RUN: -triple spir64-unknown-unknown -fsycl-range-rounding=force -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-RANGE + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-device \ +// RUN: -triple spir64_fpga-unknown-unknown -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-NO-RANGE + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -fsycl-range-rounding=force \ +// RUN: -triple spir64_fpga-unknown-unknown -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-RANGE + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-NO-RANGE + +// RUN: %clang_cc1 %s -E -dM -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-NO-RANGE + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-host \ +// RUN: -triple x86_64-unknown-linux-gnu -fsycl-range-rounding=force -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-RANGE + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-host -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-NO-RANGE + +// CHECK-FORCE-RANGE: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1 +// CHECK-FORCE-NO-RANGE-NOT: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1 // RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \ // RUN: -fgpu-default-stream=per-thread \ @@ -334,4 +362,4 @@ // RUN: -triple amdgcn-amd-amdhsa -fcuda-is-device | FileCheck -match-full-lines \ // RUN: %s --check-prefix=CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG // CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG: #define __HIPSTDPAR__ 1 -// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1 \ No newline at end of file +// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1 diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 51e2f41de9d75..90b53fedf7090 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1181,7 +1181,6 @@ class __SYCL_EXPORT handler { // non-32-bit global range, we wrap the old kernel in a new kernel // that has each work item peform multiple invocations the old // kernel in a 32-bit global range. - auto Dev = detail::getSyclObjImpl(detail::getDeviceFromHandler(*this)); id MaxNWGs = [&] { auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups_v2(); if (!HasMaxWGs) { @@ -1224,6 +1223,11 @@ class __SYCL_EXPORT handler { // will yield a rounded-up value for the total range. Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor); } +#ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + // If we are forcing range rounding kernels to be used, we always want the + // rounded range kernel to be generated, even if rounding isn't needed + DidAdjust = true; +#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ for (int i = 0; i < Dims; ++i) if (RoundedRange[i] > MaxRange[i]) @@ -1330,6 +1334,10 @@ class __SYCL_EXPORT handler { { (void)UserRange; (void)Props; +#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ // If parallel_for range + // rounding is forced then + // only range rounded kernel + // is generated kernel_parallel_for_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ @@ -1340,6 +1348,9 @@ class __SYCL_EXPORT handler { std::move(KernelFunc)); setType(detail::CG::Kernel); #endif +#else + (void)KernelFunc; +#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ } } diff --git a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp index 0a456ef02ea00..9146ca22e947e 100644 --- a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp @@ -1,152 +1,163 @@ // REQUIRES: gpu // RUN: %{build} -o %t.out -// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s - +// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT + +// RUN: %{build} -fsycl-range-rounding=force -o %t.out +// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT + +// These tests test 3 things: +// +// 1. The user range is the same as the in kernel range (using BufRange) as +// reported by get_range(). +// 2. That the effective range is the same as the reported range (using +// BufCounter). ie check that the mapping of effective range to user range is +// "onto"? +// 3. That every index in a 1, 2, or 3 dim range is active the execution (using +// BufIndexes). ie check that the mapping of effective range to user range is +// "1-to-1"? +// #include #include + using namespace sycl; +constexpr size_t MagicY = 33, MagicZ = 64; + range<1> Range1 = {0}; range<2> Range2 = {0, 0}; range<3> Range3 = {0, 0, 0}; +template class Kernel1; +template class Kernel2; +template class Kernel3; + void check(const char *msg, size_t v, size_t ref) { std::cout << msg << v << std::endl; assert(v == ref); } -int try_item1(size_t size) { - range<1> Size{size}; - int Counter = 0; - { - buffer, 1> BufRange(&Range1, 1); - buffer BufCounter(&Counter, 1); - queue myQueue; - - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](item<1> ITEM) { - AccCounter[0].fetch_add(1); - AccRange[0] = ITEM.get_range(0); - }); - }); - myQueue.wait(); - } - check("Size seen by user = ", Range1.get(0), size); - check("Counter = ", Counter, size); - return 0; +template void checkVec(vec a, vec b) { + assert(Dims == 1 || Dims == 2 || Dims == 3); + assert(a[0] == b[0]); + if constexpr (Dims > 1) + assert(a[1] == b[1]); + if constexpr (Dims > 2) + assert(a[2] == b[2]); } -void try_item2(size_t size) { - range<2> Size{size, 10}; - int Counter = 0; - { - buffer, 1> BufRange(&Range2, 1); - buffer BufCounter(&Counter, 1); - queue myQueue; - - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](item<2> ITEM) { - AccCounter[0].fetch_add(1); - AccRange[0][0] = ITEM.get_range(0); - }); - }); - myQueue.wait(); - } - check("Size seen by user = ", Range2.get(0), size); - check("Counter = ", Counter, size * 10); -} - -void try_item3(size_t size) { - range<3> Size{size, 10, 10}; - int Counter = 0; - { - buffer, 1> BufRange(&Range3, 1); - buffer BufCounter(&Counter, 1); - queue myQueue; - - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](item<3> ITEM) { - AccCounter[0].fetch_add(1); - AccRange[0][0] = ITEM.get_range(0); - }); - }); - myQueue.wait(); - } - check("Size seen by user = ", Range3.get(0), size); - check("Counter = ", Counter, size * 10 * 10); -} - -void try_id1(size_t size) { +template void try_1d_range(size_t size) { + using IndexCheckT = int; range<1> Size{size}; int Counter = 0; + std::vector ItemIndexes(Size[0]); { buffer, 1> BufRange(&Range1, 1); buffer BufCounter(&Counter, 1); + buffer BufIndexes(ItemIndexes); queue myQueue; myQueue.submit([&](handler &cgh) { auto AccRange = BufRange.get_access(cgh); auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](id<1> ID) { + auto AccIndexes = BufIndexes.get_access(cgh); + cgh.parallel_for>(Size, [=](KernelIdT I) { AccCounter[0].fetch_add(1); - AccRange[0] = ID[0]; + if constexpr (std::is_same_v>) + AccRange[0] = sycl::range<1>(I.get_range(0)); + int Idx = I[0]; + AccIndexes[Idx] = IndexCheckT(I[0]); }); }); myQueue.wait(); } + if constexpr (std::is_same_v>) { + check("Size seen by user at Dim 0 = ", Range1.get(0), size); + } check("Counter = ", Counter, size); + for (auto i = 0; i < Size[0]; ++i) { + checkVec<1>(vec(ItemIndexes[i]), vec(i)); + } + std::cout << "Correct kernel indexes used\n"; } -void try_id2(size_t size) { - range<2> Size{size, 10}; +template void try_2d_range(size_t size) { + using IndexCheckT = int2; + range<2> Size{size, MagicY}; int Counter = 0; + std::vector ItemIndexes(Size[0] * Size[1]); { buffer, 1> BufRange(&Range2, 1); buffer BufCounter(&Counter, 1); + buffer BufIndexes(ItemIndexes); queue myQueue; myQueue.submit([&](handler &cgh) { auto AccRange = BufRange.get_access(cgh); auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](id<2> ID) { + auto AccIndexes = BufIndexes.get_access(cgh); + cgh.parallel_for>(Size, [=](KernelIdT I) { AccCounter[0].fetch_add(1); - AccRange[0][0] = ID[0]; + if constexpr (std::is_same_v>) + AccRange[0] = sycl::range<2>(I.get_range(0), I.get_range(1)); + int Idx = I[0] * Size[1] + I[1]; + AccIndexes[Idx] = IndexCheckT(I[0], I[1]); }); }); myQueue.wait(); } - check("Counter = ", Counter, size * 10); + if constexpr (std::is_same_v>) { + check("Size seen by user at Dim 0 = ", Range2.get(0), Size[0]); + check("Size seen by user at Dim 1 = ", Range2.get(1), Size[1]); + } + check("Counter = ", Counter, size * MagicY); + for (auto i = 0; i < Size[0]; ++i) + for (auto j = 0; j < Size[1]; ++j) + checkVec<2>(ItemIndexes[i * Size[1] + j], IndexCheckT(i, j)); + std::cout << "Correct kernel indexes used\n"; } -void try_id3(size_t size) { - range<3> Size{size, 10, 10}; +template void try_3d_range(size_t size) { + using IndexCheckT = int3; + range<3> Size{size, MagicY, MagicZ}; int Counter = 0; + std::vector ItemIndexes(Size[0] * Size[1] * Size[2]); { buffer, 1> BufRange(&Range3, 1); buffer BufCounter(&Counter, 1); + buffer BufIndexes(ItemIndexes); queue myQueue; myQueue.submit([&](handler &cgh) { auto AccRange = BufRange.get_access(cgh); auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](id<3> ID) { + auto AccIndexes = BufIndexes.get_access(cgh); + cgh.parallel_for>(Size, [=](KernelIdT I) { AccCounter[0].fetch_add(1); - AccRange[0][0] = ID[0]; + if constexpr (std::is_same_v>) + AccRange[0] = + sycl::range<3>(I.get_range(0), I.get_range(1), I.get_range(2)); + int Idx = I[0] * Size[1] * Size[2] + I[1] * Size[2] + I[2]; + AccIndexes[Idx] = IndexCheckT(I[0], I[1], I[2]); }); }); myQueue.wait(); } - check("Counter = ", Counter, size * 10 * 10); + if constexpr (std::is_same_v>) { + check("Size seen by user at Dim 0 = ", Range3.get(0), Size[0]); + check("Size seen by user at Dim 1 = ", Range3.get(1), Size[1]); + check("Size seen by user at Dim 2 = ", Range3.get(2), Size[2]); + } + check("Counter = ", Counter, size * MagicY * MagicZ); + for (auto i = 0; i < Size[0]; ++i) + for (auto j = 0; j < Size[1]; ++j) + for (auto k = 0; k < Size[2]; ++k) + checkVec<3>(ItemIndexes[i * Size[1] * Size[2] + j * Size[2] + k], + IndexCheckT(i, j, k)); + std::cout << "Correct kernel indexes used\n"; } void try_unnamed_lambda(size_t size) { - range<3> Size{size, 10, 10}; + range<3> Size{size, MagicY, MagicZ}; int Counter = 0; { buffer, 1> BufRange(&Range3, 1); @@ -163,57 +174,71 @@ void try_unnamed_lambda(size_t size) { }); myQueue.wait(); } - check("Counter = ", Counter, size * 10 * 10); + check("Counter = ", Counter, size * MagicY * MagicZ); } int main() { - int x; - - x = 1500; - try_item1(x); - try_item2(x); - try_item3(x); - try_id1(x); - try_id2(x); - try_id3(x); + int x = 1500; + try_1d_range>(x); + try_1d_range>(x); + try_2d_range>(x); + try_2d_range>(x); + try_3d_range>(x); + try_3d_range>(x); try_unnamed_lambda(x); x = 256; - try_item1(x); - try_item2(x); - try_item3(x); - try_id1(x); - try_id2(x); - try_id3(x); + try_1d_range>(x); + try_1d_range>(x); + try_2d_range>(x); + try_2d_range>(x); + try_3d_range>(x); + try_3d_range>(x); try_unnamed_lambda(x); - - return 0; } -// CHECK: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Size seen by user = 1500 -// CHECK-NEXT: Counter = 1500 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Size seen by user = 1500 -// CHECK-NEXT: Counter = 15000 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Size seen by user = 1500 -// CHECK-NEXT: Counter = 150000 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Counter = 1500 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Counter = 15000 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Counter = 150000 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Counter = 150000 -// CHECK-NEXT: Size seen by user = 256 -// CHECK-NEXT: Counter = 256 -// CHECK-NEXT: Size seen by user = 256 -// CHECK-NEXT: Counter = 2560 -// CHECK-NEXT: Size seen by user = 256 -// CHECK-NEXT: Counter = 25600 -// CHECK-NEXT: Counter = 256 -// CHECK-NEXT: Counter = 2560 -// CHECK-NEXT: Counter = 25600 -// CHECK-NEXT: Counter = 25600 +// CHECK-DEFAULT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-DEFAULT-NEXT: Counter = 1500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 1500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Counter = 49500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 49500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 2 = 64 +// CHECK-DEFAULT-NEXT: Counter = 3168000 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 3168000 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 3168000 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-DEFAULT-NEXT: Counter = 256 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 256 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Counter = 8448 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 8448 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 2 = 64 +// CHECK-DEFAULT-NEXT: Counter = 540672 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 540672 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 540672 From 7fccec20e9a4f30c86cf6ab1e22906e8e2079a3e Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 14 Feb 2024 17:30:11 +0000 Subject: [PATCH 2/8] Update documentation Add some description of how the -fsycl-range-rounding flag should be used. --- sycl/doc/design/ParallelForRangeRounding.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sycl/doc/design/ParallelForRangeRounding.md b/sycl/doc/design/ParallelForRangeRounding.md index a4199aed8e800..7f43cafe6e96e 100644 --- a/sycl/doc/design/ParallelForRangeRounding.md +++ b/sycl/doc/design/ParallelForRangeRounding.md @@ -42,5 +42,8 @@ rounding will only be used if the SYCL runtime X dimension exceeds some minimum value, which can be configured using the `SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS` environment variable. -Generation of range rounded kernels can be disabled by using the compiler flag -`-fsycl-disable-range-rounding`. +In order to reduce binary size, the user can tell the compiler only to generate +the range rounded kernel, `-fsycl-range-rounding=force`. The user can also tell +the SYCL implementation to only produce the unrounded kernel using the flag +`-fsycl-range-rounding=disable`. By default both kernels will be generated, +which is equivalent to `-fsycl-range-rounding=on`. From 2695911b00f9ecff94539a4c0671b33a8d00296f Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Thu, 15 Feb 2024 11:17:42 +0000 Subject: [PATCH 3/8] Fix test Compiler invocation was missing -Xclang --- sycl/test-e2e/Basic/parallel_for_range_roundup.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp index 9146ca22e947e..e48b0ec7cd161 100644 --- a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp @@ -1,8 +1,8 @@ // REQUIRES: gpu -// RUN: %{build} -o %t.out +// RUN: %{build} -Xclang -fsycl-range-rounding=disable -o %t.out // RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT -// RUN: %{build} -fsycl-range-rounding=force -o %t.out +// RUN: %{build} -Xclang -fsycl-range-rounding=force -o %t.out // RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT // These tests test 3 things: From 8bb45ed78cf05c999b5082152957337d7a33d9da Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Thu, 15 Feb 2024 11:41:51 +0000 Subject: [PATCH 4/8] Respond to comments - Change if else to switch in integration header emission and init preprocessor - Change comment in handler.hpp - Change comments and use static_assert with message in test-e2e - Change enum to have no defined int values - Wrap long line in LangOptions.def # --- clang/include/clang/Basic/LangOptions.def | 3 ++- clang/include/clang/Basic/LangOptions.h | 6 +++--- clang/lib/Frontend/InitPreprocessor.cpp | 13 ++++++++----- clang/lib/Sema/SemaSYCL.cpp | 13 +++++++------ sycl/include/sycl/handler.hpp | 7 +++---- sycl/test-e2e/Basic/parallel_for_range_roundup.cpp | 13 +++++++------ 6 files changed, 30 insertions(+), 25 deletions(-) diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 431dc20038941..7fe80879f7664 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -298,7 +298,8 @@ LANGOPT( "SYCL compiler assumes value fits within MAX_INT for member function of " "get/operator[], get_id/operator[] and get_global_id/get_global_linear_id " "in SYCL class id, iterm and nd_iterm") -ENUM_LANGOPT(SYCLRangeRounding, SYCLRangeRoundingPreference, 2, SYCLRangeRoundingPreference::On, +ENUM_LANGOPT(SYCLRangeRounding, SYCLRangeRoundingPreference, 2, + SYCLRangeRoundingPreference::On, "Preference for SYCL parallel_for range rounding") LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the " "SYCL integration header") diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 96c58a308168a..c8081a77d65c9 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -152,9 +152,9 @@ class LangOptionsBase { }; enum class SYCLRangeRoundingPreference { - On = 0, - Disable = 1, - Force = 2, + On, + Disable, + Force, }; enum HLSLLangStd { diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 073660d86c0a6..0dde49bde21a0 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -579,13 +579,16 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, // Set __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ macro for // both host and device compilations if -fsycl-disable-range-rounding // flag is used. - if (LangOpts.getSYCLRangeRounding() == - LangOptions::SYCLRangeRoundingPreference::Disable) + switch (LangOpts.getSYCLRangeRounding()) { + case LangOptions::SYCLRangeRoundingPreference::Disable: Builder.defineMacro("__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__"); - - if (LangOpts.getSYCLRangeRounding() == - LangOptions::SYCLRangeRoundingPreference::Force) + break; + case LangOptions::SYCLRangeRoundingPreference::Force: Builder.defineMacro("__SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__"); + break; + default: + break; + } } if (LangOpts.DeclareSPIRVBuiltins) { diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index a9b3f8698f668..0d07d673f3880 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -5172,18 +5172,19 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "#endif //" << Macro.first << "\n\n"; } - if (S.getLangOpts().getSYCLRangeRounding() == - LangOptions::SYCLRangeRoundingPreference::Disable) { + switch (S.getLangOpts().getSYCLRangeRounding()) { + case LangOptions::SYCLRangeRoundingPreference::Disable: O << "#ifndef __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ \n"; O << "#define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1\n"; O << "#endif //__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__\n\n"; - } - - if (S.getLangOpts().getSYCLRangeRounding() == - LangOptions::SYCLRangeRoundingPreference::Force) { + break; + case LangOptions::SYCLRangeRoundingPreference::Force: O << "#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ \n"; O << "#define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1\n"; O << "#endif //__SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__\n\n"; + break; + default: + break; } if (SpecConsts.size() > 0) { diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 90b53fedf7090..672f8da32c91c 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1334,10 +1334,9 @@ class __SYCL_EXPORT handler { { (void)UserRange; (void)Props; -#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ // If parallel_for range - // rounding is forced then - // only range rounded kernel - // is generated +#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + // If parallel_for range rounding is forced then only range rounded + // kernel is generated kernel_parallel_for_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ diff --git a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp index e48b0ec7cd161..d40a278aa488e 100644 --- a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp @@ -10,11 +10,11 @@ // 1. The user range is the same as the in kernel range (using BufRange) as // reported by get_range(). // 2. That the effective range is the same as the reported range (using -// BufCounter). ie check that the mapping of effective range to user range is -// "onto"? -// 3. That every index in a 1, 2, or 3 dim range is active the execution (using -// BufIndexes). ie check that the mapping of effective range to user range is -// "1-to-1"? +// BufCounter). i.e. check that the mapping of effective range to user range +// is "onto". +// 3. That every index in a 1, 2, or 3 dimension range is active the execution +// (using BufIndexes). i.e. check that the mapping of effective range to user +// range is "one-to-one". // #include #include @@ -37,7 +37,8 @@ void check(const char *msg, size_t v, size_t ref) { } template void checkVec(vec a, vec b) { - assert(Dims == 1 || Dims == 2 || Dims == 3); + static_assert(Dims == 1 || Dims == 2 || Dims == 3, + "Should only be use for 1, 2 or 3 dimensional vectors"); assert(a[0] == b[0]); if constexpr (Dims > 1) assert(a[1] == b[1]); From 4676c358b1179b3099f36b5d21ce2baf90a44f7a Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Fri, 16 Feb 2024 14:54:27 +0000 Subject: [PATCH 5/8] Make flag a driver flag, not just cc1 Makes -fsycl-range-rounding= accessible to driver calls, not just cc1 invocations. --- clang/include/clang/Driver/Options.td | 29 ++++++++++--------- clang/lib/Driver/ToolChains/Clang.cpp | 3 ++ clang/test/Driver/sycl-offload.c | 13 +++++++++ .../Basic/parallel_for_range_roundup.cpp | 4 +-- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index ebef5188982bb..f770a6edb66d1 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3997,6 +3997,21 @@ def fsycl_host_compiler_options_EQ : Joined<["-"], "fsycl-host-compiler-options= Visibility<[ClangOption, CLOption, DXCOption]>, HelpText<"When performing the host compilation with " "-fsycl-host-compiler specified, use the given options during that compile. " "Options are expected to be a quoted list of space separated options.">; +def fsycl_range_rounding_EQ : Joined<["-"], "fsycl-range-rounding=">, + Visibility<[ClangOption, CLOption, DXCOption, CC1Option]>, + Values<"on,disable,force">, + NormalizedValuesScope<"LangOptions::SYCLRangeRoundingPreference">, + NormalizedValues<["On", "Disable", "Force"]>, + MarshallingInfoEnum, "On">, + HelpText<"Options for range rounding of SYCL range kernels: " + "disable (do not generate range rounded kernels) " + "force (only generate range rounded kernels) " + "on (generate range rounded kernels as well as unrounded kernels). Default is 'on'">; +def fsycl_disable_range_rounding : Flag<["-"], "fsycl-disable-range-rounding">, + Visibility<[ClangOption, CLOption, DXCOption, CC1Option]>, + Alias, AliasArgs<["disable"]>, + HelpText<"Deprecated: please use -fsycl-range-rounding=disable instead.">, + Flags<[Deprecated]>; def fno_sycl_use_footer : Flag<["-"], "fno-sycl-use-footer">, Visibility<[ClangOption, CLOption, DXCOption]>, HelpText<"Disable usage of the integration footer during SYCL enabled " "compilations.">; @@ -8256,20 +8271,6 @@ defm sycl_allow_func_ptr: BoolFOption<"sycl-allow-func-ptr", def fenable_sycl_dae : Flag<["-"], "fenable-sycl-dae">, HelpText<"Enable Dead Argument Elimination in SPIR kernels">, MarshallingInfoFlag>; -def fsycl_range_rounding_EQ : Joined<["-"], "fsycl-range-rounding=">, - Visibility<[ClangOption, CLOption, DXCOption]>, - Values<"on,disable,force">, - NormalizedValuesScope<"LangOptions::SYCLRangeRoundingPreference">, - NormalizedValues<["On", "Disable", "Force"]>, - MarshallingInfoEnum, "On">, - HelpText<"Options for range rounding of SYCL range kernels: " - "disable (do not generate range rounded kernels) " - "force (only generate range rounded kernels) " - "on (generate range rounded kernels as well as unrounded kernels). Default is 'on'">; -def fsycl_disable_range_rounding : Flag<["-"], "fsycl-disable-range-rounding">, - Alias, AliasArgs<["disable"]>, - HelpText<"Deprecated: please use -fsycl-range-rounding=disable instead.">, - Flags<[Deprecated]>; def fsycl_enable_int_header_diags: Flag<["-"], "fsycl-enable-int-header-diags">, HelpText<"Enable diagnostics that require the SYCL integration header.">, MarshallingInfoFlag>; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 2fcd80364bf7e..81d5fa1a94742 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5427,6 +5427,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_sycl_esimd_force_stateless_mem, true)) CmdArgs.push_back("-fno-sycl-esimd-force-stateless-mem"); + if (Arg *A = Args.getLastArg(options::OPT_fsycl_range_rounding_EQ)) + A->render(Args, CmdArgs); + // Add the Unique ID prefix StringRef UniqueID = D.getSYCLUniqueID(Input.getBaseInput()); if (!UniqueID.empty()) diff --git a/clang/test/Driver/sycl-offload.c b/clang/test/Driver/sycl-offload.c index 3d6916cfe3b75..b6c2891010152 100644 --- a/clang/test/Driver/sycl-offload.c +++ b/clang/test/Driver/sycl-offload.c @@ -509,6 +509,19 @@ // CHK-TOOLS-OPTS2: clang-offload-wrapper{{.*}} "-link-opts=-DFOO1 -DFOO2" /// -fsycl-range-rounding settings +/// +/// // Check that driver flag is passed to cc1 +// RUN: %clang -### -fsycl -fsycl-range-rounding=disable %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-DRIVER-RANGE-ROUNDING-DISABLE %s +// RUN: %clang -### -fsycl -fsycl-range-rounding=force %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-DRIVER-RANGE-ROUNDING-FORCE %s +// RUN: %clang -### -fsycl -fsycl-range-rounding=on %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-DRIVER-RANGE-ROUNDING-ON %s +// CHK-DRIVER-RANGE-ROUNDING-DISABLE: "-cc1{{.*}}-fsycl-range-rounding=disable" +// CHK-DRIVER-RANGE-ROUNDING-FORCE: "-cc1{{.*}}-fsycl-range-rounding=force" +// CHK-DRIVER-RANGE-ROUNDING-ON: "-cc1{{.*}}-fsycl-range-rounding=on" +/// +/// // RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \ // RUN: -fsycl-targets=spir64 -O0 %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-DISABLE-RANGE-ROUNDING %s diff --git a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp index d40a278aa488e..60022d18e8bf3 100644 --- a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp @@ -1,8 +1,8 @@ // REQUIRES: gpu -// RUN: %{build} -Xclang -fsycl-range-rounding=disable -o %t.out +// RUN: %{build} -fsycl-range-rounding=disable -o %t.out // RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT -// RUN: %{build} -Xclang -fsycl-range-rounding=force -o %t.out +// RUN: %{build} -fsycl-range-rounding=force -o %t.out // RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT // These tests test 3 things: From 8883f2d5ee0cdc8aa7a9956c8aa334a133484f7f Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Fri, 16 Feb 2024 16:29:10 +0000 Subject: [PATCH 6/8] Remove erroneous range rounding disable flag Range rounding disable is tested in another test. --- sycl/test-e2e/Basic/parallel_for_range_roundup.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp index 60022d18e8bf3..9e6d74dcb3f85 100644 --- a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp @@ -1,5 +1,5 @@ // REQUIRES: gpu -// RUN: %{build} -fsycl-range-rounding=disable -o %t.out +// RUN: %{build} -o %t.out // RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT // RUN: %{build} -fsycl-range-rounding=force -o %t.out From 2e60a20b0fae0a00502abcc394eb86886d05b218 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Fri, 16 Feb 2024 17:32:21 +0000 Subject: [PATCH 7/8] Make user range rounding preference override -O0 Range rounding is disabled for -O0 but now a user preference for range rounding can override this. --- clang/lib/Driver/ToolChains/Clang.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 81d5fa1a94742..ea652823950cc 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5454,7 +5454,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, bool DisableRangeRounding = false; if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { if (A->getOption().matches(options::OPT_O0)) - DisableRangeRounding = true; + // If the user has set some range rounding preference then let that + // override not range rounding at -O0 + if (!Args.getLastArg(options::OPT_fsycl_range_rounding_EQ)) + DisableRangeRounding = true; } if (DisableRangeRounding || HasFPGA) CmdArgs.push_back("-fsycl-range-rounding=disable"); From d0b0d75866717bcb932b654ba771cd2c14145086 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Mon, 19 Feb 2024 09:27:15 +0000 Subject: [PATCH 8/8] Update test Make sure that if -fsycl-range-rounding=force is used, there is no emission of the unrounded range kernel at -O0 and -Od --- clang/test/Driver/sycl-offload.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/clang/test/Driver/sycl-offload.c b/clang/test/Driver/sycl-offload.c index b6c2891010152..f67ca70bbb717 100644 --- a/clang/test/Driver/sycl-offload.c +++ b/clang/test/Driver/sycl-offload.c @@ -527,7 +527,14 @@ // RUN: | FileCheck -check-prefix=CHK-DISABLE-RANGE-ROUNDING %s // RUN: %clang_cl -### -fsycl -fsycl-targets=spir64 -Od %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-DISABLE-RANGE-ROUNDING %s +// RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \ +// RUN: -O0 -fsycl-range-rounding=force %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-OVERRIDE-RANGE-ROUNDING %s +// RUN: %clang_cl -### -fsycl -Od %s 2>&1 -fsycl-range-rounding=force %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-OVERRIDE-RANGE-ROUNDING %s // CHK-DISABLE-RANGE-ROUNDING: "-fsycl-range-rounding=disable" +// CHK-OVERRIDE-RANGE-ROUNDING: "-fsycl-range-rounding=force" +// CHK-OVERRIDE-RANGE-ROUNDING-NOT: "-fsycl-range-rounding=disable" // RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \ // RUN: -fsycl-targets=spir64 -O2 %s 2>&1 \