Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Add Experimental Range Rounding #12690

Merged
merged 9 commits into from
Apr 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/LangOptions.def
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ LANGOPT(
ENUM_LANGOPT(SYCLRangeRounding, SYCLRangeRoundingPreference, 2,
SYCLRangeRoundingPreference::On,
"Preference for SYCL parallel_for range rounding")
LANGOPT(SYCLExperimentalRangeRounding, 1, 0, "Use experimental parallel for range rounding")
hdelan marked this conversation as resolved.
Show resolved Hide resolved
LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the "
"SYCL integration header")
LANGOPT(SYCLAllowVirtualFunctions, 1, 0,
Expand Down
4 changes: 4 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -4012,6 +4012,10 @@ def fsycl_disable_range_rounding : Flag<["-"], "fsycl-disable-range-rounding">,
Alias<fsycl_range_rounding_EQ>, AliasArgs<["disable"]>,
HelpText<"Deprecated: please use -fsycl-range-rounding=disable instead.">,
Flags<[Deprecated]>;
def fsycl_exp_range_rounding : Flag<["-"], "fsycl-exp-range-rounding">,
Visibility<[ClangOption, CLOption, DXCOption, CC1Option]>,
HelpText<"Use experimental range rounding.">,
MarshallingInfoFlag<LangOpts<"SYCLExperimentalRangeRounding">>;
def fno_sycl_use_footer : Flag<["-"], "fno-sycl-use-footer">, Visibility<[ClangOption, CLOption, DXCOption]>,
HelpText<"Disable usage of the integration footer during SYCL enabled "
"compilations.">;
Expand Down
3 changes: 3 additions & 0 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5430,6 +5430,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
if (Arg *A = Args.getLastArg(options::OPT_fsycl_range_rounding_EQ))
A->render(Args, CmdArgs);

if (Arg *A = Args.getLastArg(options::OPT_fsycl_exp_range_rounding))
A->render(Args, CmdArgs);
hdelan marked this conversation as resolved.
Show resolved Hide resolved

// Add the Unique ID prefix
StringRef UniqueID = D.getSYCLUniqueID(Input.getBaseInput());
if (!UniqueID.empty())
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/Frontend/InitPreprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,12 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
default:
break;
}

// Set __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ macro for
// both host and device compilations if -fsycl-exp-range-rounding
// flag is used.
if (LangOpts.SYCLExperimentalRangeRounding)
Builder.defineMacro("__SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__");
}

if (LangOpts.DeclareSPIRVBuiltins) {
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/Sema/SemaSYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5187,6 +5187,12 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) {
break;
}

if (S.getLangOpts().SYCLExperimentalRangeRounding) {
O << "#ifndef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ \n";
O << "#define __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ 1\n";
O << "#endif //__SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__\n\n";
}

if (SpecConsts.size() > 0) {
O << "// Forward declarations of templated spec constant types:\n";
for (const auto &SC : SpecConsts)
Expand Down
28 changes: 28 additions & 0 deletions clang/test/Preprocessor/predefined-macros.c
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,34 @@
// CHECK-FORCE-RANGE: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1
// CHECK-FORCE-NO-RANGE-NOT: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1

// RUN: %clang_cc1 %s -E -dM -fsycl-is-device \
// RUN: -triple spir64-unknown-unknown -fsycl-exp-range-rounding -o - \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-RANGE

// RUN: %clang_cc1 %s -E -dM -fsycl-is-device \
// RUN: -triple spir64_fpga-unknown-unknown -o - \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-NO-RANGE

// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -fsycl-exp-range-rounding \
// RUN: -triple spir64_fpga-unknown-unknown -o - \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-RANGE

// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -o - \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-NO-RANGE

// RUN: %clang_cc1 %s -E -dM -o - \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-NO-RANGE

// RUN: %clang_cc1 %s -E -dM -fsycl-is-host \
// RUN: -triple x86_64-unknown-linux-gnu -fsycl-exp-range-rounding -o - \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-RANGE

// RUN: %clang_cc1 %s -E -dM -fsycl-is-host -o - \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-EXP-NO-RANGE

// CHECK-EXP-RANGE: #define __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ 1
// CHECK-EXP-NO-RANGE-NOT: #define __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ 1

// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \
// RUN: -fgpu-default-stream=per-thread \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH
Expand Down
32 changes: 32 additions & 0 deletions sycl/doc/design/ParallelForRangeRounding.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,35 @@ the range rounded kernel, `-fsycl-range-rounding=force`. The user can also tell
the SYCL implementation to only produce the unrounded kernel using the flag
`-fsycl-range-rounding=disable`. By default both kernels will be generated,
which is equivalent to `-fsycl-range-rounding=on`.

## Experimental Range Rounding

Experimental range rounding will perform rounding in all dimensions.
Experimental range rounding can be set using the `-fsycl-exp-range-rounding`
flag.

Some oddly shaped ranges and how they might round:

```
{43} -> {64}
{43, 79} -> {64, 96}
{43, 79, 7} -> {64, 96, 8}
```

The user can specify the factor that they want the rounded range to be a
multiple of in all dimensions using the
`SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS` environment variable. When
experimental range rounding is used, only the middle value in
`SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS` is used.
If `SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS` is set to `1:256:1`, the rounded
range will divide `256` in all dimensions

```
{43} -> {256}
{43, 257} -> {256, 512}
{43, 257, 7} -> {256, 512, 256}
```

`-fsycl-range-rounding=disable` will override `-fsycl-exp-range-rounding`. If
both are used in conjunction then no range rounding will happen.

24 changes: 24 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1241,6 +1241,29 @@ class __SYCL_EXPORT handler {
DidAdjust = true;
};

#ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
hdelan marked this conversation as resolved.
Show resolved Hide resolved
size_t GoodExpFactor = 1;
switch (Dims) {
case 1:
GoodExpFactor = 32; // Make global range multiple of {32}
break;
case 2:
GoodExpFactor = 16; // Make global range multiple of {16, 16}
break;
case 3:
GoodExpFactor = 8; // Make global range multiple of {8, 8, 8}
break;
}

// Check if rounding parameters have been set through environment:
// SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
this->GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX);

for (auto i = 0; i < Dims; ++i)
if (UserRange[i] % GoodExpFactor) {
Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor);
}
#else
// Perform range rounding if there are sufficient work-items to
// need rounding and the user-specified range is not a multiple of
// a "good" value.
Expand All @@ -1251,6 +1274,7 @@ class __SYCL_EXPORT handler {
// will yield a rounded-up value for the total range.
Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor);
}
#endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
#ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
// If we are forcing range rounding kernels to be used, we always want the
// rounded range kernel to be generated, even if rounding isn't needed
Expand Down
62 changes: 62 additions & 0 deletions sycl/test-e2e/Basic/parallel_for_range_roundup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
// RUN: %{build} -fsycl-range-rounding=force -o %t.out
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT

// RUN: %{build} -fsycl-exp-range-rounding -o %t.out
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-EXP

// RUN: %{build} -fsycl-range-rounding=force -fsycl-exp-range-rounding -o %t.out
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-EXP
//
// These tests test 3 things:
//
// 1. The user range is the same as the in kernel range (using BufRange) as
Expand Down Expand Up @@ -243,3 +249,59 @@ int main() {
// CHECK-DEFAULT-NEXT: Counter = 540672
// CHECK-DEFAULT-NEXT: Correct kernel indexes used
// CHECK-DEFAULT-NEXT: Counter = 540672

// CHECK-EXP: parallel_for range adjusted at dim 0 from 1500 to 1504
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 1500
// CHECK-EXP-NEXT: Counter = 1500
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
// CHECK-EXP-NEXT: Counter = 1500
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 1500
// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33
// CHECK-EXP-NEXT: Counter = 49500
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48
// CHECK-EXP-NEXT: Counter = 49500
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 1500
// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33
// CHECK-EXP-NEXT: Size seen by user at Dim 2 = 64
// CHECK-EXP-NEXT: Counter = 3168000
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
// CHECK-EXP-NEXT: Counter = 3168000
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
// CHECK-EXP-NEXT: Counter = 3168000
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 256
// CHECK-EXP-NEXT: Counter = 256
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: Counter = 256
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 256
// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33
// CHECK-EXP-NEXT: Counter = 8448
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 48
// CHECK-EXP-NEXT: Counter = 8448
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
// CHECK-EXP-NEXT: Size seen by user at Dim 0 = 256
// CHECK-EXP-NEXT: Size seen by user at Dim 1 = 33
// CHECK-EXP-NEXT: Size seen by user at Dim 2 = 64
// CHECK-EXP-NEXT: Counter = 540672
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
// CHECK-EXP-NEXT: Counter = 540672
// CHECK-EXP-NEXT: Correct kernel indexes used
// CHECK-EXP-NEXT: parallel_for range adjusted at dim 1 from 33 to 40
// CHECK-EXP-NEXT: Counter = 540672
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// RUN: echo "Running parallel_for benchmark without range rounding"
// RUN: %{build} -fsycl-range-rounding=disable -o %t.out
// RUN: %{run} %t.out

// RUN: echo "Running parallel_for benchmark with normal range rounding"
// RUN: %{build} -fsycl-range-rounding=force -o %t.out
// RUN: %{run} %t.out

// RUN: echo "Running parallel_for benchmark with experimental range rounding"
// RUN: %{build} -fsycl-exp-range-rounding -fsycl-range-rounding=force -o %t.out
// RUN: %{run} %t.out

#include <chrono>
#include <iostream>
#include <sycl/sycl.hpp>

class FillData;
class Compute;

int main() {
constexpr static size_t width{788};
constexpr static size_t height{1888};
constexpr static size_t N{width * height};
constexpr static size_t iterations{1000};

sycl::queue q{};
float *A{sycl::malloc_device<float>(N, q)};
float *B{sycl::malloc_device<float>(N, q)};
float *C{sycl::malloc_device<float>(N, q)};

q.submit([&](sycl::handler &cgh) {
cgh.parallel_for<FillData>(
sycl::range<2>{height, width}, [=](sycl::id<2> id) {
unsigned int row{static_cast<unsigned int>(id[0])};
unsigned int col{static_cast<unsigned int>(id[1])};
unsigned int ix{row * static_cast<unsigned int>(width) + col};
A[ix] = id[0];
B[ix] = id[1];
});
}).wait_and_throw();

auto start{std::chrono::steady_clock::now()};
for (size_t i{0}; i < iterations; ++i) {
q.submit([&](sycl::handler &cgh) {
cgh.parallel_for<Compute>(
sycl::range<2>{height, width}, [=](sycl::id<2> id) {
unsigned int row{static_cast<unsigned int>(id[0])};
unsigned int col{static_cast<unsigned int>(id[1])};
unsigned int ix{row * static_cast<unsigned int>(width) + col};
if (ix >= static_cast<unsigned int>(N)) {
return;
}
if (A[ix] > B[ix]) {
C[ix] = A[ix];
} else {
C[ix] = B[ix];
}
});
}).wait_and_throw();
}
auto end{std::chrono::steady_clock::now()};
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end -
start)
.count()
<< " ms" << std::endl;
}
Loading